gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement HARD_REGNO_NREGS.  */
1071
1072 int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement HARD_REGNO_MODE_OK.  */
1087
1088 int
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return 1;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return
1110           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111       else
1112         return 1;
1113     }
1114
1115   return 0;
1116 }
1117
1118 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1119 machine_mode
1120 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1121                                      machine_mode mode)
1122 {
1123   /* Handle modes that fit within single registers.  */
1124   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1125     {
1126       if (GET_MODE_SIZE (mode) >= 4)
1127         return mode;
1128       else
1129         return SImode;
1130     }
1131   /* Fall back to generic for multi-reg and very large modes.  */
1132   else
1133     return choose_hard_reg_mode (regno, nregs, false);
1134 }
1135
1136 /* Return true if calls to DECL should be treated as
1137    long-calls (ie called via a register).  */
1138 static bool
1139 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1140 {
1141   return false;
1142 }
1143
1144 /* Return true if calls to symbol-ref SYM should be treated as
1145    long-calls (ie called via a register).  */
1146 bool
1147 aarch64_is_long_call_p (rtx sym)
1148 {
1149   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1150 }
1151
1152 /* Return true if calls to symbol-ref SYM should not go through
1153    plt stubs.  */
1154
1155 bool
1156 aarch64_is_noplt_call_p (rtx sym)
1157 {
1158   const_tree decl = SYMBOL_REF_DECL (sym);
1159
1160   if (flag_pic
1161       && decl
1162       && (!flag_plt
1163           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1164       && !targetm.binds_local_p (decl))
1165     return true;
1166
1167   return false;
1168 }
1169
1170 /* Return true if the offsets to a zero/sign-extract operation
1171    represent an expression that matches an extend operation.  The
1172    operands represent the paramters from
1173
1174    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1175 bool
1176 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1177                                 rtx extract_imm)
1178 {
1179   HOST_WIDE_INT mult_val, extract_val;
1180
1181   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1182     return false;
1183
1184   mult_val = INTVAL (mult_imm);
1185   extract_val = INTVAL (extract_imm);
1186
1187   if (extract_val > 8
1188       && extract_val < GET_MODE_BITSIZE (mode)
1189       && exact_log2 (extract_val & ~7) > 0
1190       && (extract_val & 7) <= 4
1191       && mult_val == (1 << (extract_val & 7)))
1192     return true;
1193
1194   return false;
1195 }
1196
1197 /* Emit an insn that's a simple single-set.  Both the operands must be
1198    known to be valid.  */
1199 inline static rtx_insn *
1200 emit_set_insn (rtx x, rtx y)
1201 {
1202   return emit_insn (gen_rtx_SET (x, y));
1203 }
1204
1205 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1206    return the rtx for register 0 in the proper mode.  */
1207 rtx
1208 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1209 {
1210   machine_mode mode = SELECT_CC_MODE (code, x, y);
1211   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1212
1213   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1214   return cc_reg;
1215 }
1216
1217 /* Build the SYMBOL_REF for __tls_get_addr.  */
1218
1219 static GTY(()) rtx tls_get_addr_libfunc;
1220
1221 rtx
1222 aarch64_tls_get_addr (void)
1223 {
1224   if (!tls_get_addr_libfunc)
1225     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1226   return tls_get_addr_libfunc;
1227 }
1228
1229 /* Return the TLS model to use for ADDR.  */
1230
1231 static enum tls_model
1232 tls_symbolic_operand_type (rtx addr)
1233 {
1234   enum tls_model tls_kind = TLS_MODEL_NONE;
1235   rtx sym, addend;
1236
1237   if (GET_CODE (addr) == CONST)
1238     {
1239       split_const (addr, &sym, &addend);
1240       if (GET_CODE (sym) == SYMBOL_REF)
1241         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1242     }
1243   else if (GET_CODE (addr) == SYMBOL_REF)
1244     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1245
1246   return tls_kind;
1247 }
1248
1249 /* We'll allow lo_sum's in addresses in our legitimate addresses
1250    so that combine would take care of combining addresses where
1251    necessary, but for generation purposes, we'll generate the address
1252    as :
1253    RTL                               Absolute
1254    tmp = hi (symbol_ref);            adrp  x1, foo
1255    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1256                                      nop
1257
1258    PIC                               TLS
1259    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1260    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1261                                      bl   __tls_get_addr
1262                                      nop
1263
1264    Load TLS symbol, depending on TLS mechanism and TLS access model.
1265
1266    Global Dynamic - Traditional TLS:
1267    adrp tmp, :tlsgd:imm
1268    add  dest, tmp, #:tlsgd_lo12:imm
1269    bl   __tls_get_addr
1270
1271    Global Dynamic - TLS Descriptors:
1272    adrp dest, :tlsdesc:imm
1273    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1274    add  dest, dest, #:tlsdesc_lo12:imm
1275    blr  tmp
1276    mrs  tp, tpidr_el0
1277    add  dest, dest, tp
1278
1279    Initial Exec:
1280    mrs  tp, tpidr_el0
1281    adrp tmp, :gottprel:imm
1282    ldr  dest, [tmp, #:gottprel_lo12:imm]
1283    add  dest, dest, tp
1284
1285    Local Exec:
1286    mrs  tp, tpidr_el0
1287    add  t0, tp, #:tprel_hi12:imm, lsl #12
1288    add  t0, t0, #:tprel_lo12_nc:imm
1289 */
1290
1291 static void
1292 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1293                                    enum aarch64_symbol_type type)
1294 {
1295   switch (type)
1296     {
1297     case SYMBOL_SMALL_ABSOLUTE:
1298       {
1299         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1300         rtx tmp_reg = dest;
1301         machine_mode mode = GET_MODE (dest);
1302
1303         gcc_assert (mode == Pmode || mode == ptr_mode);
1304
1305         if (can_create_pseudo_p ())
1306           tmp_reg = gen_reg_rtx (mode);
1307
1308         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1309         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1310         return;
1311       }
1312
1313     case SYMBOL_TINY_ABSOLUTE:
1314       emit_insn (gen_rtx_SET (dest, imm));
1315       return;
1316
1317     case SYMBOL_SMALL_GOT_28K:
1318       {
1319         machine_mode mode = GET_MODE (dest);
1320         rtx gp_rtx = pic_offset_table_rtx;
1321         rtx insn;
1322         rtx mem;
1323
1324         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1325            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1326            decide rtx costs, in which case pic_offset_table_rtx is not
1327            initialized.  For that case no need to generate the first adrp
1328            instruction as the final cost for global variable access is
1329            one instruction.  */
1330         if (gp_rtx != NULL)
1331           {
1332             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1333                using the page base as GOT base, the first page may be wasted,
1334                in the worst scenario, there is only 28K space for GOT).
1335
1336                The generate instruction sequence for accessing global variable
1337                is:
1338
1339                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1340
1341                Only one instruction needed. But we must initialize
1342                pic_offset_table_rtx properly.  We generate initialize insn for
1343                every global access, and allow CSE to remove all redundant.
1344
1345                The final instruction sequences will look like the following
1346                for multiply global variables access.
1347
1348                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1349
1350                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1351                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1352                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1353                  ...  */
1354
1355             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1356             crtl->uses_pic_offset_table = 1;
1357             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1358
1359             if (mode != GET_MODE (gp_rtx))
1360              gp_rtx = gen_lowpart (mode, gp_rtx);
1361
1362           }
1363
1364         if (mode == ptr_mode)
1365           {
1366             if (mode == DImode)
1367               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1368             else
1369               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1370
1371             mem = XVECEXP (SET_SRC (insn), 0, 0);
1372           }
1373         else
1374           {
1375             gcc_assert (mode == Pmode);
1376
1377             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1378             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1379           }
1380
1381         /* The operand is expected to be MEM.  Whenever the related insn
1382            pattern changed, above code which calculate mem should be
1383            updated.  */
1384         gcc_assert (GET_CODE (mem) == MEM);
1385         MEM_READONLY_P (mem) = 1;
1386         MEM_NOTRAP_P (mem) = 1;
1387         emit_insn (insn);
1388         return;
1389       }
1390
1391     case SYMBOL_SMALL_GOT_4G:
1392       {
1393         /* In ILP32, the mode of dest can be either SImode or DImode,
1394            while the got entry is always of SImode size.  The mode of
1395            dest depends on how dest is used: if dest is assigned to a
1396            pointer (e.g. in the memory), it has SImode; it may have
1397            DImode if dest is dereferenced to access the memeory.
1398            This is why we have to handle three different ldr_got_small
1399            patterns here (two patterns for ILP32).  */
1400
1401         rtx insn;
1402         rtx mem;
1403         rtx tmp_reg = dest;
1404         machine_mode mode = GET_MODE (dest);
1405
1406         if (can_create_pseudo_p ())
1407           tmp_reg = gen_reg_rtx (mode);
1408
1409         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1410         if (mode == ptr_mode)
1411           {
1412             if (mode == DImode)
1413               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1414             else
1415               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1416
1417             mem = XVECEXP (SET_SRC (insn), 0, 0);
1418           }
1419         else
1420           {
1421             gcc_assert (mode == Pmode);
1422
1423             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1424             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1425           }
1426
1427         gcc_assert (GET_CODE (mem) == MEM);
1428         MEM_READONLY_P (mem) = 1;
1429         MEM_NOTRAP_P (mem) = 1;
1430         emit_insn (insn);
1431         return;
1432       }
1433
1434     case SYMBOL_SMALL_TLSGD:
1435       {
1436         rtx_insn *insns;
1437         machine_mode mode = GET_MODE (dest);
1438         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1439
1440         start_sequence ();
1441         if (TARGET_ILP32)
1442           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1443         else
1444           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1445         insns = get_insns ();
1446         end_sequence ();
1447
1448         RTL_CONST_CALL_P (insns) = 1;
1449         emit_libcall_block (insns, dest, result, imm);
1450         return;
1451       }
1452
1453     case SYMBOL_SMALL_TLSDESC:
1454       {
1455         machine_mode mode = GET_MODE (dest);
1456         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1457         rtx tp;
1458
1459         gcc_assert (mode == Pmode || mode == ptr_mode);
1460
1461         /* In ILP32, the got entry is always of SImode size.  Unlike
1462            small GOT, the dest is fixed at reg 0.  */
1463         if (TARGET_ILP32)
1464           emit_insn (gen_tlsdesc_small_si (imm));
1465         else
1466           emit_insn (gen_tlsdesc_small_di (imm));
1467         tp = aarch64_load_tp (NULL);
1468
1469         if (mode != Pmode)
1470           tp = gen_lowpart (mode, tp);
1471
1472         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1473         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1474         return;
1475       }
1476
1477     case SYMBOL_SMALL_TLSIE:
1478       {
1479         /* In ILP32, the mode of dest can be either SImode or DImode,
1480            while the got entry is always of SImode size.  The mode of
1481            dest depends on how dest is used: if dest is assigned to a
1482            pointer (e.g. in the memory), it has SImode; it may have
1483            DImode if dest is dereferenced to access the memeory.
1484            This is why we have to handle three different tlsie_small
1485            patterns here (two patterns for ILP32).  */
1486         machine_mode mode = GET_MODE (dest);
1487         rtx tmp_reg = gen_reg_rtx (mode);
1488         rtx tp = aarch64_load_tp (NULL);
1489
1490         if (mode == ptr_mode)
1491           {
1492             if (mode == DImode)
1493               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1494             else
1495               {
1496                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1497                 tp = gen_lowpart (mode, tp);
1498               }
1499           }
1500         else
1501           {
1502             gcc_assert (mode == Pmode);
1503             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1504           }
1505
1506         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1507         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508         return;
1509       }
1510
1511     case SYMBOL_TLSLE12:
1512     case SYMBOL_TLSLE24:
1513     case SYMBOL_TLSLE32:
1514     case SYMBOL_TLSLE48:
1515       {
1516         machine_mode mode = GET_MODE (dest);
1517         rtx tp = aarch64_load_tp (NULL);
1518
1519         if (mode != Pmode)
1520           tp = gen_lowpart (mode, tp);
1521
1522         switch (type)
1523           {
1524           case SYMBOL_TLSLE12:
1525             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1526                         (dest, tp, imm));
1527             break;
1528           case SYMBOL_TLSLE24:
1529             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1530                         (dest, tp, imm));
1531           break;
1532           case SYMBOL_TLSLE32:
1533             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1534                         (dest, imm));
1535             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1536                         (dest, dest, tp));
1537           break;
1538           case SYMBOL_TLSLE48:
1539             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1540                         (dest, imm));
1541             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1542                         (dest, dest, tp));
1543             break;
1544           default:
1545             gcc_unreachable ();
1546           }
1547
1548         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1549         return;
1550       }
1551
1552     case SYMBOL_TINY_GOT:
1553       emit_insn (gen_ldr_got_tiny (dest, imm));
1554       return;
1555
1556     case SYMBOL_TINY_TLSIE:
1557       {
1558         machine_mode mode = GET_MODE (dest);
1559         rtx tp = aarch64_load_tp (NULL);
1560
1561         if (mode == ptr_mode)
1562           {
1563             if (mode == DImode)
1564               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1565             else
1566               {
1567                 tp = gen_lowpart (mode, tp);
1568                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1569               }
1570           }
1571         else
1572           {
1573             gcc_assert (mode == Pmode);
1574             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1575           }
1576
1577         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1578         return;
1579       }
1580
1581     default:
1582       gcc_unreachable ();
1583     }
1584 }
1585
1586 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1587    handle all moves if !can_create_pseudo_p ().  The distinction is
1588    important because, unlike emit_move_insn, the move expanders know
1589    how to force Pmode objects into the constant pool even when the
1590    constant pool address is not itself legitimate.  */
1591 static rtx
1592 aarch64_emit_move (rtx dest, rtx src)
1593 {
1594   return (can_create_pseudo_p ()
1595           ? emit_move_insn (dest, src)
1596           : emit_move_insn_1 (dest, src));
1597 }
1598
1599 /* Split a 128-bit move operation into two 64-bit move operations,
1600    taking care to handle partial overlap of register to register
1601    copies.  Special cases are needed when moving between GP regs and
1602    FP regs.  SRC can be a register, constant or memory; DST a register
1603    or memory.  If either operand is memory it must not have any side
1604    effects.  */
1605 void
1606 aarch64_split_128bit_move (rtx dst, rtx src)
1607 {
1608   rtx dst_lo, dst_hi;
1609   rtx src_lo, src_hi;
1610
1611   machine_mode mode = GET_MODE (dst);
1612
1613   gcc_assert (mode == TImode || mode == TFmode);
1614   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1615   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1616
1617   if (REG_P (dst) && REG_P (src))
1618     {
1619       int src_regno = REGNO (src);
1620       int dst_regno = REGNO (dst);
1621
1622       /* Handle FP <-> GP regs.  */
1623       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1624         {
1625           src_lo = gen_lowpart (word_mode, src);
1626           src_hi = gen_highpart (word_mode, src);
1627
1628           if (mode == TImode)
1629             {
1630               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1631               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1632             }
1633           else
1634             {
1635               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1636               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1637             }
1638           return;
1639         }
1640       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1641         {
1642           dst_lo = gen_lowpart (word_mode, dst);
1643           dst_hi = gen_highpart (word_mode, dst);
1644
1645           if (mode == TImode)
1646             {
1647               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1648               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1649             }
1650           else
1651             {
1652               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1653               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1654             }
1655           return;
1656         }
1657     }
1658
1659   dst_lo = gen_lowpart (word_mode, dst);
1660   dst_hi = gen_highpart (word_mode, dst);
1661   src_lo = gen_lowpart (word_mode, src);
1662   src_hi = gen_highpart_mode (word_mode, mode, src);
1663
1664   /* At most one pairing may overlap.  */
1665   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1666     {
1667       aarch64_emit_move (dst_hi, src_hi);
1668       aarch64_emit_move (dst_lo, src_lo);
1669     }
1670   else
1671     {
1672       aarch64_emit_move (dst_lo, src_lo);
1673       aarch64_emit_move (dst_hi, src_hi);
1674     }
1675 }
1676
1677 bool
1678 aarch64_split_128bit_move_p (rtx dst, rtx src)
1679 {
1680   return (! REG_P (src)
1681           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1682 }
1683
1684 /* Split a complex SIMD combine.  */
1685
1686 void
1687 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1688 {
1689   machine_mode src_mode = GET_MODE (src1);
1690   machine_mode dst_mode = GET_MODE (dst);
1691
1692   gcc_assert (VECTOR_MODE_P (dst_mode));
1693   gcc_assert (register_operand (dst, dst_mode)
1694               && register_operand (src1, src_mode)
1695               && register_operand (src2, src_mode));
1696
1697   rtx (*gen) (rtx, rtx, rtx);
1698
1699   switch (src_mode)
1700     {
1701     case E_V8QImode:
1702       gen = gen_aarch64_simd_combinev8qi;
1703       break;
1704     case E_V4HImode:
1705       gen = gen_aarch64_simd_combinev4hi;
1706       break;
1707     case E_V2SImode:
1708       gen = gen_aarch64_simd_combinev2si;
1709       break;
1710     case E_V4HFmode:
1711       gen = gen_aarch64_simd_combinev4hf;
1712       break;
1713     case E_V2SFmode:
1714       gen = gen_aarch64_simd_combinev2sf;
1715       break;
1716     case E_DImode:
1717       gen = gen_aarch64_simd_combinedi;
1718       break;
1719     case E_DFmode:
1720       gen = gen_aarch64_simd_combinedf;
1721       break;
1722     default:
1723       gcc_unreachable ();
1724     }
1725
1726   emit_insn (gen (dst, src1, src2));
1727   return;
1728 }
1729
1730 /* Split a complex SIMD move.  */
1731
1732 void
1733 aarch64_split_simd_move (rtx dst, rtx src)
1734 {
1735   machine_mode src_mode = GET_MODE (src);
1736   machine_mode dst_mode = GET_MODE (dst);
1737
1738   gcc_assert (VECTOR_MODE_P (dst_mode));
1739
1740   if (REG_P (dst) && REG_P (src))
1741     {
1742       rtx (*gen) (rtx, rtx);
1743
1744       gcc_assert (VECTOR_MODE_P (src_mode));
1745
1746       switch (src_mode)
1747         {
1748         case E_V16QImode:
1749           gen = gen_aarch64_split_simd_movv16qi;
1750           break;
1751         case E_V8HImode:
1752           gen = gen_aarch64_split_simd_movv8hi;
1753           break;
1754         case E_V4SImode:
1755           gen = gen_aarch64_split_simd_movv4si;
1756           break;
1757         case E_V2DImode:
1758           gen = gen_aarch64_split_simd_movv2di;
1759           break;
1760         case E_V8HFmode:
1761           gen = gen_aarch64_split_simd_movv8hf;
1762           break;
1763         case E_V4SFmode:
1764           gen = gen_aarch64_split_simd_movv4sf;
1765           break;
1766         case E_V2DFmode:
1767           gen = gen_aarch64_split_simd_movv2df;
1768           break;
1769         default:
1770           gcc_unreachable ();
1771         }
1772
1773       emit_insn (gen (dst, src));
1774       return;
1775     }
1776 }
1777
1778 bool
1779 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1780                               machine_mode ymode, rtx y)
1781 {
1782   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1783   gcc_assert (r != NULL);
1784   return rtx_equal_p (x, r);
1785 }
1786
1787
1788 static rtx
1789 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1790 {
1791   if (can_create_pseudo_p ())
1792     return force_reg (mode, value);
1793   else
1794     {
1795       x = aarch64_emit_move (x, value);
1796       return x;
1797     }
1798 }
1799
1800
1801 static rtx
1802 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1803 {
1804   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1805     {
1806       rtx high;
1807       /* Load the full offset into a register.  This
1808          might be improvable in the future.  */
1809       high = GEN_INT (offset);
1810       offset = 0;
1811       high = aarch64_force_temporary (mode, temp, high);
1812       reg = aarch64_force_temporary (mode, temp,
1813                                      gen_rtx_PLUS (mode, high, reg));
1814     }
1815   return plus_constant (mode, reg, offset);
1816 }
1817
1818 static int
1819 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1820                                 machine_mode mode)
1821 {
1822   int i;
1823   unsigned HOST_WIDE_INT val, val2, mask;
1824   int one_match, zero_match;
1825   int num_insns;
1826
1827   val = INTVAL (imm);
1828
1829   if (aarch64_move_imm (val, mode))
1830     {
1831       if (generate)
1832         emit_insn (gen_rtx_SET (dest, imm));
1833       return 1;
1834     }
1835
1836   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1837      (with XXXX non-zero). In that case check to see if the move can be done in
1838      a smaller mode.  */
1839   val2 = val & 0xffffffff;
1840   if (mode == DImode
1841       && aarch64_move_imm (val2, SImode)
1842       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1843     {
1844       if (generate)
1845         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1846
1847       /* Check if we have to emit a second instruction by checking to see
1848          if any of the upper 32 bits of the original DI mode value is set.  */
1849       if (val == val2)
1850         return 1;
1851
1852       i = (val >> 48) ? 48 : 32;
1853
1854       if (generate)
1855          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1856                                     GEN_INT ((val >> i) & 0xffff)));
1857
1858       return 2;
1859     }
1860
1861   if ((val >> 32) == 0 || mode == SImode)
1862     {
1863       if (generate)
1864         {
1865           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1866           if (mode == SImode)
1867             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1868                                        GEN_INT ((val >> 16) & 0xffff)));
1869           else
1870             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1871                                        GEN_INT ((val >> 16) & 0xffff)));
1872         }
1873       return 2;
1874     }
1875
1876   /* Remaining cases are all for DImode.  */
1877
1878   mask = 0xffff;
1879   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1880     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1881   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1882     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1883
1884   if (zero_match != 2 && one_match != 2)
1885     {
1886       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1887          For a 64-bit bitmask try whether changing 16 bits to all ones or
1888          zeroes creates a valid bitmask.  To check any repeated bitmask,
1889          try using 16 bits from the other 32-bit half of val.  */
1890
1891       for (i = 0; i < 64; i += 16, mask <<= 16)
1892         {
1893           val2 = val & ~mask;
1894           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1895             break;
1896           val2 = val | mask;
1897           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1898             break;
1899           val2 = val2 & ~mask;
1900           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1901           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1902             break;
1903         }
1904       if (i != 64)
1905         {
1906           if (generate)
1907             {
1908               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1909               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1910                                          GEN_INT ((val >> i) & 0xffff)));
1911             }
1912           return 2;
1913         }
1914     }
1915
1916   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1917      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1918      otherwise skip zero bits.  */
1919
1920   num_insns = 1;
1921   mask = 0xffff;
1922   val2 = one_match > zero_match ? ~val : val;
1923   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1924
1925   if (generate)
1926     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1927                                            ? (val | ~(mask << i))
1928                                            : (val & (mask << i)))));
1929   for (i += 16; i < 64; i += 16)
1930     {
1931       if ((val2 & (mask << i)) == 0)
1932         continue;
1933       if (generate)
1934         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1935                                    GEN_INT ((val >> i) & 0xffff)));
1936       num_insns ++;
1937     }
1938
1939   return num_insns;
1940 }
1941
1942
1943 void
1944 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1945 {
1946   machine_mode mode = GET_MODE (dest);
1947
1948   gcc_assert (mode == SImode || mode == DImode);
1949
1950   /* Check on what type of symbol it is.  */
1951   if (GET_CODE (imm) == SYMBOL_REF
1952       || GET_CODE (imm) == LABEL_REF
1953       || GET_CODE (imm) == CONST)
1954     {
1955       rtx mem, base, offset;
1956       enum aarch64_symbol_type sty;
1957
1958       /* If we have (const (plus symbol offset)), separate out the offset
1959          before we start classifying the symbol.  */
1960       split_const (imm, &base, &offset);
1961
1962       sty = aarch64_classify_symbol (base, offset);
1963       switch (sty)
1964         {
1965         case SYMBOL_FORCE_TO_MEM:
1966           if (offset != const0_rtx
1967               && targetm.cannot_force_const_mem (mode, imm))
1968             {
1969               gcc_assert (can_create_pseudo_p ());
1970               base = aarch64_force_temporary (mode, dest, base);
1971               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1972               aarch64_emit_move (dest, base);
1973               return;
1974             }
1975
1976           mem = force_const_mem (ptr_mode, imm);
1977           gcc_assert (mem);
1978
1979           /* If we aren't generating PC relative literals, then
1980              we need to expand the literal pool access carefully.
1981              This is something that needs to be done in a number
1982              of places, so could well live as a separate function.  */
1983           if (!aarch64_pcrelative_literal_loads)
1984             {
1985               gcc_assert (can_create_pseudo_p ());
1986               base = gen_reg_rtx (ptr_mode);
1987               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1988               if (ptr_mode != Pmode)
1989                 base = convert_memory_address (Pmode, base);
1990               mem = gen_rtx_MEM (ptr_mode, base);
1991             }
1992
1993           if (mode != ptr_mode)
1994             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1995
1996           emit_insn (gen_rtx_SET (dest, mem));
1997
1998           return;
1999
2000         case SYMBOL_SMALL_TLSGD:
2001         case SYMBOL_SMALL_TLSDESC:
2002         case SYMBOL_SMALL_TLSIE:
2003         case SYMBOL_SMALL_GOT_28K:
2004         case SYMBOL_SMALL_GOT_4G:
2005         case SYMBOL_TINY_GOT:
2006         case SYMBOL_TINY_TLSIE:
2007           if (offset != const0_rtx)
2008             {
2009               gcc_assert(can_create_pseudo_p ());
2010               base = aarch64_force_temporary (mode, dest, base);
2011               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2012               aarch64_emit_move (dest, base);
2013               return;
2014             }
2015           /* FALLTHRU */
2016
2017         case SYMBOL_SMALL_ABSOLUTE:
2018         case SYMBOL_TINY_ABSOLUTE:
2019         case SYMBOL_TLSLE12:
2020         case SYMBOL_TLSLE24:
2021         case SYMBOL_TLSLE32:
2022         case SYMBOL_TLSLE48:
2023           aarch64_load_symref_appropriately (dest, imm, sty);
2024           return;
2025
2026         default:
2027           gcc_unreachable ();
2028         }
2029     }
2030
2031   if (!CONST_INT_P (imm))
2032     {
2033       if (GET_CODE (imm) == HIGH)
2034         emit_insn (gen_rtx_SET (dest, imm));
2035       else
2036         {
2037           rtx mem = force_const_mem (mode, imm);
2038           gcc_assert (mem);
2039           emit_insn (gen_rtx_SET (dest, mem));
2040         }
2041
2042       return;
2043     }
2044
2045   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2046 }
2047
2048 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2049    temporary value if necessary.  FRAME_RELATED_P should be true if
2050    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2051    to the generated instructions.  If SCRATCHREG is known to hold
2052    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2053    immediate again.
2054
2055    Since this function may be used to adjust the stack pointer, we must
2056    ensure that it cannot cause transient stack deallocation (for example
2057    by first incrementing SP and then decrementing when adjusting by a
2058    large immediate).  */
2059
2060 static void
2061 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2062                                HOST_WIDE_INT delta, bool frame_related_p,
2063                                bool emit_move_imm)
2064 {
2065   HOST_WIDE_INT mdelta = abs_hwi (delta);
2066   rtx this_rtx = gen_rtx_REG (mode, regnum);
2067   rtx_insn *insn;
2068
2069   if (!mdelta)
2070     return;
2071
2072   /* Single instruction adjustment.  */
2073   if (aarch64_uimm12_shift (mdelta))
2074     {
2075       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2076       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2077       return;
2078     }
2079
2080   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2081      Only do this if mdelta is not a 16-bit move as adjusting using a move
2082      is better.  */
2083   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2084     {
2085       HOST_WIDE_INT low_off = mdelta & 0xfff;
2086
2087       low_off = delta < 0 ? -low_off : low_off;
2088       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2089       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2090       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2091       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2092       return;
2093     }
2094
2095   /* Emit a move immediate if required and an addition/subtraction.  */
2096   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2097   if (emit_move_imm)
2098     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2099   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2100                               : gen_add2_insn (this_rtx, scratch_rtx));
2101   if (frame_related_p)
2102     {
2103       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2104       rtx adj = plus_constant (mode, this_rtx, delta);
2105       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2106     }
2107 }
2108
2109 static inline void
2110 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2111                       HOST_WIDE_INT delta)
2112 {
2113   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2114 }
2115
2116 static inline void
2117 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2118 {
2119   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2120                                  true, emit_move_imm);
2121 }
2122
2123 static inline void
2124 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2125 {
2126   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2127                                  frame_related_p, true);
2128 }
2129
2130 static bool
2131 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2132                                  tree exp ATTRIBUTE_UNUSED)
2133 {
2134   /* Currently, always true.  */
2135   return true;
2136 }
2137
2138 /* Implement TARGET_PASS_BY_REFERENCE.  */
2139
2140 static bool
2141 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2142                            machine_mode mode,
2143                            const_tree type,
2144                            bool named ATTRIBUTE_UNUSED)
2145 {
2146   HOST_WIDE_INT size;
2147   machine_mode dummymode;
2148   int nregs;
2149
2150   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2151   size = (mode == BLKmode && type)
2152     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2153
2154   /* Aggregates are passed by reference based on their size.  */
2155   if (type && AGGREGATE_TYPE_P (type))
2156     {
2157       size = int_size_in_bytes (type);
2158     }
2159
2160   /* Variable sized arguments are always returned by reference.  */
2161   if (size < 0)
2162     return true;
2163
2164   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2165   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2166                                                &dummymode, &nregs,
2167                                                NULL))
2168     return false;
2169
2170   /* Arguments which are variable sized or larger than 2 registers are
2171      passed by reference unless they are a homogenous floating point
2172      aggregate.  */
2173   return size > 2 * UNITS_PER_WORD;
2174 }
2175
2176 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2177 static bool
2178 aarch64_return_in_msb (const_tree valtype)
2179 {
2180   machine_mode dummy_mode;
2181   int dummy_int;
2182
2183   /* Never happens in little-endian mode.  */
2184   if (!BYTES_BIG_ENDIAN)
2185     return false;
2186
2187   /* Only composite types smaller than or equal to 16 bytes can
2188      be potentially returned in registers.  */
2189   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2190       || int_size_in_bytes (valtype) <= 0
2191       || int_size_in_bytes (valtype) > 16)
2192     return false;
2193
2194   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2195      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2196      is always passed/returned in the least significant bits of fp/simd
2197      register(s).  */
2198   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2199                                                &dummy_mode, &dummy_int, NULL))
2200     return false;
2201
2202   return true;
2203 }
2204
2205 /* Implement TARGET_FUNCTION_VALUE.
2206    Define how to find the value returned by a function.  */
2207
2208 static rtx
2209 aarch64_function_value (const_tree type, const_tree func,
2210                         bool outgoing ATTRIBUTE_UNUSED)
2211 {
2212   machine_mode mode;
2213   int unsignedp;
2214   int count;
2215   machine_mode ag_mode;
2216
2217   mode = TYPE_MODE (type);
2218   if (INTEGRAL_TYPE_P (type))
2219     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2220
2221   if (aarch64_return_in_msb (type))
2222     {
2223       HOST_WIDE_INT size = int_size_in_bytes (type);
2224
2225       if (size % UNITS_PER_WORD != 0)
2226         {
2227           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2228           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2229         }
2230     }
2231
2232   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2233                                                &ag_mode, &count, NULL))
2234     {
2235       if (!aarch64_composite_type_p (type, mode))
2236         {
2237           gcc_assert (count == 1 && mode == ag_mode);
2238           return gen_rtx_REG (mode, V0_REGNUM);
2239         }
2240       else
2241         {
2242           int i;
2243           rtx par;
2244
2245           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2246           for (i = 0; i < count; i++)
2247             {
2248               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2249               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2250                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2251               XVECEXP (par, 0, i) = tmp;
2252             }
2253           return par;
2254         }
2255     }
2256   else
2257     return gen_rtx_REG (mode, R0_REGNUM);
2258 }
2259
2260 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2261    Return true if REGNO is the number of a hard register in which the values
2262    of called function may come back.  */
2263
2264 static bool
2265 aarch64_function_value_regno_p (const unsigned int regno)
2266 {
2267   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2268      of 16-byte return values are: 128-bit integers and 16-byte small
2269      structures (excluding homogeneous floating-point aggregates).  */
2270   if (regno == R0_REGNUM || regno == R1_REGNUM)
2271     return true;
2272
2273   /* Up to four fp/simd registers can return a function value, e.g. a
2274      homogeneous floating-point aggregate having four members.  */
2275   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2276     return TARGET_FLOAT;
2277
2278   return false;
2279 }
2280
2281 /* Implement TARGET_RETURN_IN_MEMORY.
2282
2283    If the type T of the result of a function is such that
2284      void func (T arg)
2285    would require that arg be passed as a value in a register (or set of
2286    registers) according to the parameter passing rules, then the result
2287    is returned in the same registers as would be used for such an
2288    argument.  */
2289
2290 static bool
2291 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2292 {
2293   HOST_WIDE_INT size;
2294   machine_mode ag_mode;
2295   int count;
2296
2297   if (!AGGREGATE_TYPE_P (type)
2298       && TREE_CODE (type) != COMPLEX_TYPE
2299       && TREE_CODE (type) != VECTOR_TYPE)
2300     /* Simple scalar types always returned in registers.  */
2301     return false;
2302
2303   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2304                                                type,
2305                                                &ag_mode,
2306                                                &count,
2307                                                NULL))
2308     return false;
2309
2310   /* Types larger than 2 registers returned in memory.  */
2311   size = int_size_in_bytes (type);
2312   return (size < 0 || size > 2 * UNITS_PER_WORD);
2313 }
2314
2315 static bool
2316 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2317                                const_tree type, int *nregs)
2318 {
2319   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2320   return aarch64_vfp_is_call_or_return_candidate (mode,
2321                                                   type,
2322                                                   &pcum->aapcs_vfp_rmode,
2323                                                   nregs,
2324                                                   NULL);
2325 }
2326
2327 /* Given MODE and TYPE of a function argument, return the alignment in
2328    bits.  The idea is to suppress any stronger alignment requested by
2329    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2330    This is a helper function for local use only.  */
2331
2332 static unsigned int
2333 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2334 {
2335   if (!type)
2336     return GET_MODE_ALIGNMENT (mode);
2337
2338   if (integer_zerop (TYPE_SIZE (type)))
2339     return 0;
2340
2341   gcc_assert (TYPE_MODE (type) == mode);
2342
2343   if (!AGGREGATE_TYPE_P (type))
2344     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2345
2346   if (TREE_CODE (type) == ARRAY_TYPE)
2347     return TYPE_ALIGN (TREE_TYPE (type));
2348
2349   unsigned int alignment = 0;
2350   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2351     if (TREE_CODE (field) == FIELD_DECL)
2352       alignment = std::max (alignment, DECL_ALIGN (field));
2353
2354   return alignment;
2355 }
2356
2357 /* Layout a function argument according to the AAPCS64 rules.  The rule
2358    numbers refer to the rule numbers in the AAPCS64.  */
2359
2360 static void
2361 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2362                     const_tree type,
2363                     bool named ATTRIBUTE_UNUSED)
2364 {
2365   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2366   int ncrn, nvrn, nregs;
2367   bool allocate_ncrn, allocate_nvrn;
2368   HOST_WIDE_INT size;
2369
2370   /* We need to do this once per argument.  */
2371   if (pcum->aapcs_arg_processed)
2372     return;
2373
2374   pcum->aapcs_arg_processed = true;
2375
2376   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2377   size
2378     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2379                 UNITS_PER_WORD);
2380
2381   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2382   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2383                                                  mode,
2384                                                  type,
2385                                                  &nregs);
2386
2387   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2388      The following code thus handles passing by SIMD/FP registers first.  */
2389
2390   nvrn = pcum->aapcs_nvrn;
2391
2392   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2393      and homogenous short-vector aggregates (HVA).  */
2394   if (allocate_nvrn)
2395     {
2396       if (!TARGET_FLOAT)
2397         aarch64_err_no_fpadvsimd (mode, "argument");
2398
2399       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2400         {
2401           pcum->aapcs_nextnvrn = nvrn + nregs;
2402           if (!aarch64_composite_type_p (type, mode))
2403             {
2404               gcc_assert (nregs == 1);
2405               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2406             }
2407           else
2408             {
2409               rtx par;
2410               int i;
2411               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2412               for (i = 0; i < nregs; i++)
2413                 {
2414                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2415                                          V0_REGNUM + nvrn + i);
2416                   tmp = gen_rtx_EXPR_LIST
2417                     (VOIDmode, tmp,
2418                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2419                   XVECEXP (par, 0, i) = tmp;
2420                 }
2421               pcum->aapcs_reg = par;
2422             }
2423           return;
2424         }
2425       else
2426         {
2427           /* C.3 NSRN is set to 8.  */
2428           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2429           goto on_stack;
2430         }
2431     }
2432
2433   ncrn = pcum->aapcs_ncrn;
2434   nregs = size / UNITS_PER_WORD;
2435
2436   /* C6 - C9.  though the sign and zero extension semantics are
2437      handled elsewhere.  This is the case where the argument fits
2438      entirely general registers.  */
2439   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2440     {
2441
2442       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2443
2444       /* C.8 if the argument has an alignment of 16 then the NGRN is
2445          rounded up to the next even number.  */
2446       if (nregs == 2
2447           && ncrn % 2
2448           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2449              comparison is there because for > 16 * BITS_PER_UNIT
2450              alignment nregs should be > 2 and therefore it should be
2451              passed by reference rather than value.  */
2452           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2453         {
2454           ++ncrn;
2455           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2456         }
2457
2458       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2459          A reg is still generated for it, but the caller should be smart
2460          enough not to use it.  */
2461       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2462         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2463       else
2464         {
2465           rtx par;
2466           int i;
2467
2468           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2469           for (i = 0; i < nregs; i++)
2470             {
2471               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2472               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2473                                        GEN_INT (i * UNITS_PER_WORD));
2474               XVECEXP (par, 0, i) = tmp;
2475             }
2476           pcum->aapcs_reg = par;
2477         }
2478
2479       pcum->aapcs_nextncrn = ncrn + nregs;
2480       return;
2481     }
2482
2483   /* C.11  */
2484   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2485
2486   /* The argument is passed on stack; record the needed number of words for
2487      this argument and align the total size if necessary.  */
2488 on_stack:
2489   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2490
2491   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2492     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2493                                        16 / UNITS_PER_WORD);
2494   return;
2495 }
2496
2497 /* Implement TARGET_FUNCTION_ARG.  */
2498
2499 static rtx
2500 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2501                       const_tree type, bool named)
2502 {
2503   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2504   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2505
2506   if (mode == VOIDmode)
2507     return NULL_RTX;
2508
2509   aarch64_layout_arg (pcum_v, mode, type, named);
2510   return pcum->aapcs_reg;
2511 }
2512
2513 void
2514 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2515                            const_tree fntype ATTRIBUTE_UNUSED,
2516                            rtx libname ATTRIBUTE_UNUSED,
2517                            const_tree fndecl ATTRIBUTE_UNUSED,
2518                            unsigned n_named ATTRIBUTE_UNUSED)
2519 {
2520   pcum->aapcs_ncrn = 0;
2521   pcum->aapcs_nvrn = 0;
2522   pcum->aapcs_nextncrn = 0;
2523   pcum->aapcs_nextnvrn = 0;
2524   pcum->pcs_variant = ARM_PCS_AAPCS64;
2525   pcum->aapcs_reg = NULL_RTX;
2526   pcum->aapcs_arg_processed = false;
2527   pcum->aapcs_stack_words = 0;
2528   pcum->aapcs_stack_size = 0;
2529
2530   if (!TARGET_FLOAT
2531       && fndecl && TREE_PUBLIC (fndecl)
2532       && fntype && fntype != error_mark_node)
2533     {
2534       const_tree type = TREE_TYPE (fntype);
2535       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2536       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2537       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2538                                                    &mode, &nregs, NULL))
2539         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2540     }
2541   return;
2542 }
2543
2544 static void
2545 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2546                               machine_mode mode,
2547                               const_tree type,
2548                               bool named)
2549 {
2550   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2551   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2552     {
2553       aarch64_layout_arg (pcum_v, mode, type, named);
2554       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2555                   != (pcum->aapcs_stack_words != 0));
2556       pcum->aapcs_arg_processed = false;
2557       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2558       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2559       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2560       pcum->aapcs_stack_words = 0;
2561       pcum->aapcs_reg = NULL_RTX;
2562     }
2563 }
2564
2565 bool
2566 aarch64_function_arg_regno_p (unsigned regno)
2567 {
2568   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2569           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2570 }
2571
2572 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2573    PARM_BOUNDARY bits of alignment, but will be given anything up
2574    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2575    that both before and after the layout of each argument, the Next
2576    Stacked Argument Address (NSAA) will have a minimum alignment of
2577    8 bytes.  */
2578
2579 static unsigned int
2580 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2581 {
2582   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2583   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2584 }
2585
2586 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2587
2588    Return true if an argument passed on the stack should be padded upwards,
2589    i.e. if the least-significant byte of the stack slot has useful data.
2590
2591    Small aggregate types are placed in the lowest memory address.
2592
2593    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2594
2595 bool
2596 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2597 {
2598   /* On little-endian targets, the least significant byte of every stack
2599      argument is passed at the lowest byte address of the stack slot.  */
2600   if (!BYTES_BIG_ENDIAN)
2601     return true;
2602
2603   /* Otherwise, integral, floating-point and pointer types are padded downward:
2604      the least significant byte of a stack argument is passed at the highest
2605      byte address of the stack slot.  */
2606   if (type
2607       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2608          || POINTER_TYPE_P (type))
2609       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2610     return false;
2611
2612   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2613   return true;
2614 }
2615
2616 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2617
2618    It specifies padding for the last (may also be the only)
2619    element of a block move between registers and memory.  If
2620    assuming the block is in the memory, padding upward means that
2621    the last element is padded after its highest significant byte,
2622    while in downward padding, the last element is padded at the
2623    its least significant byte side.
2624
2625    Small aggregates and small complex types are always padded
2626    upwards.
2627
2628    We don't need to worry about homogeneous floating-point or
2629    short-vector aggregates; their move is not affected by the
2630    padding direction determined here.  Regardless of endianness,
2631    each element of such an aggregate is put in the least
2632    significant bits of a fp/simd register.
2633
2634    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2635    register has useful data, and return the opposite if the most
2636    significant byte does.  */
2637
2638 bool
2639 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2640                      bool first ATTRIBUTE_UNUSED)
2641 {
2642
2643   /* Small composite types are always padded upward.  */
2644   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2645     {
2646       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2647                             : GET_MODE_SIZE (mode));
2648       if (size < 2 * UNITS_PER_WORD)
2649         return true;
2650     }
2651
2652   /* Otherwise, use the default padding.  */
2653   return !BYTES_BIG_ENDIAN;
2654 }
2655
2656 static scalar_int_mode
2657 aarch64_libgcc_cmp_return_mode (void)
2658 {
2659   return SImode;
2660 }
2661
2662 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2663
2664 /* We use the 12-bit shifted immediate arithmetic instructions so values
2665    must be multiple of (1 << 12), i.e. 4096.  */
2666 #define ARITH_FACTOR 4096
2667
2668 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2669 #error Cannot use simple address calculation for stack probing
2670 #endif
2671
2672 /* The pair of scratch registers used for stack probing.  */
2673 #define PROBE_STACK_FIRST_REG  9
2674 #define PROBE_STACK_SECOND_REG 10
2675
2676 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2677    inclusive.  These are offsets from the current stack pointer.  */
2678
2679 static void
2680 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2681 {
2682   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2683
2684   /* See the same assertion on PROBE_INTERVAL above.  */
2685   gcc_assert ((first % ARITH_FACTOR) == 0);
2686
2687   /* See if we have a constant small number of probes to generate.  If so,
2688      that's the easy case.  */
2689   if (size <= PROBE_INTERVAL)
2690     {
2691       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2692
2693       emit_set_insn (reg1,
2694                      plus_constant (Pmode,
2695                                     stack_pointer_rtx, -(first + base)));
2696       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2697     }
2698
2699   /* The run-time loop is made up of 8 insns in the generic case while the
2700      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2701   else if (size <= 4 * PROBE_INTERVAL)
2702     {
2703       HOST_WIDE_INT i, rem;
2704
2705       emit_set_insn (reg1,
2706                      plus_constant (Pmode,
2707                                     stack_pointer_rtx,
2708                                     -(first + PROBE_INTERVAL)));
2709       emit_stack_probe (reg1);
2710
2711       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2712          it exceeds SIZE.  If only two probes are needed, this will not
2713          generate any code.  Then probe at FIRST + SIZE.  */
2714       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2715         {
2716           emit_set_insn (reg1,
2717                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2718           emit_stack_probe (reg1);
2719         }
2720
2721       rem = size - (i - PROBE_INTERVAL);
2722       if (rem > 256)
2723         {
2724           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2725
2726           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2727           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2728         }
2729       else
2730         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2731     }
2732
2733   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2734      extra careful with variables wrapping around because we might be at
2735      the very top (or the very bottom) of the address space and we have
2736      to be able to handle this case properly; in particular, we use an
2737      equality test for the loop condition.  */
2738   else
2739     {
2740       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2741
2742       /* Step 1: round SIZE to the previous multiple of the interval.  */
2743
2744       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2745
2746
2747       /* Step 2: compute initial and final value of the loop counter.  */
2748
2749       /* TEST_ADDR = SP + FIRST.  */
2750       emit_set_insn (reg1,
2751                      plus_constant (Pmode, stack_pointer_rtx, -first));
2752
2753       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2754       HOST_WIDE_INT adjustment = - (first + rounded_size);
2755       if (! aarch64_uimm12_shift (adjustment))
2756         {
2757           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2758                                           true, Pmode);
2759           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2760         }
2761       else
2762         {
2763           emit_set_insn (reg2,
2764                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2765         }
2766
2767       /* Step 3: the loop
2768
2769          do
2770            {
2771              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2772              probe at TEST_ADDR
2773            }
2774          while (TEST_ADDR != LAST_ADDR)
2775
2776          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2777          until it is equal to ROUNDED_SIZE.  */
2778
2779       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2780
2781
2782       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2783          that SIZE is equal to ROUNDED_SIZE.  */
2784
2785       if (size != rounded_size)
2786         {
2787           HOST_WIDE_INT rem = size - rounded_size;
2788
2789           if (rem > 256)
2790             {
2791               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2792
2793               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2794               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2795             }
2796           else
2797             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2798         }
2799     }
2800
2801   /* Make sure nothing is scheduled before we are done.  */
2802   emit_insn (gen_blockage ());
2803 }
2804
2805 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2806    absolute addresses.  */
2807
2808 const char *
2809 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2810 {
2811   static int labelno = 0;
2812   char loop_lab[32];
2813   rtx xops[2];
2814
2815   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2816
2817   /* Loop.  */
2818   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2819
2820   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2821   xops[0] = reg1;
2822   xops[1] = GEN_INT (PROBE_INTERVAL);
2823   output_asm_insn ("sub\t%0, %0, %1", xops);
2824
2825   /* Probe at TEST_ADDR.  */
2826   output_asm_insn ("str\txzr, [%0]", xops);
2827
2828   /* Test if TEST_ADDR == LAST_ADDR.  */
2829   xops[1] = reg2;
2830   output_asm_insn ("cmp\t%0, %1", xops);
2831
2832   /* Branch.  */
2833   fputs ("\tb.ne\t", asm_out_file);
2834   assemble_name_raw (asm_out_file, loop_lab);
2835   fputc ('\n', asm_out_file);
2836
2837   return "";
2838 }
2839
2840 static bool
2841 aarch64_frame_pointer_required (void)
2842 {
2843   /* In aarch64_override_options_after_change
2844      flag_omit_leaf_frame_pointer turns off the frame pointer by
2845      default.  Turn it back on now if we've not got a leaf
2846      function.  */
2847   if (flag_omit_leaf_frame_pointer
2848       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2849     return true;
2850
2851   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2852   if (crtl->calls_eh_return)
2853     return true;
2854
2855   return false;
2856 }
2857
2858 /* Mark the registers that need to be saved by the callee and calculate
2859    the size of the callee-saved registers area and frame record (both FP
2860    and LR may be omitted).  */
2861 static void
2862 aarch64_layout_frame (void)
2863 {
2864   HOST_WIDE_INT offset = 0;
2865   int regno, last_fp_reg = INVALID_REGNUM;
2866
2867   if (reload_completed && cfun->machine->frame.laid_out)
2868     return;
2869
2870 #define SLOT_NOT_REQUIRED (-2)
2871 #define SLOT_REQUIRED     (-1)
2872
2873   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2874   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2875
2876   /* First mark all the registers that really need to be saved...  */
2877   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2878     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2879
2880   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2881     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2882
2883   /* ... that includes the eh data registers (if needed)...  */
2884   if (crtl->calls_eh_return)
2885     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2886       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2887         = SLOT_REQUIRED;
2888
2889   /* ... and any callee saved register that dataflow says is live.  */
2890   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2891     if (df_regs_ever_live_p (regno)
2892         && (regno == R30_REGNUM
2893             || !call_used_regs[regno]))
2894       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2895
2896   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2897     if (df_regs_ever_live_p (regno)
2898         && !call_used_regs[regno])
2899       {
2900         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2901         last_fp_reg = regno;
2902       }
2903
2904   if (frame_pointer_needed)
2905     {
2906       /* FP and LR are placed in the linkage record.  */
2907       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2908       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2909       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2910       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2911       offset += 2 * UNITS_PER_WORD;
2912     }
2913
2914   /* Now assign stack slots for them.  */
2915   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2916     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2917       {
2918         cfun->machine->frame.reg_offset[regno] = offset;
2919         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2920           cfun->machine->frame.wb_candidate1 = regno;
2921         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2922           cfun->machine->frame.wb_candidate2 = regno;
2923         offset += UNITS_PER_WORD;
2924       }
2925
2926   HOST_WIDE_INT max_int_offset = offset;
2927   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2928   bool has_align_gap = offset != max_int_offset;
2929
2930   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2931     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2932       {
2933         /* If there is an alignment gap between integer and fp callee-saves,
2934            allocate the last fp register to it if possible.  */
2935         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2936           {
2937             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2938             break;
2939           }
2940
2941         cfun->machine->frame.reg_offset[regno] = offset;
2942         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2943           cfun->machine->frame.wb_candidate1 = regno;
2944         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2945                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2946           cfun->machine->frame.wb_candidate2 = regno;
2947         offset += UNITS_PER_WORD;
2948       }
2949
2950   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2951
2952   cfun->machine->frame.saved_regs_size = offset;
2953
2954   HOST_WIDE_INT varargs_and_saved_regs_size
2955     = offset + cfun->machine->frame.saved_varargs_size;
2956
2957   cfun->machine->frame.hard_fp_offset
2958     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2959                 STACK_BOUNDARY / BITS_PER_UNIT);
2960
2961   cfun->machine->frame.frame_size
2962     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2963                 + crtl->outgoing_args_size,
2964                 STACK_BOUNDARY / BITS_PER_UNIT);
2965
2966   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2967
2968   cfun->machine->frame.initial_adjust = 0;
2969   cfun->machine->frame.final_adjust = 0;
2970   cfun->machine->frame.callee_adjust = 0;
2971   cfun->machine->frame.callee_offset = 0;
2972
2973   HOST_WIDE_INT max_push_offset = 0;
2974   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2975     max_push_offset = 512;
2976   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2977     max_push_offset = 256;
2978
2979   if (cfun->machine->frame.frame_size < max_push_offset
2980       && crtl->outgoing_args_size == 0)
2981     {
2982       /* Simple, small frame with no outgoing arguments:
2983          stp reg1, reg2, [sp, -frame_size]!
2984          stp reg3, reg4, [sp, 16]  */
2985       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2986     }
2987   else if ((crtl->outgoing_args_size
2988             + cfun->machine->frame.saved_regs_size < 512)
2989            && !(cfun->calls_alloca
2990                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2991     {
2992       /* Frame with small outgoing arguments:
2993          sub sp, sp, frame_size
2994          stp reg1, reg2, [sp, outgoing_args_size]
2995          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2996       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2997       cfun->machine->frame.callee_offset
2998         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2999     }
3000   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3001     {
3002       /* Frame with large outgoing arguments but a small local area:
3003          stp reg1, reg2, [sp, -hard_fp_offset]!
3004          stp reg3, reg4, [sp, 16]
3005          sub sp, sp, outgoing_args_size  */
3006       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3007       cfun->machine->frame.final_adjust
3008         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3009     }
3010   else if (!frame_pointer_needed
3011            && varargs_and_saved_regs_size < max_push_offset)
3012     {
3013       /* Frame with large local area and outgoing arguments (this pushes the
3014          callee-saves first, followed by the locals and outgoing area):
3015          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3016          stp reg3, reg4, [sp, 16]
3017          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3018       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3019       cfun->machine->frame.final_adjust
3020         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3021       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3022       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3023     }
3024   else
3025     {
3026       /* Frame with large local area and outgoing arguments using frame pointer:
3027          sub sp, sp, hard_fp_offset
3028          stp x29, x30, [sp, 0]
3029          add x29, sp, 0
3030          stp reg3, reg4, [sp, 16]
3031          sub sp, sp, outgoing_args_size  */
3032       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3033       cfun->machine->frame.final_adjust
3034         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3035     }
3036
3037   cfun->machine->frame.laid_out = true;
3038 }
3039
3040 /* Return true if the register REGNO is saved on entry to
3041    the current function.  */
3042
3043 static bool
3044 aarch64_register_saved_on_entry (int regno)
3045 {
3046   return cfun->machine->frame.reg_offset[regno] >= 0;
3047 }
3048
3049 /* Return the next register up from REGNO up to LIMIT for the callee
3050    to save.  */
3051
3052 static unsigned
3053 aarch64_next_callee_save (unsigned regno, unsigned limit)
3054 {
3055   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3056     regno ++;
3057   return regno;
3058 }
3059
3060 /* Push the register number REGNO of mode MODE to the stack with write-back
3061    adjusting the stack by ADJUSTMENT.  */
3062
3063 static void
3064 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3065                            HOST_WIDE_INT adjustment)
3066  {
3067   rtx base_rtx = stack_pointer_rtx;
3068   rtx insn, reg, mem;
3069
3070   reg = gen_rtx_REG (mode, regno);
3071   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3072                             plus_constant (Pmode, base_rtx, -adjustment));
3073   mem = gen_frame_mem (mode, mem);
3074
3075   insn = emit_move_insn (mem, reg);
3076   RTX_FRAME_RELATED_P (insn) = 1;
3077 }
3078
3079 /* Generate and return an instruction to store the pair of registers
3080    REG and REG2 of mode MODE to location BASE with write-back adjusting
3081    the stack location BASE by ADJUSTMENT.  */
3082
3083 static rtx
3084 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3085                           HOST_WIDE_INT adjustment)
3086 {
3087   switch (mode)
3088     {
3089     case E_DImode:
3090       return gen_storewb_pairdi_di (base, base, reg, reg2,
3091                                     GEN_INT (-adjustment),
3092                                     GEN_INT (UNITS_PER_WORD - adjustment));
3093     case E_DFmode:
3094       return gen_storewb_pairdf_di (base, base, reg, reg2,
3095                                     GEN_INT (-adjustment),
3096                                     GEN_INT (UNITS_PER_WORD - adjustment));
3097     default:
3098       gcc_unreachable ();
3099     }
3100 }
3101
3102 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3103    stack pointer by ADJUSTMENT.  */
3104
3105 static void
3106 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3107 {
3108   rtx_insn *insn;
3109   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3110
3111   if (regno2 == INVALID_REGNUM)
3112     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3113
3114   rtx reg1 = gen_rtx_REG (mode, regno1);
3115   rtx reg2 = gen_rtx_REG (mode, regno2);
3116
3117   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3118                                               reg2, adjustment));
3119   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3120   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3121   RTX_FRAME_RELATED_P (insn) = 1;
3122 }
3123
3124 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3125    adjusting it by ADJUSTMENT afterwards.  */
3126
3127 static rtx
3128 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3129                          HOST_WIDE_INT adjustment)
3130 {
3131   switch (mode)
3132     {
3133     case E_DImode:
3134       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3135                                    GEN_INT (UNITS_PER_WORD));
3136     case E_DFmode:
3137       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3138                                    GEN_INT (UNITS_PER_WORD));
3139     default:
3140       gcc_unreachable ();
3141     }
3142 }
3143
3144 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3145    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3146    into CFI_OPS.  */
3147
3148 static void
3149 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3150                   rtx *cfi_ops)
3151 {
3152   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3153   rtx reg1 = gen_rtx_REG (mode, regno1);
3154
3155   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3156
3157   if (regno2 == INVALID_REGNUM)
3158     {
3159       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3160       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3161       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3162     }
3163   else
3164     {
3165       rtx reg2 = gen_rtx_REG (mode, regno2);
3166       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3167       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3168                                           reg2, adjustment));
3169     }
3170 }
3171
3172 /* Generate and return a store pair instruction of mode MODE to store
3173    register REG1 to MEM1 and register REG2 to MEM2.  */
3174
3175 static rtx
3176 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3177                         rtx reg2)
3178 {
3179   switch (mode)
3180     {
3181     case E_DImode:
3182       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3183
3184     case E_DFmode:
3185       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3186
3187     default:
3188       gcc_unreachable ();
3189     }
3190 }
3191
3192 /* Generate and regurn a load pair isntruction of mode MODE to load register
3193    REG1 from MEM1 and register REG2 from MEM2.  */
3194
3195 static rtx
3196 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3197                        rtx mem2)
3198 {
3199   switch (mode)
3200     {
3201     case E_DImode:
3202       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3203
3204     case E_DFmode:
3205       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3206
3207     default:
3208       gcc_unreachable ();
3209     }
3210 }
3211
3212 /* Return TRUE if return address signing should be enabled for the current
3213    function, otherwise return FALSE.  */
3214
3215 bool
3216 aarch64_return_address_signing_enabled (void)
3217 {
3218   /* This function should only be called after frame laid out.   */
3219   gcc_assert (cfun->machine->frame.laid_out);
3220
3221   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3222      if it's LR is pushed onto stack.  */
3223   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3224           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3225               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3226 }
3227
3228 /* Emit code to save the callee-saved registers from register number START
3229    to LIMIT to the stack at the location starting at offset START_OFFSET,
3230    skipping any write-back candidates if SKIP_WB is true.  */
3231
3232 static void
3233 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3234                            unsigned start, unsigned limit, bool skip_wb)
3235 {
3236   rtx_insn *insn;
3237   unsigned regno;
3238   unsigned regno2;
3239
3240   for (regno = aarch64_next_callee_save (start, limit);
3241        regno <= limit;
3242        regno = aarch64_next_callee_save (regno + 1, limit))
3243     {
3244       rtx reg, mem;
3245       HOST_WIDE_INT offset;
3246
3247       if (skip_wb
3248           && (regno == cfun->machine->frame.wb_candidate1
3249               || regno == cfun->machine->frame.wb_candidate2))
3250         continue;
3251
3252       if (cfun->machine->reg_is_wrapped_separately[regno])
3253        continue;
3254
3255       reg = gen_rtx_REG (mode, regno);
3256       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3257       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3258                                                 offset));
3259
3260       regno2 = aarch64_next_callee_save (regno + 1, limit);
3261
3262       if (regno2 <= limit
3263           && !cfun->machine->reg_is_wrapped_separately[regno2]
3264           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3265               == cfun->machine->frame.reg_offset[regno2]))
3266
3267         {
3268           rtx reg2 = gen_rtx_REG (mode, regno2);
3269           rtx mem2;
3270
3271           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3272           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3273                                                      offset));
3274           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3275                                                     reg2));
3276
3277           /* The first part of a frame-related parallel insn is
3278              always assumed to be relevant to the frame
3279              calculations; subsequent parts, are only
3280              frame-related if explicitly marked.  */
3281           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3282           regno = regno2;
3283         }
3284       else
3285         insn = emit_move_insn (mem, reg);
3286
3287       RTX_FRAME_RELATED_P (insn) = 1;
3288     }
3289 }
3290
3291 /* Emit code to restore the callee registers of mode MODE from register
3292    number START up to and including LIMIT.  Restore from the stack offset
3293    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3294    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3295
3296 static void
3297 aarch64_restore_callee_saves (machine_mode mode,
3298                               HOST_WIDE_INT start_offset, unsigned start,
3299                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3300 {
3301   rtx base_rtx = stack_pointer_rtx;
3302   unsigned regno;
3303   unsigned regno2;
3304   HOST_WIDE_INT offset;
3305
3306   for (regno = aarch64_next_callee_save (start, limit);
3307        regno <= limit;
3308        regno = aarch64_next_callee_save (regno + 1, limit))
3309     {
3310       if (cfun->machine->reg_is_wrapped_separately[regno])
3311        continue;
3312
3313       rtx reg, mem;
3314
3315       if (skip_wb
3316           && (regno == cfun->machine->frame.wb_candidate1
3317               || regno == cfun->machine->frame.wb_candidate2))
3318         continue;
3319
3320       reg = gen_rtx_REG (mode, regno);
3321       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3322       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3323
3324       regno2 = aarch64_next_callee_save (regno + 1, limit);
3325
3326       if (regno2 <= limit
3327           && !cfun->machine->reg_is_wrapped_separately[regno2]
3328           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3329               == cfun->machine->frame.reg_offset[regno2]))
3330         {
3331           rtx reg2 = gen_rtx_REG (mode, regno2);
3332           rtx mem2;
3333
3334           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3335           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3336           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3337
3338           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3339           regno = regno2;
3340         }
3341       else
3342         emit_move_insn (reg, mem);
3343       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3344     }
3345 }
3346
3347 static inline bool
3348 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3349                                HOST_WIDE_INT offset)
3350 {
3351   return offset >= -256 && offset < 256;
3352 }
3353
3354 static inline bool
3355 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3356 {
3357   return (offset >= 0
3358           && offset < 4096 * GET_MODE_SIZE (mode)
3359           && offset % GET_MODE_SIZE (mode) == 0);
3360 }
3361
3362 bool
3363 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3364 {
3365   return (offset >= -64 * GET_MODE_SIZE (mode)
3366           && offset < 64 * GET_MODE_SIZE (mode)
3367           && offset % GET_MODE_SIZE (mode) == 0);
3368 }
3369
3370 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3371
3372 static sbitmap
3373 aarch64_get_separate_components (void)
3374 {
3375   aarch64_layout_frame ();
3376
3377   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3378   bitmap_clear (components);
3379
3380   /* The registers we need saved to the frame.  */
3381   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3382     if (aarch64_register_saved_on_entry (regno))
3383       {
3384         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3385         if (!frame_pointer_needed)
3386           offset += cfun->machine->frame.frame_size
3387                     - cfun->machine->frame.hard_fp_offset;
3388         /* Check that we can access the stack slot of the register with one
3389            direct load with no adjustments needed.  */
3390         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3391           bitmap_set_bit (components, regno);
3392       }
3393
3394   /* Don't mess with the hard frame pointer.  */
3395   if (frame_pointer_needed)
3396     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3397
3398   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3399   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3400   /* If aarch64_layout_frame has chosen registers to store/restore with
3401      writeback don't interfere with them to avoid having to output explicit
3402      stack adjustment instructions.  */
3403   if (reg2 != INVALID_REGNUM)
3404     bitmap_clear_bit (components, reg2);
3405   if (reg1 != INVALID_REGNUM)
3406     bitmap_clear_bit (components, reg1);
3407
3408   bitmap_clear_bit (components, LR_REGNUM);
3409   bitmap_clear_bit (components, SP_REGNUM);
3410
3411   return components;
3412 }
3413
3414 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3415
3416 static sbitmap
3417 aarch64_components_for_bb (basic_block bb)
3418 {
3419   bitmap in = DF_LIVE_IN (bb);
3420   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3421   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3422
3423   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3424   bitmap_clear (components);
3425
3426   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3427   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3428     if ((!call_used_regs[regno])
3429        && (bitmap_bit_p (in, regno)
3430            || bitmap_bit_p (gen, regno)
3431            || bitmap_bit_p (kill, regno)))
3432           bitmap_set_bit (components, regno);
3433
3434   return components;
3435 }
3436
3437 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3438    Nothing to do for aarch64.  */
3439
3440 static void
3441 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3442 {
3443 }
3444
3445 /* Return the next set bit in BMP from START onwards.  Return the total number
3446    of bits in BMP if no set bit is found at or after START.  */
3447
3448 static unsigned int
3449 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3450 {
3451   unsigned int nbits = SBITMAP_SIZE (bmp);
3452   if (start == nbits)
3453     return start;
3454
3455   gcc_assert (start < nbits);
3456   for (unsigned int i = start; i < nbits; i++)
3457     if (bitmap_bit_p (bmp, i))
3458       return i;
3459
3460   return nbits;
3461 }
3462
3463 /* Do the work for aarch64_emit_prologue_components and
3464    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3465    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3466    for these components or the epilogue sequence.  That is, it determines
3467    whether we should emit stores or loads and what kind of CFA notes to attach
3468    to the insns.  Otherwise the logic for the two sequences is very
3469    similar.  */
3470
3471 static void
3472 aarch64_process_components (sbitmap components, bool prologue_p)
3473 {
3474   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3475                              ? HARD_FRAME_POINTER_REGNUM
3476                              : STACK_POINTER_REGNUM);
3477
3478   unsigned last_regno = SBITMAP_SIZE (components);
3479   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3480   rtx_insn *insn = NULL;
3481
3482   while (regno != last_regno)
3483     {
3484       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3485          so DFmode for the vector registers is enough.  */
3486       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3487       rtx reg = gen_rtx_REG (mode, regno);
3488       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3489       if (!frame_pointer_needed)
3490         offset += cfun->machine->frame.frame_size
3491                   - cfun->machine->frame.hard_fp_offset;
3492       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3493       rtx mem = gen_frame_mem (mode, addr);
3494
3495       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3496       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3497       /* No more registers to handle after REGNO.
3498          Emit a single save/restore and exit.  */
3499       if (regno2 == last_regno)
3500         {
3501           insn = emit_insn (set);
3502           RTX_FRAME_RELATED_P (insn) = 1;
3503           if (prologue_p)
3504             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3505           else
3506             add_reg_note (insn, REG_CFA_RESTORE, reg);
3507           break;
3508         }
3509
3510       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3511       /* The next register is not of the same class or its offset is not
3512          mergeable with the current one into a pair.  */
3513       if (!satisfies_constraint_Ump (mem)
3514           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3515           || (offset2 - cfun->machine->frame.reg_offset[regno])
3516                 != GET_MODE_SIZE (mode))
3517         {
3518           insn = emit_insn (set);
3519           RTX_FRAME_RELATED_P (insn) = 1;
3520           if (prologue_p)
3521             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3522           else
3523             add_reg_note (insn, REG_CFA_RESTORE, reg);
3524
3525           regno = regno2;
3526           continue;
3527         }
3528
3529       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3530       rtx reg2 = gen_rtx_REG (mode, regno2);
3531       if (!frame_pointer_needed)
3532         offset2 += cfun->machine->frame.frame_size
3533                   - cfun->machine->frame.hard_fp_offset;
3534       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3535       rtx mem2 = gen_frame_mem (mode, addr2);
3536       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3537                              : gen_rtx_SET (reg2, mem2);
3538
3539       if (prologue_p)
3540         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3541       else
3542         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3543
3544       RTX_FRAME_RELATED_P (insn) = 1;
3545       if (prologue_p)
3546         {
3547           add_reg_note (insn, REG_CFA_OFFSET, set);
3548           add_reg_note (insn, REG_CFA_OFFSET, set2);
3549         }
3550       else
3551         {
3552           add_reg_note (insn, REG_CFA_RESTORE, reg);
3553           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3554         }
3555
3556       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3557     }
3558 }
3559
3560 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3561
3562 static void
3563 aarch64_emit_prologue_components (sbitmap components)
3564 {
3565   aarch64_process_components (components, true);
3566 }
3567
3568 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3569
3570 static void
3571 aarch64_emit_epilogue_components (sbitmap components)
3572 {
3573   aarch64_process_components (components, false);
3574 }
3575
3576 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3577
3578 static void
3579 aarch64_set_handled_components (sbitmap components)
3580 {
3581   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3582     if (bitmap_bit_p (components, regno))
3583       cfun->machine->reg_is_wrapped_separately[regno] = true;
3584 }
3585
3586 /* AArch64 stack frames generated by this compiler look like:
3587
3588         +-------------------------------+
3589         |                               |
3590         |  incoming stack arguments     |
3591         |                               |
3592         +-------------------------------+
3593         |                               | <-- incoming stack pointer (aligned)
3594         |  callee-allocated save area   |
3595         |  for register varargs         |
3596         |                               |
3597         +-------------------------------+
3598         |  local variables              | <-- frame_pointer_rtx
3599         |                               |
3600         +-------------------------------+
3601         |  padding0                     | \
3602         +-------------------------------+  |
3603         |  callee-saved registers       |  | frame.saved_regs_size
3604         +-------------------------------+  |
3605         |  LR'                          |  |
3606         +-------------------------------+  |
3607         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3608         +-------------------------------+
3609         |  dynamic allocation           |
3610         +-------------------------------+
3611         |  padding                      |
3612         +-------------------------------+
3613         |  outgoing stack arguments     | <-- arg_pointer
3614         |                               |
3615         +-------------------------------+
3616         |                               | <-- stack_pointer_rtx (aligned)
3617
3618    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3619    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3620    unchanged.  */
3621
3622 /* Generate the prologue instructions for entry into a function.
3623    Establish the stack frame by decreasing the stack pointer with a
3624    properly calculated size and, if necessary, create a frame record
3625    filled with the values of LR and previous frame pointer.  The
3626    current FP is also set up if it is in use.  */
3627
3628 void
3629 aarch64_expand_prologue (void)
3630 {
3631   aarch64_layout_frame ();
3632
3633   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3634   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3635   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3636   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3637   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3638   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3639   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3640   rtx_insn *insn;
3641
3642   /* Sign return address for functions.  */
3643   if (aarch64_return_address_signing_enabled ())
3644     {
3645       insn = emit_insn (gen_pacisp ());
3646       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3647       RTX_FRAME_RELATED_P (insn) = 1;
3648     }
3649
3650   if (flag_stack_usage_info)
3651     current_function_static_stack_size = frame_size;
3652
3653   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3654     {
3655       if (crtl->is_leaf && !cfun->calls_alloca)
3656         {
3657           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3658             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3659                                             frame_size - STACK_CHECK_PROTECT);
3660         }
3661       else if (frame_size > 0)
3662         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3663     }
3664
3665   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3666
3667   if (callee_adjust != 0)
3668     aarch64_push_regs (reg1, reg2, callee_adjust);
3669
3670   if (frame_pointer_needed)
3671     {
3672       if (callee_adjust == 0)
3673         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3674                                    R30_REGNUM, false);
3675       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3676                                        stack_pointer_rtx,
3677                                        GEN_INT (callee_offset)));
3678       RTX_FRAME_RELATED_P (insn) = 1;
3679       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3680     }
3681
3682   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3683                              callee_adjust != 0 || frame_pointer_needed);
3684   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3685                              callee_adjust != 0 || frame_pointer_needed);
3686   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3687 }
3688
3689 /* Return TRUE if we can use a simple_return insn.
3690
3691    This function checks whether the callee saved stack is empty, which
3692    means no restore actions are need. The pro_and_epilogue will use
3693    this to check whether shrink-wrapping opt is feasible.  */
3694
3695 bool
3696 aarch64_use_return_insn_p (void)
3697 {
3698   if (!reload_completed)
3699     return false;
3700
3701   if (crtl->profile)
3702     return false;
3703
3704   aarch64_layout_frame ();
3705
3706   return cfun->machine->frame.frame_size == 0;
3707 }
3708
3709 /* Generate the epilogue instructions for returning from a function.
3710    This is almost exactly the reverse of the prolog sequence, except
3711    that we need to insert barriers to avoid scheduling loads that read
3712    from a deallocated stack, and we optimize the unwind records by
3713    emitting them all together if possible.  */
3714 void
3715 aarch64_expand_epilogue (bool for_sibcall)
3716 {
3717   aarch64_layout_frame ();
3718
3719   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3720   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3721   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3722   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3723   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3724   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3725   rtx cfi_ops = NULL;
3726   rtx_insn *insn;
3727
3728   /* We need to add memory barrier to prevent read from deallocated stack.  */
3729   bool need_barrier_p = (get_frame_size ()
3730                          + cfun->machine->frame.saved_varargs_size) != 0;
3731
3732   /* Emit a barrier to prevent loads from a deallocated stack.  */
3733   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3734       || crtl->calls_eh_return)
3735     {
3736       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3737       need_barrier_p = false;
3738     }
3739
3740   /* Restore the stack pointer from the frame pointer if it may not
3741      be the same as the stack pointer.  */
3742   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3743     {
3744       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3745                                        hard_frame_pointer_rtx,
3746                                        GEN_INT (-callee_offset)));
3747       /* If writeback is used when restoring callee-saves, the CFA
3748          is restored on the instruction doing the writeback.  */
3749       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3750     }
3751   else
3752     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3753
3754   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3755                                 callee_adjust != 0, &cfi_ops);
3756   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3757                                 callee_adjust != 0, &cfi_ops);
3758
3759   if (need_barrier_p)
3760     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3761
3762   if (callee_adjust != 0)
3763     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3764
3765   if (callee_adjust != 0 || initial_adjust > 65536)
3766     {
3767       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3768       insn = get_last_insn ();
3769       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3770       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3771       RTX_FRAME_RELATED_P (insn) = 1;
3772       cfi_ops = NULL;
3773     }
3774
3775   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3776
3777   if (cfi_ops)
3778     {
3779       /* Emit delayed restores and reset the CFA to be SP.  */
3780       insn = get_last_insn ();
3781       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3782       REG_NOTES (insn) = cfi_ops;
3783       RTX_FRAME_RELATED_P (insn) = 1;
3784     }
3785
3786   /* We prefer to emit the combined return/authenticate instruction RETAA,
3787      however there are three cases in which we must instead emit an explicit
3788      authentication instruction.
3789
3790         1) Sibcalls don't return in a normal way, so if we're about to call one
3791            we must authenticate.
3792
3793         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3794            generating code for !TARGET_ARMV8_3 we can't use it and must
3795            explicitly authenticate.
3796
3797         3) On an eh_return path we make extra stack adjustments to update the
3798            canonical frame address to be the exception handler's CFA.  We want
3799            to authenticate using the CFA of the function which calls eh_return.
3800     */
3801   if (aarch64_return_address_signing_enabled ()
3802       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3803     {
3804       insn = emit_insn (gen_autisp ());
3805       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3806       RTX_FRAME_RELATED_P (insn) = 1;
3807     }
3808
3809   /* Stack adjustment for exception handler.  */
3810   if (crtl->calls_eh_return)
3811     {
3812       /* We need to unwind the stack by the offset computed by
3813          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3814          to be SP; letting the CFA move during this adjustment
3815          is just as correct as retaining the CFA from the body
3816          of the function.  Therefore, do nothing special.  */
3817       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3818     }
3819
3820   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3821   if (!for_sibcall)
3822     emit_jump_insn (ret_rtx);
3823 }
3824
3825 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3826    normally or return to a previous frame after unwinding.
3827
3828    An EH return uses a single shared return sequence.  The epilogue is
3829    exactly like a normal epilogue except that it has an extra input
3830    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3831    that must be applied after the frame has been destroyed.  An extra label
3832    is inserted before the epilogue which initializes this register to zero,
3833    and this is the entry point for a normal return.
3834
3835    An actual EH return updates the return address, initializes the stack
3836    adjustment and jumps directly into the epilogue (bypassing the zeroing
3837    of the adjustment).  Since the return address is typically saved on the
3838    stack when a function makes a call, the saved LR must be updated outside
3839    the epilogue.
3840
3841    This poses problems as the store is generated well before the epilogue,
3842    so the offset of LR is not known yet.  Also optimizations will remove the
3843    store as it appears dead, even after the epilogue is generated (as the
3844    base or offset for loading LR is different in many cases).
3845
3846    To avoid these problems this implementation forces the frame pointer
3847    in eh_return functions so that the location of LR is fixed and known early.
3848    It also marks the store volatile, so no optimization is permitted to
3849    remove the store.  */
3850 rtx
3851 aarch64_eh_return_handler_rtx (void)
3852 {
3853   rtx tmp = gen_frame_mem (Pmode,
3854     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3855
3856   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3857   MEM_VOLATILE_P (tmp) = true;
3858   return tmp;
3859 }
3860
3861 /* Output code to add DELTA to the first argument, and then jump
3862    to FUNCTION.  Used for C++ multiple inheritance.  */
3863 static void
3864 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3865                          HOST_WIDE_INT delta,
3866                          HOST_WIDE_INT vcall_offset,
3867                          tree function)
3868 {
3869   /* The this pointer is always in x0.  Note that this differs from
3870      Arm where the this pointer maybe bumped to r1 if r0 is required
3871      to return a pointer to an aggregate.  On AArch64 a result value
3872      pointer will be in x8.  */
3873   int this_regno = R0_REGNUM;
3874   rtx this_rtx, temp0, temp1, addr, funexp;
3875   rtx_insn *insn;
3876
3877   reload_completed = 1;
3878   emit_note (NOTE_INSN_PROLOGUE_END);
3879
3880   if (vcall_offset == 0)
3881     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3882   else
3883     {
3884       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3885
3886       this_rtx = gen_rtx_REG (Pmode, this_regno);
3887       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3888       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3889
3890       addr = this_rtx;
3891       if (delta != 0)
3892         {
3893           if (delta >= -256 && delta < 256)
3894             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3895                                        plus_constant (Pmode, this_rtx, delta));
3896           else
3897             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3898         }
3899
3900       if (Pmode == ptr_mode)
3901         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3902       else
3903         aarch64_emit_move (temp0,
3904                            gen_rtx_ZERO_EXTEND (Pmode,
3905                                                 gen_rtx_MEM (ptr_mode, addr)));
3906
3907       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3908           addr = plus_constant (Pmode, temp0, vcall_offset);
3909       else
3910         {
3911           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3912                                           Pmode);
3913           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3914         }
3915
3916       if (Pmode == ptr_mode)
3917         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3918       else
3919         aarch64_emit_move (temp1,
3920                            gen_rtx_SIGN_EXTEND (Pmode,
3921                                                 gen_rtx_MEM (ptr_mode, addr)));
3922
3923       emit_insn (gen_add2_insn (this_rtx, temp1));
3924     }
3925
3926   /* Generate a tail call to the target function.  */
3927   if (!TREE_USED (function))
3928     {
3929       assemble_external (function);
3930       TREE_USED (function) = 1;
3931     }
3932   funexp = XEXP (DECL_RTL (function), 0);
3933   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3934   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3935   SIBLING_CALL_P (insn) = 1;
3936
3937   insn = get_insns ();
3938   shorten_branches (insn);
3939   final_start_function (insn, file, 1);
3940   final (insn, file, 1);
3941   final_end_function ();
3942
3943   /* Stop pretending to be a post-reload pass.  */
3944   reload_completed = 0;
3945 }
3946
3947 static bool
3948 aarch64_tls_referenced_p (rtx x)
3949 {
3950   if (!TARGET_HAVE_TLS)
3951     return false;
3952   subrtx_iterator::array_type array;
3953   FOR_EACH_SUBRTX (iter, array, x, ALL)
3954     {
3955       const_rtx x = *iter;
3956       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3957         return true;
3958       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3959          TLS offsets, not real symbol references.  */
3960       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3961         iter.skip_subrtxes ();
3962     }
3963   return false;
3964 }
3965
3966
3967 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3968    a left shift of 0 or 12 bits.  */
3969 bool
3970 aarch64_uimm12_shift (HOST_WIDE_INT val)
3971 {
3972   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3973           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3974           );
3975 }
3976
3977
3978 /* Return true if val is an immediate that can be loaded into a
3979    register by a MOVZ instruction.  */
3980 static bool
3981 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3982 {
3983   if (GET_MODE_SIZE (mode) > 4)
3984     {
3985       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3986           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3987         return 1;
3988     }
3989   else
3990     {
3991       /* Ignore sign extension.  */
3992       val &= (HOST_WIDE_INT) 0xffffffff;
3993     }
3994   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3995           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3996 }
3997
3998 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3999
4000 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4001   {
4002     0x0000000100000001ull,
4003     0x0001000100010001ull,
4004     0x0101010101010101ull,
4005     0x1111111111111111ull,
4006     0x5555555555555555ull,
4007   };
4008
4009
4010 /* Return true if val is a valid bitmask immediate.  */
4011
4012 bool
4013 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4014 {
4015   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4016   int bits;
4017
4018   /* Check for a single sequence of one bits and return quickly if so.
4019      The special cases of all ones and all zeroes returns false.  */
4020   val = (unsigned HOST_WIDE_INT) val_in;
4021   tmp = val + (val & -val);
4022
4023   if (tmp == (tmp & -tmp))
4024     return (val + 1) > 1;
4025
4026   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4027   if (mode == SImode)
4028     val = (val << 32) | (val & 0xffffffff);
4029
4030   /* Invert if the immediate doesn't start with a zero bit - this means we
4031      only need to search for sequences of one bits.  */
4032   if (val & 1)
4033     val = ~val;
4034
4035   /* Find the first set bit and set tmp to val with the first sequence of one
4036      bits removed.  Return success if there is a single sequence of ones.  */
4037   first_one = val & -val;
4038   tmp = val & (val + first_one);
4039
4040   if (tmp == 0)
4041     return true;
4042
4043   /* Find the next set bit and compute the difference in bit position.  */
4044   next_one = tmp & -tmp;
4045   bits = clz_hwi (first_one) - clz_hwi (next_one);
4046   mask = val ^ tmp;
4047
4048   /* Check the bit position difference is a power of 2, and that the first
4049      sequence of one bits fits within 'bits' bits.  */
4050   if ((mask >> bits) != 0 || bits != (bits & -bits))
4051     return false;
4052
4053   /* Check the sequence of one bits is repeated 64/bits times.  */
4054   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4055 }
4056
4057 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4058    Assumed precondition: VAL_IN Is not zero.  */
4059
4060 unsigned HOST_WIDE_INT
4061 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4062 {
4063   int lowest_bit_set = ctz_hwi (val_in);
4064   int highest_bit_set = floor_log2 (val_in);
4065   gcc_assert (val_in != 0);
4066
4067   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4068           (HOST_WIDE_INT_1U << lowest_bit_set));
4069 }
4070
4071 /* Create constant where bits outside of lowest bit set to highest bit set
4072    are set to 1.  */
4073
4074 unsigned HOST_WIDE_INT
4075 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4076 {
4077   return val_in | ~aarch64_and_split_imm1 (val_in);
4078 }
4079
4080 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4081
4082 bool
4083 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4084 {
4085   if (aarch64_bitmask_imm (val_in, mode))
4086     return false;
4087
4088   if (aarch64_move_imm (val_in, mode))
4089     return false;
4090
4091   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4092
4093   return aarch64_bitmask_imm (imm2, mode);
4094 }
4095
4096 /* Return true if val is an immediate that can be loaded into a
4097    register in a single instruction.  */
4098 bool
4099 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4100 {
4101   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4102     return 1;
4103   return aarch64_bitmask_imm (val, mode);
4104 }
4105
4106 static bool
4107 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4108 {
4109   rtx base, offset;
4110
4111   if (GET_CODE (x) == HIGH)
4112     return true;
4113
4114   split_const (x, &base, &offset);
4115   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4116     {
4117       if (aarch64_classify_symbol (base, offset)
4118           != SYMBOL_FORCE_TO_MEM)
4119         return true;
4120       else
4121         /* Avoid generating a 64-bit relocation in ILP32; leave
4122            to aarch64_expand_mov_immediate to handle it properly.  */
4123         return mode != ptr_mode;
4124     }
4125
4126   return aarch64_tls_referenced_p (x);
4127 }
4128
4129 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4130    The expansion for a table switch is quite expensive due to the number
4131    of instructions, the table lookup and hard to predict indirect jump.
4132    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4133    set, otherwise use tables for > 16 cases as a tradeoff between size and
4134    performance.  When optimizing for size, use the default setting.  */
4135
4136 static unsigned int
4137 aarch64_case_values_threshold (void)
4138 {
4139   /* Use the specified limit for the number of cases before using jump
4140      tables at higher optimization levels.  */
4141   if (optimize > 2
4142       && selected_cpu->tune->max_case_values != 0)
4143     return selected_cpu->tune->max_case_values;
4144   else
4145     return optimize_size ? default_case_values_threshold () : 17;
4146 }
4147
4148 /* Return true if register REGNO is a valid index register.
4149    STRICT_P is true if REG_OK_STRICT is in effect.  */
4150
4151 bool
4152 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4153 {
4154   if (!HARD_REGISTER_NUM_P (regno))
4155     {
4156       if (!strict_p)
4157         return true;
4158
4159       if (!reg_renumber)
4160         return false;
4161
4162       regno = reg_renumber[regno];
4163     }
4164   return GP_REGNUM_P (regno);
4165 }
4166
4167 /* Return true if register REGNO is a valid base register for mode MODE.
4168    STRICT_P is true if REG_OK_STRICT is in effect.  */
4169
4170 bool
4171 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4172 {
4173   if (!HARD_REGISTER_NUM_P (regno))
4174     {
4175       if (!strict_p)
4176         return true;
4177
4178       if (!reg_renumber)
4179         return false;
4180
4181       regno = reg_renumber[regno];
4182     }
4183
4184   /* The fake registers will be eliminated to either the stack or
4185      hard frame pointer, both of which are usually valid base registers.
4186      Reload deals with the cases where the eliminated form isn't valid.  */
4187   return (GP_REGNUM_P (regno)
4188           || regno == SP_REGNUM
4189           || regno == FRAME_POINTER_REGNUM
4190           || regno == ARG_POINTER_REGNUM);
4191 }
4192
4193 /* Return true if X is a valid base register for mode MODE.
4194    STRICT_P is true if REG_OK_STRICT is in effect.  */
4195
4196 static bool
4197 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4198 {
4199   if (!strict_p
4200       && GET_CODE (x) == SUBREG
4201       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4202     x = SUBREG_REG (x);
4203
4204   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4205 }
4206
4207 /* Return true if address offset is a valid index.  If it is, fill in INFO
4208    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4209
4210 static bool
4211 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4212                         machine_mode mode, bool strict_p)
4213 {
4214   enum aarch64_address_type type;
4215   rtx index;
4216   int shift;
4217
4218   /* (reg:P) */
4219   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4220       && GET_MODE (x) == Pmode)
4221     {
4222       type = ADDRESS_REG_REG;
4223       index = x;
4224       shift = 0;
4225     }
4226   /* (sign_extend:DI (reg:SI)) */
4227   else if ((GET_CODE (x) == SIGN_EXTEND
4228             || GET_CODE (x) == ZERO_EXTEND)
4229            && GET_MODE (x) == DImode
4230            && GET_MODE (XEXP (x, 0)) == SImode)
4231     {
4232       type = (GET_CODE (x) == SIGN_EXTEND)
4233         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4234       index = XEXP (x, 0);
4235       shift = 0;
4236     }
4237   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4238   else if (GET_CODE (x) == MULT
4239            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4240                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4241            && GET_MODE (XEXP (x, 0)) == DImode
4242            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4243            && CONST_INT_P (XEXP (x, 1)))
4244     {
4245       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4246         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4247       index = XEXP (XEXP (x, 0), 0);
4248       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4249     }
4250   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4251   else if (GET_CODE (x) == ASHIFT
4252            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4253                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4254            && GET_MODE (XEXP (x, 0)) == DImode
4255            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4256            && CONST_INT_P (XEXP (x, 1)))
4257     {
4258       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4259         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4260       index = XEXP (XEXP (x, 0), 0);
4261       shift = INTVAL (XEXP (x, 1));
4262     }
4263   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4264   else if ((GET_CODE (x) == SIGN_EXTRACT
4265             || GET_CODE (x) == ZERO_EXTRACT)
4266            && GET_MODE (x) == DImode
4267            && GET_CODE (XEXP (x, 0)) == MULT
4268            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4269            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4270     {
4271       type = (GET_CODE (x) == SIGN_EXTRACT)
4272         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4273       index = XEXP (XEXP (x, 0), 0);
4274       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4275       if (INTVAL (XEXP (x, 1)) != 32 + shift
4276           || INTVAL (XEXP (x, 2)) != 0)
4277         shift = -1;
4278     }
4279   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4280      (const_int 0xffffffff<<shift)) */
4281   else if (GET_CODE (x) == AND
4282            && GET_MODE (x) == DImode
4283            && GET_CODE (XEXP (x, 0)) == MULT
4284            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4285            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4286            && CONST_INT_P (XEXP (x, 1)))
4287     {
4288       type = ADDRESS_REG_UXTW;
4289       index = XEXP (XEXP (x, 0), 0);
4290       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4291       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4292         shift = -1;
4293     }
4294   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4295   else if ((GET_CODE (x) == SIGN_EXTRACT
4296             || GET_CODE (x) == ZERO_EXTRACT)
4297            && GET_MODE (x) == DImode
4298            && GET_CODE (XEXP (x, 0)) == ASHIFT
4299            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4300            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4301     {
4302       type = (GET_CODE (x) == SIGN_EXTRACT)
4303         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4304       index = XEXP (XEXP (x, 0), 0);
4305       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4306       if (INTVAL (XEXP (x, 1)) != 32 + shift
4307           || INTVAL (XEXP (x, 2)) != 0)
4308         shift = -1;
4309     }
4310   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4311      (const_int 0xffffffff<<shift)) */
4312   else if (GET_CODE (x) == AND
4313            && GET_MODE (x) == DImode
4314            && GET_CODE (XEXP (x, 0)) == ASHIFT
4315            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4316            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4317            && CONST_INT_P (XEXP (x, 1)))
4318     {
4319       type = ADDRESS_REG_UXTW;
4320       index = XEXP (XEXP (x, 0), 0);
4321       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4322       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4323         shift = -1;
4324     }
4325   /* (mult:P (reg:P) (const_int scale)) */
4326   else if (GET_CODE (x) == MULT
4327            && GET_MODE (x) == Pmode
4328            && GET_MODE (XEXP (x, 0)) == Pmode
4329            && CONST_INT_P (XEXP (x, 1)))
4330     {
4331       type = ADDRESS_REG_REG;
4332       index = XEXP (x, 0);
4333       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4334     }
4335   /* (ashift:P (reg:P) (const_int shift)) */
4336   else if (GET_CODE (x) == ASHIFT
4337            && GET_MODE (x) == Pmode
4338            && GET_MODE (XEXP (x, 0)) == Pmode
4339            && CONST_INT_P (XEXP (x, 1)))
4340     {
4341       type = ADDRESS_REG_REG;
4342       index = XEXP (x, 0);
4343       shift = INTVAL (XEXP (x, 1));
4344     }
4345   else
4346     return false;
4347
4348   if (!strict_p
4349       && GET_CODE (index) == SUBREG
4350       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4351     index = SUBREG_REG (index);
4352
4353   if ((shift == 0 ||
4354        (shift > 0 && shift <= 3
4355         && (1 << shift) == GET_MODE_SIZE (mode)))
4356       && REG_P (index)
4357       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4358     {
4359       info->type = type;
4360       info->offset = index;
4361       info->shift = shift;
4362       return true;
4363     }
4364
4365   return false;
4366 }
4367
4368 /* Return true if MODE is one of the modes for which we
4369    support LDP/STP operations.  */
4370
4371 static bool
4372 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4373 {
4374   return mode == SImode || mode == DImode
4375          || mode == SFmode || mode == DFmode
4376          || (aarch64_vector_mode_supported_p (mode)
4377              && GET_MODE_SIZE (mode) == 8);
4378 }
4379
4380 /* Return true if REGNO is a virtual pointer register, or an eliminable
4381    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4382    include stack_pointer or hard_frame_pointer.  */
4383 static bool
4384 virt_or_elim_regno_p (unsigned regno)
4385 {
4386   return ((regno >= FIRST_VIRTUAL_REGISTER
4387            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4388           || regno == FRAME_POINTER_REGNUM
4389           || regno == ARG_POINTER_REGNUM);
4390 }
4391
4392 /* Return true if X is a valid address for machine mode MODE.  If it is,
4393    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4394    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4395
4396 static bool
4397 aarch64_classify_address (struct aarch64_address_info *info,
4398                           rtx x, machine_mode mode,
4399                           RTX_CODE outer_code, bool strict_p)
4400 {
4401   enum rtx_code code = GET_CODE (x);
4402   rtx op0, op1;
4403
4404   /* On BE, we use load/store pair for all large int mode load/stores.
4405      TI/TFmode may also use a load/store pair.  */
4406   bool load_store_pair_p = (outer_code == PARALLEL
4407                             || mode == TImode
4408                             || mode == TFmode
4409                             || (BYTES_BIG_ENDIAN
4410                                 && aarch64_vect_struct_mode_p (mode)));
4411
4412   bool allow_reg_index_p =
4413     !load_store_pair_p
4414     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4415     && !aarch64_vect_struct_mode_p (mode);
4416
4417   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4418      REG addressing.  */
4419   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4420       && (code != POST_INC && code != REG))
4421     return false;
4422
4423   switch (code)
4424     {
4425     case REG:
4426     case SUBREG:
4427       info->type = ADDRESS_REG_IMM;
4428       info->base = x;
4429       info->offset = const0_rtx;
4430       return aarch64_base_register_rtx_p (x, strict_p);
4431
4432     case PLUS:
4433       op0 = XEXP (x, 0);
4434       op1 = XEXP (x, 1);
4435
4436       if (! strict_p
4437           && REG_P (op0)
4438           && virt_or_elim_regno_p (REGNO (op0))
4439           && CONST_INT_P (op1))
4440         {
4441           info->type = ADDRESS_REG_IMM;
4442           info->base = op0;
4443           info->offset = op1;
4444
4445           return true;
4446         }
4447
4448       if (GET_MODE_SIZE (mode) != 0
4449           && CONST_INT_P (op1)
4450           && aarch64_base_register_rtx_p (op0, strict_p))
4451         {
4452           HOST_WIDE_INT offset = INTVAL (op1);
4453
4454           info->type = ADDRESS_REG_IMM;
4455           info->base = op0;
4456           info->offset = op1;
4457
4458           /* TImode and TFmode values are allowed in both pairs of X
4459              registers and individual Q registers.  The available
4460              address modes are:
4461              X,X: 7-bit signed scaled offset
4462              Q:   9-bit signed offset
4463              We conservatively require an offset representable in either mode.
4464              When performing the check for pairs of X registers i.e.  LDP/STP
4465              pass down DImode since that is the natural size of the LDP/STP
4466              instruction memory accesses.  */
4467           if (mode == TImode || mode == TFmode)
4468             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4469                     && (offset_9bit_signed_unscaled_p (mode, offset)
4470                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4471
4472           /* A 7bit offset check because OImode will emit a ldp/stp
4473              instruction (only big endian will get here).
4474              For ldp/stp instructions, the offset is scaled for the size of a
4475              single element of the pair.  */
4476           if (mode == OImode)
4477             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4478
4479           /* Three 9/12 bit offsets checks because CImode will emit three
4480              ldr/str instructions (only big endian will get here).  */
4481           if (mode == CImode)
4482             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4483                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4484                         || offset_12bit_unsigned_scaled_p (V16QImode,
4485                                                            offset + 32)));
4486
4487           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4488              instructions (only big endian will get here).  */
4489           if (mode == XImode)
4490             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4491                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4492                                                             offset + 32));
4493
4494           if (load_store_pair_p)
4495             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4496                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4497           else
4498             return (offset_9bit_signed_unscaled_p (mode, offset)
4499                     || offset_12bit_unsigned_scaled_p (mode, offset));
4500         }
4501
4502       if (allow_reg_index_p)
4503         {
4504           /* Look for base + (scaled/extended) index register.  */
4505           if (aarch64_base_register_rtx_p (op0, strict_p)
4506               && aarch64_classify_index (info, op1, mode, strict_p))
4507             {
4508               info->base = op0;
4509               return true;
4510             }
4511           if (aarch64_base_register_rtx_p (op1, strict_p)
4512               && aarch64_classify_index (info, op0, mode, strict_p))
4513             {
4514               info->base = op1;
4515               return true;
4516             }
4517         }
4518
4519       return false;
4520
4521     case POST_INC:
4522     case POST_DEC:
4523     case PRE_INC:
4524     case PRE_DEC:
4525       info->type = ADDRESS_REG_WB;
4526       info->base = XEXP (x, 0);
4527       info->offset = NULL_RTX;
4528       return aarch64_base_register_rtx_p (info->base, strict_p);
4529
4530     case POST_MODIFY:
4531     case PRE_MODIFY:
4532       info->type = ADDRESS_REG_WB;
4533       info->base = XEXP (x, 0);
4534       if (GET_CODE (XEXP (x, 1)) == PLUS
4535           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4536           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4537           && aarch64_base_register_rtx_p (info->base, strict_p))
4538         {
4539           HOST_WIDE_INT offset;
4540           info->offset = XEXP (XEXP (x, 1), 1);
4541           offset = INTVAL (info->offset);
4542
4543           /* TImode and TFmode values are allowed in both pairs of X
4544              registers and individual Q registers.  The available
4545              address modes are:
4546              X,X: 7-bit signed scaled offset
4547              Q:   9-bit signed offset
4548              We conservatively require an offset representable in either mode.
4549            */
4550           if (mode == TImode || mode == TFmode)
4551             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4552                     && offset_9bit_signed_unscaled_p (mode, offset));
4553
4554           if (load_store_pair_p)
4555             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4556                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4557           else
4558             return offset_9bit_signed_unscaled_p (mode, offset);
4559         }
4560       return false;
4561
4562     case CONST:
4563     case SYMBOL_REF:
4564     case LABEL_REF:
4565       /* load literal: pc-relative constant pool entry.  Only supported
4566          for SI mode or larger.  */
4567       info->type = ADDRESS_SYMBOLIC;
4568
4569       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4570         {
4571           rtx sym, addend;
4572
4573           split_const (x, &sym, &addend);
4574           return ((GET_CODE (sym) == LABEL_REF
4575                    || (GET_CODE (sym) == SYMBOL_REF
4576                        && CONSTANT_POOL_ADDRESS_P (sym)
4577                        && aarch64_pcrelative_literal_loads)));
4578         }
4579       return false;
4580
4581     case LO_SUM:
4582       info->type = ADDRESS_LO_SUM;
4583       info->base = XEXP (x, 0);
4584       info->offset = XEXP (x, 1);
4585       if (allow_reg_index_p
4586           && aarch64_base_register_rtx_p (info->base, strict_p))
4587         {
4588           rtx sym, offs;
4589           split_const (info->offset, &sym, &offs);
4590           if (GET_CODE (sym) == SYMBOL_REF
4591               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4592             {
4593               /* The symbol and offset must be aligned to the access size.  */
4594               unsigned int align;
4595               unsigned int ref_size;
4596
4597               if (CONSTANT_POOL_ADDRESS_P (sym))
4598                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4599               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4600                 {
4601                   tree exp = SYMBOL_REF_DECL (sym);
4602                   align = TYPE_ALIGN (TREE_TYPE (exp));
4603                   align = CONSTANT_ALIGNMENT (exp, align);
4604                 }
4605               else if (SYMBOL_REF_DECL (sym))
4606                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4607               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4608                        && SYMBOL_REF_BLOCK (sym) != NULL)
4609                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4610               else
4611                 align = BITS_PER_UNIT;
4612
4613               ref_size = GET_MODE_SIZE (mode);
4614               if (ref_size == 0)
4615                 ref_size = GET_MODE_SIZE (DImode);
4616
4617               return ((INTVAL (offs) & (ref_size - 1)) == 0
4618                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4619             }
4620         }
4621       return false;
4622
4623     default:
4624       return false;
4625     }
4626 }
4627
4628 /* Return true if the address X is valid for a PRFM instruction.
4629    STRICT_P is true if we should do strict checking with
4630    aarch64_classify_address.  */
4631
4632 bool
4633 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4634 {
4635   struct aarch64_address_info addr;
4636
4637   /* PRFM accepts the same addresses as DImode...  */
4638   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4639   if (!res)
4640     return false;
4641
4642   /* ... except writeback forms.  */
4643   return addr.type != ADDRESS_REG_WB;
4644 }
4645
4646 bool
4647 aarch64_symbolic_address_p (rtx x)
4648 {
4649   rtx offset;
4650
4651   split_const (x, &x, &offset);
4652   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4653 }
4654
4655 /* Classify the base of symbolic expression X.  */
4656
4657 enum aarch64_symbol_type
4658 aarch64_classify_symbolic_expression (rtx x)
4659 {
4660   rtx offset;
4661
4662   split_const (x, &x, &offset);
4663   return aarch64_classify_symbol (x, offset);
4664 }
4665
4666
4667 /* Return TRUE if X is a legitimate address for accessing memory in
4668    mode MODE.  */
4669 static bool
4670 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4671 {
4672   struct aarch64_address_info addr;
4673
4674   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4675 }
4676
4677 /* Return TRUE if X is a legitimate address for accessing memory in
4678    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4679    pair operation.  */
4680 bool
4681 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4682                               RTX_CODE outer_code, bool strict_p)
4683 {
4684   struct aarch64_address_info addr;
4685
4686   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4687 }
4688
4689 /* Split an out-of-range address displacement into a base and offset.
4690    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4691    to increase opportunities for sharing the base address of different sizes.
4692    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4693 static bool
4694 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4695 {
4696   HOST_WIDE_INT offset = INTVAL (*disp);
4697   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4698
4699   if (mode == TImode || mode == TFmode
4700       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4701     base = (offset + 0x100) & ~0x1ff;
4702
4703   *off = GEN_INT (base);
4704   *disp = GEN_INT (offset - base);
4705   return true;
4706 }
4707
4708 /* Return the binary representation of floating point constant VALUE in INTVAL.
4709    If the value cannot be converted, return false without setting INTVAL.
4710    The conversion is done in the given MODE.  */
4711 bool
4712 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4713 {
4714
4715   /* We make a general exception for 0.  */
4716   if (aarch64_float_const_zero_rtx_p (value))
4717     {
4718       *intval = 0;
4719       return true;
4720     }
4721
4722   machine_mode mode = GET_MODE (value);
4723   if (GET_CODE (value) != CONST_DOUBLE
4724       || !SCALAR_FLOAT_MODE_P (mode)
4725       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4726       /* Only support up to DF mode.  */
4727       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4728     return false;
4729
4730   unsigned HOST_WIDE_INT ival = 0;
4731
4732   long res[2];
4733   real_to_target (res,
4734                   CONST_DOUBLE_REAL_VALUE (value),
4735                   REAL_MODE_FORMAT (mode));
4736
4737   if (mode == DFmode)
4738     {
4739       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4740       ival = zext_hwi (res[order], 32);
4741       ival |= (zext_hwi (res[1 - order], 32) << 32);
4742     }
4743   else
4744       ival = zext_hwi (res[0], 32);
4745
4746   *intval = ival;
4747   return true;
4748 }
4749
4750 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4751    single MOV(+MOVK) followed by an FMOV.  */
4752 bool
4753 aarch64_float_const_rtx_p (rtx x)
4754 {
4755   machine_mode mode = GET_MODE (x);
4756   if (mode == VOIDmode)
4757     return false;
4758
4759   /* Determine whether it's cheaper to write float constants as
4760      mov/movk pairs over ldr/adrp pairs.  */
4761   unsigned HOST_WIDE_INT ival;
4762
4763   if (GET_CODE (x) == CONST_DOUBLE
4764       && SCALAR_FLOAT_MODE_P (mode)
4765       && aarch64_reinterpret_float_as_int (x, &ival))
4766     {
4767       machine_mode imode = (mode == HFmode
4768                             ? SImode
4769                             : int_mode_for_mode (mode).require ());
4770       int num_instr = aarch64_internal_mov_immediate
4771                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4772       return num_instr < 3;
4773     }
4774
4775   return false;
4776 }
4777
4778 /* Return TRUE if rtx X is immediate constant 0.0 */
4779 bool
4780 aarch64_float_const_zero_rtx_p (rtx x)
4781 {
4782   if (GET_MODE (x) == VOIDmode)
4783     return false;
4784
4785   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4786     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4787   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4788 }
4789
4790 /* Return TRUE if rtx X is immediate constant that fits in a single
4791    MOVI immediate operation.  */
4792 bool
4793 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4794 {
4795   if (!TARGET_SIMD)
4796      return false;
4797
4798   machine_mode vmode, imode;
4799   unsigned HOST_WIDE_INT ival;
4800
4801   if (GET_CODE (x) == CONST_DOUBLE
4802       && SCALAR_FLOAT_MODE_P (mode))
4803     {
4804       if (!aarch64_reinterpret_float_as_int (x, &ival))
4805         return false;
4806
4807       /* We make a general exception for 0.  */
4808       if (aarch64_float_const_zero_rtx_p (x))
4809         return true;
4810
4811       imode = int_mode_for_mode (mode).require ();
4812     }
4813   else if (GET_CODE (x) == CONST_INT
4814            && SCALAR_INT_MODE_P (mode))
4815     {
4816        imode = mode;
4817        ival = INTVAL (x);
4818     }
4819   else
4820     return false;
4821
4822    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4823      a 128 bit vector mode.  */
4824   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4825
4826   vmode = aarch64_simd_container_mode (imode, width);
4827   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4828
4829   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4830 }
4831
4832
4833 /* Return the fixed registers used for condition codes.  */
4834
4835 static bool
4836 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4837 {
4838   *p1 = CC_REGNUM;
4839   *p2 = INVALID_REGNUM;
4840   return true;
4841 }
4842
4843 /* This function is used by the call expanders of the machine description.
4844    RESULT is the register in which the result is returned.  It's NULL for
4845    "call" and "sibcall".
4846    MEM is the location of the function call.
4847    SIBCALL indicates whether this function call is normal call or sibling call.
4848    It will generate different pattern accordingly.  */
4849
4850 void
4851 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4852 {
4853   rtx call, callee, tmp;
4854   rtvec vec;
4855   machine_mode mode;
4856
4857   gcc_assert (MEM_P (mem));
4858   callee = XEXP (mem, 0);
4859   mode = GET_MODE (callee);
4860   gcc_assert (mode == Pmode);
4861
4862   /* Decide if we should generate indirect calls by loading the
4863      address of the callee into a register before performing
4864      the branch-and-link.  */
4865   if (SYMBOL_REF_P (callee)
4866       ? (aarch64_is_long_call_p (callee)
4867          || aarch64_is_noplt_call_p (callee))
4868       : !REG_P (callee))
4869     XEXP (mem, 0) = force_reg (mode, callee);
4870
4871   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4872
4873   if (result != NULL_RTX)
4874     call = gen_rtx_SET (result, call);
4875
4876   if (sibcall)
4877     tmp = ret_rtx;
4878   else
4879     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4880
4881   vec = gen_rtvec (2, call, tmp);
4882   call = gen_rtx_PARALLEL (VOIDmode, vec);
4883
4884   aarch64_emit_call_insn (call);
4885 }
4886
4887 /* Emit call insn with PAT and do aarch64-specific handling.  */
4888
4889 void
4890 aarch64_emit_call_insn (rtx pat)
4891 {
4892   rtx insn = emit_call_insn (pat);
4893
4894   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4895   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4896   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4897 }
4898
4899 machine_mode
4900 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4901 {
4902   /* All floating point compares return CCFP if it is an equality
4903      comparison, and CCFPE otherwise.  */
4904   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4905     {
4906       switch (code)
4907         {
4908         case EQ:
4909         case NE:
4910         case UNORDERED:
4911         case ORDERED:
4912         case UNLT:
4913         case UNLE:
4914         case UNGT:
4915         case UNGE:
4916         case UNEQ:
4917         case LTGT:
4918           return CCFPmode;
4919
4920         case LT:
4921         case LE:
4922         case GT:
4923         case GE:
4924           return CCFPEmode;
4925
4926         default:
4927           gcc_unreachable ();
4928         }
4929     }
4930
4931   /* Equality comparisons of short modes against zero can be performed
4932      using the TST instruction with the appropriate bitmask.  */
4933   if (y == const0_rtx && REG_P (x)
4934       && (code == EQ || code == NE)
4935       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4936     return CC_NZmode;
4937
4938   /* Similarly, comparisons of zero_extends from shorter modes can
4939      be performed using an ANDS with an immediate mask.  */
4940   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4941       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4942       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4943       && (code == EQ || code == NE))
4944     return CC_NZmode;
4945
4946   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4947       && y == const0_rtx
4948       && (code == EQ || code == NE || code == LT || code == GE)
4949       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4950           || GET_CODE (x) == NEG
4951           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4952               && CONST_INT_P (XEXP (x, 2)))))
4953     return CC_NZmode;
4954
4955   /* A compare with a shifted operand.  Because of canonicalization,
4956      the comparison will have to be swapped when we emit the assembly
4957      code.  */
4958   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4959       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4960       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4961           || GET_CODE (x) == LSHIFTRT
4962           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4963     return CC_SWPmode;
4964
4965   /* Similarly for a negated operand, but we can only do this for
4966      equalities.  */
4967   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4968       && (REG_P (y) || GET_CODE (y) == SUBREG)
4969       && (code == EQ || code == NE)
4970       && GET_CODE (x) == NEG)
4971     return CC_Zmode;
4972
4973   /* A test for unsigned overflow.  */
4974   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4975       && code == NE
4976       && GET_CODE (x) == PLUS
4977       && GET_CODE (y) == ZERO_EXTEND)
4978     return CC_Cmode;
4979
4980   /* For everything else, return CCmode.  */
4981   return CCmode;
4982 }
4983
4984 static int
4985 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4986
4987 int
4988 aarch64_get_condition_code (rtx x)
4989 {
4990   machine_mode mode = GET_MODE (XEXP (x, 0));
4991   enum rtx_code comp_code = GET_CODE (x);
4992
4993   if (GET_MODE_CLASS (mode) != MODE_CC)
4994     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4995   return aarch64_get_condition_code_1 (mode, comp_code);
4996 }
4997
4998 static int
4999 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5000 {
5001   switch (mode)
5002     {
5003     case E_CCFPmode:
5004     case E_CCFPEmode:
5005       switch (comp_code)
5006         {
5007         case GE: return AARCH64_GE;
5008         case GT: return AARCH64_GT;
5009         case LE: return AARCH64_LS;
5010         case LT: return AARCH64_MI;
5011         case NE: return AARCH64_NE;
5012         case EQ: return AARCH64_EQ;
5013         case ORDERED: return AARCH64_VC;
5014         case UNORDERED: return AARCH64_VS;
5015         case UNLT: return AARCH64_LT;
5016         case UNLE: return AARCH64_LE;
5017         case UNGT: return AARCH64_HI;
5018         case UNGE: return AARCH64_PL;
5019         default: return -1;
5020         }
5021       break;
5022
5023     case E_CCmode:
5024       switch (comp_code)
5025         {
5026         case NE: return AARCH64_NE;
5027         case EQ: return AARCH64_EQ;
5028         case GE: return AARCH64_GE;
5029         case GT: return AARCH64_GT;
5030         case LE: return AARCH64_LE;
5031         case LT: return AARCH64_LT;
5032         case GEU: return AARCH64_CS;
5033         case GTU: return AARCH64_HI;
5034         case LEU: return AARCH64_LS;
5035         case LTU: return AARCH64_CC;
5036         default: return -1;
5037         }
5038       break;
5039
5040     case E_CC_SWPmode:
5041       switch (comp_code)
5042         {
5043         case NE: return AARCH64_NE;
5044         case EQ: return AARCH64_EQ;
5045         case GE: return AARCH64_LE;
5046         case GT: return AARCH64_LT;
5047         case LE: return AARCH64_GE;
5048         case LT: return AARCH64_GT;
5049         case GEU: return AARCH64_LS;
5050         case GTU: return AARCH64_CC;
5051         case LEU: return AARCH64_CS;
5052         case LTU: return AARCH64_HI;
5053         default: return -1;
5054         }
5055       break;
5056
5057     case E_CC_NZmode:
5058       switch (comp_code)
5059         {
5060         case NE: return AARCH64_NE;
5061         case EQ: return AARCH64_EQ;
5062         case GE: return AARCH64_PL;
5063         case LT: return AARCH64_MI;
5064         default: return -1;
5065         }
5066       break;
5067
5068     case E_CC_Zmode:
5069       switch (comp_code)
5070         {
5071         case NE: return AARCH64_NE;
5072         case EQ: return AARCH64_EQ;
5073         default: return -1;
5074         }
5075       break;
5076
5077     case E_CC_Cmode:
5078       switch (comp_code)
5079         {
5080         case NE: return AARCH64_CS;
5081         case EQ: return AARCH64_CC;
5082         default: return -1;
5083         }
5084       break;
5085
5086     default:
5087       return -1;
5088     }
5089
5090   return -1;
5091 }
5092
5093 bool
5094 aarch64_const_vec_all_same_in_range_p (rtx x,
5095                                   HOST_WIDE_INT minval,
5096                                   HOST_WIDE_INT maxval)
5097 {
5098   HOST_WIDE_INT firstval;
5099   int count, i;
5100
5101   if (GET_CODE (x) != CONST_VECTOR
5102       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5103     return false;
5104
5105   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5106   if (firstval < minval || firstval > maxval)
5107     return false;
5108
5109   count = CONST_VECTOR_NUNITS (x);
5110   for (i = 1; i < count; i++)
5111     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5112       return false;
5113
5114   return true;
5115 }
5116
5117 bool
5118 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5119 {
5120   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5121 }
5122
5123
5124 /* N Z C V.  */
5125 #define AARCH64_CC_V 1
5126 #define AARCH64_CC_C (1 << 1)
5127 #define AARCH64_CC_Z (1 << 2)
5128 #define AARCH64_CC_N (1 << 3)
5129
5130 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5131 static const int aarch64_nzcv_codes[] =
5132 {
5133   0,            /* EQ, Z == 1.  */
5134   AARCH64_CC_Z, /* NE, Z == 0.  */
5135   0,            /* CS, C == 1.  */
5136   AARCH64_CC_C, /* CC, C == 0.  */
5137   0,            /* MI, N == 1.  */
5138   AARCH64_CC_N, /* PL, N == 0.  */
5139   0,            /* VS, V == 1.  */
5140   AARCH64_CC_V, /* VC, V == 0.  */
5141   0,            /* HI, C ==1 && Z == 0.  */
5142   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5143   AARCH64_CC_V, /* GE, N == V.  */
5144   0,            /* LT, N != V.  */
5145   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5146   0,            /* LE, !(Z == 0 && N == V).  */
5147   0,            /* AL, Any.  */
5148   0             /* NV, Any.  */
5149 };
5150
5151 /* Print operand X to file F in a target specific manner according to CODE.
5152    The acceptable formatting commands given by CODE are:
5153      'c':               An integer or symbol address without a preceding #
5154                         sign.
5155      'e':               Print the sign/zero-extend size as a character 8->b,
5156                         16->h, 32->w.
5157      'p':               Prints N such that 2^N == X (X must be power of 2 and
5158                         const int).
5159      'P':               Print the number of non-zero bits in X (a const_int).
5160      'H':               Print the higher numbered register of a pair (TImode)
5161                         of regs.
5162      'm':               Print a condition (eq, ne, etc).
5163      'M':               Same as 'm', but invert condition.
5164      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5165      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5166                         The register printed is the FP/SIMD register name
5167                         of X + 0/1/2/3 for S/T/U/V.
5168      'R':               Print a scalar FP/SIMD register name + 1.
5169      'X':               Print bottom 16 bits of integer constant in hex.
5170      'w/x':             Print a general register name or the zero register
5171                         (32-bit or 64-bit).
5172      '0':               Print a normal operand, if it's a general register,
5173                         then we assume DImode.
5174      'k':               Print NZCV for conditional compare instructions.
5175      'A':               Output address constant representing the first
5176                         argument of X, specifying a relocation offset
5177                         if appropriate.
5178      'L':               Output constant address specified by X
5179                         with a relocation offset if appropriate.
5180      'G':               Prints address of X, specifying a PC relative
5181                         relocation mode if appropriate.  */
5182
5183 static void
5184 aarch64_print_operand (FILE *f, rtx x, int code)
5185 {
5186   switch (code)
5187     {
5188     case 'c':
5189       switch (GET_CODE (x))
5190         {
5191         case CONST_INT:
5192           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5193           break;
5194
5195         case SYMBOL_REF:
5196           output_addr_const (f, x);
5197           break;
5198
5199         case CONST:
5200           if (GET_CODE (XEXP (x, 0)) == PLUS
5201               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5202             {
5203               output_addr_const (f, x);
5204               break;
5205             }
5206           /* Fall through.  */
5207
5208         default:
5209           output_operand_lossage ("Unsupported operand for code '%c'", code);
5210         }
5211       break;
5212
5213     case 'e':
5214       {
5215         int n;
5216
5217         if (!CONST_INT_P (x)
5218             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5219           {
5220             output_operand_lossage ("invalid operand for '%%%c'", code);
5221             return;
5222           }
5223
5224         switch (n)
5225           {
5226           case 3:
5227             fputc ('b', f);
5228             break;
5229           case 4:
5230             fputc ('h', f);
5231             break;
5232           case 5:
5233             fputc ('w', f);
5234             break;
5235           default:
5236             output_operand_lossage ("invalid operand for '%%%c'", code);
5237             return;
5238           }
5239       }
5240       break;
5241
5242     case 'p':
5243       {
5244         int n;
5245
5246         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5247           {
5248             output_operand_lossage ("invalid operand for '%%%c'", code);
5249             return;
5250           }
5251
5252         asm_fprintf (f, "%d", n);
5253       }
5254       break;
5255
5256     case 'P':
5257       if (!CONST_INT_P (x))
5258         {
5259           output_operand_lossage ("invalid operand for '%%%c'", code);
5260           return;
5261         }
5262
5263       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5264       break;
5265
5266     case 'H':
5267       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5268         {
5269           output_operand_lossage ("invalid operand for '%%%c'", code);
5270           return;
5271         }
5272
5273       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5274       break;
5275
5276     case 'M':
5277     case 'm':
5278       {
5279         int cond_code;
5280         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5281         if (x == const_true_rtx)
5282           {
5283             if (code == 'M')
5284               fputs ("nv", f);
5285             return;
5286           }
5287
5288         if (!COMPARISON_P (x))
5289           {
5290             output_operand_lossage ("invalid operand for '%%%c'", code);
5291             return;
5292           }
5293
5294         cond_code = aarch64_get_condition_code (x);
5295         gcc_assert (cond_code >= 0);
5296         if (code == 'M')
5297           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5298         fputs (aarch64_condition_codes[cond_code], f);
5299       }
5300       break;
5301
5302     case 'b':
5303     case 'h':
5304     case 's':
5305     case 'd':
5306     case 'q':
5307       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5308         {
5309           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5310           return;
5311         }
5312       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5313       break;
5314
5315     case 'S':
5316     case 'T':
5317     case 'U':
5318     case 'V':
5319       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5320         {
5321           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5322           return;
5323         }
5324       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5325       break;
5326
5327     case 'R':
5328       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5329         {
5330           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5331           return;
5332         }
5333       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5334       break;
5335
5336     case 'X':
5337       if (!CONST_INT_P (x))
5338         {
5339           output_operand_lossage ("invalid operand for '%%%c'", code);
5340           return;
5341         }
5342       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5343       break;
5344
5345     case 'w':
5346     case 'x':
5347       if (x == const0_rtx
5348           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5349         {
5350           asm_fprintf (f, "%czr", code);
5351           break;
5352         }
5353
5354       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5355         {
5356           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5357           break;
5358         }
5359
5360       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5361         {
5362           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5363           break;
5364         }
5365
5366       /* Fall through */
5367
5368     case 0:
5369       if (x == NULL)
5370         {
5371           output_operand_lossage ("missing operand");
5372           return;
5373         }
5374
5375       switch (GET_CODE (x))
5376         {
5377         case REG:
5378           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5379           break;
5380
5381         case MEM:
5382           output_address (GET_MODE (x), XEXP (x, 0));
5383           /* Check all memory references are Pmode - even with ILP32.  */
5384           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5385           break;
5386
5387         case CONST:
5388         case LABEL_REF:
5389         case SYMBOL_REF:
5390           output_addr_const (asm_out_file, x);
5391           break;
5392
5393         case CONST_INT:
5394           asm_fprintf (f, "%wd", INTVAL (x));
5395           break;
5396
5397         case CONST_VECTOR:
5398           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5399             {
5400               gcc_assert (
5401                   aarch64_const_vec_all_same_in_range_p (x,
5402                                                          HOST_WIDE_INT_MIN,
5403                                                          HOST_WIDE_INT_MAX));
5404               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5405             }
5406           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5407             {
5408               fputc ('0', f);
5409             }
5410           else
5411             gcc_unreachable ();
5412           break;
5413
5414         case CONST_DOUBLE:
5415           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5416              be getting CONST_DOUBLEs holding integers.  */
5417           gcc_assert (GET_MODE (x) != VOIDmode);
5418           if (aarch64_float_const_zero_rtx_p (x))
5419             {
5420               fputc ('0', f);
5421               break;
5422             }
5423           else if (aarch64_float_const_representable_p (x))
5424             {
5425 #define buf_size 20
5426               char float_buf[buf_size] = {'\0'};
5427               real_to_decimal_for_mode (float_buf,
5428                                         CONST_DOUBLE_REAL_VALUE (x),
5429                                         buf_size, buf_size,
5430                                         1, GET_MODE (x));
5431               asm_fprintf (asm_out_file, "%s", float_buf);
5432               break;
5433 #undef buf_size
5434             }
5435           output_operand_lossage ("invalid constant");
5436           return;
5437         default:
5438           output_operand_lossage ("invalid operand");
5439           return;
5440         }
5441       break;
5442
5443     case 'A':
5444       if (GET_CODE (x) == HIGH)
5445         x = XEXP (x, 0);
5446
5447       switch (aarch64_classify_symbolic_expression (x))
5448         {
5449         case SYMBOL_SMALL_GOT_4G:
5450           asm_fprintf (asm_out_file, ":got:");
5451           break;
5452
5453         case SYMBOL_SMALL_TLSGD:
5454           asm_fprintf (asm_out_file, ":tlsgd:");
5455           break;
5456
5457         case SYMBOL_SMALL_TLSDESC:
5458           asm_fprintf (asm_out_file, ":tlsdesc:");
5459           break;
5460
5461         case SYMBOL_SMALL_TLSIE:
5462           asm_fprintf (asm_out_file, ":gottprel:");
5463           break;
5464
5465         case SYMBOL_TLSLE24:
5466           asm_fprintf (asm_out_file, ":tprel:");
5467           break;
5468
5469         case SYMBOL_TINY_GOT:
5470           gcc_unreachable ();
5471           break;
5472
5473         default:
5474           break;
5475         }
5476       output_addr_const (asm_out_file, x);
5477       break;
5478
5479     case 'L':
5480       switch (aarch64_classify_symbolic_expression (x))
5481         {
5482         case SYMBOL_SMALL_GOT_4G:
5483           asm_fprintf (asm_out_file, ":lo12:");
5484           break;
5485
5486         case SYMBOL_SMALL_TLSGD:
5487           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5488           break;
5489
5490         case SYMBOL_SMALL_TLSDESC:
5491           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5492           break;
5493
5494         case SYMBOL_SMALL_TLSIE:
5495           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5496           break;
5497
5498         case SYMBOL_TLSLE12:
5499           asm_fprintf (asm_out_file, ":tprel_lo12:");
5500           break;
5501
5502         case SYMBOL_TLSLE24:
5503           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5504           break;
5505
5506         case SYMBOL_TINY_GOT:
5507           asm_fprintf (asm_out_file, ":got:");
5508           break;
5509
5510         case SYMBOL_TINY_TLSIE:
5511           asm_fprintf (asm_out_file, ":gottprel:");
5512           break;
5513
5514         default:
5515           break;
5516         }
5517       output_addr_const (asm_out_file, x);
5518       break;
5519
5520     case 'G':
5521       switch (aarch64_classify_symbolic_expression (x))
5522         {
5523         case SYMBOL_TLSLE24:
5524           asm_fprintf (asm_out_file, ":tprel_hi12:");
5525           break;
5526         default:
5527           break;
5528         }
5529       output_addr_const (asm_out_file, x);
5530       break;
5531
5532     case 'k':
5533       {
5534         HOST_WIDE_INT cond_code;
5535
5536         if (!CONST_INT_P (x))
5537           {
5538             output_operand_lossage ("invalid operand for '%%%c'", code);
5539             return;
5540           }
5541
5542         cond_code = INTVAL (x);
5543         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5544         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5545       }
5546       break;
5547
5548     default:
5549       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5550       return;
5551     }
5552 }
5553
5554 static void
5555 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5556 {
5557   struct aarch64_address_info addr;
5558
5559   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5560     switch (addr.type)
5561       {
5562       case ADDRESS_REG_IMM:
5563         if (addr.offset == const0_rtx)
5564           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5565         else
5566           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5567                        INTVAL (addr.offset));
5568         return;
5569
5570       case ADDRESS_REG_REG:
5571         if (addr.shift == 0)
5572           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5573                        reg_names [REGNO (addr.offset)]);
5574         else
5575           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5576                        reg_names [REGNO (addr.offset)], addr.shift);
5577         return;
5578
5579       case ADDRESS_REG_UXTW:
5580         if (addr.shift == 0)
5581           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5582                        REGNO (addr.offset) - R0_REGNUM);
5583         else
5584           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5585                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5586         return;
5587
5588       case ADDRESS_REG_SXTW:
5589         if (addr.shift == 0)
5590           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5591                        REGNO (addr.offset) - R0_REGNUM);
5592         else
5593           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5594                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5595         return;
5596
5597       case ADDRESS_REG_WB:
5598         switch (GET_CODE (x))
5599           {
5600           case PRE_INC:
5601             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5602                          GET_MODE_SIZE (mode));
5603             return;
5604           case POST_INC:
5605             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5606                          GET_MODE_SIZE (mode));
5607             return;
5608           case PRE_DEC:
5609             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5610                          GET_MODE_SIZE (mode));
5611             return;
5612           case POST_DEC:
5613             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5614                          GET_MODE_SIZE (mode));
5615             return;
5616           case PRE_MODIFY:
5617             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5618                          INTVAL (addr.offset));
5619             return;
5620           case POST_MODIFY:
5621             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5622                          INTVAL (addr.offset));
5623             return;
5624           default:
5625             break;
5626           }
5627         break;
5628
5629       case ADDRESS_LO_SUM:
5630         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5631         output_addr_const (f, addr.offset);
5632         asm_fprintf (f, "]");
5633         return;
5634
5635       case ADDRESS_SYMBOLIC:
5636         break;
5637       }
5638
5639   output_addr_const (f, x);
5640 }
5641
5642 bool
5643 aarch64_label_mentioned_p (rtx x)
5644 {
5645   const char *fmt;
5646   int i;
5647
5648   if (GET_CODE (x) == LABEL_REF)
5649     return true;
5650
5651   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5652      referencing instruction, but they are constant offsets, not
5653      symbols.  */
5654   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5655     return false;
5656
5657   fmt = GET_RTX_FORMAT (GET_CODE (x));
5658   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5659     {
5660       if (fmt[i] == 'E')
5661         {
5662           int j;
5663
5664           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5665             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5666               return 1;
5667         }
5668       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5669         return 1;
5670     }
5671
5672   return 0;
5673 }
5674
5675 /* Implement REGNO_REG_CLASS.  */
5676
5677 enum reg_class
5678 aarch64_regno_regclass (unsigned regno)
5679 {
5680   if (GP_REGNUM_P (regno))
5681     return GENERAL_REGS;
5682
5683   if (regno == SP_REGNUM)
5684     return STACK_REG;
5685
5686   if (regno == FRAME_POINTER_REGNUM
5687       || regno == ARG_POINTER_REGNUM)
5688     return POINTER_REGS;
5689
5690   if (FP_REGNUM_P (regno))
5691     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5692
5693   return NO_REGS;
5694 }
5695
5696 static rtx
5697 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5698 {
5699   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5700      where mask is selected by alignment and size of the offset.
5701      We try to pick as large a range for the offset as possible to
5702      maximize the chance of a CSE.  However, for aligned addresses
5703      we limit the range to 4k so that structures with different sized
5704      elements are likely to use the same base.  We need to be careful
5705      not to split a CONST for some forms of address expression, otherwise
5706      it will generate sub-optimal code.  */
5707
5708   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5709     {
5710       rtx base = XEXP (x, 0);
5711       rtx offset_rtx = XEXP (x, 1);
5712       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5713
5714       if (GET_CODE (base) == PLUS)
5715         {
5716           rtx op0 = XEXP (base, 0);
5717           rtx op1 = XEXP (base, 1);
5718
5719           /* Force any scaling into a temp for CSE.  */
5720           op0 = force_reg (Pmode, op0);
5721           op1 = force_reg (Pmode, op1);
5722
5723           /* Let the pointer register be in op0.  */
5724           if (REG_POINTER (op1))
5725             std::swap (op0, op1);
5726
5727           /* If the pointer is virtual or frame related, then we know that
5728              virtual register instantiation or register elimination is going
5729              to apply a second constant.  We want the two constants folded
5730              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5731           if (virt_or_elim_regno_p (REGNO (op0)))
5732             {
5733               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5734                                    NULL_RTX, true, OPTAB_DIRECT);
5735               return gen_rtx_PLUS (Pmode, base, op1);
5736             }
5737
5738           /* Otherwise, in order to encourage CSE (and thence loop strength
5739              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5740           base = expand_binop (Pmode, add_optab, op0, op1,
5741                                NULL_RTX, true, OPTAB_DIRECT);
5742           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5743         }
5744
5745       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5746       HOST_WIDE_INT base_offset;
5747       if (GET_MODE_SIZE (mode) > 16)
5748         base_offset = (offset + 0x400) & ~0x7f0;
5749       /* For offsets aren't a multiple of the access size, the limit is
5750          -256...255.  */
5751       else if (offset & (GET_MODE_SIZE (mode) - 1))
5752         {
5753           base_offset = (offset + 0x100) & ~0x1ff;
5754
5755           /* BLKmode typically uses LDP of X-registers.  */
5756           if (mode == BLKmode)
5757             base_offset = (offset + 512) & ~0x3ff;
5758         }
5759       /* Small negative offsets are supported.  */
5760       else if (IN_RANGE (offset, -256, 0))
5761         base_offset = 0;
5762       else if (mode == TImode || mode == TFmode)
5763         base_offset = (offset + 0x100) & ~0x1ff;
5764       /* Use 12-bit offset by access size.  */
5765       else
5766         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5767
5768       if (base_offset != 0)
5769         {
5770           base = plus_constant (Pmode, base, base_offset);
5771           base = force_operand (base, NULL_RTX);
5772           return plus_constant (Pmode, base, offset - base_offset);
5773         }
5774     }
5775
5776   return x;
5777 }
5778
5779 /* Return the reload icode required for a constant pool in mode.  */
5780 static enum insn_code
5781 aarch64_constant_pool_reload_icode (machine_mode mode)
5782 {
5783   switch (mode)
5784     {
5785     case E_SFmode:
5786       return CODE_FOR_aarch64_reload_movcpsfdi;
5787
5788     case E_DFmode:
5789       return CODE_FOR_aarch64_reload_movcpdfdi;
5790
5791     case E_TFmode:
5792       return CODE_FOR_aarch64_reload_movcptfdi;
5793
5794     case E_V8QImode:
5795       return CODE_FOR_aarch64_reload_movcpv8qidi;
5796
5797     case E_V16QImode:
5798       return CODE_FOR_aarch64_reload_movcpv16qidi;
5799
5800     case E_V4HImode:
5801       return CODE_FOR_aarch64_reload_movcpv4hidi;
5802
5803     case E_V8HImode:
5804       return CODE_FOR_aarch64_reload_movcpv8hidi;
5805
5806     case E_V2SImode:
5807       return CODE_FOR_aarch64_reload_movcpv2sidi;
5808
5809     case E_V4SImode:
5810       return CODE_FOR_aarch64_reload_movcpv4sidi;
5811
5812     case E_V2DImode:
5813       return CODE_FOR_aarch64_reload_movcpv2didi;
5814
5815     case E_V2DFmode:
5816       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5817
5818     default:
5819       gcc_unreachable ();
5820     }
5821
5822   gcc_unreachable ();
5823 }
5824 static reg_class_t
5825 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5826                           reg_class_t rclass,
5827                           machine_mode mode,
5828                           secondary_reload_info *sri)
5829 {
5830
5831   /* If we have to disable direct literal pool loads and stores because the
5832      function is too big, then we need a scratch register.  */
5833   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5834       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5835           || targetm.vector_mode_supported_p (GET_MODE (x)))
5836       && !aarch64_pcrelative_literal_loads)
5837     {
5838       sri->icode = aarch64_constant_pool_reload_icode (mode);
5839       return NO_REGS;
5840     }
5841
5842   /* Without the TARGET_SIMD instructions we cannot move a Q register
5843      to a Q register directly.  We need a scratch.  */
5844   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5845       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5846       && reg_class_subset_p (rclass, FP_REGS))
5847     {
5848       if (mode == TFmode)
5849         sri->icode = CODE_FOR_aarch64_reload_movtf;
5850       else if (mode == TImode)
5851         sri->icode = CODE_FOR_aarch64_reload_movti;
5852       return NO_REGS;
5853     }
5854
5855   /* A TFmode or TImode memory access should be handled via an FP_REGS
5856      because AArch64 has richer addressing modes for LDR/STR instructions
5857      than LDP/STP instructions.  */
5858   if (TARGET_FLOAT && rclass == GENERAL_REGS
5859       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5860     return FP_REGS;
5861
5862   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5863       return GENERAL_REGS;
5864
5865   return NO_REGS;
5866 }
5867
5868 static bool
5869 aarch64_can_eliminate (const int from, const int to)
5870 {
5871   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5872      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5873
5874   if (frame_pointer_needed)
5875     {
5876       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5877         return true;
5878       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5879         return false;
5880       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5881           && !cfun->calls_alloca)
5882         return true;
5883       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5884         return true;
5885
5886       return false;
5887     }
5888   else
5889     {
5890       /* If we decided that we didn't need a leaf frame pointer but then used
5891          LR in the function, then we'll want a frame pointer after all, so
5892          prevent this elimination to ensure a frame pointer is used.  */
5893       if (to == STACK_POINTER_REGNUM
5894           && flag_omit_leaf_frame_pointer
5895           && df_regs_ever_live_p (LR_REGNUM))
5896         return false;
5897     }
5898
5899   return true;
5900 }
5901
5902 HOST_WIDE_INT
5903 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5904 {
5905   aarch64_layout_frame ();
5906
5907   if (to == HARD_FRAME_POINTER_REGNUM)
5908     {
5909       if (from == ARG_POINTER_REGNUM)
5910         return cfun->machine->frame.hard_fp_offset;
5911
5912       if (from == FRAME_POINTER_REGNUM)
5913         return cfun->machine->frame.hard_fp_offset
5914                - cfun->machine->frame.locals_offset;
5915     }
5916
5917   if (to == STACK_POINTER_REGNUM)
5918     {
5919       if (from == FRAME_POINTER_REGNUM)
5920           return cfun->machine->frame.frame_size
5921                  - cfun->machine->frame.locals_offset;
5922     }
5923
5924   return cfun->machine->frame.frame_size;
5925 }
5926
5927 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5928    previous frame.  */
5929
5930 rtx
5931 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5932 {
5933   if (count != 0)
5934     return const0_rtx;
5935   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5936 }
5937
5938
5939 static void
5940 aarch64_asm_trampoline_template (FILE *f)
5941 {
5942   if (TARGET_ILP32)
5943     {
5944       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5945       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5946     }
5947   else
5948     {
5949       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5950       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5951     }
5952   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5953   assemble_aligned_integer (4, const0_rtx);
5954   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5955   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5956 }
5957
5958 static void
5959 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5960 {
5961   rtx fnaddr, mem, a_tramp;
5962   const int tramp_code_sz = 16;
5963
5964   /* Don't need to copy the trailing D-words, we fill those in below.  */
5965   emit_block_move (m_tramp, assemble_trampoline_template (),
5966                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5967   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5968   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5969   if (GET_MODE (fnaddr) != ptr_mode)
5970     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5971   emit_move_insn (mem, fnaddr);
5972
5973   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5974   emit_move_insn (mem, chain_value);
5975
5976   /* XXX We should really define a "clear_cache" pattern and use
5977      gen_clear_cache().  */
5978   a_tramp = XEXP (m_tramp, 0);
5979   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5980                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5981                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5982                      ptr_mode);
5983 }
5984
5985 static unsigned char
5986 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5987 {
5988   switch (regclass)
5989     {
5990     case CALLER_SAVE_REGS:
5991     case POINTER_REGS:
5992     case GENERAL_REGS:
5993     case ALL_REGS:
5994     case FP_REGS:
5995     case FP_LO_REGS:
5996       return
5997         aarch64_vector_mode_p (mode)
5998           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5999           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6000     case STACK_REG:
6001       return 1;
6002
6003     case NO_REGS:
6004       return 0;
6005
6006     default:
6007       break;
6008     }
6009   gcc_unreachable ();
6010 }
6011
6012 static reg_class_t
6013 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6014 {
6015   if (regclass == POINTER_REGS)
6016     return GENERAL_REGS;
6017
6018   if (regclass == STACK_REG)
6019     {
6020       if (REG_P(x)
6021           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6022           return regclass;
6023
6024       return NO_REGS;
6025     }
6026
6027   /* Register eliminiation can result in a request for
6028      SP+constant->FP_REGS.  We cannot support such operations which
6029      use SP as source and an FP_REG as destination, so reject out
6030      right now.  */
6031   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6032     {
6033       rtx lhs = XEXP (x, 0);
6034
6035       /* Look through a possible SUBREG introduced by ILP32.  */
6036       if (GET_CODE (lhs) == SUBREG)
6037         lhs = SUBREG_REG (lhs);
6038
6039       gcc_assert (REG_P (lhs));
6040       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6041                                       POINTER_REGS));
6042       return NO_REGS;
6043     }
6044
6045   return regclass;
6046 }
6047
6048 void
6049 aarch64_asm_output_labelref (FILE* f, const char *name)
6050 {
6051   asm_fprintf (f, "%U%s", name);
6052 }
6053
6054 static void
6055 aarch64_elf_asm_constructor (rtx symbol, int priority)
6056 {
6057   if (priority == DEFAULT_INIT_PRIORITY)
6058     default_ctor_section_asm_out_constructor (symbol, priority);
6059   else
6060     {
6061       section *s;
6062       /* While priority is known to be in range [0, 65535], so 18 bytes
6063          would be enough, the compiler might not know that.  To avoid
6064          -Wformat-truncation false positive, use a larger size.  */
6065       char buf[23];
6066       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6067       s = get_section (buf, SECTION_WRITE, NULL);
6068       switch_to_section (s);
6069       assemble_align (POINTER_SIZE);
6070       assemble_aligned_integer (POINTER_BYTES, symbol);
6071     }
6072 }
6073
6074 static void
6075 aarch64_elf_asm_destructor (rtx symbol, int priority)
6076 {
6077   if (priority == DEFAULT_INIT_PRIORITY)
6078     default_dtor_section_asm_out_destructor (symbol, priority);
6079   else
6080     {
6081       section *s;
6082       /* While priority is known to be in range [0, 65535], so 18 bytes
6083          would be enough, the compiler might not know that.  To avoid
6084          -Wformat-truncation false positive, use a larger size.  */
6085       char buf[23];
6086       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6087       s = get_section (buf, SECTION_WRITE, NULL);
6088       switch_to_section (s);
6089       assemble_align (POINTER_SIZE);
6090       assemble_aligned_integer (POINTER_BYTES, symbol);
6091     }
6092 }
6093
6094 const char*
6095 aarch64_output_casesi (rtx *operands)
6096 {
6097   char buf[100];
6098   char label[100];
6099   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6100   int index;
6101   static const char *const patterns[4][2] =
6102   {
6103     {
6104       "ldrb\t%w3, [%0,%w1,uxtw]",
6105       "add\t%3, %4, %w3, sxtb #2"
6106     },
6107     {
6108       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6109       "add\t%3, %4, %w3, sxth #2"
6110     },
6111     {
6112       "ldr\t%w3, [%0,%w1,uxtw #2]",
6113       "add\t%3, %4, %w3, sxtw #2"
6114     },
6115     /* We assume that DImode is only generated when not optimizing and
6116        that we don't really need 64-bit address offsets.  That would
6117        imply an object file with 8GB of code in a single function!  */
6118     {
6119       "ldr\t%w3, [%0,%w1,uxtw #2]",
6120       "add\t%3, %4, %w3, sxtw #2"
6121     }
6122   };
6123
6124   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6125
6126   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6127
6128   gcc_assert (index >= 0 && index <= 3);
6129
6130   /* Need to implement table size reduction, by chaning the code below.  */
6131   output_asm_insn (patterns[index][0], operands);
6132   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6133   snprintf (buf, sizeof (buf),
6134             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6135   output_asm_insn (buf, operands);
6136   output_asm_insn (patterns[index][1], operands);
6137   output_asm_insn ("br\t%3", operands);
6138   assemble_label (asm_out_file, label);
6139   return "";
6140 }
6141
6142
6143 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6144    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6145    operator.  */
6146
6147 int
6148 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6149 {
6150   if (shift >= 0 && shift <= 3)
6151     {
6152       int size;
6153       for (size = 8; size <= 32; size *= 2)
6154         {
6155           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6156           if (mask == bits << shift)
6157             return size;
6158         }
6159     }
6160   return 0;
6161 }
6162
6163 /* Constant pools are per function only when PC relative
6164    literal loads are true or we are in the large memory
6165    model.  */
6166
6167 static inline bool
6168 aarch64_can_use_per_function_literal_pools_p (void)
6169 {
6170   return (aarch64_pcrelative_literal_loads
6171           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6172 }
6173
6174 static bool
6175 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6176 {
6177   /* Fixme:: In an ideal world this would work similar
6178      to the logic in aarch64_select_rtx_section but this
6179      breaks bootstrap in gcc go.  For now we workaround
6180      this by returning false here.  */
6181   return false;
6182 }
6183
6184 /* Select appropriate section for constants depending
6185    on where we place literal pools.  */
6186
6187 static section *
6188 aarch64_select_rtx_section (machine_mode mode,
6189                             rtx x,
6190                             unsigned HOST_WIDE_INT align)
6191 {
6192   if (aarch64_can_use_per_function_literal_pools_p ())
6193     return function_section (current_function_decl);
6194
6195   return default_elf_select_rtx_section (mode, x, align);
6196 }
6197
6198 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6199 void
6200 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6201                                   HOST_WIDE_INT offset)
6202 {
6203   /* When using per-function literal pools, we must ensure that any code
6204      section is aligned to the minimal instruction length, lest we get
6205      errors from the assembler re "unaligned instructions".  */
6206   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6207     ASM_OUTPUT_ALIGN (f, 2);
6208 }
6209
6210 /* Costs.  */
6211
6212 /* Helper function for rtx cost calculation.  Strip a shift expression
6213    from X.  Returns the inner operand if successful, or the original
6214    expression on failure.  */
6215 static rtx
6216 aarch64_strip_shift (rtx x)
6217 {
6218   rtx op = x;
6219
6220   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6221      we can convert both to ROR during final output.  */
6222   if ((GET_CODE (op) == ASHIFT
6223        || GET_CODE (op) == ASHIFTRT
6224        || GET_CODE (op) == LSHIFTRT
6225        || GET_CODE (op) == ROTATERT
6226        || GET_CODE (op) == ROTATE)
6227       && CONST_INT_P (XEXP (op, 1)))
6228     return XEXP (op, 0);
6229
6230   if (GET_CODE (op) == MULT
6231       && CONST_INT_P (XEXP (op, 1))
6232       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6233     return XEXP (op, 0);
6234
6235   return x;
6236 }
6237
6238 /* Helper function for rtx cost calculation.  Strip an extend
6239    expression from X.  Returns the inner operand if successful, or the
6240    original expression on failure.  We deal with a number of possible
6241    canonicalization variations here. If STRIP_SHIFT is true, then
6242    we can strip off a shift also.  */
6243 static rtx
6244 aarch64_strip_extend (rtx x, bool strip_shift)
6245 {
6246   rtx op = x;
6247
6248   /* Zero and sign extraction of a widened value.  */
6249   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6250       && XEXP (op, 2) == const0_rtx
6251       && GET_CODE (XEXP (op, 0)) == MULT
6252       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6253                                          XEXP (op, 1)))
6254     return XEXP (XEXP (op, 0), 0);
6255
6256   /* It can also be represented (for zero-extend) as an AND with an
6257      immediate.  */
6258   if (GET_CODE (op) == AND
6259       && GET_CODE (XEXP (op, 0)) == MULT
6260       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6261       && CONST_INT_P (XEXP (op, 1))
6262       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6263                            INTVAL (XEXP (op, 1))) != 0)
6264     return XEXP (XEXP (op, 0), 0);
6265
6266   /* Now handle extended register, as this may also have an optional
6267      left shift by 1..4.  */
6268   if (strip_shift
6269       && GET_CODE (op) == ASHIFT
6270       && CONST_INT_P (XEXP (op, 1))
6271       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6272     op = XEXP (op, 0);
6273
6274   if (GET_CODE (op) == ZERO_EXTEND
6275       || GET_CODE (op) == SIGN_EXTEND)
6276     op = XEXP (op, 0);
6277
6278   if (op != x)
6279     return op;
6280
6281   return x;
6282 }
6283
6284 /* Return true iff CODE is a shift supported in combination
6285    with arithmetic instructions.  */
6286
6287 static bool
6288 aarch64_shift_p (enum rtx_code code)
6289 {
6290   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6291 }
6292
6293
6294 /* Return true iff X is a cheap shift without a sign extend. */
6295
6296 static bool
6297 aarch64_cheap_mult_shift_p (rtx x)
6298 {
6299   rtx op0, op1;
6300
6301   op0 = XEXP (x, 0);
6302   op1 = XEXP (x, 1);
6303
6304   if (!(aarch64_tune_params.extra_tuning_flags
6305                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6306     return false;
6307
6308   if (GET_CODE (op0) == SIGN_EXTEND)
6309     return false;
6310
6311   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6312       && UINTVAL (op1) <= 4)
6313     return true;
6314
6315   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6316     return false;
6317
6318   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6319
6320   if (l2 > 0 && l2 <= 4)
6321     return true;
6322
6323   return false;
6324 }
6325
6326 /* Helper function for rtx cost calculation.  Calculate the cost of
6327    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6328    Return the calculated cost of the expression, recursing manually in to
6329    operands where needed.  */
6330
6331 static int
6332 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6333 {
6334   rtx op0, op1;
6335   const struct cpu_cost_table *extra_cost
6336     = aarch64_tune_params.insn_extra_cost;
6337   int cost = 0;
6338   bool compound_p = (outer == PLUS || outer == MINUS);
6339   machine_mode mode = GET_MODE (x);
6340
6341   gcc_checking_assert (code == MULT);
6342
6343   op0 = XEXP (x, 0);
6344   op1 = XEXP (x, 1);
6345
6346   if (VECTOR_MODE_P (mode))
6347     mode = GET_MODE_INNER (mode);
6348
6349   /* Integer multiply/fma.  */
6350   if (GET_MODE_CLASS (mode) == MODE_INT)
6351     {
6352       /* The multiply will be canonicalized as a shift, cost it as such.  */
6353       if (aarch64_shift_p (GET_CODE (x))
6354           || (CONST_INT_P (op1)
6355               && exact_log2 (INTVAL (op1)) > 0))
6356         {
6357           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6358                            || GET_CODE (op0) == SIGN_EXTEND;
6359           if (speed)
6360             {
6361               if (compound_p)
6362                 {
6363                   /* If the shift is considered cheap,
6364                      then don't add any cost. */
6365                   if (aarch64_cheap_mult_shift_p (x))
6366                     ;
6367                   else if (REG_P (op1))
6368                     /* ARITH + shift-by-register.  */
6369                     cost += extra_cost->alu.arith_shift_reg;
6370                   else if (is_extend)
6371                     /* ARITH + extended register.  We don't have a cost field
6372                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6373                     cost += extra_cost->alu.extend_arith;
6374                   else
6375                     /* ARITH + shift-by-immediate.  */
6376                     cost += extra_cost->alu.arith_shift;
6377                 }
6378               else
6379                 /* LSL (immediate).  */
6380                 cost += extra_cost->alu.shift;
6381
6382             }
6383           /* Strip extends as we will have costed them in the case above.  */
6384           if (is_extend)
6385             op0 = aarch64_strip_extend (op0, true);
6386
6387           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6388
6389           return cost;
6390         }
6391
6392       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6393          compound and let the below cases handle it.  After all, MNEG is a
6394          special-case alias of MSUB.  */
6395       if (GET_CODE (op0) == NEG)
6396         {
6397           op0 = XEXP (op0, 0);
6398           compound_p = true;
6399         }
6400
6401       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6402       if ((GET_CODE (op0) == ZERO_EXTEND
6403            && GET_CODE (op1) == ZERO_EXTEND)
6404           || (GET_CODE (op0) == SIGN_EXTEND
6405               && GET_CODE (op1) == SIGN_EXTEND))
6406         {
6407           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6408           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6409
6410           if (speed)
6411             {
6412               if (compound_p)
6413                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6414                 cost += extra_cost->mult[0].extend_add;
6415               else
6416                 /* MUL/SMULL/UMULL.  */
6417                 cost += extra_cost->mult[0].extend;
6418             }
6419
6420           return cost;
6421         }
6422
6423       /* This is either an integer multiply or a MADD.  In both cases
6424          we want to recurse and cost the operands.  */
6425       cost += rtx_cost (op0, mode, MULT, 0, speed);
6426       cost += rtx_cost (op1, mode, MULT, 1, speed);
6427
6428       if (speed)
6429         {
6430           if (compound_p)
6431             /* MADD/MSUB.  */
6432             cost += extra_cost->mult[mode == DImode].add;
6433           else
6434             /* MUL.  */
6435             cost += extra_cost->mult[mode == DImode].simple;
6436         }
6437
6438       return cost;
6439     }
6440   else
6441     {
6442       if (speed)
6443         {
6444           /* Floating-point FMA/FMUL can also support negations of the
6445              operands, unless the rounding mode is upward or downward in
6446              which case FNMUL is different than FMUL with operand negation.  */
6447           bool neg0 = GET_CODE (op0) == NEG;
6448           bool neg1 = GET_CODE (op1) == NEG;
6449           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6450             {
6451               if (neg0)
6452                 op0 = XEXP (op0, 0);
6453               if (neg1)
6454                 op1 = XEXP (op1, 0);
6455             }
6456
6457           if (compound_p)
6458             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6459             cost += extra_cost->fp[mode == DFmode].fma;
6460           else
6461             /* FMUL/FNMUL.  */
6462             cost += extra_cost->fp[mode == DFmode].mult;
6463         }
6464
6465       cost += rtx_cost (op0, mode, MULT, 0, speed);
6466       cost += rtx_cost (op1, mode, MULT, 1, speed);
6467       return cost;
6468     }
6469 }
6470
6471 static int
6472 aarch64_address_cost (rtx x,
6473                       machine_mode mode,
6474                       addr_space_t as ATTRIBUTE_UNUSED,
6475                       bool speed)
6476 {
6477   enum rtx_code c = GET_CODE (x);
6478   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6479   struct aarch64_address_info info;
6480   int cost = 0;
6481   info.shift = 0;
6482
6483   if (!aarch64_classify_address (&info, x, mode, c, false))
6484     {
6485       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6486         {
6487           /* This is a CONST or SYMBOL ref which will be split
6488              in a different way depending on the code model in use.
6489              Cost it through the generic infrastructure.  */
6490           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6491           /* Divide through by the cost of one instruction to
6492              bring it to the same units as the address costs.  */
6493           cost_symbol_ref /= COSTS_N_INSNS (1);
6494           /* The cost is then the cost of preparing the address,
6495              followed by an immediate (possibly 0) offset.  */
6496           return cost_symbol_ref + addr_cost->imm_offset;
6497         }
6498       else
6499         {
6500           /* This is most likely a jump table from a case
6501              statement.  */
6502           return addr_cost->register_offset;
6503         }
6504     }
6505
6506   switch (info.type)
6507     {
6508       case ADDRESS_LO_SUM:
6509       case ADDRESS_SYMBOLIC:
6510       case ADDRESS_REG_IMM:
6511         cost += addr_cost->imm_offset;
6512         break;
6513
6514       case ADDRESS_REG_WB:
6515         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6516           cost += addr_cost->pre_modify;
6517         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6518           cost += addr_cost->post_modify;
6519         else
6520           gcc_unreachable ();
6521
6522         break;
6523
6524       case ADDRESS_REG_REG:
6525         cost += addr_cost->register_offset;
6526         break;
6527
6528       case ADDRESS_REG_SXTW:
6529         cost += addr_cost->register_sextend;
6530         break;
6531
6532       case ADDRESS_REG_UXTW:
6533         cost += addr_cost->register_zextend;
6534         break;
6535
6536       default:
6537         gcc_unreachable ();
6538     }
6539
6540
6541   if (info.shift > 0)
6542     {
6543       /* For the sake of calculating the cost of the shifted register
6544          component, we can treat same sized modes in the same way.  */
6545       switch (GET_MODE_BITSIZE (mode))
6546         {
6547           case 16:
6548             cost += addr_cost->addr_scale_costs.hi;
6549             break;
6550
6551           case 32:
6552             cost += addr_cost->addr_scale_costs.si;
6553             break;
6554
6555           case 64:
6556             cost += addr_cost->addr_scale_costs.di;
6557             break;
6558
6559           /* We can't tell, or this is a 128-bit vector.  */
6560           default:
6561             cost += addr_cost->addr_scale_costs.ti;
6562             break;
6563         }
6564     }
6565
6566   return cost;
6567 }
6568
6569 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6570    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6571    to be taken.  */
6572
6573 int
6574 aarch64_branch_cost (bool speed_p, bool predictable_p)
6575 {
6576   /* When optimizing for speed, use the cost of unpredictable branches.  */
6577   const struct cpu_branch_cost *branch_costs =
6578     aarch64_tune_params.branch_costs;
6579
6580   if (!speed_p || predictable_p)
6581     return branch_costs->predictable;
6582   else
6583     return branch_costs->unpredictable;
6584 }
6585
6586 /* Return true if the RTX X in mode MODE is a zero or sign extract
6587    usable in an ADD or SUB (extended register) instruction.  */
6588 static bool
6589 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6590 {
6591   /* Catch add with a sign extract.
6592      This is add_<optab><mode>_multp2.  */
6593   if (GET_CODE (x) == SIGN_EXTRACT
6594       || GET_CODE (x) == ZERO_EXTRACT)
6595     {
6596       rtx op0 = XEXP (x, 0);
6597       rtx op1 = XEXP (x, 1);
6598       rtx op2 = XEXP (x, 2);
6599
6600       if (GET_CODE (op0) == MULT
6601           && CONST_INT_P (op1)
6602           && op2 == const0_rtx
6603           && CONST_INT_P (XEXP (op0, 1))
6604           && aarch64_is_extend_from_extract (mode,
6605                                              XEXP (op0, 1),
6606                                              op1))
6607         {
6608           return true;
6609         }
6610     }
6611   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6612      No shift.  */
6613   else if (GET_CODE (x) == SIGN_EXTEND
6614            || GET_CODE (x) == ZERO_EXTEND)
6615     return REG_P (XEXP (x, 0));
6616
6617   return false;
6618 }
6619
6620 static bool
6621 aarch64_frint_unspec_p (unsigned int u)
6622 {
6623   switch (u)
6624     {
6625       case UNSPEC_FRINTZ:
6626       case UNSPEC_FRINTP:
6627       case UNSPEC_FRINTM:
6628       case UNSPEC_FRINTA:
6629       case UNSPEC_FRINTN:
6630       case UNSPEC_FRINTX:
6631       case UNSPEC_FRINTI:
6632         return true;
6633
6634       default:
6635         return false;
6636     }
6637 }
6638
6639 /* Return true iff X is an rtx that will match an extr instruction
6640    i.e. as described in the *extr<mode>5_insn family of patterns.
6641    OP0 and OP1 will be set to the operands of the shifts involved
6642    on success and will be NULL_RTX otherwise.  */
6643
6644 static bool
6645 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6646 {
6647   rtx op0, op1;
6648   machine_mode mode = GET_MODE (x);
6649
6650   *res_op0 = NULL_RTX;
6651   *res_op1 = NULL_RTX;
6652
6653   if (GET_CODE (x) != IOR)
6654     return false;
6655
6656   op0 = XEXP (x, 0);
6657   op1 = XEXP (x, 1);
6658
6659   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6660       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6661     {
6662      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6663       if (GET_CODE (op1) == ASHIFT)
6664         std::swap (op0, op1);
6665
6666       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6667         return false;
6668
6669       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6670       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6671
6672       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6673           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6674         {
6675           *res_op0 = XEXP (op0, 0);
6676           *res_op1 = XEXP (op1, 0);
6677           return true;
6678         }
6679     }
6680
6681   return false;
6682 }
6683
6684 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6685    storing it in *COST.  Result is true if the total cost of the operation
6686    has now been calculated.  */
6687 static bool
6688 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6689 {
6690   rtx inner;
6691   rtx comparator;
6692   enum rtx_code cmpcode;
6693
6694   if (COMPARISON_P (op0))
6695     {
6696       inner = XEXP (op0, 0);
6697       comparator = XEXP (op0, 1);
6698       cmpcode = GET_CODE (op0);
6699     }
6700   else
6701     {
6702       inner = op0;
6703       comparator = const0_rtx;
6704       cmpcode = NE;
6705     }
6706
6707   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6708     {
6709       /* Conditional branch.  */
6710       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6711         return true;
6712       else
6713         {
6714           if (cmpcode == NE || cmpcode == EQ)
6715             {
6716               if (comparator == const0_rtx)
6717                 {
6718                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6719                   if (GET_CODE (inner) == ZERO_EXTRACT)
6720                     /* TBZ/TBNZ.  */
6721                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6722                                        ZERO_EXTRACT, 0, speed);
6723                   else
6724                     /* CBZ/CBNZ.  */
6725                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6726
6727                 return true;
6728               }
6729             }
6730           else if (cmpcode == LT || cmpcode == GE)
6731             {
6732               /* TBZ/TBNZ.  */
6733               if (comparator == const0_rtx)
6734                 return true;
6735             }
6736         }
6737     }
6738   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6739     {
6740       /* CCMP.  */
6741       if (GET_CODE (op1) == COMPARE)
6742         {
6743           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6744           if (XEXP (op1, 1) == const0_rtx)
6745             *cost += 1;
6746           if (speed)
6747             {
6748               machine_mode mode = GET_MODE (XEXP (op1, 0));
6749               const struct cpu_cost_table *extra_cost
6750                 = aarch64_tune_params.insn_extra_cost;
6751
6752               if (GET_MODE_CLASS (mode) == MODE_INT)
6753                 *cost += extra_cost->alu.arith;
6754               else
6755                 *cost += extra_cost->fp[mode == DFmode].compare;
6756             }
6757           return true;
6758         }
6759
6760       /* It's a conditional operation based on the status flags,
6761          so it must be some flavor of CSEL.  */
6762
6763       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6764       if (GET_CODE (op1) == NEG
6765           || GET_CODE (op1) == NOT
6766           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6767         op1 = XEXP (op1, 0);
6768       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6769         {
6770           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6771           op1 = XEXP (op1, 0);
6772           op2 = XEXP (op2, 0);
6773         }
6774
6775       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6776       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6777       return true;
6778     }
6779
6780   /* We don't know what this is, cost all operands.  */
6781   return false;
6782 }
6783
6784 /* Check whether X is a bitfield operation of the form shift + extend that
6785    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6786    operand to which the bitfield operation is applied.  Otherwise return
6787    NULL_RTX.  */
6788
6789 static rtx
6790 aarch64_extend_bitfield_pattern_p (rtx x)
6791 {
6792   rtx_code outer_code = GET_CODE (x);
6793   machine_mode outer_mode = GET_MODE (x);
6794
6795   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6796       && outer_mode != SImode && outer_mode != DImode)
6797     return NULL_RTX;
6798
6799   rtx inner = XEXP (x, 0);
6800   rtx_code inner_code = GET_CODE (inner);
6801   machine_mode inner_mode = GET_MODE (inner);
6802   rtx op = NULL_RTX;
6803
6804   switch (inner_code)
6805     {
6806       case ASHIFT:
6807         if (CONST_INT_P (XEXP (inner, 1))
6808             && (inner_mode == QImode || inner_mode == HImode))
6809           op = XEXP (inner, 0);
6810         break;
6811       case LSHIFTRT:
6812         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6813             && (inner_mode == QImode || inner_mode == HImode))
6814           op = XEXP (inner, 0);
6815         break;
6816       case ASHIFTRT:
6817         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6818             && (inner_mode == QImode || inner_mode == HImode))
6819           op = XEXP (inner, 0);
6820         break;
6821       default:
6822         break;
6823     }
6824
6825   return op;
6826 }
6827
6828 /* Return true if the mask and a shift amount from an RTX of the form
6829    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6830    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6831
6832 bool
6833 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6834 {
6835   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6836          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6837          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6838          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6839 }
6840
6841 /* Calculate the cost of calculating X, storing it in *COST.  Result
6842    is true if the total cost of the operation has now been calculated.  */
6843 static bool
6844 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6845                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6846 {
6847   rtx op0, op1, op2;
6848   const struct cpu_cost_table *extra_cost
6849     = aarch64_tune_params.insn_extra_cost;
6850   int code = GET_CODE (x);
6851   scalar_int_mode int_mode;
6852
6853   /* By default, assume that everything has equivalent cost to the
6854      cheapest instruction.  Any additional costs are applied as a delta
6855      above this default.  */
6856   *cost = COSTS_N_INSNS (1);
6857
6858   switch (code)
6859     {
6860     case SET:
6861       /* The cost depends entirely on the operands to SET.  */
6862       *cost = 0;
6863       op0 = SET_DEST (x);
6864       op1 = SET_SRC (x);
6865
6866       switch (GET_CODE (op0))
6867         {
6868         case MEM:
6869           if (speed)
6870             {
6871               rtx address = XEXP (op0, 0);
6872               if (VECTOR_MODE_P (mode))
6873                 *cost += extra_cost->ldst.storev;
6874               else if (GET_MODE_CLASS (mode) == MODE_INT)
6875                 *cost += extra_cost->ldst.store;
6876               else if (mode == SFmode)
6877                 *cost += extra_cost->ldst.storef;
6878               else if (mode == DFmode)
6879                 *cost += extra_cost->ldst.stored;
6880
6881               *cost +=
6882                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6883                                                      0, speed));
6884             }
6885
6886           *cost += rtx_cost (op1, mode, SET, 1, speed);
6887           return true;
6888
6889         case SUBREG:
6890           if (! REG_P (SUBREG_REG (op0)))
6891             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6892
6893           /* Fall through.  */
6894         case REG:
6895           /* The cost is one per vector-register copied.  */
6896           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6897             {
6898               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6899                               / GET_MODE_SIZE (V4SImode);
6900               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6901             }
6902           /* const0_rtx is in general free, but we will use an
6903              instruction to set a register to 0.  */
6904           else if (REG_P (op1) || op1 == const0_rtx)
6905             {
6906               /* The cost is 1 per register copied.  */
6907               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6908                               / UNITS_PER_WORD;
6909               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6910             }
6911           else
6912             /* Cost is just the cost of the RHS of the set.  */
6913             *cost += rtx_cost (op1, mode, SET, 1, speed);
6914           return true;
6915
6916         case ZERO_EXTRACT:
6917         case SIGN_EXTRACT:
6918           /* Bit-field insertion.  Strip any redundant widening of
6919              the RHS to meet the width of the target.  */
6920           if (GET_CODE (op1) == SUBREG)
6921             op1 = SUBREG_REG (op1);
6922           if ((GET_CODE (op1) == ZERO_EXTEND
6923                || GET_CODE (op1) == SIGN_EXTEND)
6924               && CONST_INT_P (XEXP (op0, 1))
6925               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6926                   >= INTVAL (XEXP (op0, 1))))
6927             op1 = XEXP (op1, 0);
6928
6929           if (CONST_INT_P (op1))
6930             {
6931               /* MOV immediate is assumed to always be cheap.  */
6932               *cost = COSTS_N_INSNS (1);
6933             }
6934           else
6935             {
6936               /* BFM.  */
6937               if (speed)
6938                 *cost += extra_cost->alu.bfi;
6939               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6940             }
6941
6942           return true;
6943
6944         default:
6945           /* We can't make sense of this, assume default cost.  */
6946           *cost = COSTS_N_INSNS (1);
6947           return false;
6948         }
6949       return false;
6950
6951     case CONST_INT:
6952       /* If an instruction can incorporate a constant within the
6953          instruction, the instruction's expression avoids calling
6954          rtx_cost() on the constant.  If rtx_cost() is called on a
6955          constant, then it is usually because the constant must be
6956          moved into a register by one or more instructions.
6957
6958          The exception is constant 0, which can be expressed
6959          as XZR/WZR and is therefore free.  The exception to this is
6960          if we have (set (reg) (const0_rtx)) in which case we must cost
6961          the move.  However, we can catch that when we cost the SET, so
6962          we don't need to consider that here.  */
6963       if (x == const0_rtx)
6964         *cost = 0;
6965       else
6966         {
6967           /* To an approximation, building any other constant is
6968              proportionally expensive to the number of instructions
6969              required to build that constant.  This is true whether we
6970              are compiling for SPEED or otherwise.  */
6971           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6972                                  (NULL_RTX, x, false, mode));
6973         }
6974       return true;
6975
6976     case CONST_DOUBLE:
6977
6978       /* First determine number of instructions to do the move
6979           as an integer constant.  */
6980       if (!aarch64_float_const_representable_p (x)
6981            && !aarch64_can_const_movi_rtx_p (x, mode)
6982            && aarch64_float_const_rtx_p (x))
6983         {
6984           unsigned HOST_WIDE_INT ival;
6985           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6986           gcc_assert (succeed);
6987
6988           machine_mode imode = (mode == HFmode
6989                                 ? SImode
6990                                 : int_mode_for_mode (mode).require ());
6991           int ncost = aarch64_internal_mov_immediate
6992                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6993           *cost += COSTS_N_INSNS (ncost);
6994           return true;
6995         }
6996
6997       if (speed)
6998         {
6999           /* mov[df,sf]_aarch64.  */
7000           if (aarch64_float_const_representable_p (x))
7001             /* FMOV (scalar immediate).  */
7002             *cost += extra_cost->fp[mode == DFmode].fpconst;
7003           else if (!aarch64_float_const_zero_rtx_p (x))
7004             {
7005               /* This will be a load from memory.  */
7006               if (mode == DFmode)
7007                 *cost += extra_cost->ldst.loadd;
7008               else
7009                 *cost += extra_cost->ldst.loadf;
7010             }
7011           else
7012             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7013                or MOV v0.s[0], wzr - neither of which are modeled by the
7014                cost tables.  Just use the default cost.  */
7015             {
7016             }
7017         }
7018
7019       return true;
7020
7021     case MEM:
7022       if (speed)
7023         {
7024           /* For loads we want the base cost of a load, plus an
7025              approximation for the additional cost of the addressing
7026              mode.  */
7027           rtx address = XEXP (x, 0);
7028           if (VECTOR_MODE_P (mode))
7029             *cost += extra_cost->ldst.loadv;
7030           else if (GET_MODE_CLASS (mode) == MODE_INT)
7031             *cost += extra_cost->ldst.load;
7032           else if (mode == SFmode)
7033             *cost += extra_cost->ldst.loadf;
7034           else if (mode == DFmode)
7035             *cost += extra_cost->ldst.loadd;
7036
7037           *cost +=
7038                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7039                                                      0, speed));
7040         }
7041
7042       return true;
7043
7044     case NEG:
7045       op0 = XEXP (x, 0);
7046
7047       if (VECTOR_MODE_P (mode))
7048         {
7049           if (speed)
7050             {
7051               /* FNEG.  */
7052               *cost += extra_cost->vect.alu;
7053             }
7054           return false;
7055         }
7056
7057       if (GET_MODE_CLASS (mode) == MODE_INT)
7058         {
7059           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7060               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7061             {
7062               /* CSETM.  */
7063               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7064               return true;
7065             }
7066
7067           /* Cost this as SUB wzr, X.  */
7068           op0 = CONST0_RTX (mode);
7069           op1 = XEXP (x, 0);
7070           goto cost_minus;
7071         }
7072
7073       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7074         {
7075           /* Support (neg(fma...)) as a single instruction only if
7076              sign of zeros is unimportant.  This matches the decision
7077              making in aarch64.md.  */
7078           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7079             {
7080               /* FNMADD.  */
7081               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7082               return true;
7083             }
7084           if (GET_CODE (op0) == MULT)
7085             {
7086               /* FNMUL.  */
7087               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7088               return true;
7089             }
7090           if (speed)
7091             /* FNEG.  */
7092             *cost += extra_cost->fp[mode == DFmode].neg;
7093           return false;
7094         }
7095
7096       return false;
7097
7098     case CLRSB:
7099     case CLZ:
7100       if (speed)
7101         {
7102           if (VECTOR_MODE_P (mode))
7103             *cost += extra_cost->vect.alu;
7104           else
7105             *cost += extra_cost->alu.clz;
7106         }
7107
7108       return false;
7109
7110     case COMPARE:
7111       op0 = XEXP (x, 0);
7112       op1 = XEXP (x, 1);
7113
7114       if (op1 == const0_rtx
7115           && GET_CODE (op0) == AND)
7116         {
7117           x = op0;
7118           mode = GET_MODE (op0);
7119           goto cost_logic;
7120         }
7121
7122       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7123         {
7124           /* TODO: A write to the CC flags possibly costs extra, this
7125              needs encoding in the cost tables.  */
7126
7127           mode = GET_MODE (op0);
7128           /* ANDS.  */
7129           if (GET_CODE (op0) == AND)
7130             {
7131               x = op0;
7132               goto cost_logic;
7133             }
7134
7135           if (GET_CODE (op0) == PLUS)
7136             {
7137               /* ADDS (and CMN alias).  */
7138               x = op0;
7139               goto cost_plus;
7140             }
7141
7142           if (GET_CODE (op0) == MINUS)
7143             {
7144               /* SUBS.  */
7145               x = op0;
7146               goto cost_minus;
7147             }
7148
7149           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7150               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7151               && CONST_INT_P (XEXP (op0, 2)))
7152             {
7153               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7154                  Handle it here directly rather than going to cost_logic
7155                  since we know the immediate generated for the TST is valid
7156                  so we can avoid creating an intermediate rtx for it only
7157                  for costing purposes.  */
7158               if (speed)
7159                 *cost += extra_cost->alu.logical;
7160
7161               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7162                                  ZERO_EXTRACT, 0, speed);
7163               return true;
7164             }
7165
7166           if (GET_CODE (op1) == NEG)
7167             {
7168               /* CMN.  */
7169               if (speed)
7170                 *cost += extra_cost->alu.arith;
7171
7172               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7173               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7174               return true;
7175             }
7176
7177           /* CMP.
7178
7179              Compare can freely swap the order of operands, and
7180              canonicalization puts the more complex operation first.
7181              But the integer MINUS logic expects the shift/extend
7182              operation in op1.  */
7183           if (! (REG_P (op0)
7184                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7185           {
7186             op0 = XEXP (x, 1);
7187             op1 = XEXP (x, 0);
7188           }
7189           goto cost_minus;
7190         }
7191
7192       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7193         {
7194           /* FCMP.  */
7195           if (speed)
7196             *cost += extra_cost->fp[mode == DFmode].compare;
7197
7198           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7199             {
7200               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7201               /* FCMP supports constant 0.0 for no extra cost. */
7202               return true;
7203             }
7204           return false;
7205         }
7206
7207       if (VECTOR_MODE_P (mode))
7208         {
7209           /* Vector compare.  */
7210           if (speed)
7211             *cost += extra_cost->vect.alu;
7212
7213           if (aarch64_float_const_zero_rtx_p (op1))
7214             {
7215               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7216                  cost.  */
7217               return true;
7218             }
7219           return false;
7220         }
7221       return false;
7222
7223     case MINUS:
7224       {
7225         op0 = XEXP (x, 0);
7226         op1 = XEXP (x, 1);
7227
7228 cost_minus:
7229         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7230
7231         /* Detect valid immediates.  */
7232         if ((GET_MODE_CLASS (mode) == MODE_INT
7233              || (GET_MODE_CLASS (mode) == MODE_CC
7234                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7235             && CONST_INT_P (op1)
7236             && aarch64_uimm12_shift (INTVAL (op1)))
7237           {
7238             if (speed)
7239               /* SUB(S) (immediate).  */
7240               *cost += extra_cost->alu.arith;
7241             return true;
7242           }
7243
7244         /* Look for SUB (extended register).  */
7245         if (aarch64_rtx_arith_op_extract_p (op1, mode))
7246           {
7247             if (speed)
7248               *cost += extra_cost->alu.extend_arith;
7249
7250             op1 = aarch64_strip_extend (op1, true);
7251             *cost += rtx_cost (op1, VOIDmode,
7252                                (enum rtx_code) GET_CODE (op1), 0, speed);
7253             return true;
7254           }
7255
7256         rtx new_op1 = aarch64_strip_extend (op1, false);
7257
7258         /* Cost this as an FMA-alike operation.  */
7259         if ((GET_CODE (new_op1) == MULT
7260              || aarch64_shift_p (GET_CODE (new_op1)))
7261             && code != COMPARE)
7262           {
7263             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7264                                             (enum rtx_code) code,
7265                                             speed);
7266             return true;
7267           }
7268
7269         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7270
7271         if (speed)
7272           {
7273             if (VECTOR_MODE_P (mode))
7274               {
7275                 /* Vector SUB.  */
7276                 *cost += extra_cost->vect.alu;
7277               }
7278             else if (GET_MODE_CLASS (mode) == MODE_INT)
7279               {
7280                 /* SUB(S).  */
7281                 *cost += extra_cost->alu.arith;
7282               }
7283             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7284               {
7285                 /* FSUB.  */
7286                 *cost += extra_cost->fp[mode == DFmode].addsub;
7287               }
7288           }
7289         return true;
7290       }
7291
7292     case PLUS:
7293       {
7294         rtx new_op0;
7295
7296         op0 = XEXP (x, 0);
7297         op1 = XEXP (x, 1);
7298
7299 cost_plus:
7300         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7301             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7302           {
7303             /* CSINC.  */
7304             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7305             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7306             return true;
7307           }
7308
7309         if (GET_MODE_CLASS (mode) == MODE_INT
7310             && CONST_INT_P (op1)
7311             && aarch64_uimm12_shift (INTVAL (op1)))
7312           {
7313             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7314
7315             if (speed)
7316               /* ADD (immediate).  */
7317               *cost += extra_cost->alu.arith;
7318             return true;
7319           }
7320
7321         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7322
7323         /* Look for ADD (extended register).  */
7324         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7325           {
7326             if (speed)
7327               *cost += extra_cost->alu.extend_arith;
7328
7329             op0 = aarch64_strip_extend (op0, true);
7330             *cost += rtx_cost (op0, VOIDmode,
7331                                (enum rtx_code) GET_CODE (op0), 0, speed);
7332             return true;
7333           }
7334
7335         /* Strip any extend, leave shifts behind as we will
7336            cost them through mult_cost.  */
7337         new_op0 = aarch64_strip_extend (op0, false);
7338
7339         if (GET_CODE (new_op0) == MULT
7340             || aarch64_shift_p (GET_CODE (new_op0)))
7341           {
7342             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7343                                             speed);
7344             return true;
7345           }
7346
7347         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7348
7349         if (speed)
7350           {
7351             if (VECTOR_MODE_P (mode))
7352               {
7353                 /* Vector ADD.  */
7354                 *cost += extra_cost->vect.alu;
7355               }
7356             else if (GET_MODE_CLASS (mode) == MODE_INT)
7357               {
7358                 /* ADD.  */
7359                 *cost += extra_cost->alu.arith;
7360               }
7361             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7362               {
7363                 /* FADD.  */
7364                 *cost += extra_cost->fp[mode == DFmode].addsub;
7365               }
7366           }
7367         return true;
7368       }
7369
7370     case BSWAP:
7371       *cost = COSTS_N_INSNS (1);
7372
7373       if (speed)
7374         {
7375           if (VECTOR_MODE_P (mode))
7376             *cost += extra_cost->vect.alu;
7377           else
7378             *cost += extra_cost->alu.rev;
7379         }
7380       return false;
7381
7382     case IOR:
7383       if (aarch_rev16_p (x))
7384         {
7385           *cost = COSTS_N_INSNS (1);
7386
7387           if (speed)
7388             {
7389               if (VECTOR_MODE_P (mode))
7390                 *cost += extra_cost->vect.alu;
7391               else
7392                 *cost += extra_cost->alu.rev;
7393             }
7394           return true;
7395         }
7396
7397       if (aarch64_extr_rtx_p (x, &op0, &op1))
7398         {
7399           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7400           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7401           if (speed)
7402             *cost += extra_cost->alu.shift;
7403
7404           return true;
7405         }
7406     /* Fall through.  */
7407     case XOR:
7408     case AND:
7409     cost_logic:
7410       op0 = XEXP (x, 0);
7411       op1 = XEXP (x, 1);
7412
7413       if (VECTOR_MODE_P (mode))
7414         {
7415           if (speed)
7416             *cost += extra_cost->vect.alu;
7417           return true;
7418         }
7419
7420       if (code == AND
7421           && GET_CODE (op0) == MULT
7422           && CONST_INT_P (XEXP (op0, 1))
7423           && CONST_INT_P (op1)
7424           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7425                                INTVAL (op1)) != 0)
7426         {
7427           /* This is a UBFM/SBFM.  */
7428           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7429           if (speed)
7430             *cost += extra_cost->alu.bfx;
7431           return true;
7432         }
7433
7434       if (is_int_mode (mode, &int_mode))
7435         {
7436           if (CONST_INT_P (op1))
7437             {
7438               /* We have a mask + shift version of a UBFIZ
7439                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7440               if (GET_CODE (op0) == ASHIFT
7441                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7442                                                          XEXP (op0, 1)))
7443                 {
7444                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7445                                      (enum rtx_code) code, 0, speed);
7446                   if (speed)
7447                     *cost += extra_cost->alu.bfx;
7448
7449                   return true;
7450                 }
7451               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7452                 {
7453                 /* We possibly get the immediate for free, this is not
7454                    modelled.  */
7455                   *cost += rtx_cost (op0, int_mode,
7456                                      (enum rtx_code) code, 0, speed);
7457                   if (speed)
7458                     *cost += extra_cost->alu.logical;
7459
7460                   return true;
7461                 }
7462             }
7463           else
7464             {
7465               rtx new_op0 = op0;
7466
7467               /* Handle ORN, EON, or BIC.  */
7468               if (GET_CODE (op0) == NOT)
7469                 op0 = XEXP (op0, 0);
7470
7471               new_op0 = aarch64_strip_shift (op0);
7472
7473               /* If we had a shift on op0 then this is a logical-shift-
7474                  by-register/immediate operation.  Otherwise, this is just
7475                  a logical operation.  */
7476               if (speed)
7477                 {
7478                   if (new_op0 != op0)
7479                     {
7480                       /* Shift by immediate.  */
7481                       if (CONST_INT_P (XEXP (op0, 1)))
7482                         *cost += extra_cost->alu.log_shift;
7483                       else
7484                         *cost += extra_cost->alu.log_shift_reg;
7485                     }
7486                   else
7487                     *cost += extra_cost->alu.logical;
7488                 }
7489
7490               /* In both cases we want to cost both operands.  */
7491               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7492                                  0, speed);
7493               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7494                                  1, speed);
7495
7496               return true;
7497             }
7498         }
7499       return false;
7500
7501     case NOT:
7502       x = XEXP (x, 0);
7503       op0 = aarch64_strip_shift (x);
7504
7505       if (VECTOR_MODE_P (mode))
7506         {
7507           /* Vector NOT.  */
7508           *cost += extra_cost->vect.alu;
7509           return false;
7510         }
7511
7512       /* MVN-shifted-reg.  */
7513       if (op0 != x)
7514         {
7515           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7516
7517           if (speed)
7518             *cost += extra_cost->alu.log_shift;
7519
7520           return true;
7521         }
7522       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7523          Handle the second form here taking care that 'a' in the above can
7524          be a shift.  */
7525       else if (GET_CODE (op0) == XOR)
7526         {
7527           rtx newop0 = XEXP (op0, 0);
7528           rtx newop1 = XEXP (op0, 1);
7529           rtx op0_stripped = aarch64_strip_shift (newop0);
7530
7531           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7532           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7533
7534           if (speed)
7535             {
7536               if (op0_stripped != newop0)
7537                 *cost += extra_cost->alu.log_shift;
7538               else
7539                 *cost += extra_cost->alu.logical;
7540             }
7541
7542           return true;
7543         }
7544       /* MVN.  */
7545       if (speed)
7546         *cost += extra_cost->alu.logical;
7547
7548       return false;
7549
7550     case ZERO_EXTEND:
7551
7552       op0 = XEXP (x, 0);
7553       /* If a value is written in SI mode, then zero extended to DI
7554          mode, the operation will in general be free as a write to
7555          a 'w' register implicitly zeroes the upper bits of an 'x'
7556          register.  However, if this is
7557
7558            (set (reg) (zero_extend (reg)))
7559
7560          we must cost the explicit register move.  */
7561       if (mode == DImode
7562           && GET_MODE (op0) == SImode
7563           && outer == SET)
7564         {
7565           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7566
7567         /* If OP_COST is non-zero, then the cost of the zero extend
7568            is effectively the cost of the inner operation.  Otherwise
7569            we have a MOV instruction and we take the cost from the MOV
7570            itself.  This is true independently of whether we are
7571            optimizing for space or time.  */
7572           if (op_cost)
7573             *cost = op_cost;
7574
7575           return true;
7576         }
7577       else if (MEM_P (op0))
7578         {
7579           /* All loads can zero extend to any size for free.  */
7580           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7581           return true;
7582         }
7583
7584       op0 = aarch64_extend_bitfield_pattern_p (x);
7585       if (op0)
7586         {
7587           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7588           if (speed)
7589             *cost += extra_cost->alu.bfx;
7590           return true;
7591         }
7592
7593       if (speed)
7594         {
7595           if (VECTOR_MODE_P (mode))
7596             {
7597               /* UMOV.  */
7598               *cost += extra_cost->vect.alu;
7599             }
7600           else
7601             {
7602               /* We generate an AND instead of UXTB/UXTH.  */
7603               *cost += extra_cost->alu.logical;
7604             }
7605         }
7606       return false;
7607
7608     case SIGN_EXTEND:
7609       if (MEM_P (XEXP (x, 0)))
7610         {
7611           /* LDRSH.  */
7612           if (speed)
7613             {
7614               rtx address = XEXP (XEXP (x, 0), 0);
7615               *cost += extra_cost->ldst.load_sign_extend;
7616
7617               *cost +=
7618                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7619                                                      0, speed));
7620             }
7621           return true;
7622         }
7623
7624       op0 = aarch64_extend_bitfield_pattern_p (x);
7625       if (op0)
7626         {
7627           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7628           if (speed)
7629             *cost += extra_cost->alu.bfx;
7630           return true;
7631         }
7632
7633       if (speed)
7634         {
7635           if (VECTOR_MODE_P (mode))
7636             *cost += extra_cost->vect.alu;
7637           else
7638             *cost += extra_cost->alu.extend;
7639         }
7640       return false;
7641
7642     case ASHIFT:
7643       op0 = XEXP (x, 0);
7644       op1 = XEXP (x, 1);
7645
7646       if (CONST_INT_P (op1))
7647         {
7648           if (speed)
7649             {
7650               if (VECTOR_MODE_P (mode))
7651                 {
7652                   /* Vector shift (immediate).  */
7653                   *cost += extra_cost->vect.alu;
7654                 }
7655               else
7656                 {
7657                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7658                      aliases.  */
7659                   *cost += extra_cost->alu.shift;
7660                 }
7661             }
7662
7663           /* We can incorporate zero/sign extend for free.  */
7664           if (GET_CODE (op0) == ZERO_EXTEND
7665               || GET_CODE (op0) == SIGN_EXTEND)
7666             op0 = XEXP (op0, 0);
7667
7668           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7669           return true;
7670         }
7671       else
7672         {
7673           if (VECTOR_MODE_P (mode))
7674             {
7675               if (speed)
7676                 /* Vector shift (register).  */
7677                 *cost += extra_cost->vect.alu;
7678             }
7679           else
7680             {
7681               if (speed)
7682                 /* LSLV.  */
7683                 *cost += extra_cost->alu.shift_reg;
7684
7685               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7686                   && CONST_INT_P (XEXP (op1, 1))
7687                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7688                 {
7689                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7690                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7691                      don't recurse into it.  */
7692                   return true;
7693                 }
7694             }
7695           return false;  /* All arguments need to be in registers.  */
7696         }
7697
7698     case ROTATE:
7699     case ROTATERT:
7700     case LSHIFTRT:
7701     case ASHIFTRT:
7702       op0 = XEXP (x, 0);
7703       op1 = XEXP (x, 1);
7704
7705       if (CONST_INT_P (op1))
7706         {
7707           /* ASR (immediate) and friends.  */
7708           if (speed)
7709             {
7710               if (VECTOR_MODE_P (mode))
7711                 *cost += extra_cost->vect.alu;
7712               else
7713                 *cost += extra_cost->alu.shift;
7714             }
7715
7716           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7717           return true;
7718         }
7719       else
7720         {
7721           if (VECTOR_MODE_P (mode))
7722             {
7723               if (speed)
7724                 /* Vector shift (register).  */
7725                 *cost += extra_cost->vect.alu;
7726             }
7727           else
7728             {
7729               if (speed)
7730                 /* ASR (register) and friends.  */
7731                 *cost += extra_cost->alu.shift_reg;
7732
7733               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7734                   && CONST_INT_P (XEXP (op1, 1))
7735                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7736                 {
7737                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7738                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7739                      don't recurse into it.  */
7740                   return true;
7741                 }
7742             }
7743           return false;  /* All arguments need to be in registers.  */
7744         }
7745
7746     case SYMBOL_REF:
7747
7748       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7749           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7750         {
7751           /* LDR.  */
7752           if (speed)
7753             *cost += extra_cost->ldst.load;
7754         }
7755       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7756                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7757         {
7758           /* ADRP, followed by ADD.  */
7759           *cost += COSTS_N_INSNS (1);
7760           if (speed)
7761             *cost += 2 * extra_cost->alu.arith;
7762         }
7763       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7764                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7765         {
7766           /* ADR.  */
7767           if (speed)
7768             *cost += extra_cost->alu.arith;
7769         }
7770
7771       if (flag_pic)
7772         {
7773           /* One extra load instruction, after accessing the GOT.  */
7774           *cost += COSTS_N_INSNS (1);
7775           if (speed)
7776             *cost += extra_cost->ldst.load;
7777         }
7778       return true;
7779
7780     case HIGH:
7781     case LO_SUM:
7782       /* ADRP/ADD (immediate).  */
7783       if (speed)
7784         *cost += extra_cost->alu.arith;
7785       return true;
7786
7787     case ZERO_EXTRACT:
7788     case SIGN_EXTRACT:
7789       /* UBFX/SBFX.  */
7790       if (speed)
7791         {
7792           if (VECTOR_MODE_P (mode))
7793             *cost += extra_cost->vect.alu;
7794           else
7795             *cost += extra_cost->alu.bfx;
7796         }
7797
7798       /* We can trust that the immediates used will be correct (there
7799          are no by-register forms), so we need only cost op0.  */
7800       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7801       return true;
7802
7803     case MULT:
7804       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7805       /* aarch64_rtx_mult_cost always handles recursion to its
7806          operands.  */
7807       return true;
7808
7809     case MOD:
7810     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7811        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7812        an unconditional negate.  This case should only ever be reached through
7813        the set_smod_pow2_cheap check in expmed.c.  */
7814       if (CONST_INT_P (XEXP (x, 1))
7815           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7816           && (mode == SImode || mode == DImode))
7817         {
7818           /* We expand to 4 instructions.  Reset the baseline.  */
7819           *cost = COSTS_N_INSNS (4);
7820
7821           if (speed)
7822             *cost += 2 * extra_cost->alu.logical
7823                      + 2 * extra_cost->alu.arith;
7824
7825           return true;
7826         }
7827
7828     /* Fall-through.  */
7829     case UMOD:
7830       if (speed)
7831         {
7832           /* Slighly prefer UMOD over SMOD.  */
7833           if (VECTOR_MODE_P (mode))
7834             *cost += extra_cost->vect.alu;
7835           else if (GET_MODE_CLASS (mode) == MODE_INT)
7836             *cost += (extra_cost->mult[mode == DImode].add
7837                       + extra_cost->mult[mode == DImode].idiv
7838                       + (code == MOD ? 1 : 0));
7839         }
7840       return false;  /* All arguments need to be in registers.  */
7841
7842     case DIV:
7843     case UDIV:
7844     case SQRT:
7845       if (speed)
7846         {
7847           if (VECTOR_MODE_P (mode))
7848             *cost += extra_cost->vect.alu;
7849           else if (GET_MODE_CLASS (mode) == MODE_INT)
7850             /* There is no integer SQRT, so only DIV and UDIV can get
7851                here.  */
7852             *cost += (extra_cost->mult[mode == DImode].idiv
7853                      /* Slighly prefer UDIV over SDIV.  */
7854                      + (code == DIV ? 1 : 0));
7855           else
7856             *cost += extra_cost->fp[mode == DFmode].div;
7857         }
7858       return false;  /* All arguments need to be in registers.  */
7859
7860     case IF_THEN_ELSE:
7861       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7862                                          XEXP (x, 2), cost, speed);
7863
7864     case EQ:
7865     case NE:
7866     case GT:
7867     case GTU:
7868     case LT:
7869     case LTU:
7870     case GE:
7871     case GEU:
7872     case LE:
7873     case LEU:
7874
7875       return false; /* All arguments must be in registers.  */
7876
7877     case FMA:
7878       op0 = XEXP (x, 0);
7879       op1 = XEXP (x, 1);
7880       op2 = XEXP (x, 2);
7881
7882       if (speed)
7883         {
7884           if (VECTOR_MODE_P (mode))
7885             *cost += extra_cost->vect.alu;
7886           else
7887             *cost += extra_cost->fp[mode == DFmode].fma;
7888         }
7889
7890       /* FMSUB, FNMADD, and FNMSUB are free.  */
7891       if (GET_CODE (op0) == NEG)
7892         op0 = XEXP (op0, 0);
7893
7894       if (GET_CODE (op2) == NEG)
7895         op2 = XEXP (op2, 0);
7896
7897       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7898          and the by-element operand as operand 0.  */
7899       if (GET_CODE (op1) == NEG)
7900         op1 = XEXP (op1, 0);
7901
7902       /* Catch vector-by-element operations.  The by-element operand can
7903          either be (vec_duplicate (vec_select (x))) or just
7904          (vec_select (x)), depending on whether we are multiplying by
7905          a vector or a scalar.
7906
7907          Canonicalization is not very good in these cases, FMA4 will put the
7908          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7909       if (GET_CODE (op0) == VEC_DUPLICATE)
7910         op0 = XEXP (op0, 0);
7911       else if (GET_CODE (op1) == VEC_DUPLICATE)
7912         op1 = XEXP (op1, 0);
7913
7914       if (GET_CODE (op0) == VEC_SELECT)
7915         op0 = XEXP (op0, 0);
7916       else if (GET_CODE (op1) == VEC_SELECT)
7917         op1 = XEXP (op1, 0);
7918
7919       /* If the remaining parameters are not registers,
7920          get the cost to put them into registers.  */
7921       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7922       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7923       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7924       return true;
7925
7926     case FLOAT:
7927     case UNSIGNED_FLOAT:
7928       if (speed)
7929         *cost += extra_cost->fp[mode == DFmode].fromint;
7930       return false;
7931
7932     case FLOAT_EXTEND:
7933       if (speed)
7934         {
7935           if (VECTOR_MODE_P (mode))
7936             {
7937               /*Vector truncate.  */
7938               *cost += extra_cost->vect.alu;
7939             }
7940           else
7941             *cost += extra_cost->fp[mode == DFmode].widen;
7942         }
7943       return false;
7944
7945     case FLOAT_TRUNCATE:
7946       if (speed)
7947         {
7948           if (VECTOR_MODE_P (mode))
7949             {
7950               /*Vector conversion.  */
7951               *cost += extra_cost->vect.alu;
7952             }
7953           else
7954             *cost += extra_cost->fp[mode == DFmode].narrow;
7955         }
7956       return false;
7957
7958     case FIX:
7959     case UNSIGNED_FIX:
7960       x = XEXP (x, 0);
7961       /* Strip the rounding part.  They will all be implemented
7962          by the fcvt* family of instructions anyway.  */
7963       if (GET_CODE (x) == UNSPEC)
7964         {
7965           unsigned int uns_code = XINT (x, 1);
7966
7967           if (uns_code == UNSPEC_FRINTA
7968               || uns_code == UNSPEC_FRINTM
7969               || uns_code == UNSPEC_FRINTN
7970               || uns_code == UNSPEC_FRINTP
7971               || uns_code == UNSPEC_FRINTZ)
7972             x = XVECEXP (x, 0, 0);
7973         }
7974
7975       if (speed)
7976         {
7977           if (VECTOR_MODE_P (mode))
7978             *cost += extra_cost->vect.alu;
7979           else
7980             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7981         }
7982
7983       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7984          fixed-point fcvt.  */
7985       if (GET_CODE (x) == MULT
7986           && ((VECTOR_MODE_P (mode)
7987                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7988               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7989         {
7990           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7991                              0, speed);
7992           return true;
7993         }
7994
7995       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7996       return true;
7997
7998     case ABS:
7999       if (VECTOR_MODE_P (mode))
8000         {
8001           /* ABS (vector).  */
8002           if (speed)
8003             *cost += extra_cost->vect.alu;
8004         }
8005       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8006         {
8007           op0 = XEXP (x, 0);
8008
8009           /* FABD, which is analogous to FADD.  */
8010           if (GET_CODE (op0) == MINUS)
8011             {
8012               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8013               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8014               if (speed)
8015                 *cost += extra_cost->fp[mode == DFmode].addsub;
8016
8017               return true;
8018             }
8019           /* Simple FABS is analogous to FNEG.  */
8020           if (speed)
8021             *cost += extra_cost->fp[mode == DFmode].neg;
8022         }
8023       else
8024         {
8025           /* Integer ABS will either be split to
8026              two arithmetic instructions, or will be an ABS
8027              (scalar), which we don't model.  */
8028           *cost = COSTS_N_INSNS (2);
8029           if (speed)
8030             *cost += 2 * extra_cost->alu.arith;
8031         }
8032       return false;
8033
8034     case SMAX:
8035     case SMIN:
8036       if (speed)
8037         {
8038           if (VECTOR_MODE_P (mode))
8039             *cost += extra_cost->vect.alu;
8040           else
8041             {
8042               /* FMAXNM/FMINNM/FMAX/FMIN.
8043                  TODO: This may not be accurate for all implementations, but
8044                  we do not model this in the cost tables.  */
8045               *cost += extra_cost->fp[mode == DFmode].addsub;
8046             }
8047         }
8048       return false;
8049
8050     case UNSPEC:
8051       /* The floating point round to integer frint* instructions.  */
8052       if (aarch64_frint_unspec_p (XINT (x, 1)))
8053         {
8054           if (speed)
8055             *cost += extra_cost->fp[mode == DFmode].roundint;
8056
8057           return false;
8058         }
8059
8060       if (XINT (x, 1) == UNSPEC_RBIT)
8061         {
8062           if (speed)
8063             *cost += extra_cost->alu.rev;
8064
8065           return false;
8066         }
8067       break;
8068
8069     case TRUNCATE:
8070
8071       /* Decompose <su>muldi3_highpart.  */
8072       if (/* (truncate:DI  */
8073           mode == DImode
8074           /*   (lshiftrt:TI  */
8075           && GET_MODE (XEXP (x, 0)) == TImode
8076           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8077           /*      (mult:TI  */
8078           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8079           /*        (ANY_EXTEND:TI (reg:DI))
8080                     (ANY_EXTEND:TI (reg:DI)))  */
8081           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8082                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8083               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8084                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8085           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8086           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8087           /*     (const_int 64)  */
8088           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8089           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8090         {
8091           /* UMULH/SMULH.  */
8092           if (speed)
8093             *cost += extra_cost->mult[mode == DImode].extend;
8094           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8095                              mode, MULT, 0, speed);
8096           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8097                              mode, MULT, 1, speed);
8098           return true;
8099         }
8100
8101       /* Fall through.  */
8102     default:
8103       break;
8104     }
8105
8106   if (dump_file
8107       && flag_aarch64_verbose_cost)
8108     fprintf (dump_file,
8109       "\nFailed to cost RTX.  Assuming default cost.\n");
8110
8111   return true;
8112 }
8113
8114 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8115    calculated for X.  This cost is stored in *COST.  Returns true
8116    if the total cost of X was calculated.  */
8117 static bool
8118 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8119                    int param, int *cost, bool speed)
8120 {
8121   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8122
8123   if (dump_file
8124       && flag_aarch64_verbose_cost)
8125     {
8126       print_rtl_single (dump_file, x);
8127       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8128                speed ? "Hot" : "Cold",
8129                *cost, result ? "final" : "partial");
8130     }
8131
8132   return result;
8133 }
8134
8135 static int
8136 aarch64_register_move_cost (machine_mode mode,
8137                             reg_class_t from_i, reg_class_t to_i)
8138 {
8139   enum reg_class from = (enum reg_class) from_i;
8140   enum reg_class to = (enum reg_class) to_i;
8141   const struct cpu_regmove_cost *regmove_cost
8142     = aarch64_tune_params.regmove_cost;
8143
8144   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8145   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8146     to = GENERAL_REGS;
8147
8148   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8149     from = GENERAL_REGS;
8150
8151   /* Moving between GPR and stack cost is the same as GP2GP.  */
8152   if ((from == GENERAL_REGS && to == STACK_REG)
8153       || (to == GENERAL_REGS && from == STACK_REG))
8154     return regmove_cost->GP2GP;
8155
8156   /* To/From the stack register, we move via the gprs.  */
8157   if (to == STACK_REG || from == STACK_REG)
8158     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8159             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8160
8161   if (GET_MODE_SIZE (mode) == 16)
8162     {
8163       /* 128-bit operations on general registers require 2 instructions.  */
8164       if (from == GENERAL_REGS && to == GENERAL_REGS)
8165         return regmove_cost->GP2GP * 2;
8166       else if (from == GENERAL_REGS)
8167         return regmove_cost->GP2FP * 2;
8168       else if (to == GENERAL_REGS)
8169         return regmove_cost->FP2GP * 2;
8170
8171       /* When AdvSIMD instructions are disabled it is not possible to move
8172          a 128-bit value directly between Q registers.  This is handled in
8173          secondary reload.  A general register is used as a scratch to move
8174          the upper DI value and the lower DI value is moved directly,
8175          hence the cost is the sum of three moves. */
8176       if (! TARGET_SIMD)
8177         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8178
8179       return regmove_cost->FP2FP;
8180     }
8181
8182   if (from == GENERAL_REGS && to == GENERAL_REGS)
8183     return regmove_cost->GP2GP;
8184   else if (from == GENERAL_REGS)
8185     return regmove_cost->GP2FP;
8186   else if (to == GENERAL_REGS)
8187     return regmove_cost->FP2GP;
8188
8189   return regmove_cost->FP2FP;
8190 }
8191
8192 static int
8193 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8194                           reg_class_t rclass ATTRIBUTE_UNUSED,
8195                           bool in ATTRIBUTE_UNUSED)
8196 {
8197   return aarch64_tune_params.memmov_cost;
8198 }
8199
8200 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8201    to optimize 1.0/sqrt.  */
8202
8203 static bool
8204 use_rsqrt_p (machine_mode mode)
8205 {
8206   return (!flag_trapping_math
8207           && flag_unsafe_math_optimizations
8208           && ((aarch64_tune_params.approx_modes->recip_sqrt
8209                & AARCH64_APPROX_MODE (mode))
8210               || flag_mrecip_low_precision_sqrt));
8211 }
8212
8213 /* Function to decide when to use the approximate reciprocal square root
8214    builtin.  */
8215
8216 static tree
8217 aarch64_builtin_reciprocal (tree fndecl)
8218 {
8219   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8220
8221   if (!use_rsqrt_p (mode))
8222     return NULL_TREE;
8223   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8224 }
8225
8226 typedef rtx (*rsqrte_type) (rtx, rtx);
8227
8228 /* Select reciprocal square root initial estimate insn depending on machine
8229    mode.  */
8230
8231 static rsqrte_type
8232 get_rsqrte_type (machine_mode mode)
8233 {
8234   switch (mode)
8235   {
8236     case E_DFmode:   return gen_aarch64_rsqrtedf;
8237     case E_SFmode:   return gen_aarch64_rsqrtesf;
8238     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8239     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8240     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8241     default: gcc_unreachable ();
8242   }
8243 }
8244
8245 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8246
8247 /* Select reciprocal square root series step insn depending on machine mode.  */
8248
8249 static rsqrts_type
8250 get_rsqrts_type (machine_mode mode)
8251 {
8252   switch (mode)
8253   {
8254     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8255     case E_SFmode:   return gen_aarch64_rsqrtssf;
8256     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8257     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8258     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8259     default: gcc_unreachable ();
8260   }
8261 }
8262
8263 /* Emit instruction sequence to compute either the approximate square root
8264    or its approximate reciprocal, depending on the flag RECP, and return
8265    whether the sequence was emitted or not.  */
8266
8267 bool
8268 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8269 {
8270   machine_mode mode = GET_MODE (dst);
8271
8272   if (GET_MODE_INNER (mode) == HFmode)
8273     {
8274       gcc_assert (!recp);
8275       return false;
8276     }
8277
8278   machine_mode mmsk
8279     = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)).require (),
8280                        GET_MODE_NUNITS (mode));
8281   if (!recp)
8282     {
8283       if (!(flag_mlow_precision_sqrt
8284             || (aarch64_tune_params.approx_modes->sqrt
8285                 & AARCH64_APPROX_MODE (mode))))
8286         return false;
8287
8288       if (flag_finite_math_only
8289           || flag_trapping_math
8290           || !flag_unsafe_math_optimizations
8291           || optimize_function_for_size_p (cfun))
8292         return false;
8293     }
8294   else
8295     /* Caller assumes we cannot fail.  */
8296     gcc_assert (use_rsqrt_p (mode));
8297
8298
8299   rtx xmsk = gen_reg_rtx (mmsk);
8300   if (!recp)
8301     /* When calculating the approximate square root, compare the
8302        argument with 0.0 and create a mask.  */
8303     emit_insn (gen_rtx_SET (xmsk,
8304                             gen_rtx_NEG (mmsk,
8305                                          gen_rtx_EQ (mmsk, src,
8306                                                      CONST0_RTX (mode)))));
8307
8308   /* Estimate the approximate reciprocal square root.  */
8309   rtx xdst = gen_reg_rtx (mode);
8310   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8311
8312   /* Iterate over the series twice for SF and thrice for DF.  */
8313   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8314
8315   /* Optionally iterate over the series once less for faster performance
8316      while sacrificing the accuracy.  */
8317   if ((recp && flag_mrecip_low_precision_sqrt)
8318       || (!recp && flag_mlow_precision_sqrt))
8319     iterations--;
8320
8321   /* Iterate over the series to calculate the approximate reciprocal square
8322      root.  */
8323   rtx x1 = gen_reg_rtx (mode);
8324   while (iterations--)
8325     {
8326       rtx x2 = gen_reg_rtx (mode);
8327       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8328
8329       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8330
8331       if (iterations > 0)
8332         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8333     }
8334
8335   if (!recp)
8336     {
8337       /* Qualify the approximate reciprocal square root when the argument is
8338          0.0 by squashing the intermediary result to 0.0.  */
8339       rtx xtmp = gen_reg_rtx (mmsk);
8340       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8341                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8342       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8343
8344       /* Calculate the approximate square root.  */
8345       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8346     }
8347
8348   /* Finalize the approximation.  */
8349   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8350
8351   return true;
8352 }
8353
8354 typedef rtx (*recpe_type) (rtx, rtx);
8355
8356 /* Select reciprocal initial estimate insn depending on machine mode.  */
8357
8358 static recpe_type
8359 get_recpe_type (machine_mode mode)
8360 {
8361   switch (mode)
8362   {
8363     case E_SFmode:   return (gen_aarch64_frecpesf);
8364     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8365     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8366     case E_DFmode:   return (gen_aarch64_frecpedf);
8367     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8368     default:         gcc_unreachable ();
8369   }
8370 }
8371
8372 typedef rtx (*recps_type) (rtx, rtx, rtx);
8373
8374 /* Select reciprocal series step insn depending on machine mode.  */
8375
8376 static recps_type
8377 get_recps_type (machine_mode mode)
8378 {
8379   switch (mode)
8380   {
8381     case E_SFmode:   return (gen_aarch64_frecpssf);
8382     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8383     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8384     case E_DFmode:   return (gen_aarch64_frecpsdf);
8385     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8386     default:         gcc_unreachable ();
8387   }
8388 }
8389
8390 /* Emit the instruction sequence to compute the approximation for the division
8391    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8392
8393 bool
8394 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8395 {
8396   machine_mode mode = GET_MODE (quo);
8397
8398   if (GET_MODE_INNER (mode) == HFmode)
8399     return false;
8400
8401   bool use_approx_division_p = (flag_mlow_precision_div
8402                                 || (aarch64_tune_params.approx_modes->division
8403                                     & AARCH64_APPROX_MODE (mode)));
8404
8405   if (!flag_finite_math_only
8406       || flag_trapping_math
8407       || !flag_unsafe_math_optimizations
8408       || optimize_function_for_size_p (cfun)
8409       || !use_approx_division_p)
8410     return false;
8411
8412   /* Estimate the approximate reciprocal.  */
8413   rtx xrcp = gen_reg_rtx (mode);
8414   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8415
8416   /* Iterate over the series twice for SF and thrice for DF.  */
8417   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8418
8419   /* Optionally iterate over the series once less for faster performance,
8420      while sacrificing the accuracy.  */
8421   if (flag_mlow_precision_div)
8422     iterations--;
8423
8424   /* Iterate over the series to calculate the approximate reciprocal.  */
8425   rtx xtmp = gen_reg_rtx (mode);
8426   while (iterations--)
8427     {
8428       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8429
8430       if (iterations > 0)
8431         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8432     }
8433
8434   if (num != CONST1_RTX (mode))
8435     {
8436       /* As the approximate reciprocal of DEN is already calculated, only
8437          calculate the approximate division when NUM is not 1.0.  */
8438       rtx xnum = force_reg (mode, num);
8439       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8440     }
8441
8442   /* Finalize the approximation.  */
8443   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8444   return true;
8445 }
8446
8447 /* Return the number of instructions that can be issued per cycle.  */
8448 static int
8449 aarch64_sched_issue_rate (void)
8450 {
8451   return aarch64_tune_params.issue_rate;
8452 }
8453
8454 static int
8455 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8456 {
8457   int issue_rate = aarch64_sched_issue_rate ();
8458
8459   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8460 }
8461
8462
8463 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8464    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8465    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8466
8467 static int
8468 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8469                                                     int ready_index)
8470 {
8471   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8472 }
8473
8474
8475 /* Vectorizer cost model target hooks.  */
8476
8477 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8478 static int
8479 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8480                                     tree vectype,
8481                                     int misalign ATTRIBUTE_UNUSED)
8482 {
8483   unsigned elements;
8484   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8485   bool fp = false;
8486
8487   if (vectype != NULL)
8488     fp = FLOAT_TYPE_P (vectype);
8489
8490   switch (type_of_cost)
8491     {
8492       case scalar_stmt:
8493         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8494
8495       case scalar_load:
8496         return costs->scalar_load_cost;
8497
8498       case scalar_store:
8499         return costs->scalar_store_cost;
8500
8501       case vector_stmt:
8502         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8503
8504       case vector_load:
8505         return costs->vec_align_load_cost;
8506
8507       case vector_store:
8508         return costs->vec_store_cost;
8509
8510       case vec_to_scalar:
8511         return costs->vec_to_scalar_cost;
8512
8513       case scalar_to_vec:
8514         return costs->scalar_to_vec_cost;
8515
8516       case unaligned_load:
8517         return costs->vec_unalign_load_cost;
8518
8519       case unaligned_store:
8520         return costs->vec_unalign_store_cost;
8521
8522       case cond_branch_taken:
8523         return costs->cond_taken_branch_cost;
8524
8525       case cond_branch_not_taken:
8526         return costs->cond_not_taken_branch_cost;
8527
8528       case vec_perm:
8529         return costs->vec_permute_cost;
8530
8531       case vec_promote_demote:
8532         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8533
8534       case vec_construct:
8535         elements = TYPE_VECTOR_SUBPARTS (vectype);
8536         return elements / 2 + 1;
8537
8538       default:
8539         gcc_unreachable ();
8540     }
8541 }
8542
8543 /* Implement targetm.vectorize.add_stmt_cost.  */
8544 static unsigned
8545 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8546                        struct _stmt_vec_info *stmt_info, int misalign,
8547                        enum vect_cost_model_location where)
8548 {
8549   unsigned *cost = (unsigned *) data;
8550   unsigned retval = 0;
8551
8552   if (flag_vect_cost_model)
8553     {
8554       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8555       int stmt_cost =
8556             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8557
8558       /* Statements in an inner loop relative to the loop being
8559          vectorized are weighted more heavily.  The value here is
8560          arbitrary and could potentially be improved with analysis.  */
8561       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8562         count *= 50; /*  FIXME  */
8563
8564       retval = (unsigned) (count * stmt_cost);
8565       cost[where] += retval;
8566     }
8567
8568   return retval;
8569 }
8570
8571 static void initialize_aarch64_code_model (struct gcc_options *);
8572
8573 /* Parse the TO_PARSE string and put the architecture struct that it
8574    selects into RES and the architectural features into ISA_FLAGS.
8575    Return an aarch64_parse_opt_result describing the parse result.
8576    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8577
8578 static enum aarch64_parse_opt_result
8579 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8580                     unsigned long *isa_flags)
8581 {
8582   char *ext;
8583   const struct processor *arch;
8584   char *str = (char *) alloca (strlen (to_parse) + 1);
8585   size_t len;
8586
8587   strcpy (str, to_parse);
8588
8589   ext = strchr (str, '+');
8590
8591   if (ext != NULL)
8592     len = ext - str;
8593   else
8594     len = strlen (str);
8595
8596   if (len == 0)
8597     return AARCH64_PARSE_MISSING_ARG;
8598
8599
8600   /* Loop through the list of supported ARCHes to find a match.  */
8601   for (arch = all_architectures; arch->name != NULL; arch++)
8602     {
8603       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8604         {
8605           unsigned long isa_temp = arch->flags;
8606
8607           if (ext != NULL)
8608             {
8609               /* TO_PARSE string contains at least one extension.  */
8610               enum aarch64_parse_opt_result ext_res
8611                 = aarch64_parse_extension (ext, &isa_temp);
8612
8613               if (ext_res != AARCH64_PARSE_OK)
8614                 return ext_res;
8615             }
8616           /* Extension parsing was successful.  Confirm the result
8617              arch and ISA flags.  */
8618           *res = arch;
8619           *isa_flags = isa_temp;
8620           return AARCH64_PARSE_OK;
8621         }
8622     }
8623
8624   /* ARCH name not found in list.  */
8625   return AARCH64_PARSE_INVALID_ARG;
8626 }
8627
8628 /* Parse the TO_PARSE string and put the result tuning in RES and the
8629    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8630    describing the parse result.  If there is an error parsing, RES and
8631    ISA_FLAGS are left unchanged.  */
8632
8633 static enum aarch64_parse_opt_result
8634 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8635                    unsigned long *isa_flags)
8636 {
8637   char *ext;
8638   const struct processor *cpu;
8639   char *str = (char *) alloca (strlen (to_parse) + 1);
8640   size_t len;
8641
8642   strcpy (str, to_parse);
8643
8644   ext = strchr (str, '+');
8645
8646   if (ext != NULL)
8647     len = ext - str;
8648   else
8649     len = strlen (str);
8650
8651   if (len == 0)
8652     return AARCH64_PARSE_MISSING_ARG;
8653
8654
8655   /* Loop through the list of supported CPUs to find a match.  */
8656   for (cpu = all_cores; cpu->name != NULL; cpu++)
8657     {
8658       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8659         {
8660           unsigned long isa_temp = cpu->flags;
8661
8662
8663           if (ext != NULL)
8664             {
8665               /* TO_PARSE string contains at least one extension.  */
8666               enum aarch64_parse_opt_result ext_res
8667                 = aarch64_parse_extension (ext, &isa_temp);
8668
8669               if (ext_res != AARCH64_PARSE_OK)
8670                 return ext_res;
8671             }
8672           /* Extension parsing was successfull.  Confirm the result
8673              cpu and ISA flags.  */
8674           *res = cpu;
8675           *isa_flags = isa_temp;
8676           return AARCH64_PARSE_OK;
8677         }
8678     }
8679
8680   /* CPU name not found in list.  */
8681   return AARCH64_PARSE_INVALID_ARG;
8682 }
8683
8684 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8685    Return an aarch64_parse_opt_result describing the parse result.
8686    If the parsing fails the RES does not change.  */
8687
8688 static enum aarch64_parse_opt_result
8689 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8690 {
8691   const struct processor *cpu;
8692   char *str = (char *) alloca (strlen (to_parse) + 1);
8693
8694   strcpy (str, to_parse);
8695
8696   /* Loop through the list of supported CPUs to find a match.  */
8697   for (cpu = all_cores; cpu->name != NULL; cpu++)
8698     {
8699       if (strcmp (cpu->name, str) == 0)
8700         {
8701           *res = cpu;
8702           return AARCH64_PARSE_OK;
8703         }
8704     }
8705
8706   /* CPU name not found in list.  */
8707   return AARCH64_PARSE_INVALID_ARG;
8708 }
8709
8710 /* Parse TOKEN, which has length LENGTH to see if it is an option
8711    described in FLAG.  If it is, return the index bit for that fusion type.
8712    If not, error (printing OPTION_NAME) and return zero.  */
8713
8714 static unsigned int
8715 aarch64_parse_one_option_token (const char *token,
8716                                 size_t length,
8717                                 const struct aarch64_flag_desc *flag,
8718                                 const char *option_name)
8719 {
8720   for (; flag->name != NULL; flag++)
8721     {
8722       if (length == strlen (flag->name)
8723           && !strncmp (flag->name, token, length))
8724         return flag->flag;
8725     }
8726
8727   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8728   return 0;
8729 }
8730
8731 /* Parse OPTION which is a comma-separated list of flags to enable.
8732    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8733    default state we inherit from the CPU tuning structures.  OPTION_NAME
8734    gives the top-level option we are parsing in the -moverride string,
8735    for use in error messages.  */
8736
8737 static unsigned int
8738 aarch64_parse_boolean_options (const char *option,
8739                                const struct aarch64_flag_desc *flags,
8740                                unsigned int initial_state,
8741                                const char *option_name)
8742 {
8743   const char separator = '.';
8744   const char* specs = option;
8745   const char* ntoken = option;
8746   unsigned int found_flags = initial_state;
8747
8748   while ((ntoken = strchr (specs, separator)))
8749     {
8750       size_t token_length = ntoken - specs;
8751       unsigned token_ops = aarch64_parse_one_option_token (specs,
8752                                                            token_length,
8753                                                            flags,
8754                                                            option_name);
8755       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8756          in the token stream, reset the supported operations.  So:
8757
8758            adrp+add.cmp+branch.none.adrp+add
8759
8760            would have the result of turning on only adrp+add fusion.  */
8761       if (!token_ops)
8762         found_flags = 0;
8763
8764       found_flags |= token_ops;
8765       specs = ++ntoken;
8766     }
8767
8768   /* We ended with a comma, print something.  */
8769   if (!(*specs))
8770     {
8771       error ("%s string ill-formed\n", option_name);
8772       return 0;
8773     }
8774
8775   /* We still have one more token to parse.  */
8776   size_t token_length = strlen (specs);
8777   unsigned token_ops = aarch64_parse_one_option_token (specs,
8778                                                        token_length,
8779                                                        flags,
8780                                                        option_name);
8781    if (!token_ops)
8782      found_flags = 0;
8783
8784   found_flags |= token_ops;
8785   return found_flags;
8786 }
8787
8788 /* Support for overriding instruction fusion.  */
8789
8790 static void
8791 aarch64_parse_fuse_string (const char *fuse_string,
8792                             struct tune_params *tune)
8793 {
8794   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8795                                                      aarch64_fusible_pairs,
8796                                                      tune->fusible_ops,
8797                                                      "fuse=");
8798 }
8799
8800 /* Support for overriding other tuning flags.  */
8801
8802 static void
8803 aarch64_parse_tune_string (const char *tune_string,
8804                             struct tune_params *tune)
8805 {
8806   tune->extra_tuning_flags
8807     = aarch64_parse_boolean_options (tune_string,
8808                                      aarch64_tuning_flags,
8809                                      tune->extra_tuning_flags,
8810                                      "tune=");
8811 }
8812
8813 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8814    we understand.  If it is, extract the option string and handoff to
8815    the appropriate function.  */
8816
8817 void
8818 aarch64_parse_one_override_token (const char* token,
8819                                   size_t length,
8820                                   struct tune_params *tune)
8821 {
8822   const struct aarch64_tuning_override_function *fn
8823     = aarch64_tuning_override_functions;
8824
8825   const char *option_part = strchr (token, '=');
8826   if (!option_part)
8827     {
8828       error ("tuning string missing in option (%s)", token);
8829       return;
8830     }
8831
8832   /* Get the length of the option name.  */
8833   length = option_part - token;
8834   /* Skip the '=' to get to the option string.  */
8835   option_part++;
8836
8837   for (; fn->name != NULL; fn++)
8838     {
8839       if (!strncmp (fn->name, token, length))
8840         {
8841           fn->parse_override (option_part, tune);
8842           return;
8843         }
8844     }
8845
8846   error ("unknown tuning option (%s)",token);
8847   return;
8848 }
8849
8850 /* A checking mechanism for the implementation of the tls size.  */
8851
8852 static void
8853 initialize_aarch64_tls_size (struct gcc_options *opts)
8854 {
8855   if (aarch64_tls_size == 0)
8856     aarch64_tls_size = 24;
8857
8858   switch (opts->x_aarch64_cmodel_var)
8859     {
8860     case AARCH64_CMODEL_TINY:
8861       /* Both the default and maximum TLS size allowed under tiny is 1M which
8862          needs two instructions to address, so we clamp the size to 24.  */
8863       if (aarch64_tls_size > 24)
8864         aarch64_tls_size = 24;
8865       break;
8866     case AARCH64_CMODEL_SMALL:
8867       /* The maximum TLS size allowed under small is 4G.  */
8868       if (aarch64_tls_size > 32)
8869         aarch64_tls_size = 32;
8870       break;
8871     case AARCH64_CMODEL_LARGE:
8872       /* The maximum TLS size allowed under large is 16E.
8873          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8874       if (aarch64_tls_size > 48)
8875         aarch64_tls_size = 48;
8876       break;
8877     default:
8878       gcc_unreachable ();
8879     }
8880
8881   return;
8882 }
8883
8884 /* Parse STRING looking for options in the format:
8885      string     :: option:string
8886      option     :: name=substring
8887      name       :: {a-z}
8888      substring  :: defined by option.  */
8889
8890 static void
8891 aarch64_parse_override_string (const char* input_string,
8892                                struct tune_params* tune)
8893 {
8894   const char separator = ':';
8895   size_t string_length = strlen (input_string) + 1;
8896   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8897   char *string = string_root;
8898   strncpy (string, input_string, string_length);
8899   string[string_length - 1] = '\0';
8900
8901   char* ntoken = string;
8902
8903   while ((ntoken = strchr (string, separator)))
8904     {
8905       size_t token_length = ntoken - string;
8906       /* Make this substring look like a string.  */
8907       *ntoken = '\0';
8908       aarch64_parse_one_override_token (string, token_length, tune);
8909       string = ++ntoken;
8910     }
8911
8912   /* One last option to parse.  */
8913   aarch64_parse_one_override_token (string, strlen (string), tune);
8914   free (string_root);
8915 }
8916
8917
8918 static void
8919 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8920 {
8921   /* The logic here is that if we are disabling all frame pointer generation
8922      then we do not need to disable leaf frame pointer generation as a
8923      separate operation.  But if we are *only* disabling leaf frame pointer
8924      generation then we set flag_omit_frame_pointer to true, but in
8925      aarch64_frame_pointer_required we return false only for leaf functions.
8926
8927      PR 70044: We have to be careful about being called multiple times for the
8928      same function.  Once we have decided to set flag_omit_frame_pointer just
8929      so that we can omit leaf frame pointers, we must then not interpret a
8930      second call as meaning that all frame pointer generation should be
8931      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8932      non-zero value.  */
8933   if (opts->x_flag_omit_frame_pointer == 2)
8934     opts->x_flag_omit_frame_pointer = 0;
8935
8936   if (opts->x_flag_omit_frame_pointer)
8937     opts->x_flag_omit_leaf_frame_pointer = false;
8938   else if (opts->x_flag_omit_leaf_frame_pointer)
8939     opts->x_flag_omit_frame_pointer = 2;
8940
8941   /* If not optimizing for size, set the default
8942      alignment to what the target wants.  */
8943   if (!opts->x_optimize_size)
8944     {
8945       if (opts->x_align_loops <= 0)
8946         opts->x_align_loops = aarch64_tune_params.loop_align;
8947       if (opts->x_align_jumps <= 0)
8948         opts->x_align_jumps = aarch64_tune_params.jump_align;
8949       if (opts->x_align_functions <= 0)
8950         opts->x_align_functions = aarch64_tune_params.function_align;
8951     }
8952
8953   /* We default to no pc-relative literal loads.  */
8954
8955   aarch64_pcrelative_literal_loads = false;
8956
8957   /* If -mpc-relative-literal-loads is set on the command line, this
8958      implies that the user asked for PC relative literal loads.  */
8959   if (opts->x_pcrelative_literal_loads == 1)
8960     aarch64_pcrelative_literal_loads = true;
8961
8962   /* This is PR70113. When building the Linux kernel with
8963      CONFIG_ARM64_ERRATUM_843419, support for relocations
8964      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8965      removed from the kernel to avoid loading objects with possibly
8966      offending sequences.  Without -mpc-relative-literal-loads we would
8967      generate such relocations, preventing the kernel build from
8968      succeeding.  */
8969   if (opts->x_pcrelative_literal_loads == 2
8970       && TARGET_FIX_ERR_A53_843419)
8971     aarch64_pcrelative_literal_loads = true;
8972
8973   /* In the tiny memory model it makes no sense to disallow PC relative
8974      literal pool loads.  */
8975   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8976       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8977     aarch64_pcrelative_literal_loads = true;
8978
8979   /* When enabling the lower precision Newton series for the square root, also
8980      enable it for the reciprocal square root, since the latter is an
8981      intermediary step for the former.  */
8982   if (flag_mlow_precision_sqrt)
8983     flag_mrecip_low_precision_sqrt = true;
8984 }
8985
8986 /* 'Unpack' up the internal tuning structs and update the options
8987     in OPTS.  The caller must have set up selected_tune and selected_arch
8988     as all the other target-specific codegen decisions are
8989     derived from them.  */
8990
8991 void
8992 aarch64_override_options_internal (struct gcc_options *opts)
8993 {
8994   aarch64_tune_flags = selected_tune->flags;
8995   aarch64_tune = selected_tune->sched_core;
8996   /* Make a copy of the tuning parameters attached to the core, which
8997      we may later overwrite.  */
8998   aarch64_tune_params = *(selected_tune->tune);
8999   aarch64_architecture_version = selected_arch->architecture_version;
9000
9001   if (opts->x_aarch64_override_tune_string)
9002     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9003                                   &aarch64_tune_params);
9004
9005   /* This target defaults to strict volatile bitfields.  */
9006   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9007     opts->x_flag_strict_volatile_bitfields = 1;
9008
9009   initialize_aarch64_code_model (opts);
9010   initialize_aarch64_tls_size (opts);
9011
9012   int queue_depth = 0;
9013   switch (aarch64_tune_params.autoprefetcher_model)
9014     {
9015       case tune_params::AUTOPREFETCHER_OFF:
9016         queue_depth = -1;
9017         break;
9018       case tune_params::AUTOPREFETCHER_WEAK:
9019         queue_depth = 0;
9020         break;
9021       case tune_params::AUTOPREFETCHER_STRONG:
9022         queue_depth = max_insn_queue_index + 1;
9023         break;
9024       default:
9025         gcc_unreachable ();
9026     }
9027
9028   /* We don't mind passing in global_options_set here as we don't use
9029      the *options_set structs anyway.  */
9030   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9031                          queue_depth,
9032                          opts->x_param_values,
9033                          global_options_set.x_param_values);
9034
9035   /* Set up parameters to be used in prefetching algorithm.  Do not
9036      override the defaults unless we are tuning for a core we have
9037      researched values for.  */
9038   if (aarch64_tune_params.prefetch->num_slots > 0)
9039     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9040                            aarch64_tune_params.prefetch->num_slots,
9041                            opts->x_param_values,
9042                            global_options_set.x_param_values);
9043   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9044     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9045                            aarch64_tune_params.prefetch->l1_cache_size,
9046                            opts->x_param_values,
9047                            global_options_set.x_param_values);
9048   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9049     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9050                            aarch64_tune_params.prefetch->l1_cache_line_size,
9051                            opts->x_param_values,
9052                            global_options_set.x_param_values);
9053   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9054     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9055                            aarch64_tune_params.prefetch->l2_cache_size,
9056                            opts->x_param_values,
9057                            global_options_set.x_param_values);
9058
9059   /* Enable sw prefetching at specified optimization level for
9060      CPUS that have prefetch.  Lower optimization level threshold by 1
9061      when profiling is enabled.  */
9062   if (opts->x_flag_prefetch_loop_arrays < 0
9063       && !opts->x_optimize_size
9064       && aarch64_tune_params.prefetch->default_opt_level >= 0
9065       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9066     opts->x_flag_prefetch_loop_arrays = 1;
9067
9068   aarch64_override_options_after_change_1 (opts);
9069 }
9070
9071 /* Print a hint with a suggestion for a core or architecture name that
9072    most closely resembles what the user passed in STR.  ARCH is true if
9073    the user is asking for an architecture name.  ARCH is false if the user
9074    is asking for a core name.  */
9075
9076 static void
9077 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9078 {
9079   auto_vec<const char *> candidates;
9080   const struct processor *entry = arch ? all_architectures : all_cores;
9081   for (; entry->name != NULL; entry++)
9082     candidates.safe_push (entry->name);
9083   char *s;
9084   const char *hint = candidates_list_and_hint (str, s, candidates);
9085   if (hint)
9086     inform (input_location, "valid arguments are: %s;"
9087                              " did you mean %qs?", s, hint);
9088   XDELETEVEC (s);
9089 }
9090
9091 /* Print a hint with a suggestion for a core name that most closely resembles
9092    what the user passed in STR.  */
9093
9094 inline static void
9095 aarch64_print_hint_for_core (const char *str)
9096 {
9097   aarch64_print_hint_for_core_or_arch (str, false);
9098 }
9099
9100 /* Print a hint with a suggestion for an architecture name that most closely
9101    resembles what the user passed in STR.  */
9102
9103 inline static void
9104 aarch64_print_hint_for_arch (const char *str)
9105 {
9106   aarch64_print_hint_for_core_or_arch (str, true);
9107 }
9108
9109 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9110    specified in STR and throw errors if appropriate.  Put the results if
9111    they are valid in RES and ISA_FLAGS.  Return whether the option is
9112    valid.  */
9113
9114 static bool
9115 aarch64_validate_mcpu (const char *str, const struct processor **res,
9116                        unsigned long *isa_flags)
9117 {
9118   enum aarch64_parse_opt_result parse_res
9119     = aarch64_parse_cpu (str, res, isa_flags);
9120
9121   if (parse_res == AARCH64_PARSE_OK)
9122     return true;
9123
9124   switch (parse_res)
9125     {
9126       case AARCH64_PARSE_MISSING_ARG:
9127         error ("missing cpu name in %<-mcpu=%s%>", str);
9128         break;
9129       case AARCH64_PARSE_INVALID_ARG:
9130         error ("unknown value %qs for -mcpu", str);
9131         aarch64_print_hint_for_core (str);
9132         break;
9133       case AARCH64_PARSE_INVALID_FEATURE:
9134         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9135         break;
9136       default:
9137         gcc_unreachable ();
9138     }
9139
9140   return false;
9141 }
9142
9143 /* Validate a command-line -march option.  Parse the arch and extensions
9144    (if any) specified in STR and throw errors if appropriate.  Put the
9145    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9146    option is valid.  */
9147
9148 static bool
9149 aarch64_validate_march (const char *str, const struct processor **res,
9150                          unsigned long *isa_flags)
9151 {
9152   enum aarch64_parse_opt_result parse_res
9153     = aarch64_parse_arch (str, res, isa_flags);
9154
9155   if (parse_res == AARCH64_PARSE_OK)
9156     return true;
9157
9158   switch (parse_res)
9159     {
9160       case AARCH64_PARSE_MISSING_ARG:
9161         error ("missing arch name in %<-march=%s%>", str);
9162         break;
9163       case AARCH64_PARSE_INVALID_ARG:
9164         error ("unknown value %qs for -march", str);
9165         aarch64_print_hint_for_arch (str);
9166         break;
9167       case AARCH64_PARSE_INVALID_FEATURE:
9168         error ("invalid feature modifier in %<-march=%s%>", str);
9169         break;
9170       default:
9171         gcc_unreachable ();
9172     }
9173
9174   return false;
9175 }
9176
9177 /* Validate a command-line -mtune option.  Parse the cpu
9178    specified in STR and throw errors if appropriate.  Put the
9179    result, if it is valid, in RES.  Return whether the option is
9180    valid.  */
9181
9182 static bool
9183 aarch64_validate_mtune (const char *str, const struct processor **res)
9184 {
9185   enum aarch64_parse_opt_result parse_res
9186     = aarch64_parse_tune (str, res);
9187
9188   if (parse_res == AARCH64_PARSE_OK)
9189     return true;
9190
9191   switch (parse_res)
9192     {
9193       case AARCH64_PARSE_MISSING_ARG:
9194         error ("missing cpu name in %<-mtune=%s%>", str);
9195         break;
9196       case AARCH64_PARSE_INVALID_ARG:
9197         error ("unknown value %qs for -mtune", str);
9198         aarch64_print_hint_for_core (str);
9199         break;
9200       default:
9201         gcc_unreachable ();
9202     }
9203   return false;
9204 }
9205
9206 /* Return the CPU corresponding to the enum CPU.
9207    If it doesn't specify a cpu, return the default.  */
9208
9209 static const struct processor *
9210 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9211 {
9212   if (cpu != aarch64_none)
9213     return &all_cores[cpu];
9214
9215   /* The & 0x3f is to extract the bottom 6 bits that encode the
9216      default cpu as selected by the --with-cpu GCC configure option
9217      in config.gcc.
9218      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9219      flags mechanism should be reworked to make it more sane.  */
9220   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9221 }
9222
9223 /* Return the architecture corresponding to the enum ARCH.
9224    If it doesn't specify a valid architecture, return the default.  */
9225
9226 static const struct processor *
9227 aarch64_get_arch (enum aarch64_arch arch)
9228 {
9229   if (arch != aarch64_no_arch)
9230     return &all_architectures[arch];
9231
9232   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9233
9234   return &all_architectures[cpu->arch];
9235 }
9236
9237 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9238    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9239    tuning structs.  In particular it must set selected_tune and
9240    aarch64_isa_flags that define the available ISA features and tuning
9241    decisions.  It must also set selected_arch as this will be used to
9242    output the .arch asm tags for each function.  */
9243
9244 static void
9245 aarch64_override_options (void)
9246 {
9247   unsigned long cpu_isa = 0;
9248   unsigned long arch_isa = 0;
9249   aarch64_isa_flags = 0;
9250
9251   bool valid_cpu = true;
9252   bool valid_tune = true;
9253   bool valid_arch = true;
9254
9255   selected_cpu = NULL;
9256   selected_arch = NULL;
9257   selected_tune = NULL;
9258
9259   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9260      If either of -march or -mtune is given, they override their
9261      respective component of -mcpu.  */
9262   if (aarch64_cpu_string)
9263     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9264                                         &cpu_isa);
9265
9266   if (aarch64_arch_string)
9267     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9268                                           &arch_isa);
9269
9270   if (aarch64_tune_string)
9271     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9272
9273   /* If the user did not specify a processor, choose the default
9274      one for them.  This will be the CPU set during configuration using
9275      --with-cpu, otherwise it is "generic".  */
9276   if (!selected_cpu)
9277     {
9278       if (selected_arch)
9279         {
9280           selected_cpu = &all_cores[selected_arch->ident];
9281           aarch64_isa_flags = arch_isa;
9282           explicit_arch = selected_arch->arch;
9283         }
9284       else
9285         {
9286           /* Get default configure-time CPU.  */
9287           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9288           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9289         }
9290
9291       if (selected_tune)
9292         explicit_tune_core = selected_tune->ident;
9293     }
9294   /* If both -mcpu and -march are specified check that they are architecturally
9295      compatible, warn if they're not and prefer the -march ISA flags.  */
9296   else if (selected_arch)
9297     {
9298       if (selected_arch->arch != selected_cpu->arch)
9299         {
9300           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9301                        all_architectures[selected_cpu->arch].name,
9302                        selected_arch->name);
9303         }
9304       aarch64_isa_flags = arch_isa;
9305       explicit_arch = selected_arch->arch;
9306       explicit_tune_core = selected_tune ? selected_tune->ident
9307                                           : selected_cpu->ident;
9308     }
9309   else
9310     {
9311       /* -mcpu but no -march.  */
9312       aarch64_isa_flags = cpu_isa;
9313       explicit_tune_core = selected_tune ? selected_tune->ident
9314                                           : selected_cpu->ident;
9315       gcc_assert (selected_cpu);
9316       selected_arch = &all_architectures[selected_cpu->arch];
9317       explicit_arch = selected_arch->arch;
9318     }
9319
9320   /* Set the arch as well as we will need it when outputing
9321      the .arch directive in assembly.  */
9322   if (!selected_arch)
9323     {
9324       gcc_assert (selected_cpu);
9325       selected_arch = &all_architectures[selected_cpu->arch];
9326     }
9327
9328   if (!selected_tune)
9329     selected_tune = selected_cpu;
9330
9331 #ifndef HAVE_AS_MABI_OPTION
9332   /* The compiler may have been configured with 2.23.* binutils, which does
9333      not have support for ILP32.  */
9334   if (TARGET_ILP32)
9335     error ("Assembler does not support -mabi=ilp32");
9336 #endif
9337
9338   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9339     sorry ("Return address signing is only supported for -mabi=lp64");
9340
9341   /* Make sure we properly set up the explicit options.  */
9342   if ((aarch64_cpu_string && valid_cpu)
9343        || (aarch64_tune_string && valid_tune))
9344     gcc_assert (explicit_tune_core != aarch64_none);
9345
9346   if ((aarch64_cpu_string && valid_cpu)
9347        || (aarch64_arch_string && valid_arch))
9348     gcc_assert (explicit_arch != aarch64_no_arch);
9349
9350   aarch64_override_options_internal (&global_options);
9351
9352   /* Save these options as the default ones in case we push and pop them later
9353      while processing functions with potential target attributes.  */
9354   target_option_default_node = target_option_current_node
9355       = build_target_option_node (&global_options);
9356 }
9357
9358 /* Implement targetm.override_options_after_change.  */
9359
9360 static void
9361 aarch64_override_options_after_change (void)
9362 {
9363   aarch64_override_options_after_change_1 (&global_options);
9364 }
9365
9366 static struct machine_function *
9367 aarch64_init_machine_status (void)
9368 {
9369   struct machine_function *machine;
9370   machine = ggc_cleared_alloc<machine_function> ();
9371   return machine;
9372 }
9373
9374 void
9375 aarch64_init_expanders (void)
9376 {
9377   init_machine_status = aarch64_init_machine_status;
9378 }
9379
9380 /* A checking mechanism for the implementation of the various code models.  */
9381 static void
9382 initialize_aarch64_code_model (struct gcc_options *opts)
9383 {
9384    if (opts->x_flag_pic)
9385      {
9386        switch (opts->x_aarch64_cmodel_var)
9387          {
9388          case AARCH64_CMODEL_TINY:
9389            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9390            break;
9391          case AARCH64_CMODEL_SMALL:
9392 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9393            aarch64_cmodel = (flag_pic == 2
9394                              ? AARCH64_CMODEL_SMALL_PIC
9395                              : AARCH64_CMODEL_SMALL_SPIC);
9396 #else
9397            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9398 #endif
9399            break;
9400          case AARCH64_CMODEL_LARGE:
9401            sorry ("code model %qs with -f%s", "large",
9402                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9403            break;
9404          default:
9405            gcc_unreachable ();
9406          }
9407      }
9408    else
9409      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9410 }
9411
9412 /* Implement TARGET_OPTION_SAVE.  */
9413
9414 static void
9415 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9416 {
9417   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9418 }
9419
9420 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9421    using the information saved in PTR.  */
9422
9423 static void
9424 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9425 {
9426   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9427   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9428   opts->x_explicit_arch = ptr->x_explicit_arch;
9429   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9430   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9431
9432   aarch64_override_options_internal (opts);
9433 }
9434
9435 /* Implement TARGET_OPTION_PRINT.  */
9436
9437 static void
9438 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9439 {
9440   const struct processor *cpu
9441     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9442   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9443   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9444   std::string extension
9445     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9446
9447   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9448   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9449            arch->name, extension.c_str ());
9450 }
9451
9452 static GTY(()) tree aarch64_previous_fndecl;
9453
9454 void
9455 aarch64_reset_previous_fndecl (void)
9456 {
9457   aarch64_previous_fndecl = NULL;
9458 }
9459
9460 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9461    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9462    make sure optab availability predicates are recomputed when necessary.  */
9463
9464 void
9465 aarch64_save_restore_target_globals (tree new_tree)
9466 {
9467   if (TREE_TARGET_GLOBALS (new_tree))
9468     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9469   else if (new_tree == target_option_default_node)
9470     restore_target_globals (&default_target_globals);
9471   else
9472     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9473 }
9474
9475 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9476    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9477    of the function, if such exists.  This function may be called multiple
9478    times on a single function so use aarch64_previous_fndecl to avoid
9479    setting up identical state.  */
9480
9481 static void
9482 aarch64_set_current_function (tree fndecl)
9483 {
9484   if (!fndecl || fndecl == aarch64_previous_fndecl)
9485     return;
9486
9487   tree old_tree = (aarch64_previous_fndecl
9488                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9489                    : NULL_TREE);
9490
9491   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9492
9493   /* If current function has no attributes but the previous one did,
9494      use the default node.  */
9495   if (!new_tree && old_tree)
9496     new_tree = target_option_default_node;
9497
9498   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9499      the default have been handled by aarch64_save_restore_target_globals from
9500      aarch64_pragma_target_parse.  */
9501   if (old_tree == new_tree)
9502     return;
9503
9504   aarch64_previous_fndecl = fndecl;
9505
9506   /* First set the target options.  */
9507   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9508
9509   aarch64_save_restore_target_globals (new_tree);
9510 }
9511
9512 /* Enum describing the various ways we can handle attributes.
9513    In many cases we can reuse the generic option handling machinery.  */
9514
9515 enum aarch64_attr_opt_type
9516 {
9517   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9518   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9519   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9520   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9521 };
9522
9523 /* All the information needed to handle a target attribute.
9524    NAME is the name of the attribute.
9525    ATTR_TYPE specifies the type of behavior of the attribute as described
9526    in the definition of enum aarch64_attr_opt_type.
9527    ALLOW_NEG is true if the attribute supports a "no-" form.
9528    HANDLER is the function that takes the attribute string and whether
9529    it is a pragma or attribute and handles the option.  It is needed only
9530    when the ATTR_TYPE is aarch64_attr_custom.
9531    OPT_NUM is the enum specifying the option that the attribute modifies.
9532    This is needed for attributes that mirror the behavior of a command-line
9533    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9534    aarch64_attr_enum.  */
9535
9536 struct aarch64_attribute_info
9537 {
9538   const char *name;
9539   enum aarch64_attr_opt_type attr_type;
9540   bool allow_neg;
9541   bool (*handler) (const char *, const char *);
9542   enum opt_code opt_num;
9543 };
9544
9545 /* Handle the ARCH_STR argument to the arch= target attribute.
9546    PRAGMA_OR_ATTR is used in potential error messages.  */
9547
9548 static bool
9549 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9550 {
9551   const struct processor *tmp_arch = NULL;
9552   enum aarch64_parse_opt_result parse_res
9553     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9554
9555   if (parse_res == AARCH64_PARSE_OK)
9556     {
9557       gcc_assert (tmp_arch);
9558       selected_arch = tmp_arch;
9559       explicit_arch = selected_arch->arch;
9560       return true;
9561     }
9562
9563   switch (parse_res)
9564     {
9565       case AARCH64_PARSE_MISSING_ARG:
9566         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9567         break;
9568       case AARCH64_PARSE_INVALID_ARG:
9569         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9570         aarch64_print_hint_for_arch (str);
9571         break;
9572       case AARCH64_PARSE_INVALID_FEATURE:
9573         error ("invalid feature modifier %qs for 'arch' target %s",
9574                str, pragma_or_attr);
9575         break;
9576       default:
9577         gcc_unreachable ();
9578     }
9579
9580   return false;
9581 }
9582
9583 /* Handle the argument CPU_STR to the cpu= target attribute.
9584    PRAGMA_OR_ATTR is used in potential error messages.  */
9585
9586 static bool
9587 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9588 {
9589   const struct processor *tmp_cpu = NULL;
9590   enum aarch64_parse_opt_result parse_res
9591     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9592
9593   if (parse_res == AARCH64_PARSE_OK)
9594     {
9595       gcc_assert (tmp_cpu);
9596       selected_tune = tmp_cpu;
9597       explicit_tune_core = selected_tune->ident;
9598
9599       selected_arch = &all_architectures[tmp_cpu->arch];
9600       explicit_arch = selected_arch->arch;
9601       return true;
9602     }
9603
9604   switch (parse_res)
9605     {
9606       case AARCH64_PARSE_MISSING_ARG:
9607         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9608         break;
9609       case AARCH64_PARSE_INVALID_ARG:
9610         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9611         aarch64_print_hint_for_core (str);
9612         break;
9613       case AARCH64_PARSE_INVALID_FEATURE:
9614         error ("invalid feature modifier %qs for 'cpu' target %s",
9615                str, pragma_or_attr);
9616         break;
9617       default:
9618         gcc_unreachable ();
9619     }
9620
9621   return false;
9622 }
9623
9624 /* Handle the argument STR to the tune= target attribute.
9625    PRAGMA_OR_ATTR is used in potential error messages.  */
9626
9627 static bool
9628 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9629 {
9630   const struct processor *tmp_tune = NULL;
9631   enum aarch64_parse_opt_result parse_res
9632     = aarch64_parse_tune (str, &tmp_tune);
9633
9634   if (parse_res == AARCH64_PARSE_OK)
9635     {
9636       gcc_assert (tmp_tune);
9637       selected_tune = tmp_tune;
9638       explicit_tune_core = selected_tune->ident;
9639       return true;
9640     }
9641
9642   switch (parse_res)
9643     {
9644       case AARCH64_PARSE_INVALID_ARG:
9645         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9646         aarch64_print_hint_for_core (str);
9647         break;
9648       default:
9649         gcc_unreachable ();
9650     }
9651
9652   return false;
9653 }
9654
9655 /* Parse an architecture extensions target attribute string specified in STR.
9656    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9657    if successful.  Update aarch64_isa_flags to reflect the ISA features
9658    modified.
9659    PRAGMA_OR_ATTR is used in potential error messages.  */
9660
9661 static bool
9662 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9663 {
9664   enum aarch64_parse_opt_result parse_res;
9665   unsigned long isa_flags = aarch64_isa_flags;
9666
9667   /* We allow "+nothing" in the beginning to clear out all architectural
9668      features if the user wants to handpick specific features.  */
9669   if (strncmp ("+nothing", str, 8) == 0)
9670     {
9671       isa_flags = 0;
9672       str += 8;
9673     }
9674
9675   parse_res = aarch64_parse_extension (str, &isa_flags);
9676
9677   if (parse_res == AARCH64_PARSE_OK)
9678     {
9679       aarch64_isa_flags = isa_flags;
9680       return true;
9681     }
9682
9683   switch (parse_res)
9684     {
9685       case AARCH64_PARSE_MISSING_ARG:
9686         error ("missing feature modifier in target %s %qs",
9687                pragma_or_attr, str);
9688         break;
9689
9690       case AARCH64_PARSE_INVALID_FEATURE:
9691         error ("invalid feature modifier in target %s %qs",
9692                pragma_or_attr, str);
9693         break;
9694
9695       default:
9696         gcc_unreachable ();
9697     }
9698
9699  return false;
9700 }
9701
9702 /* The target attributes that we support.  On top of these we also support just
9703    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9704    handled explicitly in aarch64_process_one_target_attr.  */
9705
9706 static const struct aarch64_attribute_info aarch64_attributes[] =
9707 {
9708   { "general-regs-only", aarch64_attr_mask, false, NULL,
9709      OPT_mgeneral_regs_only },
9710   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9711      OPT_mfix_cortex_a53_835769 },
9712   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9713      OPT_mfix_cortex_a53_843419 },
9714   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9715   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9716   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9717      OPT_momit_leaf_frame_pointer },
9718   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9719   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9720      OPT_march_ },
9721   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9722   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9723      OPT_mtune_ },
9724   { "sign-return-address", aarch64_attr_enum, false, NULL,
9725      OPT_msign_return_address_ },
9726   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9727 };
9728
9729 /* Parse ARG_STR which contains the definition of one target attribute.
9730    Show appropriate errors if any or return true if the attribute is valid.
9731    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9732    we're processing a target attribute or pragma.  */
9733
9734 static bool
9735 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9736 {
9737   bool invert = false;
9738
9739   size_t len = strlen (arg_str);
9740
9741   if (len == 0)
9742     {
9743       error ("malformed target %s", pragma_or_attr);
9744       return false;
9745     }
9746
9747   char *str_to_check = (char *) alloca (len + 1);
9748   strcpy (str_to_check, arg_str);
9749
9750   /* Skip leading whitespace.  */
9751   while (*str_to_check == ' ' || *str_to_check == '\t')
9752     str_to_check++;
9753
9754   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9755      It is easier to detect and handle it explicitly here rather than going
9756      through the machinery for the rest of the target attributes in this
9757      function.  */
9758   if (*str_to_check == '+')
9759     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9760
9761   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9762     {
9763       invert = true;
9764       str_to_check += 3;
9765     }
9766   char *arg = strchr (str_to_check, '=');
9767
9768   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9769      and point ARG to "foo".  */
9770   if (arg)
9771     {
9772       *arg = '\0';
9773       arg++;
9774     }
9775   const struct aarch64_attribute_info *p_attr;
9776   bool found = false;
9777   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9778     {
9779       /* If the names don't match up, or the user has given an argument
9780          to an attribute that doesn't accept one, or didn't give an argument
9781          to an attribute that expects one, fail to match.  */
9782       if (strcmp (str_to_check, p_attr->name) != 0)
9783         continue;
9784
9785       found = true;
9786       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9787                               || p_attr->attr_type == aarch64_attr_enum;
9788
9789       if (attr_need_arg_p ^ (arg != NULL))
9790         {
9791           error ("target %s %qs does not accept an argument",
9792                   pragma_or_attr, str_to_check);
9793           return false;
9794         }
9795
9796       /* If the name matches but the attribute does not allow "no-" versions
9797          then we can't match.  */
9798       if (invert && !p_attr->allow_neg)
9799         {
9800           error ("target %s %qs does not allow a negated form",
9801                   pragma_or_attr, str_to_check);
9802           return false;
9803         }
9804
9805       switch (p_attr->attr_type)
9806         {
9807         /* Has a custom handler registered.
9808            For example, cpu=, arch=, tune=.  */
9809           case aarch64_attr_custom:
9810             gcc_assert (p_attr->handler);
9811             if (!p_attr->handler (arg, pragma_or_attr))
9812               return false;
9813             break;
9814
9815           /* Either set or unset a boolean option.  */
9816           case aarch64_attr_bool:
9817             {
9818               struct cl_decoded_option decoded;
9819
9820               generate_option (p_attr->opt_num, NULL, !invert,
9821                                CL_TARGET, &decoded);
9822               aarch64_handle_option (&global_options, &global_options_set,
9823                                       &decoded, input_location);
9824               break;
9825             }
9826           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9827              should know what mask to apply given the option number.  */
9828           case aarch64_attr_mask:
9829             {
9830               struct cl_decoded_option decoded;
9831               /* We only need to specify the option number.
9832                  aarch64_handle_option will know which mask to apply.  */
9833               decoded.opt_index = p_attr->opt_num;
9834               decoded.value = !invert;
9835               aarch64_handle_option (&global_options, &global_options_set,
9836                                       &decoded, input_location);
9837               break;
9838             }
9839           /* Use the option setting machinery to set an option to an enum.  */
9840           case aarch64_attr_enum:
9841             {
9842               gcc_assert (arg);
9843               bool valid;
9844               int value;
9845               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9846                                               &value, CL_TARGET);
9847               if (valid)
9848                 {
9849                   set_option (&global_options, NULL, p_attr->opt_num, value,
9850                               NULL, DK_UNSPECIFIED, input_location,
9851                               global_dc);
9852                 }
9853               else
9854                 {
9855                   error ("target %s %s=%s is not valid",
9856                          pragma_or_attr, str_to_check, arg);
9857                 }
9858               break;
9859             }
9860           default:
9861             gcc_unreachable ();
9862         }
9863     }
9864
9865   /* If we reached here we either have found an attribute and validated
9866      it or didn't match any.  If we matched an attribute but its arguments
9867      were malformed we will have returned false already.  */
9868   return found;
9869 }
9870
9871 /* Count how many times the character C appears in
9872    NULL-terminated string STR.  */
9873
9874 static unsigned int
9875 num_occurences_in_str (char c, char *str)
9876 {
9877   unsigned int res = 0;
9878   while (*str != '\0')
9879     {
9880       if (*str == c)
9881         res++;
9882
9883       str++;
9884     }
9885
9886   return res;
9887 }
9888
9889 /* Parse the tree in ARGS that contains the target attribute information
9890    and update the global target options space.  PRAGMA_OR_ATTR is a string
9891    to be used in error messages, specifying whether this is processing
9892    a target attribute or a target pragma.  */
9893
9894 bool
9895 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9896 {
9897   if (TREE_CODE (args) == TREE_LIST)
9898     {
9899       do
9900         {
9901           tree head = TREE_VALUE (args);
9902           if (head)
9903             {
9904               if (!aarch64_process_target_attr (head, pragma_or_attr))
9905                 return false;
9906             }
9907           args = TREE_CHAIN (args);
9908         } while (args);
9909
9910       return true;
9911     }
9912
9913   if (TREE_CODE (args) != STRING_CST)
9914     {
9915       error ("attribute %<target%> argument not a string");
9916       return false;
9917     }
9918
9919   size_t len = strlen (TREE_STRING_POINTER (args));
9920   char *str_to_check = (char *) alloca (len + 1);
9921   strcpy (str_to_check, TREE_STRING_POINTER (args));
9922
9923   if (len == 0)
9924     {
9925       error ("malformed target %s value", pragma_or_attr);
9926       return false;
9927     }
9928
9929   /* Used to catch empty spaces between commas i.e.
9930      attribute ((target ("attr1,,attr2"))).  */
9931   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9932
9933   /* Handle multiple target attributes separated by ','.  */
9934   char *token = strtok (str_to_check, ",");
9935
9936   unsigned int num_attrs = 0;
9937   while (token)
9938     {
9939       num_attrs++;
9940       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9941         {
9942           error ("target %s %qs is invalid", pragma_or_attr, token);
9943           return false;
9944         }
9945
9946       token = strtok (NULL, ",");
9947     }
9948
9949   if (num_attrs != num_commas + 1)
9950     {
9951       error ("malformed target %s list %qs",
9952               pragma_or_attr, TREE_STRING_POINTER (args));
9953       return false;
9954     }
9955
9956   return true;
9957 }
9958
9959 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9960    process attribute ((target ("..."))).  */
9961
9962 static bool
9963 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9964 {
9965   struct cl_target_option cur_target;
9966   bool ret;
9967   tree old_optimize;
9968   tree new_target, new_optimize;
9969   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9970
9971   /* If what we're processing is the current pragma string then the
9972      target option node is already stored in target_option_current_node
9973      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9974      having to re-parse the string.  This is especially useful to keep
9975      arm_neon.h compile times down since that header contains a lot
9976      of intrinsics enclosed in pragmas.  */
9977   if (!existing_target && args == current_target_pragma)
9978     {
9979       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9980       return true;
9981     }
9982   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9983
9984   old_optimize = build_optimization_node (&global_options);
9985   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9986
9987   /* If the function changed the optimization levels as well as setting
9988      target options, start with the optimizations specified.  */
9989   if (func_optimize && func_optimize != old_optimize)
9990     cl_optimization_restore (&global_options,
9991                              TREE_OPTIMIZATION (func_optimize));
9992
9993   /* Save the current target options to restore at the end.  */
9994   cl_target_option_save (&cur_target, &global_options);
9995
9996   /* If fndecl already has some target attributes applied to it, unpack
9997      them so that we add this attribute on top of them, rather than
9998      overwriting them.  */
9999   if (existing_target)
10000     {
10001       struct cl_target_option *existing_options
10002         = TREE_TARGET_OPTION (existing_target);
10003
10004       if (existing_options)
10005         cl_target_option_restore (&global_options, existing_options);
10006     }
10007   else
10008     cl_target_option_restore (&global_options,
10009                         TREE_TARGET_OPTION (target_option_current_node));
10010
10011
10012   ret = aarch64_process_target_attr (args, "attribute");
10013
10014   /* Set up any additional state.  */
10015   if (ret)
10016     {
10017       aarch64_override_options_internal (&global_options);
10018       /* Initialize SIMD builtins if we haven't already.
10019          Set current_target_pragma to NULL for the duration so that
10020          the builtin initialization code doesn't try to tag the functions
10021          being built with the attributes specified by any current pragma, thus
10022          going into an infinite recursion.  */
10023       if (TARGET_SIMD)
10024         {
10025           tree saved_current_target_pragma = current_target_pragma;
10026           current_target_pragma = NULL;
10027           aarch64_init_simd_builtins ();
10028           current_target_pragma = saved_current_target_pragma;
10029         }
10030       new_target = build_target_option_node (&global_options);
10031     }
10032   else
10033     new_target = NULL;
10034
10035   new_optimize = build_optimization_node (&global_options);
10036
10037   if (fndecl && ret)
10038     {
10039       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10040
10041       if (old_optimize != new_optimize)
10042         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10043     }
10044
10045   cl_target_option_restore (&global_options, &cur_target);
10046
10047   if (old_optimize != new_optimize)
10048     cl_optimization_restore (&global_options,
10049                              TREE_OPTIMIZATION (old_optimize));
10050   return ret;
10051 }
10052
10053 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10054    tri-bool options (yes, no, don't care) and the default value is
10055    DEF, determine whether to reject inlining.  */
10056
10057 static bool
10058 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10059                                      int dont_care, int def)
10060 {
10061   /* If the callee doesn't care, always allow inlining.  */
10062   if (callee == dont_care)
10063     return true;
10064
10065   /* If the caller doesn't care, always allow inlining.  */
10066   if (caller == dont_care)
10067     return true;
10068
10069   /* Otherwise, allow inlining if either the callee and caller values
10070      agree, or if the callee is using the default value.  */
10071   return (callee == caller || callee == def);
10072 }
10073
10074 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10075    to inline CALLEE into CALLER based on target-specific info.
10076    Make sure that the caller and callee have compatible architectural
10077    features.  Then go through the other possible target attributes
10078    and see if they can block inlining.  Try not to reject always_inline
10079    callees unless they are incompatible architecturally.  */
10080
10081 static bool
10082 aarch64_can_inline_p (tree caller, tree callee)
10083 {
10084   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10085   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10086
10087   /* If callee has no option attributes, then it is ok to inline.  */
10088   if (!callee_tree)
10089     return true;
10090
10091   struct cl_target_option *caller_opts
10092         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10093                                            : target_option_default_node);
10094
10095   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10096
10097
10098   /* Callee's ISA flags should be a subset of the caller's.  */
10099   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10100        != callee_opts->x_aarch64_isa_flags)
10101     return false;
10102
10103   /* Allow non-strict aligned functions inlining into strict
10104      aligned ones.  */
10105   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10106        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10107       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10108            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10109     return false;
10110
10111   bool always_inline = lookup_attribute ("always_inline",
10112                                           DECL_ATTRIBUTES (callee));
10113
10114   /* If the architectural features match up and the callee is always_inline
10115      then the other attributes don't matter.  */
10116   if (always_inline)
10117     return true;
10118
10119   if (caller_opts->x_aarch64_cmodel_var
10120       != callee_opts->x_aarch64_cmodel_var)
10121     return false;
10122
10123   if (caller_opts->x_aarch64_tls_dialect
10124       != callee_opts->x_aarch64_tls_dialect)
10125     return false;
10126
10127   /* Honour explicit requests to workaround errata.  */
10128   if (!aarch64_tribools_ok_for_inlining_p (
10129           caller_opts->x_aarch64_fix_a53_err835769,
10130           callee_opts->x_aarch64_fix_a53_err835769,
10131           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10132     return false;
10133
10134   if (!aarch64_tribools_ok_for_inlining_p (
10135           caller_opts->x_aarch64_fix_a53_err843419,
10136           callee_opts->x_aarch64_fix_a53_err843419,
10137           2, TARGET_FIX_ERR_A53_843419))
10138     return false;
10139
10140   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10141      caller and calle and they don't match up, reject inlining.  */
10142   if (!aarch64_tribools_ok_for_inlining_p (
10143           caller_opts->x_flag_omit_leaf_frame_pointer,
10144           callee_opts->x_flag_omit_leaf_frame_pointer,
10145           2, 1))
10146     return false;
10147
10148   /* If the callee has specific tuning overrides, respect them.  */
10149   if (callee_opts->x_aarch64_override_tune_string != NULL
10150       && caller_opts->x_aarch64_override_tune_string == NULL)
10151     return false;
10152
10153   /* If the user specified tuning override strings for the
10154      caller and callee and they don't match up, reject inlining.
10155      We just do a string compare here, we don't analyze the meaning
10156      of the string, as it would be too costly for little gain.  */
10157   if (callee_opts->x_aarch64_override_tune_string
10158       && caller_opts->x_aarch64_override_tune_string
10159       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10160                   caller_opts->x_aarch64_override_tune_string) != 0))
10161     return false;
10162
10163   return true;
10164 }
10165
10166 /* Return true if SYMBOL_REF X binds locally.  */
10167
10168 static bool
10169 aarch64_symbol_binds_local_p (const_rtx x)
10170 {
10171   return (SYMBOL_REF_DECL (x)
10172           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10173           : SYMBOL_REF_LOCAL_P (x));
10174 }
10175
10176 /* Return true if SYMBOL_REF X is thread local */
10177 static bool
10178 aarch64_tls_symbol_p (rtx x)
10179 {
10180   if (! TARGET_HAVE_TLS)
10181     return false;
10182
10183   if (GET_CODE (x) != SYMBOL_REF)
10184     return false;
10185
10186   return SYMBOL_REF_TLS_MODEL (x) != 0;
10187 }
10188
10189 /* Classify a TLS symbol into one of the TLS kinds.  */
10190 enum aarch64_symbol_type
10191 aarch64_classify_tls_symbol (rtx x)
10192 {
10193   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10194
10195   switch (tls_kind)
10196     {
10197     case TLS_MODEL_GLOBAL_DYNAMIC:
10198     case TLS_MODEL_LOCAL_DYNAMIC:
10199       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10200
10201     case TLS_MODEL_INITIAL_EXEC:
10202       switch (aarch64_cmodel)
10203         {
10204         case AARCH64_CMODEL_TINY:
10205         case AARCH64_CMODEL_TINY_PIC:
10206           return SYMBOL_TINY_TLSIE;
10207         default:
10208           return SYMBOL_SMALL_TLSIE;
10209         }
10210
10211     case TLS_MODEL_LOCAL_EXEC:
10212       if (aarch64_tls_size == 12)
10213         return SYMBOL_TLSLE12;
10214       else if (aarch64_tls_size == 24)
10215         return SYMBOL_TLSLE24;
10216       else if (aarch64_tls_size == 32)
10217         return SYMBOL_TLSLE32;
10218       else if (aarch64_tls_size == 48)
10219         return SYMBOL_TLSLE48;
10220       else
10221         gcc_unreachable ();
10222
10223     case TLS_MODEL_EMULATED:
10224     case TLS_MODEL_NONE:
10225       return SYMBOL_FORCE_TO_MEM;
10226
10227     default:
10228       gcc_unreachable ();
10229     }
10230 }
10231
10232 /* Return the method that should be used to access SYMBOL_REF or
10233    LABEL_REF X.  */
10234
10235 enum aarch64_symbol_type
10236 aarch64_classify_symbol (rtx x, rtx offset)
10237 {
10238   if (GET_CODE (x) == LABEL_REF)
10239     {
10240       switch (aarch64_cmodel)
10241         {
10242         case AARCH64_CMODEL_LARGE:
10243           return SYMBOL_FORCE_TO_MEM;
10244
10245         case AARCH64_CMODEL_TINY_PIC:
10246         case AARCH64_CMODEL_TINY:
10247           return SYMBOL_TINY_ABSOLUTE;
10248
10249         case AARCH64_CMODEL_SMALL_SPIC:
10250         case AARCH64_CMODEL_SMALL_PIC:
10251         case AARCH64_CMODEL_SMALL:
10252           return SYMBOL_SMALL_ABSOLUTE;
10253
10254         default:
10255           gcc_unreachable ();
10256         }
10257     }
10258
10259   if (GET_CODE (x) == SYMBOL_REF)
10260     {
10261       if (aarch64_tls_symbol_p (x))
10262         return aarch64_classify_tls_symbol (x);
10263
10264       switch (aarch64_cmodel)
10265         {
10266         case AARCH64_CMODEL_TINY:
10267           /* When we retrieve symbol + offset address, we have to make sure
10268              the offset does not cause overflow of the final address.  But
10269              we have no way of knowing the address of symbol at compile time
10270              so we can't accurately say if the distance between the PC and
10271              symbol + offset is outside the addressible range of +/-1M in the
10272              TINY code model.  So we rely on images not being greater than
10273              1M and cap the offset at 1M and anything beyond 1M will have to
10274              be loaded using an alternative mechanism.  Furthermore if the
10275              symbol is a weak reference to something that isn't known to
10276              resolve to a symbol in this module, then force to memory.  */
10277           if ((SYMBOL_REF_WEAK (x)
10278                && !aarch64_symbol_binds_local_p (x))
10279               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10280             return SYMBOL_FORCE_TO_MEM;
10281           return SYMBOL_TINY_ABSOLUTE;
10282
10283         case AARCH64_CMODEL_SMALL:
10284           /* Same reasoning as the tiny code model, but the offset cap here is
10285              4G.  */
10286           if ((SYMBOL_REF_WEAK (x)
10287                && !aarch64_symbol_binds_local_p (x))
10288               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10289                             HOST_WIDE_INT_C (4294967264)))
10290             return SYMBOL_FORCE_TO_MEM;
10291           return SYMBOL_SMALL_ABSOLUTE;
10292
10293         case AARCH64_CMODEL_TINY_PIC:
10294           if (!aarch64_symbol_binds_local_p (x))
10295             return SYMBOL_TINY_GOT;
10296           return SYMBOL_TINY_ABSOLUTE;
10297
10298         case AARCH64_CMODEL_SMALL_SPIC:
10299         case AARCH64_CMODEL_SMALL_PIC:
10300           if (!aarch64_symbol_binds_local_p (x))
10301             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10302                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10303           return SYMBOL_SMALL_ABSOLUTE;
10304
10305         case AARCH64_CMODEL_LARGE:
10306           /* This is alright even in PIC code as the constant
10307              pool reference is always PC relative and within
10308              the same translation unit.  */
10309           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10310             return SYMBOL_SMALL_ABSOLUTE;
10311           else
10312             return SYMBOL_FORCE_TO_MEM;
10313
10314         default:
10315           gcc_unreachable ();
10316         }
10317     }
10318
10319   /* By default push everything into the constant pool.  */
10320   return SYMBOL_FORCE_TO_MEM;
10321 }
10322
10323 bool
10324 aarch64_constant_address_p (rtx x)
10325 {
10326   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10327 }
10328
10329 bool
10330 aarch64_legitimate_pic_operand_p (rtx x)
10331 {
10332   if (GET_CODE (x) == SYMBOL_REF
10333       || (GET_CODE (x) == CONST
10334           && GET_CODE (XEXP (x, 0)) == PLUS
10335           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10336      return false;
10337
10338   return true;
10339 }
10340
10341 /* Return true if X holds either a quarter-precision or
10342      floating-point +0.0 constant.  */
10343 static bool
10344 aarch64_valid_floating_const (rtx x)
10345 {
10346   if (!CONST_DOUBLE_P (x))
10347     return false;
10348
10349   /* This call determines which constants can be used in mov<mode>
10350      as integer moves instead of constant loads.  */
10351   if (aarch64_float_const_rtx_p (x))
10352     return true;
10353
10354   return aarch64_float_const_representable_p (x);
10355 }
10356
10357 static bool
10358 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10359 {
10360   /* Do not allow vector struct mode constants.  We could support
10361      0 and -1 easily, but they need support in aarch64-simd.md.  */
10362   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10363     return false;
10364
10365   /* For these cases we never want to use a literal load.
10366      As such we have to prevent the compiler from forcing these
10367      to memory.  */
10368   if ((GET_CODE (x) == CONST_VECTOR
10369        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10370       || CONST_INT_P (x)
10371       || aarch64_valid_floating_const (x)
10372       || aarch64_can_const_movi_rtx_p (x, mode)
10373       || aarch64_float_const_rtx_p (x))
10374         return !targetm.cannot_force_const_mem (mode, x);
10375
10376   if (GET_CODE (x) == HIGH
10377       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10378     return true;
10379
10380   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10381      so spilling them is better than rematerialization.  */
10382   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10383     return true;
10384
10385   return aarch64_constant_address_p (x);
10386 }
10387
10388 rtx
10389 aarch64_load_tp (rtx target)
10390 {
10391   if (!target
10392       || GET_MODE (target) != Pmode
10393       || !register_operand (target, Pmode))
10394     target = gen_reg_rtx (Pmode);
10395
10396   /* Can return in any reg.  */
10397   emit_insn (gen_aarch64_load_tp_hard (target));
10398   return target;
10399 }
10400
10401 /* On AAPCS systems, this is the "struct __va_list".  */
10402 static GTY(()) tree va_list_type;
10403
10404 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10405    Return the type to use as __builtin_va_list.
10406
10407    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10408
10409    struct __va_list
10410    {
10411      void *__stack;
10412      void *__gr_top;
10413      void *__vr_top;
10414      int   __gr_offs;
10415      int   __vr_offs;
10416    };  */
10417
10418 static tree
10419 aarch64_build_builtin_va_list (void)
10420 {
10421   tree va_list_name;
10422   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10423
10424   /* Create the type.  */
10425   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10426   /* Give it the required name.  */
10427   va_list_name = build_decl (BUILTINS_LOCATION,
10428                              TYPE_DECL,
10429                              get_identifier ("__va_list"),
10430                              va_list_type);
10431   DECL_ARTIFICIAL (va_list_name) = 1;
10432   TYPE_NAME (va_list_type) = va_list_name;
10433   TYPE_STUB_DECL (va_list_type) = va_list_name;
10434
10435   /* Create the fields.  */
10436   f_stack = build_decl (BUILTINS_LOCATION,
10437                         FIELD_DECL, get_identifier ("__stack"),
10438                         ptr_type_node);
10439   f_grtop = build_decl (BUILTINS_LOCATION,
10440                         FIELD_DECL, get_identifier ("__gr_top"),
10441                         ptr_type_node);
10442   f_vrtop = build_decl (BUILTINS_LOCATION,
10443                         FIELD_DECL, get_identifier ("__vr_top"),
10444                         ptr_type_node);
10445   f_groff = build_decl (BUILTINS_LOCATION,
10446                         FIELD_DECL, get_identifier ("__gr_offs"),
10447                         integer_type_node);
10448   f_vroff = build_decl (BUILTINS_LOCATION,
10449                         FIELD_DECL, get_identifier ("__vr_offs"),
10450                         integer_type_node);
10451
10452   /* Tell tree-stdarg pass about our internal offset fields.
10453      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10454      purpose to identify whether the code is updating va_list internal
10455      offset fields through irregular way.  */
10456   va_list_gpr_counter_field = f_groff;
10457   va_list_fpr_counter_field = f_vroff;
10458
10459   DECL_ARTIFICIAL (f_stack) = 1;
10460   DECL_ARTIFICIAL (f_grtop) = 1;
10461   DECL_ARTIFICIAL (f_vrtop) = 1;
10462   DECL_ARTIFICIAL (f_groff) = 1;
10463   DECL_ARTIFICIAL (f_vroff) = 1;
10464
10465   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10466   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10467   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10468   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10469   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10470
10471   TYPE_FIELDS (va_list_type) = f_stack;
10472   DECL_CHAIN (f_stack) = f_grtop;
10473   DECL_CHAIN (f_grtop) = f_vrtop;
10474   DECL_CHAIN (f_vrtop) = f_groff;
10475   DECL_CHAIN (f_groff) = f_vroff;
10476
10477   /* Compute its layout.  */
10478   layout_type (va_list_type);
10479
10480   return va_list_type;
10481 }
10482
10483 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10484 static void
10485 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10486 {
10487   const CUMULATIVE_ARGS *cum;
10488   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10489   tree stack, grtop, vrtop, groff, vroff;
10490   tree t;
10491   int gr_save_area_size = cfun->va_list_gpr_size;
10492   int vr_save_area_size = cfun->va_list_fpr_size;
10493   int vr_offset;
10494
10495   cum = &crtl->args.info;
10496   if (cfun->va_list_gpr_size)
10497     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10498                              cfun->va_list_gpr_size);
10499   if (cfun->va_list_fpr_size)
10500     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10501                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10502
10503   if (!TARGET_FLOAT)
10504     {
10505       gcc_assert (cum->aapcs_nvrn == 0);
10506       vr_save_area_size = 0;
10507     }
10508
10509   f_stack = TYPE_FIELDS (va_list_type_node);
10510   f_grtop = DECL_CHAIN (f_stack);
10511   f_vrtop = DECL_CHAIN (f_grtop);
10512   f_groff = DECL_CHAIN (f_vrtop);
10513   f_vroff = DECL_CHAIN (f_groff);
10514
10515   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10516                   NULL_TREE);
10517   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10518                   NULL_TREE);
10519   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10520                   NULL_TREE);
10521   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10522                   NULL_TREE);
10523   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10524                   NULL_TREE);
10525
10526   /* Emit code to initialize STACK, which points to the next varargs stack
10527      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10528      by named arguments.  STACK is 8-byte aligned.  */
10529   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10530   if (cum->aapcs_stack_size > 0)
10531     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10532   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10533   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10534
10535   /* Emit code to initialize GRTOP, the top of the GR save area.
10536      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10537   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10538   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10539   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10540
10541   /* Emit code to initialize VRTOP, the top of the VR save area.
10542      This address is gr_save_area_bytes below GRTOP, rounded
10543      down to the next 16-byte boundary.  */
10544   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10545   vr_offset = ROUND_UP (gr_save_area_size,
10546                         STACK_BOUNDARY / BITS_PER_UNIT);
10547
10548   if (vr_offset)
10549     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10550   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10551   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10552
10553   /* Emit code to initialize GROFF, the offset from GRTOP of the
10554      next GPR argument.  */
10555   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10556               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10557   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10558
10559   /* Likewise emit code to initialize VROFF, the offset from FTOP
10560      of the next VR argument.  */
10561   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10562               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10563   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10564 }
10565
10566 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10567
10568 static tree
10569 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10570                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10571 {
10572   tree addr;
10573   bool indirect_p;
10574   bool is_ha;           /* is HFA or HVA.  */
10575   bool dw_align;        /* double-word align.  */
10576   machine_mode ag_mode = VOIDmode;
10577   int nregs;
10578   machine_mode mode;
10579
10580   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10581   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10582   HOST_WIDE_INT size, rsize, adjust, align;
10583   tree t, u, cond1, cond2;
10584
10585   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10586   if (indirect_p)
10587     type = build_pointer_type (type);
10588
10589   mode = TYPE_MODE (type);
10590
10591   f_stack = TYPE_FIELDS (va_list_type_node);
10592   f_grtop = DECL_CHAIN (f_stack);
10593   f_vrtop = DECL_CHAIN (f_grtop);
10594   f_groff = DECL_CHAIN (f_vrtop);
10595   f_vroff = DECL_CHAIN (f_groff);
10596
10597   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10598                   f_stack, NULL_TREE);
10599   size = int_size_in_bytes (type);
10600   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10601
10602   dw_align = false;
10603   adjust = 0;
10604   if (aarch64_vfp_is_call_or_return_candidate (mode,
10605                                                type,
10606                                                &ag_mode,
10607                                                &nregs,
10608                                                &is_ha))
10609     {
10610       /* TYPE passed in fp/simd registers.  */
10611       if (!TARGET_FLOAT)
10612         aarch64_err_no_fpadvsimd (mode, "varargs");
10613
10614       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10615                       unshare_expr (valist), f_vrtop, NULL_TREE);
10616       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10617                       unshare_expr (valist), f_vroff, NULL_TREE);
10618
10619       rsize = nregs * UNITS_PER_VREG;
10620
10621       if (is_ha)
10622         {
10623           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10624             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10625         }
10626       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10627                && size < UNITS_PER_VREG)
10628         {
10629           adjust = UNITS_PER_VREG - size;
10630         }
10631     }
10632   else
10633     {
10634       /* TYPE passed in general registers.  */
10635       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10636                       unshare_expr (valist), f_grtop, NULL_TREE);
10637       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10638                       unshare_expr (valist), f_groff, NULL_TREE);
10639       rsize = ROUND_UP (size, UNITS_PER_WORD);
10640       nregs = rsize / UNITS_PER_WORD;
10641
10642       if (align > 8)
10643         dw_align = true;
10644
10645       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10646           && size < UNITS_PER_WORD)
10647         {
10648           adjust = UNITS_PER_WORD  - size;
10649         }
10650     }
10651
10652   /* Get a local temporary for the field value.  */
10653   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10654
10655   /* Emit code to branch if off >= 0.  */
10656   t = build2 (GE_EXPR, boolean_type_node, off,
10657               build_int_cst (TREE_TYPE (off), 0));
10658   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10659
10660   if (dw_align)
10661     {
10662       /* Emit: offs = (offs + 15) & -16.  */
10663       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10664                   build_int_cst (TREE_TYPE (off), 15));
10665       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10666                   build_int_cst (TREE_TYPE (off), -16));
10667       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10668     }
10669   else
10670     roundup = NULL;
10671
10672   /* Update ap.__[g|v]r_offs  */
10673   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10674               build_int_cst (TREE_TYPE (off), rsize));
10675   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10676
10677   /* String up.  */
10678   if (roundup)
10679     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10680
10681   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10682   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10683               build_int_cst (TREE_TYPE (f_off), 0));
10684   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10685
10686   /* String up: make sure the assignment happens before the use.  */
10687   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10688   COND_EXPR_ELSE (cond1) = t;
10689
10690   /* Prepare the trees handling the argument that is passed on the stack;
10691      the top level node will store in ON_STACK.  */
10692   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10693   if (align > 8)
10694     {
10695       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10696       t = fold_convert (intDI_type_node, arg);
10697       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10698                   build_int_cst (TREE_TYPE (t), 15));
10699       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10700                   build_int_cst (TREE_TYPE (t), -16));
10701       t = fold_convert (TREE_TYPE (arg), t);
10702       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10703     }
10704   else
10705     roundup = NULL;
10706   /* Advance ap.__stack  */
10707   t = fold_convert (intDI_type_node, arg);
10708   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10709               build_int_cst (TREE_TYPE (t), size + 7));
10710   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10711               build_int_cst (TREE_TYPE (t), -8));
10712   t = fold_convert (TREE_TYPE (arg), t);
10713   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10714   /* String up roundup and advance.  */
10715   if (roundup)
10716     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10717   /* String up with arg */
10718   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10719   /* Big-endianness related address adjustment.  */
10720   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10721       && size < UNITS_PER_WORD)
10722   {
10723     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10724                 size_int (UNITS_PER_WORD - size));
10725     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10726   }
10727
10728   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10729   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10730
10731   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10732   t = off;
10733   if (adjust)
10734     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10735                 build_int_cst (TREE_TYPE (off), adjust));
10736
10737   t = fold_convert (sizetype, t);
10738   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10739
10740   if (is_ha)
10741     {
10742       /* type ha; // treat as "struct {ftype field[n];}"
10743          ... [computing offs]
10744          for (i = 0; i <nregs; ++i, offs += 16)
10745            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10746          return ha;  */
10747       int i;
10748       tree tmp_ha, field_t, field_ptr_t;
10749
10750       /* Declare a local variable.  */
10751       tmp_ha = create_tmp_var_raw (type, "ha");
10752       gimple_add_tmp_var (tmp_ha);
10753
10754       /* Establish the base type.  */
10755       switch (ag_mode)
10756         {
10757         case E_SFmode:
10758           field_t = float_type_node;
10759           field_ptr_t = float_ptr_type_node;
10760           break;
10761         case E_DFmode:
10762           field_t = double_type_node;
10763           field_ptr_t = double_ptr_type_node;
10764           break;
10765         case E_TFmode:
10766           field_t = long_double_type_node;
10767           field_ptr_t = long_double_ptr_type_node;
10768           break;
10769         case E_HFmode:
10770           field_t = aarch64_fp16_type_node;
10771           field_ptr_t = aarch64_fp16_ptr_type_node;
10772           break;
10773         case E_V2SImode:
10774         case E_V4SImode:
10775             {
10776               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10777               field_t = build_vector_type_for_mode (innertype, ag_mode);
10778               field_ptr_t = build_pointer_type (field_t);
10779             }
10780           break;
10781         default:
10782           gcc_assert (0);
10783         }
10784
10785       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10786       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10787       addr = t;
10788       t = fold_convert (field_ptr_t, addr);
10789       t = build2 (MODIFY_EXPR, field_t,
10790                   build1 (INDIRECT_REF, field_t, tmp_ha),
10791                   build1 (INDIRECT_REF, field_t, t));
10792
10793       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10794       for (i = 1; i < nregs; ++i)
10795         {
10796           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10797           u = fold_convert (field_ptr_t, addr);
10798           u = build2 (MODIFY_EXPR, field_t,
10799                       build2 (MEM_REF, field_t, tmp_ha,
10800                               build_int_cst (field_ptr_t,
10801                                              (i *
10802                                               int_size_in_bytes (field_t)))),
10803                       build1 (INDIRECT_REF, field_t, u));
10804           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10805         }
10806
10807       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10808       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10809     }
10810
10811   COND_EXPR_ELSE (cond2) = t;
10812   addr = fold_convert (build_pointer_type (type), cond1);
10813   addr = build_va_arg_indirect_ref (addr);
10814
10815   if (indirect_p)
10816     addr = build_va_arg_indirect_ref (addr);
10817
10818   return addr;
10819 }
10820
10821 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10822
10823 static void
10824 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10825                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10826                                 int no_rtl)
10827 {
10828   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10829   CUMULATIVE_ARGS local_cum;
10830   int gr_saved = cfun->va_list_gpr_size;
10831   int vr_saved = cfun->va_list_fpr_size;
10832
10833   /* The caller has advanced CUM up to, but not beyond, the last named
10834      argument.  Advance a local copy of CUM past the last "real" named
10835      argument, to find out how many registers are left over.  */
10836   local_cum = *cum;
10837   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10838
10839   /* Found out how many registers we need to save.
10840      Honor tree-stdvar analysis results.  */
10841   if (cfun->va_list_gpr_size)
10842     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10843                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10844   if (cfun->va_list_fpr_size)
10845     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10846                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10847
10848   if (!TARGET_FLOAT)
10849     {
10850       gcc_assert (local_cum.aapcs_nvrn == 0);
10851       vr_saved = 0;
10852     }
10853
10854   if (!no_rtl)
10855     {
10856       if (gr_saved > 0)
10857         {
10858           rtx ptr, mem;
10859
10860           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10861           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10862                                - gr_saved * UNITS_PER_WORD);
10863           mem = gen_frame_mem (BLKmode, ptr);
10864           set_mem_alias_set (mem, get_varargs_alias_set ());
10865
10866           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10867                                mem, gr_saved);
10868         }
10869       if (vr_saved > 0)
10870         {
10871           /* We can't use move_block_from_reg, because it will use
10872              the wrong mode, storing D regs only.  */
10873           machine_mode mode = TImode;
10874           int off, i, vr_start;
10875
10876           /* Set OFF to the offset from virtual_incoming_args_rtx of
10877              the first vector register.  The VR save area lies below
10878              the GR one, and is aligned to 16 bytes.  */
10879           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10880                            STACK_BOUNDARY / BITS_PER_UNIT);
10881           off -= vr_saved * UNITS_PER_VREG;
10882
10883           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10884           for (i = 0; i < vr_saved; ++i)
10885             {
10886               rtx ptr, mem;
10887
10888               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10889               mem = gen_frame_mem (mode, ptr);
10890               set_mem_alias_set (mem, get_varargs_alias_set ());
10891               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10892               off += UNITS_PER_VREG;
10893             }
10894         }
10895     }
10896
10897   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10898      any complication of having crtl->args.pretend_args_size changed.  */
10899   cfun->machine->frame.saved_varargs_size
10900     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10901                  STACK_BOUNDARY / BITS_PER_UNIT)
10902        + vr_saved * UNITS_PER_VREG);
10903 }
10904
10905 static void
10906 aarch64_conditional_register_usage (void)
10907 {
10908   int i;
10909   if (!TARGET_FLOAT)
10910     {
10911       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10912         {
10913           fixed_regs[i] = 1;
10914           call_used_regs[i] = 1;
10915         }
10916     }
10917 }
10918
10919 /* Walk down the type tree of TYPE counting consecutive base elements.
10920    If *MODEP is VOIDmode, then set it to the first valid floating point
10921    type.  If a non-floating point type is found, or if a floating point
10922    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10923    otherwise return the count in the sub-tree.  */
10924 static int
10925 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10926 {
10927   machine_mode mode;
10928   HOST_WIDE_INT size;
10929
10930   switch (TREE_CODE (type))
10931     {
10932     case REAL_TYPE:
10933       mode = TYPE_MODE (type);
10934       if (mode != DFmode && mode != SFmode
10935           && mode != TFmode && mode != HFmode)
10936         return -1;
10937
10938       if (*modep == VOIDmode)
10939         *modep = mode;
10940
10941       if (*modep == mode)
10942         return 1;
10943
10944       break;
10945
10946     case COMPLEX_TYPE:
10947       mode = TYPE_MODE (TREE_TYPE (type));
10948       if (mode != DFmode && mode != SFmode
10949           && mode != TFmode && mode != HFmode)
10950         return -1;
10951
10952       if (*modep == VOIDmode)
10953         *modep = mode;
10954
10955       if (*modep == mode)
10956         return 2;
10957
10958       break;
10959
10960     case VECTOR_TYPE:
10961       /* Use V2SImode and V4SImode as representatives of all 64-bit
10962          and 128-bit vector types.  */
10963       size = int_size_in_bytes (type);
10964       switch (size)
10965         {
10966         case 8:
10967           mode = V2SImode;
10968           break;
10969         case 16:
10970           mode = V4SImode;
10971           break;
10972         default:
10973           return -1;
10974         }
10975
10976       if (*modep == VOIDmode)
10977         *modep = mode;
10978
10979       /* Vector modes are considered to be opaque: two vectors are
10980          equivalent for the purposes of being homogeneous aggregates
10981          if they are the same size.  */
10982       if (*modep == mode)
10983         return 1;
10984
10985       break;
10986
10987     case ARRAY_TYPE:
10988       {
10989         int count;
10990         tree index = TYPE_DOMAIN (type);
10991
10992         /* Can't handle incomplete types nor sizes that are not
10993            fixed.  */
10994         if (!COMPLETE_TYPE_P (type)
10995             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10996           return -1;
10997
10998         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10999         if (count == -1
11000             || !index
11001             || !TYPE_MAX_VALUE (index)
11002             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11003             || !TYPE_MIN_VALUE (index)
11004             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11005             || count < 0)
11006           return -1;
11007
11008         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11009                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11010
11011         /* There must be no padding.  */
11012         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11013           return -1;
11014
11015         return count;
11016       }
11017
11018     case RECORD_TYPE:
11019       {
11020         int count = 0;
11021         int sub_count;
11022         tree field;
11023
11024         /* Can't handle incomplete types nor sizes that are not
11025            fixed.  */
11026         if (!COMPLETE_TYPE_P (type)
11027             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11028           return -1;
11029
11030         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11031           {
11032             if (TREE_CODE (field) != FIELD_DECL)
11033               continue;
11034
11035             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11036             if (sub_count < 0)
11037               return -1;
11038             count += sub_count;
11039           }
11040
11041         /* There must be no padding.  */
11042         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11043           return -1;
11044
11045         return count;
11046       }
11047
11048     case UNION_TYPE:
11049     case QUAL_UNION_TYPE:
11050       {
11051         /* These aren't very interesting except in a degenerate case.  */
11052         int count = 0;
11053         int sub_count;
11054         tree field;
11055
11056         /* Can't handle incomplete types nor sizes that are not
11057            fixed.  */
11058         if (!COMPLETE_TYPE_P (type)
11059             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11060           return -1;
11061
11062         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11063           {
11064             if (TREE_CODE (field) != FIELD_DECL)
11065               continue;
11066
11067             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11068             if (sub_count < 0)
11069               return -1;
11070             count = count > sub_count ? count : sub_count;
11071           }
11072
11073         /* There must be no padding.  */
11074         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11075           return -1;
11076
11077         return count;
11078       }
11079
11080     default:
11081       break;
11082     }
11083
11084   return -1;
11085 }
11086
11087 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11088    type as described in AAPCS64 \S 4.1.2.
11089
11090    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11091
11092 static bool
11093 aarch64_short_vector_p (const_tree type,
11094                         machine_mode mode)
11095 {
11096   HOST_WIDE_INT size = -1;
11097
11098   if (type && TREE_CODE (type) == VECTOR_TYPE)
11099     size = int_size_in_bytes (type);
11100   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11101             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11102     size = GET_MODE_SIZE (mode);
11103
11104   return (size == 8 || size == 16);
11105 }
11106
11107 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11108    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11109    array types.  The C99 floating-point complex types are also considered
11110    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11111    types, which are GCC extensions and out of the scope of AAPCS64, are
11112    treated as composite types here as well.
11113
11114    Note that MODE itself is not sufficient in determining whether a type
11115    is such a composite type or not.  This is because
11116    stor-layout.c:compute_record_mode may have already changed the MODE
11117    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11118    structure with only one field may have its MODE set to the mode of the
11119    field.  Also an integer mode whose size matches the size of the
11120    RECORD_TYPE type may be used to substitute the original mode
11121    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11122    solely relied on.  */
11123
11124 static bool
11125 aarch64_composite_type_p (const_tree type,
11126                           machine_mode mode)
11127 {
11128   if (aarch64_short_vector_p (type, mode))
11129     return false;
11130
11131   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11132     return true;
11133
11134   if (mode == BLKmode
11135       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11136       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11137     return true;
11138
11139   return false;
11140 }
11141
11142 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11143    shall be passed or returned in simd/fp register(s) (providing these
11144    parameter passing registers are available).
11145
11146    Upon successful return, *COUNT returns the number of needed registers,
11147    *BASE_MODE returns the mode of the individual register and when IS_HAF
11148    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11149    floating-point aggregate or a homogeneous short-vector aggregate.  */
11150
11151 static bool
11152 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11153                                          const_tree type,
11154                                          machine_mode *base_mode,
11155                                          int *count,
11156                                          bool *is_ha)
11157 {
11158   machine_mode new_mode = VOIDmode;
11159   bool composite_p = aarch64_composite_type_p (type, mode);
11160
11161   if (is_ha != NULL) *is_ha = false;
11162
11163   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11164       || aarch64_short_vector_p (type, mode))
11165     {
11166       *count = 1;
11167       new_mode = mode;
11168     }
11169   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11170     {
11171       if (is_ha != NULL) *is_ha = true;
11172       *count = 2;
11173       new_mode = GET_MODE_INNER (mode);
11174     }
11175   else if (type && composite_p)
11176     {
11177       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11178
11179       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11180         {
11181           if (is_ha != NULL) *is_ha = true;
11182           *count = ag_count;
11183         }
11184       else
11185         return false;
11186     }
11187   else
11188     return false;
11189
11190   *base_mode = new_mode;
11191   return true;
11192 }
11193
11194 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11195
11196 static rtx
11197 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11198                           int incoming ATTRIBUTE_UNUSED)
11199 {
11200   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11201 }
11202
11203 /* Implements target hook vector_mode_supported_p.  */
11204 static bool
11205 aarch64_vector_mode_supported_p (machine_mode mode)
11206 {
11207   if (TARGET_SIMD
11208       && (mode == V4SImode  || mode == V8HImode
11209           || mode == V16QImode || mode == V2DImode
11210           || mode == V2SImode  || mode == V4HImode
11211           || mode == V8QImode || mode == V2SFmode
11212           || mode == V4SFmode || mode == V2DFmode
11213           || mode == V4HFmode || mode == V8HFmode
11214           || mode == V1DFmode))
11215     return true;
11216
11217   return false;
11218 }
11219
11220 /* Return appropriate SIMD container
11221    for MODE within a vector of WIDTH bits.  */
11222 static machine_mode
11223 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11224 {
11225   gcc_assert (width == 64 || width == 128);
11226   if (TARGET_SIMD)
11227     {
11228       if (width == 128)
11229         switch (mode)
11230           {
11231           case E_DFmode:
11232             return V2DFmode;
11233           case E_SFmode:
11234             return V4SFmode;
11235           case E_HFmode:
11236             return V8HFmode;
11237           case E_SImode:
11238             return V4SImode;
11239           case E_HImode:
11240             return V8HImode;
11241           case E_QImode:
11242             return V16QImode;
11243           case E_DImode:
11244             return V2DImode;
11245           default:
11246             break;
11247           }
11248       else
11249         switch (mode)
11250           {
11251           case E_SFmode:
11252             return V2SFmode;
11253           case E_HFmode:
11254             return V4HFmode;
11255           case E_SImode:
11256             return V2SImode;
11257           case E_HImode:
11258             return V4HImode;
11259           case E_QImode:
11260             return V8QImode;
11261           default:
11262             break;
11263           }
11264     }
11265   return word_mode;
11266 }
11267
11268 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11269 static machine_mode
11270 aarch64_preferred_simd_mode (scalar_mode mode)
11271 {
11272   return aarch64_simd_container_mode (mode, 128);
11273 }
11274
11275 /* Return the bitmask of possible vector sizes for the vectorizer
11276    to iterate over.  */
11277 static unsigned int
11278 aarch64_autovectorize_vector_sizes (void)
11279 {
11280   return (16 | 8);
11281 }
11282
11283 /* Implement TARGET_MANGLE_TYPE.  */
11284
11285 static const char *
11286 aarch64_mangle_type (const_tree type)
11287 {
11288   /* The AArch64 ABI documents say that "__va_list" has to be
11289      managled as if it is in the "std" namespace.  */
11290   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11291     return "St9__va_list";
11292
11293   /* Half-precision float.  */
11294   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11295     return "Dh";
11296
11297   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11298      builtin types.  */
11299   if (TYPE_NAME (type) != NULL)
11300     return aarch64_mangle_builtin_type (type);
11301
11302   /* Use the default mangling.  */
11303   return NULL;
11304 }
11305
11306 /* Find the first rtx_insn before insn that will generate an assembly
11307    instruction.  */
11308
11309 static rtx_insn *
11310 aarch64_prev_real_insn (rtx_insn *insn)
11311 {
11312   if (!insn)
11313     return NULL;
11314
11315   do
11316     {
11317       insn = prev_real_insn (insn);
11318     }
11319   while (insn && recog_memoized (insn) < 0);
11320
11321   return insn;
11322 }
11323
11324 static bool
11325 is_madd_op (enum attr_type t1)
11326 {
11327   unsigned int i;
11328   /* A number of these may be AArch32 only.  */
11329   enum attr_type mlatypes[] = {
11330     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11331     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11332     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11333   };
11334
11335   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11336     {
11337       if (t1 == mlatypes[i])
11338         return true;
11339     }
11340
11341   return false;
11342 }
11343
11344 /* Check if there is a register dependency between a load and the insn
11345    for which we hold recog_data.  */
11346
11347 static bool
11348 dep_between_memop_and_curr (rtx memop)
11349 {
11350   rtx load_reg;
11351   int opno;
11352
11353   gcc_assert (GET_CODE (memop) == SET);
11354
11355   if (!REG_P (SET_DEST (memop)))
11356     return false;
11357
11358   load_reg = SET_DEST (memop);
11359   for (opno = 1; opno < recog_data.n_operands; opno++)
11360     {
11361       rtx operand = recog_data.operand[opno];
11362       if (REG_P (operand)
11363           && reg_overlap_mentioned_p (load_reg, operand))
11364         return true;
11365
11366     }
11367   return false;
11368 }
11369
11370
11371 /* When working around the Cortex-A53 erratum 835769,
11372    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11373    instruction and has a preceding memory instruction such that a NOP
11374    should be inserted between them.  */
11375
11376 bool
11377 aarch64_madd_needs_nop (rtx_insn* insn)
11378 {
11379   enum attr_type attr_type;
11380   rtx_insn *prev;
11381   rtx body;
11382
11383   if (!TARGET_FIX_ERR_A53_835769)
11384     return false;
11385
11386   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11387     return false;
11388
11389   attr_type = get_attr_type (insn);
11390   if (!is_madd_op (attr_type))
11391     return false;
11392
11393   prev = aarch64_prev_real_insn (insn);
11394   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11395      Restore recog state to INSN to avoid state corruption.  */
11396   extract_constrain_insn_cached (insn);
11397
11398   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11399     return false;
11400
11401   body = single_set (prev);
11402
11403   /* If the previous insn is a memory op and there is no dependency between
11404      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11405      have a complex memory operation, probably a load/store pair.
11406      Be conservative for now and emit a NOP.  */
11407   if (GET_MODE (recog_data.operand[0]) == DImode
11408       && (!body || !dep_between_memop_and_curr (body)))
11409     return true;
11410
11411   return false;
11412
11413 }
11414
11415
11416 /* Implement FINAL_PRESCAN_INSN.  */
11417
11418 void
11419 aarch64_final_prescan_insn (rtx_insn *insn)
11420 {
11421   if (aarch64_madd_needs_nop (insn))
11422     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11423 }
11424
11425
11426 /* Return the equivalent letter for size.  */
11427 static char
11428 sizetochar (int size)
11429 {
11430   switch (size)
11431     {
11432     case 64: return 'd';
11433     case 32: return 's';
11434     case 16: return 'h';
11435     case 8 : return 'b';
11436     default: gcc_unreachable ();
11437     }
11438 }
11439
11440 /* Return true iff x is a uniform vector of floating-point
11441    constants, and the constant can be represented in
11442    quarter-precision form.  Note, as aarch64_float_const_representable
11443    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11444 static bool
11445 aarch64_vect_float_const_representable_p (rtx x)
11446 {
11447   rtx elt;
11448   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11449           && const_vec_duplicate_p (x, &elt)
11450           && aarch64_float_const_representable_p (elt));
11451 }
11452
11453 /* Return true for valid and false for invalid.  */
11454 bool
11455 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11456                               struct simd_immediate_info *info)
11457 {
11458 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11459   matches = 1;                                          \
11460   for (i = 0; i < idx; i += (STRIDE))                   \
11461     if (!(TEST))                                        \
11462       matches = 0;                                      \
11463   if (matches)                                          \
11464     {                                                   \
11465       immtype = (CLASS);                                \
11466       elsize = (ELSIZE);                                \
11467       eshift = (SHIFT);                                 \
11468       emvn = (NEG);                                     \
11469       break;                                            \
11470     }
11471
11472   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11473   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11474   unsigned char bytes[16];
11475   int immtype = -1, matches;
11476   unsigned int invmask = inverse ? 0xff : 0;
11477   int eshift, emvn;
11478
11479   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11480     {
11481       if (! (aarch64_simd_imm_zero_p (op, mode)
11482              || aarch64_vect_float_const_representable_p (op)))
11483         return false;
11484
11485       if (info)
11486         {
11487           rtx elt = CONST_VECTOR_ELT (op, 0);
11488           scalar_float_mode elt_mode
11489             = as_a <scalar_float_mode> (GET_MODE (elt));
11490
11491           info->value = elt;
11492           info->element_width = GET_MODE_BITSIZE (elt_mode);
11493           info->mvn = false;
11494           info->shift = 0;
11495         }
11496
11497       return true;
11498     }
11499
11500   /* Splat vector constant out into a byte vector.  */
11501   for (i = 0; i < n_elts; i++)
11502     {
11503       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11504          it must be laid out in the vector register in reverse order.  */
11505       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11506       unsigned HOST_WIDE_INT elpart;
11507
11508       gcc_assert (CONST_INT_P (el));
11509       elpart = INTVAL (el);
11510
11511       for (unsigned int byte = 0; byte < innersize; byte++)
11512         {
11513           bytes[idx++] = (elpart & 0xff) ^ invmask;
11514           elpart >>= BITS_PER_UNIT;
11515         }
11516
11517     }
11518
11519   /* Sanity check.  */
11520   gcc_assert (idx == GET_MODE_SIZE (mode));
11521
11522   do
11523     {
11524       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11525              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11526
11527       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11528              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11529
11530       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11531              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11532
11533       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11534              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11535
11536       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11537
11538       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11539
11540       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11541              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11542
11543       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11544              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11545
11546       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11547              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11548
11549       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11550              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11551
11552       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11553
11554       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11555
11556       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11557              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11558
11559       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11560              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11561
11562       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11563              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11564
11565       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11566              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11567
11568       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11569
11570       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11571              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11572     }
11573   while (0);
11574
11575   if (immtype == -1)
11576     return false;
11577
11578   if (info)
11579     {
11580       info->element_width = elsize;
11581       info->mvn = emvn != 0;
11582       info->shift = eshift;
11583
11584       unsigned HOST_WIDE_INT imm = 0;
11585
11586       if (immtype >= 12 && immtype <= 15)
11587         info->msl = true;
11588
11589       /* Un-invert bytes of recognized vector, if necessary.  */
11590       if (invmask != 0)
11591         for (i = 0; i < idx; i++)
11592           bytes[i] ^= invmask;
11593
11594       if (immtype == 17)
11595         {
11596           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11597           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11598
11599           for (i = 0; i < 8; i++)
11600             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11601               << (i * BITS_PER_UNIT);
11602
11603
11604           info->value = GEN_INT (imm);
11605         }
11606       else
11607         {
11608           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11609             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11610
11611           /* Construct 'abcdefgh' because the assembler cannot handle
11612              generic constants.  */
11613           if (info->mvn)
11614             imm = ~imm;
11615           imm = (imm >> info->shift) & 0xff;
11616           info->value = GEN_INT (imm);
11617         }
11618     }
11619
11620   return true;
11621 #undef CHECK
11622 }
11623
11624 /* Check of immediate shift constants are within range.  */
11625 bool
11626 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11627 {
11628   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11629   if (left)
11630     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11631   else
11632     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11633 }
11634
11635 /* Return true if X is a uniform vector where all elements
11636    are either the floating-point constant 0.0 or the
11637    integer constant 0.  */
11638 bool
11639 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11640 {
11641   return x == CONST0_RTX (mode);
11642 }
11643
11644
11645 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11646    operation of width WIDTH at bit position POS.  */
11647
11648 rtx
11649 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11650 {
11651   gcc_assert (CONST_INT_P (width));
11652   gcc_assert (CONST_INT_P (pos));
11653
11654   unsigned HOST_WIDE_INT mask
11655     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11656   return GEN_INT (mask << UINTVAL (pos));
11657 }
11658
11659 bool
11660 aarch64_mov_operand_p (rtx x, machine_mode mode)
11661 {
11662   if (GET_CODE (x) == HIGH
11663       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11664     return true;
11665
11666   if (CONST_INT_P (x))
11667     return true;
11668
11669   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11670     return true;
11671
11672   return aarch64_classify_symbolic_expression (x)
11673     == SYMBOL_TINY_ABSOLUTE;
11674 }
11675
11676 /* Return a const_int vector of VAL.  */
11677 rtx
11678 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11679 {
11680   int nunits = GET_MODE_NUNITS (mode);
11681   rtvec v = rtvec_alloc (nunits);
11682   int i;
11683
11684   rtx cache = GEN_INT (val);
11685
11686   for (i=0; i < nunits; i++)
11687     RTVEC_ELT (v, i) = cache;
11688
11689   return gen_rtx_CONST_VECTOR (mode, v);
11690 }
11691
11692 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11693
11694 bool
11695 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11696 {
11697   machine_mode vmode;
11698
11699   gcc_assert (!VECTOR_MODE_P (mode));
11700   vmode = aarch64_preferred_simd_mode (as_a <scalar_mode> (mode));
11701   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11702   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11703 }
11704
11705 /* Construct and return a PARALLEL RTX vector with elements numbering the
11706    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11707    the vector - from the perspective of the architecture.  This does not
11708    line up with GCC's perspective on lane numbers, so we end up with
11709    different masks depending on our target endian-ness.  The diagram
11710    below may help.  We must draw the distinction when building masks
11711    which select one half of the vector.  An instruction selecting
11712    architectural low-lanes for a big-endian target, must be described using
11713    a mask selecting GCC high-lanes.
11714
11715                  Big-Endian             Little-Endian
11716
11717 GCC             0   1   2   3           3   2   1   0
11718               | x | x | x | x |       | x | x | x | x |
11719 Architecture    3   2   1   0           3   2   1   0
11720
11721 Low Mask:         { 2, 3 }                { 0, 1 }
11722 High Mask:        { 0, 1 }                { 2, 3 }
11723 */
11724
11725 rtx
11726 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11727 {
11728   int nunits = GET_MODE_NUNITS (mode);
11729   rtvec v = rtvec_alloc (nunits / 2);
11730   int high_base = nunits / 2;
11731   int low_base = 0;
11732   int base;
11733   rtx t1;
11734   int i;
11735
11736   if (BYTES_BIG_ENDIAN)
11737     base = high ? low_base : high_base;
11738   else
11739     base = high ? high_base : low_base;
11740
11741   for (i = 0; i < nunits / 2; i++)
11742     RTVEC_ELT (v, i) = GEN_INT (base + i);
11743
11744   t1 = gen_rtx_PARALLEL (mode, v);
11745   return t1;
11746 }
11747
11748 /* Check OP for validity as a PARALLEL RTX vector with elements
11749    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11750    from the perspective of the architecture.  See the diagram above
11751    aarch64_simd_vect_par_cnst_half for more details.  */
11752
11753 bool
11754 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11755                                        bool high)
11756 {
11757   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11758   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11759   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11760   int i = 0;
11761
11762   if (!VECTOR_MODE_P (mode))
11763     return false;
11764
11765   if (count_op != count_ideal)
11766     return false;
11767
11768   for (i = 0; i < count_ideal; i++)
11769     {
11770       rtx elt_op = XVECEXP (op, 0, i);
11771       rtx elt_ideal = XVECEXP (ideal, 0, i);
11772
11773       if (!CONST_INT_P (elt_op)
11774           || INTVAL (elt_ideal) != INTVAL (elt_op))
11775         return false;
11776     }
11777   return true;
11778 }
11779
11780 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11781    HIGH (exclusive).  */
11782 void
11783 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11784                           const_tree exp)
11785 {
11786   HOST_WIDE_INT lane;
11787   gcc_assert (CONST_INT_P (operand));
11788   lane = INTVAL (operand);
11789
11790   if (lane < low || lane >= high)
11791   {
11792     if (exp)
11793       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11794     else
11795       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11796   }
11797 }
11798
11799 /* Return TRUE if OP is a valid vector addressing mode.  */
11800 bool
11801 aarch64_simd_mem_operand_p (rtx op)
11802 {
11803   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11804                         || REG_P (XEXP (op, 0)));
11805 }
11806
11807 /* Emit a register copy from operand to operand, taking care not to
11808    early-clobber source registers in the process.
11809
11810    COUNT is the number of components into which the copy needs to be
11811    decomposed.  */
11812 void
11813 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11814                                 unsigned int count)
11815 {
11816   unsigned int i;
11817   int rdest = REGNO (operands[0]);
11818   int rsrc = REGNO (operands[1]);
11819
11820   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11821       || rdest < rsrc)
11822     for (i = 0; i < count; i++)
11823       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11824                       gen_rtx_REG (mode, rsrc + i));
11825   else
11826     for (i = 0; i < count; i++)
11827       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11828                       gen_rtx_REG (mode, rsrc + count - i - 1));
11829 }
11830
11831 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11832    one of VSTRUCT modes: OI, CI, or XI.  */
11833 int
11834 aarch64_simd_attr_length_rglist (machine_mode mode)
11835 {
11836   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11837 }
11838
11839 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11840    alignment of a vector to 128 bits.  */
11841 static HOST_WIDE_INT
11842 aarch64_simd_vector_alignment (const_tree type)
11843 {
11844   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11845   return MIN (align, 128);
11846 }
11847
11848 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11849 static bool
11850 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11851 {
11852   if (is_packed)
11853     return false;
11854
11855   /* We guarantee alignment for vectors up to 128-bits.  */
11856   if (tree_int_cst_compare (TYPE_SIZE (type),
11857                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11858     return false;
11859
11860   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11861   return true;
11862 }
11863
11864 /* Return true if the vector misalignment factor is supported by the
11865    target.  */
11866 static bool
11867 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11868                                              const_tree type, int misalignment,
11869                                              bool is_packed)
11870 {
11871   if (TARGET_SIMD && STRICT_ALIGNMENT)
11872     {
11873       /* Return if movmisalign pattern is not supported for this mode.  */
11874       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11875         return false;
11876
11877       if (misalignment == -1)
11878         {
11879           /* Misalignment factor is unknown at compile time but we know
11880              it's word aligned.  */
11881           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11882             {
11883               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11884
11885               if (element_size != 64)
11886                 return true;
11887             }
11888           return false;
11889         }
11890     }
11891   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11892                                                       is_packed);
11893 }
11894
11895 /* If VALS is a vector constant that can be loaded into a register
11896    using DUP, generate instructions to do so and return an RTX to
11897    assign to the register.  Otherwise return NULL_RTX.  */
11898 static rtx
11899 aarch64_simd_dup_constant (rtx vals)
11900 {
11901   machine_mode mode = GET_MODE (vals);
11902   machine_mode inner_mode = GET_MODE_INNER (mode);
11903   rtx x;
11904
11905   if (!const_vec_duplicate_p (vals, &x))
11906     return NULL_RTX;
11907
11908   /* We can load this constant by using DUP and a constant in a
11909      single ARM register.  This will be cheaper than a vector
11910      load.  */
11911   x = copy_to_mode_reg (inner_mode, x);
11912   return gen_rtx_VEC_DUPLICATE (mode, x);
11913 }
11914
11915
11916 /* Generate code to load VALS, which is a PARALLEL containing only
11917    constants (for vec_init) or CONST_VECTOR, efficiently into a
11918    register.  Returns an RTX to copy into the register, or NULL_RTX
11919    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11920 static rtx
11921 aarch64_simd_make_constant (rtx vals)
11922 {
11923   machine_mode mode = GET_MODE (vals);
11924   rtx const_dup;
11925   rtx const_vec = NULL_RTX;
11926   int n_elts = GET_MODE_NUNITS (mode);
11927   int n_const = 0;
11928   int i;
11929
11930   if (GET_CODE (vals) == CONST_VECTOR)
11931     const_vec = vals;
11932   else if (GET_CODE (vals) == PARALLEL)
11933     {
11934       /* A CONST_VECTOR must contain only CONST_INTs and
11935          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11936          Only store valid constants in a CONST_VECTOR.  */
11937       for (i = 0; i < n_elts; ++i)
11938         {
11939           rtx x = XVECEXP (vals, 0, i);
11940           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11941             n_const++;
11942         }
11943       if (n_const == n_elts)
11944         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11945     }
11946   else
11947     gcc_unreachable ();
11948
11949   if (const_vec != NULL_RTX
11950       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11951     /* Load using MOVI/MVNI.  */
11952     return const_vec;
11953   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11954     /* Loaded using DUP.  */
11955     return const_dup;
11956   else if (const_vec != NULL_RTX)
11957     /* Load from constant pool. We can not take advantage of single-cycle
11958        LD1 because we need a PC-relative addressing mode.  */
11959     return const_vec;
11960   else
11961     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11962        We can not construct an initializer.  */
11963     return NULL_RTX;
11964 }
11965
11966 /* Expand a vector initialisation sequence, such that TARGET is
11967    initialised to contain VALS.  */
11968
11969 void
11970 aarch64_expand_vector_init (rtx target, rtx vals)
11971 {
11972   machine_mode mode = GET_MODE (target);
11973   machine_mode inner_mode = GET_MODE_INNER (mode);
11974   /* The number of vector elements.  */
11975   int n_elts = GET_MODE_NUNITS (mode);
11976   /* The number of vector elements which are not constant.  */
11977   int n_var = 0;
11978   rtx any_const = NULL_RTX;
11979   /* The first element of vals.  */
11980   rtx v0 = XVECEXP (vals, 0, 0);
11981   bool all_same = true;
11982
11983   /* Count the number of variable elements to initialise.  */
11984   for (int i = 0; i < n_elts; ++i)
11985     {
11986       rtx x = XVECEXP (vals, 0, i);
11987       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11988         ++n_var;
11989       else
11990         any_const = x;
11991
11992       all_same &= rtx_equal_p (x, v0);
11993     }
11994
11995   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11996      how best to handle this.  */
11997   if (n_var == 0)
11998     {
11999       rtx constant = aarch64_simd_make_constant (vals);
12000       if (constant != NULL_RTX)
12001         {
12002           emit_move_insn (target, constant);
12003           return;
12004         }
12005     }
12006
12007   /* Splat a single non-constant element if we can.  */
12008   if (all_same)
12009     {
12010       rtx x = copy_to_mode_reg (inner_mode, v0);
12011       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12012       return;
12013     }
12014
12015   enum insn_code icode = optab_handler (vec_set_optab, mode);
12016   gcc_assert (icode != CODE_FOR_nothing);
12017
12018   /* If there are only variable elements, try to optimize
12019      the insertion using dup for the most common element
12020      followed by insertions.  */
12021
12022   /* The algorithm will fill matches[*][0] with the earliest matching element,
12023      and matches[X][1] with the count of duplicate elements (if X is the
12024      earliest element which has duplicates).  */
12025
12026   if (n_var == n_elts && n_elts <= 16)
12027     {
12028       int matches[16][2] = {0};
12029       for (int i = 0; i < n_elts; i++)
12030         {
12031           for (int j = 0; j <= i; j++)
12032             {
12033               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12034                 {
12035                   matches[i][0] = j;
12036                   matches[j][1]++;
12037                   break;
12038                 }
12039             }
12040         }
12041       int maxelement = 0;
12042       int maxv = 0;
12043       for (int i = 0; i < n_elts; i++)
12044         if (matches[i][1] > maxv)
12045           {
12046             maxelement = i;
12047             maxv = matches[i][1];
12048           }
12049
12050       /* Create a duplicate of the most common element.  */
12051       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12052       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12053
12054       /* Insert the rest.  */
12055       for (int i = 0; i < n_elts; i++)
12056         {
12057           rtx x = XVECEXP (vals, 0, i);
12058           if (matches[i][0] == maxelement)
12059             continue;
12060           x = copy_to_mode_reg (inner_mode, x);
12061           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12062         }
12063       return;
12064     }
12065
12066   /* Initialise a vector which is part-variable.  We want to first try
12067      to build those lanes which are constant in the most efficient way we
12068      can.  */
12069   if (n_var != n_elts)
12070     {
12071       rtx copy = copy_rtx (vals);
12072
12073       /* Load constant part of vector.  We really don't care what goes into the
12074          parts we will overwrite, but we're more likely to be able to load the
12075          constant efficiently if it has fewer, larger, repeating parts
12076          (see aarch64_simd_valid_immediate).  */
12077       for (int i = 0; i < n_elts; i++)
12078         {
12079           rtx x = XVECEXP (vals, 0, i);
12080           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12081             continue;
12082           rtx subst = any_const;
12083           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12084             {
12085               /* Look in the copied vector, as more elements are const.  */
12086               rtx test = XVECEXP (copy, 0, i ^ bit);
12087               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12088                 {
12089                   subst = test;
12090                   break;
12091                 }
12092             }
12093           XVECEXP (copy, 0, i) = subst;
12094         }
12095       aarch64_expand_vector_init (target, copy);
12096     }
12097
12098   /* Insert the variable lanes directly.  */
12099   for (int i = 0; i < n_elts; i++)
12100     {
12101       rtx x = XVECEXP (vals, 0, i);
12102       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12103         continue;
12104       x = copy_to_mode_reg (inner_mode, x);
12105       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12106     }
12107 }
12108
12109 static unsigned HOST_WIDE_INT
12110 aarch64_shift_truncation_mask (machine_mode mode)
12111 {
12112   return
12113     (!SHIFT_COUNT_TRUNCATED
12114      || aarch64_vector_mode_supported_p (mode)
12115      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12116 }
12117
12118 /* Select a format to encode pointers in exception handling data.  */
12119 int
12120 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12121 {
12122    int type;
12123    switch (aarch64_cmodel)
12124      {
12125      case AARCH64_CMODEL_TINY:
12126      case AARCH64_CMODEL_TINY_PIC:
12127      case AARCH64_CMODEL_SMALL:
12128      case AARCH64_CMODEL_SMALL_PIC:
12129      case AARCH64_CMODEL_SMALL_SPIC:
12130        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12131           for everything.  */
12132        type = DW_EH_PE_sdata4;
12133        break;
12134      default:
12135        /* No assumptions here.  8-byte relocs required.  */
12136        type = DW_EH_PE_sdata8;
12137        break;
12138      }
12139    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12140 }
12141
12142 /* The last .arch and .tune assembly strings that we printed.  */
12143 static std::string aarch64_last_printed_arch_string;
12144 static std::string aarch64_last_printed_tune_string;
12145
12146 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12147    by the function fndecl.  */
12148
12149 void
12150 aarch64_declare_function_name (FILE *stream, const char* name,
12151                                 tree fndecl)
12152 {
12153   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12154
12155   struct cl_target_option *targ_options;
12156   if (target_parts)
12157     targ_options = TREE_TARGET_OPTION (target_parts);
12158   else
12159     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12160   gcc_assert (targ_options);
12161
12162   const struct processor *this_arch
12163     = aarch64_get_arch (targ_options->x_explicit_arch);
12164
12165   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12166   std::string extension
12167     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12168                                                   this_arch->flags);
12169   /* Only update the assembler .arch string if it is distinct from the last
12170      such string we printed.  */
12171   std::string to_print = this_arch->name + extension;
12172   if (to_print != aarch64_last_printed_arch_string)
12173     {
12174       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12175       aarch64_last_printed_arch_string = to_print;
12176     }
12177
12178   /* Print the cpu name we're tuning for in the comments, might be
12179      useful to readers of the generated asm.  Do it only when it changes
12180      from function to function and verbose assembly is requested.  */
12181   const struct processor *this_tune
12182     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12183
12184   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12185     {
12186       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12187                    this_tune->name);
12188       aarch64_last_printed_tune_string = this_tune->name;
12189     }
12190
12191   /* Don't forget the type directive for ELF.  */
12192   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12193   ASM_OUTPUT_LABEL (stream, name);
12194 }
12195
12196 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12197
12198 static void
12199 aarch64_start_file (void)
12200 {
12201   struct cl_target_option *default_options
12202     = TREE_TARGET_OPTION (target_option_default_node);
12203
12204   const struct processor *default_arch
12205     = aarch64_get_arch (default_options->x_explicit_arch);
12206   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12207   std::string extension
12208     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12209                                                   default_arch->flags);
12210
12211    aarch64_last_printed_arch_string = default_arch->name + extension;
12212    aarch64_last_printed_tune_string = "";
12213    asm_fprintf (asm_out_file, "\t.arch %s\n",
12214                 aarch64_last_printed_arch_string.c_str ());
12215
12216    default_file_start ();
12217 }
12218
12219 /* Emit load exclusive.  */
12220
12221 static void
12222 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12223                              rtx mem, rtx model_rtx)
12224 {
12225   rtx (*gen) (rtx, rtx, rtx);
12226
12227   switch (mode)
12228     {
12229     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12230     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12231     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12232     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12233     default:
12234       gcc_unreachable ();
12235     }
12236
12237   emit_insn (gen (rval, mem, model_rtx));
12238 }
12239
12240 /* Emit store exclusive.  */
12241
12242 static void
12243 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12244                               rtx rval, rtx mem, rtx model_rtx)
12245 {
12246   rtx (*gen) (rtx, rtx, rtx, rtx);
12247
12248   switch (mode)
12249     {
12250     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12251     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12252     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12253     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12254     default:
12255       gcc_unreachable ();
12256     }
12257
12258   emit_insn (gen (bval, rval, mem, model_rtx));
12259 }
12260
12261 /* Mark the previous jump instruction as unlikely.  */
12262
12263 static void
12264 aarch64_emit_unlikely_jump (rtx insn)
12265 {
12266   rtx_insn *jump = emit_jump_insn (insn);
12267   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12268 }
12269
12270 /* Expand a compare and swap pattern.  */
12271
12272 void
12273 aarch64_expand_compare_and_swap (rtx operands[])
12274 {
12275   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12276   machine_mode mode, cmp_mode;
12277   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12278   int idx;
12279   gen_cas_fn gen;
12280   const gen_cas_fn split_cas[] =
12281   {
12282     gen_aarch64_compare_and_swapqi,
12283     gen_aarch64_compare_and_swaphi,
12284     gen_aarch64_compare_and_swapsi,
12285     gen_aarch64_compare_and_swapdi
12286   };
12287   const gen_cas_fn atomic_cas[] =
12288   {
12289     gen_aarch64_compare_and_swapqi_lse,
12290     gen_aarch64_compare_and_swaphi_lse,
12291     gen_aarch64_compare_and_swapsi_lse,
12292     gen_aarch64_compare_and_swapdi_lse
12293   };
12294
12295   bval = operands[0];
12296   rval = operands[1];
12297   mem = operands[2];
12298   oldval = operands[3];
12299   newval = operands[4];
12300   is_weak = operands[5];
12301   mod_s = operands[6];
12302   mod_f = operands[7];
12303   mode = GET_MODE (mem);
12304   cmp_mode = mode;
12305
12306   /* Normally the succ memory model must be stronger than fail, but in the
12307      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12308      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12309
12310   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12311       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12312     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12313
12314   switch (mode)
12315     {
12316     case E_QImode:
12317     case E_HImode:
12318       /* For short modes, we're going to perform the comparison in SImode,
12319          so do the zero-extension now.  */
12320       cmp_mode = SImode;
12321       rval = gen_reg_rtx (SImode);
12322       oldval = convert_modes (SImode, mode, oldval, true);
12323       /* Fall through.  */
12324
12325     case E_SImode:
12326     case E_DImode:
12327       /* Force the value into a register if needed.  */
12328       if (!aarch64_plus_operand (oldval, mode))
12329         oldval = force_reg (cmp_mode, oldval);
12330       break;
12331
12332     default:
12333       gcc_unreachable ();
12334     }
12335
12336   switch (mode)
12337     {
12338     case E_QImode: idx = 0; break;
12339     case E_HImode: idx = 1; break;
12340     case E_SImode: idx = 2; break;
12341     case E_DImode: idx = 3; break;
12342     default:
12343       gcc_unreachable ();
12344     }
12345   if (TARGET_LSE)
12346     gen = atomic_cas[idx];
12347   else
12348     gen = split_cas[idx];
12349
12350   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12351
12352   if (mode == QImode || mode == HImode)
12353     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12354
12355   x = gen_rtx_REG (CCmode, CC_REGNUM);
12356   x = gen_rtx_EQ (SImode, x, const0_rtx);
12357   emit_insn (gen_rtx_SET (bval, x));
12358 }
12359
12360 /* Test whether the target supports using a atomic load-operate instruction.
12361    CODE is the operation and AFTER is TRUE if the data in memory after the
12362    operation should be returned and FALSE if the data before the operation
12363    should be returned.  Returns FALSE if the operation isn't supported by the
12364    architecture.  */
12365
12366 bool
12367 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12368 {
12369   if (!TARGET_LSE)
12370     return false;
12371
12372   switch (code)
12373     {
12374     case SET:
12375     case AND:
12376     case IOR:
12377     case XOR:
12378     case MINUS:
12379     case PLUS:
12380       return true;
12381     default:
12382       return false;
12383     }
12384 }
12385
12386 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12387    sequence implementing an atomic operation.  */
12388
12389 static void
12390 aarch64_emit_post_barrier (enum memmodel model)
12391 {
12392   const enum memmodel base_model = memmodel_base (model);
12393
12394   if (is_mm_sync (model)
12395       && (base_model == MEMMODEL_ACQUIRE
12396           || base_model == MEMMODEL_ACQ_REL
12397           || base_model == MEMMODEL_SEQ_CST))
12398     {
12399       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12400     }
12401 }
12402
12403 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12404    for the data in memory.  EXPECTED is the value expected to be in memory.
12405    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12406    is the memory ordering to use.  */
12407
12408 void
12409 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12410                         rtx expected, rtx desired,
12411                         rtx model)
12412 {
12413   rtx (*gen) (rtx, rtx, rtx, rtx);
12414   machine_mode mode;
12415
12416   mode = GET_MODE (mem);
12417
12418   switch (mode)
12419     {
12420     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12421     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12422     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12423     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12424     default:
12425       gcc_unreachable ();
12426     }
12427
12428   /* Move the expected value into the CAS destination register.  */
12429   emit_insn (gen_rtx_SET (rval, expected));
12430
12431   /* Emit the CAS.  */
12432   emit_insn (gen (rval, mem, desired, model));
12433
12434   /* Compare the expected value with the value loaded by the CAS, to establish
12435      whether the swap was made.  */
12436   aarch64_gen_compare_reg (EQ, rval, expected);
12437 }
12438
12439 /* Split a compare and swap pattern.  */
12440
12441 void
12442 aarch64_split_compare_and_swap (rtx operands[])
12443 {
12444   rtx rval, mem, oldval, newval, scratch;
12445   machine_mode mode;
12446   bool is_weak;
12447   rtx_code_label *label1, *label2;
12448   rtx x, cond;
12449   enum memmodel model;
12450   rtx model_rtx;
12451
12452   rval = operands[0];
12453   mem = operands[1];
12454   oldval = operands[2];
12455   newval = operands[3];
12456   is_weak = (operands[4] != const0_rtx);
12457   model_rtx = operands[5];
12458   scratch = operands[7];
12459   mode = GET_MODE (mem);
12460   model = memmodel_from_int (INTVAL (model_rtx));
12461
12462   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12463     loop:
12464     .label1:
12465         LD[A]XR rval, [mem]
12466         CBNZ    rval, .label2
12467         ST[L]XR scratch, newval, [mem]
12468         CBNZ    scratch, .label1
12469     .label2:
12470         CMP     rval, 0.  */
12471   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12472
12473   label1 = NULL;
12474   if (!is_weak)
12475     {
12476       label1 = gen_label_rtx ();
12477       emit_label (label1);
12478     }
12479   label2 = gen_label_rtx ();
12480
12481   /* The initial load can be relaxed for a __sync operation since a final
12482      barrier will be emitted to stop code hoisting.  */
12483   if (is_mm_sync (model))
12484     aarch64_emit_load_exclusive (mode, rval, mem,
12485                                  GEN_INT (MEMMODEL_RELAXED));
12486   else
12487     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12488
12489   if (strong_zero_p)
12490     {
12491       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12492       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12493                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12494       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12495     }
12496   else
12497     {
12498       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12499       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12500       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12501                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12502       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12503     }
12504
12505   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12506
12507   if (!is_weak)
12508     {
12509       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12510       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12511                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12512       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12513     }
12514   else
12515     {
12516       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12517       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12518       emit_insn (gen_rtx_SET (cond, x));
12519     }
12520
12521   emit_label (label2);
12522   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12523      to set the condition flags.  If this is not used it will be removed by
12524      later passes.  */
12525   if (strong_zero_p)
12526     {
12527       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12528       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12529       emit_insn (gen_rtx_SET (cond, x));
12530     }
12531   /* Emit any final barrier needed for a __sync operation.  */
12532   if (is_mm_sync (model))
12533     aarch64_emit_post_barrier (model);
12534 }
12535
12536 /* Emit a BIC instruction.  */
12537
12538 static void
12539 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12540 {
12541   rtx shift_rtx = GEN_INT (shift);
12542   rtx (*gen) (rtx, rtx, rtx, rtx);
12543
12544   switch (mode)
12545     {
12546     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12547     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12548     default:
12549       gcc_unreachable ();
12550     }
12551
12552   emit_insn (gen (dst, s2, shift_rtx, s1));
12553 }
12554
12555 /* Emit an atomic swap.  */
12556
12557 static void
12558 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12559                           rtx mem, rtx model)
12560 {
12561   rtx (*gen) (rtx, rtx, rtx, rtx);
12562
12563   switch (mode)
12564     {
12565     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12566     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12567     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12568     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12569     default:
12570       gcc_unreachable ();
12571     }
12572
12573   emit_insn (gen (dst, mem, value, model));
12574 }
12575
12576 /* Operations supported by aarch64_emit_atomic_load_op.  */
12577
12578 enum aarch64_atomic_load_op_code
12579 {
12580   AARCH64_LDOP_PLUS,    /* A + B  */
12581   AARCH64_LDOP_XOR,     /* A ^ B  */
12582   AARCH64_LDOP_OR,      /* A | B  */
12583   AARCH64_LDOP_BIC      /* A & ~B  */
12584 };
12585
12586 /* Emit an atomic load-operate.  */
12587
12588 static void
12589 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12590                              machine_mode mode, rtx dst, rtx src,
12591                              rtx mem, rtx model)
12592 {
12593   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12594   const aarch64_atomic_load_op_fn plus[] =
12595   {
12596     gen_aarch64_atomic_loadaddqi,
12597     gen_aarch64_atomic_loadaddhi,
12598     gen_aarch64_atomic_loadaddsi,
12599     gen_aarch64_atomic_loadadddi
12600   };
12601   const aarch64_atomic_load_op_fn eor[] =
12602   {
12603     gen_aarch64_atomic_loadeorqi,
12604     gen_aarch64_atomic_loadeorhi,
12605     gen_aarch64_atomic_loadeorsi,
12606     gen_aarch64_atomic_loadeordi
12607   };
12608   const aarch64_atomic_load_op_fn ior[] =
12609   {
12610     gen_aarch64_atomic_loadsetqi,
12611     gen_aarch64_atomic_loadsethi,
12612     gen_aarch64_atomic_loadsetsi,
12613     gen_aarch64_atomic_loadsetdi
12614   };
12615   const aarch64_atomic_load_op_fn bic[] =
12616   {
12617     gen_aarch64_atomic_loadclrqi,
12618     gen_aarch64_atomic_loadclrhi,
12619     gen_aarch64_atomic_loadclrsi,
12620     gen_aarch64_atomic_loadclrdi
12621   };
12622   aarch64_atomic_load_op_fn gen;
12623   int idx = 0;
12624
12625   switch (mode)
12626     {
12627     case E_QImode: idx = 0; break;
12628     case E_HImode: idx = 1; break;
12629     case E_SImode: idx = 2; break;
12630     case E_DImode: idx = 3; break;
12631     default:
12632       gcc_unreachable ();
12633     }
12634
12635   switch (code)
12636     {
12637     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12638     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12639     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12640     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12641     default:
12642       gcc_unreachable ();
12643     }
12644
12645   emit_insn (gen (dst, mem, src, model));
12646 }
12647
12648 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12649    location to store the data read from memory.  OUT_RESULT is the location to
12650    store the result of the operation.  MEM is the memory location to read and
12651    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12652    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12653    be NULL.  */
12654
12655 void
12656 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12657                          rtx mem, rtx value, rtx model_rtx)
12658 {
12659   machine_mode mode = GET_MODE (mem);
12660   machine_mode wmode = (mode == DImode ? DImode : SImode);
12661   const bool short_mode = (mode < SImode);
12662   aarch64_atomic_load_op_code ldop_code;
12663   rtx src;
12664   rtx x;
12665
12666   if (out_data)
12667     out_data = gen_lowpart (mode, out_data);
12668
12669   if (out_result)
12670     out_result = gen_lowpart (mode, out_result);
12671
12672   /* Make sure the value is in a register, putting it into a destination
12673      register if it needs to be manipulated.  */
12674   if (!register_operand (value, mode)
12675       || code == AND || code == MINUS)
12676     {
12677       src = out_result ? out_result : out_data;
12678       emit_move_insn (src, gen_lowpart (mode, value));
12679     }
12680   else
12681     src = value;
12682   gcc_assert (register_operand (src, mode));
12683
12684   /* Preprocess the data for the operation as necessary.  If the operation is
12685      a SET then emit a swap instruction and finish.  */
12686   switch (code)
12687     {
12688     case SET:
12689       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12690       return;
12691
12692     case MINUS:
12693       /* Negate the value and treat it as a PLUS.  */
12694       {
12695         rtx neg_src;
12696
12697         /* Resize the value if necessary.  */
12698         if (short_mode)
12699           src = gen_lowpart (wmode, src);
12700
12701         neg_src = gen_rtx_NEG (wmode, src);
12702         emit_insn (gen_rtx_SET (src, neg_src));
12703
12704         if (short_mode)
12705           src = gen_lowpart (mode, src);
12706       }
12707       /* Fall-through.  */
12708     case PLUS:
12709       ldop_code = AARCH64_LDOP_PLUS;
12710       break;
12711
12712     case IOR:
12713       ldop_code = AARCH64_LDOP_OR;
12714       break;
12715
12716     case XOR:
12717       ldop_code = AARCH64_LDOP_XOR;
12718       break;
12719
12720     case AND:
12721       {
12722         rtx not_src;
12723
12724         /* Resize the value if necessary.  */
12725         if (short_mode)
12726           src = gen_lowpart (wmode, src);
12727
12728         not_src = gen_rtx_NOT (wmode, src);
12729         emit_insn (gen_rtx_SET (src, not_src));
12730
12731         if (short_mode)
12732           src = gen_lowpart (mode, src);
12733       }
12734       ldop_code = AARCH64_LDOP_BIC;
12735       break;
12736
12737     default:
12738       /* The operation can't be done with atomic instructions.  */
12739       gcc_unreachable ();
12740     }
12741
12742   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12743
12744   /* If necessary, calculate the data in memory after the update by redoing the
12745      operation from values in registers.  */
12746   if (!out_result)
12747     return;
12748
12749   if (short_mode)
12750     {
12751       src = gen_lowpart (wmode, src);
12752       out_data = gen_lowpart (wmode, out_data);
12753       out_result = gen_lowpart (wmode, out_result);
12754     }
12755
12756   x = NULL_RTX;
12757
12758   switch (code)
12759     {
12760     case MINUS:
12761     case PLUS:
12762       x = gen_rtx_PLUS (wmode, out_data, src);
12763       break;
12764     case IOR:
12765       x = gen_rtx_IOR (wmode, out_data, src);
12766       break;
12767     case XOR:
12768       x = gen_rtx_XOR (wmode, out_data, src);
12769       break;
12770     case AND:
12771       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12772       return;
12773     default:
12774       gcc_unreachable ();
12775     }
12776
12777   emit_set_insn (out_result, x);
12778
12779   return;
12780 }
12781
12782 /* Split an atomic operation.  */
12783
12784 void
12785 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12786                          rtx value, rtx model_rtx, rtx cond)
12787 {
12788   machine_mode mode = GET_MODE (mem);
12789   machine_mode wmode = (mode == DImode ? DImode : SImode);
12790   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12791   const bool is_sync = is_mm_sync (model);
12792   rtx_code_label *label;
12793   rtx x;
12794
12795   /* Split the atomic operation into a sequence.  */
12796   label = gen_label_rtx ();
12797   emit_label (label);
12798
12799   if (new_out)
12800     new_out = gen_lowpart (wmode, new_out);
12801   if (old_out)
12802     old_out = gen_lowpart (wmode, old_out);
12803   else
12804     old_out = new_out;
12805   value = simplify_gen_subreg (wmode, value, mode, 0);
12806
12807   /* The initial load can be relaxed for a __sync operation since a final
12808      barrier will be emitted to stop code hoisting.  */
12809  if (is_sync)
12810     aarch64_emit_load_exclusive (mode, old_out, mem,
12811                                  GEN_INT (MEMMODEL_RELAXED));
12812   else
12813     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12814
12815   switch (code)
12816     {
12817     case SET:
12818       new_out = value;
12819       break;
12820
12821     case NOT:
12822       x = gen_rtx_AND (wmode, old_out, value);
12823       emit_insn (gen_rtx_SET (new_out, x));
12824       x = gen_rtx_NOT (wmode, new_out);
12825       emit_insn (gen_rtx_SET (new_out, x));
12826       break;
12827
12828     case MINUS:
12829       if (CONST_INT_P (value))
12830         {
12831           value = GEN_INT (-INTVAL (value));
12832           code = PLUS;
12833         }
12834       /* Fall through.  */
12835
12836     default:
12837       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12838       emit_insn (gen_rtx_SET (new_out, x));
12839       break;
12840     }
12841
12842   aarch64_emit_store_exclusive (mode, cond, mem,
12843                                 gen_lowpart (mode, new_out), model_rtx);
12844
12845   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12846   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12847                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12848   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12849
12850   /* Emit any final barrier needed for a __sync operation.  */
12851   if (is_sync)
12852     aarch64_emit_post_barrier (model);
12853 }
12854
12855 static void
12856 aarch64_init_libfuncs (void)
12857 {
12858    /* Half-precision float operations.  The compiler handles all operations
12859      with NULL libfuncs by converting to SFmode.  */
12860
12861   /* Conversions.  */
12862   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12863   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12864
12865   /* Arithmetic.  */
12866   set_optab_libfunc (add_optab, HFmode, NULL);
12867   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12868   set_optab_libfunc (smul_optab, HFmode, NULL);
12869   set_optab_libfunc (neg_optab, HFmode, NULL);
12870   set_optab_libfunc (sub_optab, HFmode, NULL);
12871
12872   /* Comparisons.  */
12873   set_optab_libfunc (eq_optab, HFmode, NULL);
12874   set_optab_libfunc (ne_optab, HFmode, NULL);
12875   set_optab_libfunc (lt_optab, HFmode, NULL);
12876   set_optab_libfunc (le_optab, HFmode, NULL);
12877   set_optab_libfunc (ge_optab, HFmode, NULL);
12878   set_optab_libfunc (gt_optab, HFmode, NULL);
12879   set_optab_libfunc (unord_optab, HFmode, NULL);
12880 }
12881
12882 /* Target hook for c_mode_for_suffix.  */
12883 static machine_mode
12884 aarch64_c_mode_for_suffix (char suffix)
12885 {
12886   if (suffix == 'q')
12887     return TFmode;
12888
12889   return VOIDmode;
12890 }
12891
12892 /* We can only represent floating point constants which will fit in
12893    "quarter-precision" values.  These values are characterised by
12894    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12895    by:
12896
12897    (-1)^s * (n/16) * 2^r
12898
12899    Where:
12900      's' is the sign bit.
12901      'n' is an integer in the range 16 <= n <= 31.
12902      'r' is an integer in the range -3 <= r <= 4.  */
12903
12904 /* Return true iff X can be represented by a quarter-precision
12905    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12906 bool
12907 aarch64_float_const_representable_p (rtx x)
12908 {
12909   /* This represents our current view of how many bits
12910      make up the mantissa.  */
12911   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12912   int exponent;
12913   unsigned HOST_WIDE_INT mantissa, mask;
12914   REAL_VALUE_TYPE r, m;
12915   bool fail;
12916
12917   if (!CONST_DOUBLE_P (x))
12918     return false;
12919
12920   /* We don't support HFmode constants yet.  */
12921   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12922     return false;
12923
12924   r = *CONST_DOUBLE_REAL_VALUE (x);
12925
12926   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12927      know if we have +zero until we analyse the mantissa, but we
12928      can reject the other invalid values.  */
12929   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12930       || REAL_VALUE_MINUS_ZERO (r))
12931     return false;
12932
12933   /* Extract exponent.  */
12934   r = real_value_abs (&r);
12935   exponent = REAL_EXP (&r);
12936
12937   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12938      highest (sign) bit, with a fixed binary point at bit point_pos.
12939      m1 holds the low part of the mantissa, m2 the high part.
12940      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12941      bits for the mantissa, this can fail (low bits will be lost).  */
12942   real_ldexp (&m, &r, point_pos - exponent);
12943   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12944
12945   /* If the low part of the mantissa has bits set we cannot represent
12946      the value.  */
12947   if (w.ulow () != 0)
12948     return false;
12949   /* We have rejected the lower HOST_WIDE_INT, so update our
12950      understanding of how many bits lie in the mantissa and
12951      look only at the high HOST_WIDE_INT.  */
12952   mantissa = w.elt (1);
12953   point_pos -= HOST_BITS_PER_WIDE_INT;
12954
12955   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12956   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12957   if ((mantissa & mask) != 0)
12958     return false;
12959
12960   /* Having filtered unrepresentable values, we may now remove all
12961      but the highest 5 bits.  */
12962   mantissa >>= point_pos - 5;
12963
12964   /* We cannot represent the value 0.0, so reject it.  This is handled
12965      elsewhere.  */
12966   if (mantissa == 0)
12967     return false;
12968
12969   /* Then, as bit 4 is always set, we can mask it off, leaving
12970      the mantissa in the range [0, 15].  */
12971   mantissa &= ~(1 << 4);
12972   gcc_assert (mantissa <= 15);
12973
12974   /* GCC internally does not use IEEE754-like encoding (where normalized
12975      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12976      Our mantissa values are shifted 4 places to the left relative to
12977      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12978      by 5 places to correct for GCC's representation.  */
12979   exponent = 5 - exponent;
12980
12981   return (exponent >= 0 && exponent <= 7);
12982 }
12983
12984 char*
12985 aarch64_output_simd_mov_immediate (rtx const_vector,
12986                                    machine_mode mode,
12987                                    unsigned width)
12988 {
12989   bool is_valid;
12990   static char templ[40];
12991   const char *mnemonic;
12992   const char *shift_op;
12993   unsigned int lane_count = 0;
12994   char element_char;
12995
12996   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12997
12998   /* This will return true to show const_vector is legal for use as either
12999      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
13000      also update INFO to show how the immediate should be generated.  */
13001   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13002   gcc_assert (is_valid);
13003
13004   element_char = sizetochar (info.element_width);
13005   lane_count = width / info.element_width;
13006
13007   mode = GET_MODE_INNER (mode);
13008   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13009     {
13010       gcc_assert (info.shift == 0 && ! info.mvn);
13011       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13012          move immediate path.  */
13013       if (aarch64_float_const_zero_rtx_p (info.value))
13014         info.value = GEN_INT (0);
13015       else
13016         {
13017           const unsigned int buf_size = 20;
13018           char float_buf[buf_size] = {'\0'};
13019           real_to_decimal_for_mode (float_buf,
13020                                     CONST_DOUBLE_REAL_VALUE (info.value),
13021                                     buf_size, buf_size, 1, mode);
13022
13023           if (lane_count == 1)
13024             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13025           else
13026             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13027                       lane_count, element_char, float_buf);
13028           return templ;
13029         }
13030     }
13031
13032   mnemonic = info.mvn ? "mvni" : "movi";
13033   shift_op = info.msl ? "msl" : "lsl";
13034
13035   gcc_assert (CONST_INT_P (info.value));
13036   if (lane_count == 1)
13037     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13038               mnemonic, UINTVAL (info.value));
13039   else if (info.shift)
13040     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13041               ", %s %d", mnemonic, lane_count, element_char,
13042               UINTVAL (info.value), shift_op, info.shift);
13043   else
13044     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13045               mnemonic, lane_count, element_char, UINTVAL (info.value));
13046   return templ;
13047 }
13048
13049 char*
13050 aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
13051 {
13052
13053   /* If a floating point number was passed and we desire to use it in an
13054      integer mode do the conversion to integer.  */
13055   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13056     {
13057       unsigned HOST_WIDE_INT ival;
13058       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13059           gcc_unreachable ();
13060       immediate = gen_int_mode (ival, mode);
13061     }
13062
13063   machine_mode vmode;
13064   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13065      a 128 bit vector mode.  */
13066   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13067
13068   gcc_assert (!VECTOR_MODE_P (mode));
13069   vmode = aarch64_simd_container_mode (mode, width);
13070   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13071   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13072 }
13073
13074 /* Split operands into moves from op[1] + op[2] into op[0].  */
13075
13076 void
13077 aarch64_split_combinev16qi (rtx operands[3])
13078 {
13079   unsigned int dest = REGNO (operands[0]);
13080   unsigned int src1 = REGNO (operands[1]);
13081   unsigned int src2 = REGNO (operands[2]);
13082   machine_mode halfmode = GET_MODE (operands[1]);
13083   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13084   rtx destlo, desthi;
13085
13086   gcc_assert (halfmode == V16QImode);
13087
13088   if (src1 == dest && src2 == dest + halfregs)
13089     {
13090       /* No-op move.  Can't split to nothing; emit something.  */
13091       emit_note (NOTE_INSN_DELETED);
13092       return;
13093     }
13094
13095   /* Preserve register attributes for variable tracking.  */
13096   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13097   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13098                                GET_MODE_SIZE (halfmode));
13099
13100   /* Special case of reversed high/low parts.  */
13101   if (reg_overlap_mentioned_p (operands[2], destlo)
13102       && reg_overlap_mentioned_p (operands[1], desthi))
13103     {
13104       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13105       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13106       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13107     }
13108   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13109     {
13110       /* Try to avoid unnecessary moves if part of the result
13111          is in the right place already.  */
13112       if (src1 != dest)
13113         emit_move_insn (destlo, operands[1]);
13114       if (src2 != dest + halfregs)
13115         emit_move_insn (desthi, operands[2]);
13116     }
13117   else
13118     {
13119       if (src2 != dest + halfregs)
13120         emit_move_insn (desthi, operands[2]);
13121       if (src1 != dest)
13122         emit_move_insn (destlo, operands[1]);
13123     }
13124 }
13125
13126 /* vec_perm support.  */
13127
13128 #define MAX_VECT_LEN 16
13129
13130 struct expand_vec_perm_d
13131 {
13132   rtx target, op0, op1;
13133   unsigned char perm[MAX_VECT_LEN];
13134   machine_mode vmode;
13135   unsigned char nelt;
13136   bool one_vector_p;
13137   bool testing_p;
13138 };
13139
13140 /* Generate a variable permutation.  */
13141
13142 static void
13143 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13144 {
13145   machine_mode vmode = GET_MODE (target);
13146   bool one_vector_p = rtx_equal_p (op0, op1);
13147
13148   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13149   gcc_checking_assert (GET_MODE (op0) == vmode);
13150   gcc_checking_assert (GET_MODE (op1) == vmode);
13151   gcc_checking_assert (GET_MODE (sel) == vmode);
13152   gcc_checking_assert (TARGET_SIMD);
13153
13154   if (one_vector_p)
13155     {
13156       if (vmode == V8QImode)
13157         {
13158           /* Expand the argument to a V16QI mode by duplicating it.  */
13159           rtx pair = gen_reg_rtx (V16QImode);
13160           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13161           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13162         }
13163       else
13164         {
13165           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13166         }
13167     }
13168   else
13169     {
13170       rtx pair;
13171
13172       if (vmode == V8QImode)
13173         {
13174           pair = gen_reg_rtx (V16QImode);
13175           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13176           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13177         }
13178       else
13179         {
13180           pair = gen_reg_rtx (OImode);
13181           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13182           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13183         }
13184     }
13185 }
13186
13187 void
13188 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13189 {
13190   machine_mode vmode = GET_MODE (target);
13191   unsigned int nelt = GET_MODE_NUNITS (vmode);
13192   bool one_vector_p = rtx_equal_p (op0, op1);
13193   rtx mask;
13194
13195   /* The TBL instruction does not use a modulo index, so we must take care
13196      of that ourselves.  */
13197   mask = aarch64_simd_gen_const_vector_dup (vmode,
13198       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13199   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13200
13201   /* For big-endian, we also need to reverse the index within the vector
13202      (but not which vector).  */
13203   if (BYTES_BIG_ENDIAN)
13204     {
13205       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13206       if (!one_vector_p)
13207         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13208       sel = expand_simple_binop (vmode, XOR, sel, mask,
13209                                  NULL, 0, OPTAB_LIB_WIDEN);
13210     }
13211   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13212 }
13213
13214 /* Recognize patterns suitable for the TRN instructions.  */
13215 static bool
13216 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13217 {
13218   unsigned int i, odd, mask, nelt = d->nelt;
13219   rtx out, in0, in1, x;
13220   rtx (*gen) (rtx, rtx, rtx);
13221   machine_mode vmode = d->vmode;
13222
13223   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13224     return false;
13225
13226   /* Note that these are little-endian tests.
13227      We correct for big-endian later.  */
13228   if (d->perm[0] == 0)
13229     odd = 0;
13230   else if (d->perm[0] == 1)
13231     odd = 1;
13232   else
13233     return false;
13234   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13235
13236   for (i = 0; i < nelt; i += 2)
13237     {
13238       if (d->perm[i] != i + odd)
13239         return false;
13240       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13241         return false;
13242     }
13243
13244   /* Success!  */
13245   if (d->testing_p)
13246     return true;
13247
13248   in0 = d->op0;
13249   in1 = d->op1;
13250   if (BYTES_BIG_ENDIAN)
13251     {
13252       x = in0, in0 = in1, in1 = x;
13253       odd = !odd;
13254     }
13255   out = d->target;
13256
13257   if (odd)
13258     {
13259       switch (vmode)
13260         {
13261         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13262         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13263         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13264         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13265         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13266         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13267         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13268         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13269         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13270         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13271         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13272         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13273         default:
13274           return false;
13275         }
13276     }
13277   else
13278     {
13279       switch (vmode)
13280         {
13281         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13282         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13283         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13284         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13285         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13286         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13287         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13288         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13289         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13290         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13291         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13292         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13293         default:
13294           return false;
13295         }
13296     }
13297
13298   emit_insn (gen (out, in0, in1));
13299   return true;
13300 }
13301
13302 /* Recognize patterns suitable for the UZP instructions.  */
13303 static bool
13304 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13305 {
13306   unsigned int i, odd, mask, nelt = d->nelt;
13307   rtx out, in0, in1, x;
13308   rtx (*gen) (rtx, rtx, rtx);
13309   machine_mode vmode = d->vmode;
13310
13311   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13312     return false;
13313
13314   /* Note that these are little-endian tests.
13315      We correct for big-endian later.  */
13316   if (d->perm[0] == 0)
13317     odd = 0;
13318   else if (d->perm[0] == 1)
13319     odd = 1;
13320   else
13321     return false;
13322   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13323
13324   for (i = 0; i < nelt; i++)
13325     {
13326       unsigned elt = (i * 2 + odd) & mask;
13327       if (d->perm[i] != elt)
13328         return false;
13329     }
13330
13331   /* Success!  */
13332   if (d->testing_p)
13333     return true;
13334
13335   in0 = d->op0;
13336   in1 = d->op1;
13337   if (BYTES_BIG_ENDIAN)
13338     {
13339       x = in0, in0 = in1, in1 = x;
13340       odd = !odd;
13341     }
13342   out = d->target;
13343
13344   if (odd)
13345     {
13346       switch (vmode)
13347         {
13348         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13349         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13350         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13351         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13352         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13353         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13354         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13355         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13356         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13357         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13358         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13359         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13360         default:
13361           return false;
13362         }
13363     }
13364   else
13365     {
13366       switch (vmode)
13367         {
13368         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13369         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13370         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13371         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13372         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13373         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13374         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13375         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13376         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13377         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13378         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13379         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13380         default:
13381           return false;
13382         }
13383     }
13384
13385   emit_insn (gen (out, in0, in1));
13386   return true;
13387 }
13388
13389 /* Recognize patterns suitable for the ZIP instructions.  */
13390 static bool
13391 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13392 {
13393   unsigned int i, high, mask, nelt = d->nelt;
13394   rtx out, in0, in1, x;
13395   rtx (*gen) (rtx, rtx, rtx);
13396   machine_mode vmode = d->vmode;
13397
13398   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13399     return false;
13400
13401   /* Note that these are little-endian tests.
13402      We correct for big-endian later.  */
13403   high = nelt / 2;
13404   if (d->perm[0] == high)
13405     /* Do Nothing.  */
13406     ;
13407   else if (d->perm[0] == 0)
13408     high = 0;
13409   else
13410     return false;
13411   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13412
13413   for (i = 0; i < nelt / 2; i++)
13414     {
13415       unsigned elt = (i + high) & mask;
13416       if (d->perm[i * 2] != elt)
13417         return false;
13418       elt = (elt + nelt) & mask;
13419       if (d->perm[i * 2 + 1] != elt)
13420         return false;
13421     }
13422
13423   /* Success!  */
13424   if (d->testing_p)
13425     return true;
13426
13427   in0 = d->op0;
13428   in1 = d->op1;
13429   if (BYTES_BIG_ENDIAN)
13430     {
13431       x = in0, in0 = in1, in1 = x;
13432       high = !high;
13433     }
13434   out = d->target;
13435
13436   if (high)
13437     {
13438       switch (vmode)
13439         {
13440         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13441         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13442         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13443         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13444         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13445         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13446         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13447         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13448         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13449         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13450         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13451         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13452         default:
13453           return false;
13454         }
13455     }
13456   else
13457     {
13458       switch (vmode)
13459         {
13460         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13461         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13462         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13463         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13464         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13465         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13466         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13467         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13468         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13469         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13470         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13471         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13472         default:
13473           return false;
13474         }
13475     }
13476
13477   emit_insn (gen (out, in0, in1));
13478   return true;
13479 }
13480
13481 /* Recognize patterns for the EXT insn.  */
13482
13483 static bool
13484 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13485 {
13486   unsigned int i, nelt = d->nelt;
13487   rtx (*gen) (rtx, rtx, rtx, rtx);
13488   rtx offset;
13489
13490   unsigned int location = d->perm[0]; /* Always < nelt.  */
13491
13492   /* Check if the extracted indices are increasing by one.  */
13493   for (i = 1; i < nelt; i++)
13494     {
13495       unsigned int required = location + i;
13496       if (d->one_vector_p)
13497         {
13498           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13499           required &= (nelt - 1);
13500         }
13501       if (d->perm[i] != required)
13502         return false;
13503     }
13504
13505   switch (d->vmode)
13506     {
13507     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13508     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13509     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13510     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13511     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13512     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13513     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13514     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13515     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13516     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13517     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13518     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13519     default:
13520       return false;
13521     }
13522
13523   /* Success! */
13524   if (d->testing_p)
13525     return true;
13526
13527   /* The case where (location == 0) is a no-op for both big- and little-endian,
13528      and is removed by the mid-end at optimization levels -O1 and higher.  */
13529
13530   if (BYTES_BIG_ENDIAN && (location != 0))
13531     {
13532       /* After setup, we want the high elements of the first vector (stored
13533          at the LSB end of the register), and the low elements of the second
13534          vector (stored at the MSB end of the register). So swap.  */
13535       std::swap (d->op0, d->op1);
13536       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13537       location = nelt - location;
13538     }
13539
13540   offset = GEN_INT (location);
13541   emit_insn (gen (d->target, d->op0, d->op1, offset));
13542   return true;
13543 }
13544
13545 /* Recognize patterns for the REV insns.  */
13546
13547 static bool
13548 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13549 {
13550   unsigned int i, j, diff, nelt = d->nelt;
13551   rtx (*gen) (rtx, rtx);
13552
13553   if (!d->one_vector_p)
13554     return false;
13555
13556   diff = d->perm[0];
13557   switch (diff)
13558     {
13559     case 7:
13560       switch (d->vmode)
13561         {
13562         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13563         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13564         default:
13565           return false;
13566         }
13567       break;
13568     case 3:
13569       switch (d->vmode)
13570         {
13571         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13572         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13573         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13574         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13575         default:
13576           return false;
13577         }
13578       break;
13579     case 1:
13580       switch (d->vmode)
13581         {
13582         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13583         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13584         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13585         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13586         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13587         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13588         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13589         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13590         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13591         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13592         default:
13593           return false;
13594         }
13595       break;
13596     default:
13597       return false;
13598     }
13599
13600   for (i = 0; i < nelt ; i += diff + 1)
13601     for (j = 0; j <= diff; j += 1)
13602       {
13603         /* This is guaranteed to be true as the value of diff
13604            is 7, 3, 1 and we should have enough elements in the
13605            queue to generate this.  Getting a vector mask with a
13606            value of diff other than these values implies that
13607            something is wrong by the time we get here.  */
13608         gcc_assert (i + j < nelt);
13609         if (d->perm[i + j] != i + diff - j)
13610           return false;
13611       }
13612
13613   /* Success! */
13614   if (d->testing_p)
13615     return true;
13616
13617   emit_insn (gen (d->target, d->op0));
13618   return true;
13619 }
13620
13621 static bool
13622 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13623 {
13624   rtx (*gen) (rtx, rtx, rtx);
13625   rtx out = d->target;
13626   rtx in0;
13627   machine_mode vmode = d->vmode;
13628   unsigned int i, elt, nelt = d->nelt;
13629   rtx lane;
13630
13631   elt = d->perm[0];
13632   for (i = 1; i < nelt; i++)
13633     {
13634       if (elt != d->perm[i])
13635         return false;
13636     }
13637
13638   /* The generic preparation in aarch64_expand_vec_perm_const_1
13639      swaps the operand order and the permute indices if it finds
13640      d->perm[0] to be in the second operand.  Thus, we can always
13641      use d->op0 and need not do any extra arithmetic to get the
13642      correct lane number.  */
13643   in0 = d->op0;
13644   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13645
13646   switch (vmode)
13647     {
13648     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13649     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13650     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13651     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13652     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13653     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13654     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13655     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13656     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13657     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13658     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13659     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13660     default:
13661       return false;
13662     }
13663
13664   emit_insn (gen (out, in0, lane));
13665   return true;
13666 }
13667
13668 static bool
13669 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13670 {
13671   rtx rperm[MAX_VECT_LEN], sel;
13672   machine_mode vmode = d->vmode;
13673   unsigned int i, nelt = d->nelt;
13674
13675   if (d->testing_p)
13676     return true;
13677
13678   /* Generic code will try constant permutation twice.  Once with the
13679      original mode and again with the elements lowered to QImode.
13680      So wait and don't do the selector expansion ourselves.  */
13681   if (vmode != V8QImode && vmode != V16QImode)
13682     return false;
13683
13684   for (i = 0; i < nelt; ++i)
13685     {
13686       int nunits = GET_MODE_NUNITS (vmode);
13687
13688       /* If big-endian and two vectors we end up with a weird mixed-endian
13689          mode on NEON.  Reverse the index within each word but not the word
13690          itself.  */
13691       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13692                                            : d->perm[i]);
13693     }
13694   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13695   sel = force_reg (vmode, sel);
13696
13697   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13698   return true;
13699 }
13700
13701 static bool
13702 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13703 {
13704   /* The pattern matching functions above are written to look for a small
13705      number to begin the sequence (0, 1, N/2).  If we begin with an index
13706      from the second operand, we can swap the operands.  */
13707   if (d->perm[0] >= d->nelt)
13708     {
13709       unsigned i, nelt = d->nelt;
13710
13711       gcc_assert (nelt == (nelt & -nelt));
13712       for (i = 0; i < nelt; ++i)
13713         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13714
13715       std::swap (d->op0, d->op1);
13716     }
13717
13718   if (TARGET_SIMD)
13719     {
13720       if (aarch64_evpc_rev (d))
13721         return true;
13722       else if (aarch64_evpc_ext (d))
13723         return true;
13724       else if (aarch64_evpc_dup (d))
13725         return true;
13726       else if (aarch64_evpc_zip (d))
13727         return true;
13728       else if (aarch64_evpc_uzp (d))
13729         return true;
13730       else if (aarch64_evpc_trn (d))
13731         return true;
13732       return aarch64_evpc_tbl (d);
13733     }
13734   return false;
13735 }
13736
13737 /* Expand a vec_perm_const pattern.  */
13738
13739 bool
13740 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13741 {
13742   struct expand_vec_perm_d d;
13743   int i, nelt, which;
13744
13745   d.target = target;
13746   d.op0 = op0;
13747   d.op1 = op1;
13748
13749   d.vmode = GET_MODE (target);
13750   gcc_assert (VECTOR_MODE_P (d.vmode));
13751   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13752   d.testing_p = false;
13753
13754   for (i = which = 0; i < nelt; ++i)
13755     {
13756       rtx e = XVECEXP (sel, 0, i);
13757       int ei = INTVAL (e) & (2 * nelt - 1);
13758       which |= (ei < nelt ? 1 : 2);
13759       d.perm[i] = ei;
13760     }
13761
13762   switch (which)
13763     {
13764     default:
13765       gcc_unreachable ();
13766
13767     case 3:
13768       d.one_vector_p = false;
13769       if (!rtx_equal_p (op0, op1))
13770         break;
13771
13772       /* The elements of PERM do not suggest that only the first operand
13773          is used, but both operands are identical.  Allow easier matching
13774          of the permutation by folding the permutation into the single
13775          input vector.  */
13776       /* Fall Through.  */
13777     case 2:
13778       for (i = 0; i < nelt; ++i)
13779         d.perm[i] &= nelt - 1;
13780       d.op0 = op1;
13781       d.one_vector_p = true;
13782       break;
13783
13784     case 1:
13785       d.op1 = op0;
13786       d.one_vector_p = true;
13787       break;
13788     }
13789
13790   return aarch64_expand_vec_perm_const_1 (&d);
13791 }
13792
13793 static bool
13794 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13795                                      const unsigned char *sel)
13796 {
13797   struct expand_vec_perm_d d;
13798   unsigned int i, nelt, which;
13799   bool ret;
13800
13801   d.vmode = vmode;
13802   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13803   d.testing_p = true;
13804   memcpy (d.perm, sel, nelt);
13805
13806   /* Calculate whether all elements are in one vector.  */
13807   for (i = which = 0; i < nelt; ++i)
13808     {
13809       unsigned char e = d.perm[i];
13810       gcc_assert (e < 2 * nelt);
13811       which |= (e < nelt ? 1 : 2);
13812     }
13813
13814   /* If all elements are from the second vector, reindex as if from the
13815      first vector.  */
13816   if (which == 2)
13817     for (i = 0; i < nelt; ++i)
13818       d.perm[i] -= nelt;
13819
13820   /* Check whether the mask can be applied to a single vector.  */
13821   d.one_vector_p = (which != 3);
13822
13823   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13824   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13825   if (!d.one_vector_p)
13826     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13827
13828   start_sequence ();
13829   ret = aarch64_expand_vec_perm_const_1 (&d);
13830   end_sequence ();
13831
13832   return ret;
13833 }
13834
13835 rtx
13836 aarch64_reverse_mask (machine_mode mode)
13837 {
13838   /* We have to reverse each vector because we dont have
13839      a permuted load that can reverse-load according to ABI rules.  */
13840   rtx mask;
13841   rtvec v = rtvec_alloc (16);
13842   int i, j;
13843   int nunits = GET_MODE_NUNITS (mode);
13844   int usize = GET_MODE_UNIT_SIZE (mode);
13845
13846   gcc_assert (BYTES_BIG_ENDIAN);
13847   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13848
13849   for (i = 0; i < nunits; i++)
13850     for (j = 0; j < usize; j++)
13851       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13852   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13853   return force_reg (V16QImode, mask);
13854 }
13855
13856 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13857    However due to issues with register allocation it is preferable to avoid
13858    tieing integer scalar and FP scalar modes.  Executing integer operations
13859    in general registers is better than treating them as scalar vector
13860    operations.  This reduces latency and avoids redundant int<->FP moves.
13861    So tie modes if they are either the same class, or vector modes with
13862    other vector modes, vector structs or any scalar mode.
13863 */
13864
13865 bool
13866 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13867 {
13868   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13869     return true;
13870
13871   /* We specifically want to allow elements of "structure" modes to
13872      be tieable to the structure.  This more general condition allows
13873      other rarer situations too.  */
13874   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13875     return true;
13876
13877   /* Also allow any scalar modes with vectors.  */
13878   if (aarch64_vector_mode_supported_p (mode1)
13879       || aarch64_vector_mode_supported_p (mode2))
13880     return true;
13881
13882   return false;
13883 }
13884
13885 /* Return a new RTX holding the result of moving POINTER forward by
13886    AMOUNT bytes.  */
13887
13888 static rtx
13889 aarch64_move_pointer (rtx pointer, int amount)
13890 {
13891   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13892
13893   return adjust_automodify_address (pointer, GET_MODE (pointer),
13894                                     next, amount);
13895 }
13896
13897 /* Return a new RTX holding the result of moving POINTER forward by the
13898    size of the mode it points to.  */
13899
13900 static rtx
13901 aarch64_progress_pointer (rtx pointer)
13902 {
13903   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13904
13905   return aarch64_move_pointer (pointer, amount);
13906 }
13907
13908 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13909    MODE bytes.  */
13910
13911 static void
13912 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13913                                               machine_mode mode)
13914 {
13915   rtx reg = gen_reg_rtx (mode);
13916
13917   /* "Cast" the pointers to the correct mode.  */
13918   *src = adjust_address (*src, mode, 0);
13919   *dst = adjust_address (*dst, mode, 0);
13920   /* Emit the memcpy.  */
13921   emit_move_insn (reg, *src);
13922   emit_move_insn (*dst, reg);
13923   /* Move the pointers forward.  */
13924   *src = aarch64_progress_pointer (*src);
13925   *dst = aarch64_progress_pointer (*dst);
13926 }
13927
13928 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13929    we succeed, otherwise return false.  */
13930
13931 bool
13932 aarch64_expand_movmem (rtx *operands)
13933 {
13934   unsigned int n;
13935   rtx dst = operands[0];
13936   rtx src = operands[1];
13937   rtx base;
13938   bool speed_p = !optimize_function_for_size_p (cfun);
13939
13940   /* When optimizing for size, give a better estimate of the length of a
13941      memcpy call, but use the default otherwise.  */
13942   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13943
13944   /* We can't do anything smart if the amount to copy is not constant.  */
13945   if (!CONST_INT_P (operands[2]))
13946     return false;
13947
13948   n = UINTVAL (operands[2]);
13949
13950   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13951      need to make at most two moves.  For cases above 16 bytes it will be one
13952      move for each 16 byte chunk, then at most two additional moves.  */
13953   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13954     return false;
13955
13956   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13957   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13958
13959   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13960   src = adjust_automodify_address (src, VOIDmode, base, 0);
13961
13962   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13963      1-byte chunk.  */
13964   if (n < 4)
13965     {
13966       if (n >= 2)
13967         {
13968           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13969           n -= 2;
13970         }
13971
13972       if (n == 1)
13973         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13974
13975       return true;
13976     }
13977
13978   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13979      4-byte chunk, partially overlapping with the previously copied chunk.  */
13980   if (n < 8)
13981     {
13982       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13983       n -= 4;
13984       if (n > 0)
13985         {
13986           int move = n - 4;
13987
13988           src = aarch64_move_pointer (src, move);
13989           dst = aarch64_move_pointer (dst, move);
13990           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13991         }
13992       return true;
13993     }
13994
13995   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13996      them, then (if applicable) an 8-byte chunk.  */
13997   while (n >= 8)
13998     {
13999       if (n / 16)
14000         {
14001           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14002           n -= 16;
14003         }
14004       else
14005         {
14006           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14007           n -= 8;
14008         }
14009     }
14010
14011   /* Finish the final bytes of the copy.  We can always do this in one
14012      instruction.  We either copy the exact amount we need, or partially
14013      overlap with the previous chunk we copied and copy 8-bytes.  */
14014   if (n == 0)
14015     return true;
14016   else if (n == 1)
14017     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14018   else if (n == 2)
14019     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14020   else if (n == 4)
14021     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14022   else
14023     {
14024       if (n == 3)
14025         {
14026           src = aarch64_move_pointer (src, -1);
14027           dst = aarch64_move_pointer (dst, -1);
14028           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14029         }
14030       else
14031         {
14032           int move = n - 8;
14033
14034           src = aarch64_move_pointer (src, move);
14035           dst = aarch64_move_pointer (dst, move);
14036           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14037         }
14038     }
14039
14040   return true;
14041 }
14042
14043 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14044    SImode stores.  Handle the case when the constant has identical
14045    bottom and top halves.  This is beneficial when the two stores can be
14046    merged into an STP and we avoid synthesising potentially expensive
14047    immediates twice.  Return true if such a split is possible.  */
14048
14049 bool
14050 aarch64_split_dimode_const_store (rtx dst, rtx src)
14051 {
14052   rtx lo = gen_lowpart (SImode, src);
14053   rtx hi = gen_highpart_mode (SImode, DImode, src);
14054
14055   bool size_p = optimize_function_for_size_p (cfun);
14056
14057   if (!rtx_equal_p (lo, hi))
14058     return false;
14059
14060   unsigned int orig_cost
14061     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14062   unsigned int lo_cost
14063     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14064
14065   /* We want to transform:
14066      MOV        x1, 49370
14067      MOVK       x1, 0x140, lsl 16
14068      MOVK       x1, 0xc0da, lsl 32
14069      MOVK       x1, 0x140, lsl 48
14070      STR        x1, [x0]
14071    into:
14072      MOV        w1, 49370
14073      MOVK       w1, 0x140, lsl 16
14074      STP        w1, w1, [x0]
14075    So we want to perform this only when we save two instructions
14076    or more.  When optimizing for size, however, accept any code size
14077    savings we can.  */
14078   if (size_p && orig_cost <= lo_cost)
14079     return false;
14080
14081   if (!size_p
14082       && (orig_cost <= lo_cost + 1))
14083     return false;
14084
14085   rtx mem_lo = adjust_address (dst, SImode, 0);
14086   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14087     return false;
14088
14089   rtx tmp_reg = gen_reg_rtx (SImode);
14090   aarch64_expand_mov_immediate (tmp_reg, lo);
14091   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14092   /* Don't emit an explicit store pair as this may not be always profitable.
14093      Let the sched-fusion logic decide whether to merge them.  */
14094   emit_move_insn (mem_lo, tmp_reg);
14095   emit_move_insn (mem_hi, tmp_reg);
14096
14097   return true;
14098 }
14099
14100 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14101
14102 static unsigned HOST_WIDE_INT
14103 aarch64_asan_shadow_offset (void)
14104 {
14105   return (HOST_WIDE_INT_1 << 36);
14106 }
14107
14108 static bool
14109 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14110                                         unsigned int align,
14111                                         enum by_pieces_operation op,
14112                                         bool speed_p)
14113 {
14114   /* STORE_BY_PIECES can be used when copying a constant string, but
14115      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14116      For now we always fail this and let the move_by_pieces code copy
14117      the string from read-only memory.  */
14118   if (op == STORE_BY_PIECES)
14119     return false;
14120
14121   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14122 }
14123
14124 static rtx
14125 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14126                         int code, tree treeop0, tree treeop1)
14127 {
14128   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14129   rtx op0, op1;
14130   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14131   insn_code icode;
14132   struct expand_operand ops[4];
14133
14134   start_sequence ();
14135   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14136
14137   op_mode = GET_MODE (op0);
14138   if (op_mode == VOIDmode)
14139     op_mode = GET_MODE (op1);
14140
14141   switch (op_mode)
14142     {
14143     case E_QImode:
14144     case E_HImode:
14145     case E_SImode:
14146       cmp_mode = SImode;
14147       icode = CODE_FOR_cmpsi;
14148       break;
14149
14150     case E_DImode:
14151       cmp_mode = DImode;
14152       icode = CODE_FOR_cmpdi;
14153       break;
14154
14155     case E_SFmode:
14156       cmp_mode = SFmode;
14157       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14158       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14159       break;
14160
14161     case E_DFmode:
14162       cmp_mode = DFmode;
14163       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14164       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14165       break;
14166
14167     default:
14168       end_sequence ();
14169       return NULL_RTX;
14170     }
14171
14172   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14173   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14174   if (!op0 || !op1)
14175     {
14176       end_sequence ();
14177       return NULL_RTX;
14178     }
14179   *prep_seq = get_insns ();
14180   end_sequence ();
14181
14182   create_fixed_operand (&ops[0], op0);
14183   create_fixed_operand (&ops[1], op1);
14184
14185   start_sequence ();
14186   if (!maybe_expand_insn (icode, 2, ops))
14187     {
14188       end_sequence ();
14189       return NULL_RTX;
14190     }
14191   *gen_seq = get_insns ();
14192   end_sequence ();
14193
14194   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14195                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14196 }
14197
14198 static rtx
14199 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14200                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14201 {
14202   rtx op0, op1, target;
14203   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14204   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14205   insn_code icode;
14206   struct expand_operand ops[6];
14207   int aarch64_cond;
14208
14209   push_to_sequence (*prep_seq);
14210   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14211
14212   op_mode = GET_MODE (op0);
14213   if (op_mode == VOIDmode)
14214     op_mode = GET_MODE (op1);
14215
14216   switch (op_mode)
14217     {
14218     case E_QImode:
14219     case E_HImode:
14220     case E_SImode:
14221       cmp_mode = SImode;
14222       icode = CODE_FOR_ccmpsi;
14223       break;
14224
14225     case E_DImode:
14226       cmp_mode = DImode;
14227       icode = CODE_FOR_ccmpdi;
14228       break;
14229
14230     case E_SFmode:
14231       cmp_mode = SFmode;
14232       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14233       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14234       break;
14235
14236     case E_DFmode:
14237       cmp_mode = DFmode;
14238       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14239       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14240       break;
14241
14242     default:
14243       end_sequence ();
14244       return NULL_RTX;
14245     }
14246
14247   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14248   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14249   if (!op0 || !op1)
14250     {
14251       end_sequence ();
14252       return NULL_RTX;
14253     }
14254   *prep_seq = get_insns ();
14255   end_sequence ();
14256
14257   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14258   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14259
14260   if (bit_code != AND)
14261     {
14262       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14263                                                 GET_MODE (XEXP (prev, 0))),
14264                              VOIDmode, XEXP (prev, 0), const0_rtx);
14265       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14266     }
14267
14268   create_fixed_operand (&ops[0], XEXP (prev, 0));
14269   create_fixed_operand (&ops[1], target);
14270   create_fixed_operand (&ops[2], op0);
14271   create_fixed_operand (&ops[3], op1);
14272   create_fixed_operand (&ops[4], prev);
14273   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14274
14275   push_to_sequence (*gen_seq);
14276   if (!maybe_expand_insn (icode, 6, ops))
14277     {
14278       end_sequence ();
14279       return NULL_RTX;
14280     }
14281
14282   *gen_seq = get_insns ();
14283   end_sequence ();
14284
14285   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14286 }
14287
14288 #undef TARGET_GEN_CCMP_FIRST
14289 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14290
14291 #undef TARGET_GEN_CCMP_NEXT
14292 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14293
14294 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14295    instruction fusion of some sort.  */
14296
14297 static bool
14298 aarch64_macro_fusion_p (void)
14299 {
14300   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14301 }
14302
14303
14304 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14305    should be kept together during scheduling.  */
14306
14307 static bool
14308 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14309 {
14310   rtx set_dest;
14311   rtx prev_set = single_set (prev);
14312   rtx curr_set = single_set (curr);
14313   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14314   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14315
14316   if (!aarch64_macro_fusion_p ())
14317     return false;
14318
14319   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14320     {
14321       /* We are trying to match:
14322          prev (mov)  == (set (reg r0) (const_int imm16))
14323          curr (movk) == (set (zero_extract (reg r0)
14324                                            (const_int 16)
14325                                            (const_int 16))
14326                              (const_int imm16_1))  */
14327
14328       set_dest = SET_DEST (curr_set);
14329
14330       if (GET_CODE (set_dest) == ZERO_EXTRACT
14331           && CONST_INT_P (SET_SRC (curr_set))
14332           && CONST_INT_P (SET_SRC (prev_set))
14333           && CONST_INT_P (XEXP (set_dest, 2))
14334           && INTVAL (XEXP (set_dest, 2)) == 16
14335           && REG_P (XEXP (set_dest, 0))
14336           && REG_P (SET_DEST (prev_set))
14337           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14338         {
14339           return true;
14340         }
14341     }
14342
14343   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14344     {
14345
14346       /*  We're trying to match:
14347           prev (adrp) == (set (reg r1)
14348                               (high (symbol_ref ("SYM"))))
14349           curr (add) == (set (reg r0)
14350                              (lo_sum (reg r1)
14351                                      (symbol_ref ("SYM"))))
14352           Note that r0 need not necessarily be the same as r1, especially
14353           during pre-regalloc scheduling.  */
14354
14355       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14356           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14357         {
14358           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14359               && REG_P (XEXP (SET_SRC (curr_set), 0))
14360               && REGNO (XEXP (SET_SRC (curr_set), 0))
14361                  == REGNO (SET_DEST (prev_set))
14362               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14363                               XEXP (SET_SRC (curr_set), 1)))
14364             return true;
14365         }
14366     }
14367
14368   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14369     {
14370
14371       /* We're trying to match:
14372          prev (movk) == (set (zero_extract (reg r0)
14373                                            (const_int 16)
14374                                            (const_int 32))
14375                              (const_int imm16_1))
14376          curr (movk) == (set (zero_extract (reg r0)
14377                                            (const_int 16)
14378                                            (const_int 48))
14379                              (const_int imm16_2))  */
14380
14381       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14382           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14383           && REG_P (XEXP (SET_DEST (prev_set), 0))
14384           && REG_P (XEXP (SET_DEST (curr_set), 0))
14385           && REGNO (XEXP (SET_DEST (prev_set), 0))
14386              == REGNO (XEXP (SET_DEST (curr_set), 0))
14387           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14388           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14389           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14390           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14391           && CONST_INT_P (SET_SRC (prev_set))
14392           && CONST_INT_P (SET_SRC (curr_set)))
14393         return true;
14394
14395     }
14396   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14397     {
14398       /* We're trying to match:
14399           prev (adrp) == (set (reg r0)
14400                               (high (symbol_ref ("SYM"))))
14401           curr (ldr) == (set (reg r1)
14402                              (mem (lo_sum (reg r0)
14403                                              (symbol_ref ("SYM")))))
14404                  or
14405           curr (ldr) == (set (reg r1)
14406                              (zero_extend (mem
14407                                            (lo_sum (reg r0)
14408                                                    (symbol_ref ("SYM"))))))  */
14409       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14410           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14411         {
14412           rtx curr_src = SET_SRC (curr_set);
14413
14414           if (GET_CODE (curr_src) == ZERO_EXTEND)
14415             curr_src = XEXP (curr_src, 0);
14416
14417           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14418               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14419               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14420                  == REGNO (SET_DEST (prev_set))
14421               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14422                               XEXP (SET_SRC (prev_set), 0)))
14423               return true;
14424         }
14425     }
14426
14427   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14428        && aarch_crypto_can_dual_issue (prev, curr))
14429     return true;
14430
14431   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14432       && any_condjump_p (curr))
14433     {
14434       enum attr_type prev_type = get_attr_type (prev);
14435
14436       unsigned int condreg1, condreg2;
14437       rtx cc_reg_1;
14438       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14439       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14440
14441       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14442           && prev
14443           && modified_in_p (cc_reg_1, prev))
14444         {
14445           /* FIXME: this misses some which is considered simple arthematic
14446              instructions for ThunderX.  Simple shifts are missed here.  */
14447           if (prev_type == TYPE_ALUS_SREG
14448               || prev_type == TYPE_ALUS_IMM
14449               || prev_type == TYPE_LOGICS_REG
14450               || prev_type == TYPE_LOGICS_IMM)
14451             return true;
14452         }
14453     }
14454
14455   if (prev_set
14456       && curr_set
14457       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14458       && any_condjump_p (curr))
14459     {
14460       /* We're trying to match:
14461           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14462           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14463                                                          (const_int 0))
14464                                                  (label_ref ("SYM"))
14465                                                  (pc))  */
14466       if (SET_DEST (curr_set) == (pc_rtx)
14467           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14468           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14469           && REG_P (SET_DEST (prev_set))
14470           && REGNO (SET_DEST (prev_set))
14471              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14472         {
14473           /* Fuse ALU operations followed by conditional branch instruction.  */
14474           switch (get_attr_type (prev))
14475             {
14476             case TYPE_ALU_IMM:
14477             case TYPE_ALU_SREG:
14478             case TYPE_ADC_REG:
14479             case TYPE_ADC_IMM:
14480             case TYPE_ADCS_REG:
14481             case TYPE_ADCS_IMM:
14482             case TYPE_LOGIC_REG:
14483             case TYPE_LOGIC_IMM:
14484             case TYPE_CSEL:
14485             case TYPE_ADR:
14486             case TYPE_MOV_IMM:
14487             case TYPE_SHIFT_REG:
14488             case TYPE_SHIFT_IMM:
14489             case TYPE_BFM:
14490             case TYPE_RBIT:
14491             case TYPE_REV:
14492             case TYPE_EXTEND:
14493               return true;
14494
14495             default:;
14496             }
14497         }
14498     }
14499
14500   return false;
14501 }
14502
14503 /* Return true iff the instruction fusion described by OP is enabled.  */
14504
14505 bool
14506 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14507 {
14508   return (aarch64_tune_params.fusible_ops & op) != 0;
14509 }
14510
14511 /* If MEM is in the form of [base+offset], extract the two parts
14512    of address and set to BASE and OFFSET, otherwise return false
14513    after clearing BASE and OFFSET.  */
14514
14515 bool
14516 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14517 {
14518   rtx addr;
14519
14520   gcc_assert (MEM_P (mem));
14521
14522   addr = XEXP (mem, 0);
14523
14524   if (REG_P (addr))
14525     {
14526       *base = addr;
14527       *offset = const0_rtx;
14528       return true;
14529     }
14530
14531   if (GET_CODE (addr) == PLUS
14532       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14533     {
14534       *base = XEXP (addr, 0);
14535       *offset = XEXP (addr, 1);
14536       return true;
14537     }
14538
14539   *base = NULL_RTX;
14540   *offset = NULL_RTX;
14541
14542   return false;
14543 }
14544
14545 /* Types for scheduling fusion.  */
14546 enum sched_fusion_type
14547 {
14548   SCHED_FUSION_NONE = 0,
14549   SCHED_FUSION_LD_SIGN_EXTEND,
14550   SCHED_FUSION_LD_ZERO_EXTEND,
14551   SCHED_FUSION_LD,
14552   SCHED_FUSION_ST,
14553   SCHED_FUSION_NUM
14554 };
14555
14556 /* If INSN is a load or store of address in the form of [base+offset],
14557    extract the two parts and set to BASE and OFFSET.  Return scheduling
14558    fusion type this INSN is.  */
14559
14560 static enum sched_fusion_type
14561 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14562 {
14563   rtx x, dest, src;
14564   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14565
14566   gcc_assert (INSN_P (insn));
14567   x = PATTERN (insn);
14568   if (GET_CODE (x) != SET)
14569     return SCHED_FUSION_NONE;
14570
14571   src = SET_SRC (x);
14572   dest = SET_DEST (x);
14573
14574   machine_mode dest_mode = GET_MODE (dest);
14575
14576   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14577     return SCHED_FUSION_NONE;
14578
14579   if (GET_CODE (src) == SIGN_EXTEND)
14580     {
14581       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14582       src = XEXP (src, 0);
14583       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14584         return SCHED_FUSION_NONE;
14585     }
14586   else if (GET_CODE (src) == ZERO_EXTEND)
14587     {
14588       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14589       src = XEXP (src, 0);
14590       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14591         return SCHED_FUSION_NONE;
14592     }
14593
14594   if (GET_CODE (src) == MEM && REG_P (dest))
14595     extract_base_offset_in_addr (src, base, offset);
14596   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14597     {
14598       fusion = SCHED_FUSION_ST;
14599       extract_base_offset_in_addr (dest, base, offset);
14600     }
14601   else
14602     return SCHED_FUSION_NONE;
14603
14604   if (*base == NULL_RTX || *offset == NULL_RTX)
14605     fusion = SCHED_FUSION_NONE;
14606
14607   return fusion;
14608 }
14609
14610 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14611
14612    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14613    and PRI are only calculated for these instructions.  For other instruction,
14614    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14615    type instruction fusion can be added by returning different priorities.
14616
14617    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14618
14619 static void
14620 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14621                                int *fusion_pri, int *pri)
14622 {
14623   int tmp, off_val;
14624   rtx base, offset;
14625   enum sched_fusion_type fusion;
14626
14627   gcc_assert (INSN_P (insn));
14628
14629   tmp = max_pri - 1;
14630   fusion = fusion_load_store (insn, &base, &offset);
14631   if (fusion == SCHED_FUSION_NONE)
14632     {
14633       *pri = tmp;
14634       *fusion_pri = tmp;
14635       return;
14636     }
14637
14638   /* Set FUSION_PRI according to fusion type and base register.  */
14639   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14640
14641   /* Calculate PRI.  */
14642   tmp /= 2;
14643
14644   /* INSN with smaller offset goes first.  */
14645   off_val = (int)(INTVAL (offset));
14646   if (off_val >= 0)
14647     tmp -= (off_val & 0xfffff);
14648   else
14649     tmp += ((- off_val) & 0xfffff);
14650
14651   *pri = tmp;
14652   return;
14653 }
14654
14655 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14656    Adjust priority of sha1h instructions so they are scheduled before
14657    other SHA1 instructions.  */
14658
14659 static int
14660 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14661 {
14662   rtx x = PATTERN (insn);
14663
14664   if (GET_CODE (x) == SET)
14665     {
14666       x = SET_SRC (x);
14667
14668       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14669         return priority + 10;
14670     }
14671
14672   return priority;
14673 }
14674
14675 /* Given OPERANDS of consecutive load/store, check if we can merge
14676    them into ldp/stp.  LOAD is true if they are load instructions.
14677    MODE is the mode of memory operands.  */
14678
14679 bool
14680 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14681                                 machine_mode mode)
14682 {
14683   HOST_WIDE_INT offval_1, offval_2, msize;
14684   enum reg_class rclass_1, rclass_2;
14685   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14686
14687   if (load)
14688     {
14689       mem_1 = operands[1];
14690       mem_2 = operands[3];
14691       reg_1 = operands[0];
14692       reg_2 = operands[2];
14693       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14694       if (REGNO (reg_1) == REGNO (reg_2))
14695         return false;
14696     }
14697   else
14698     {
14699       mem_1 = operands[0];
14700       mem_2 = operands[2];
14701       reg_1 = operands[1];
14702       reg_2 = operands[3];
14703     }
14704
14705   /* The mems cannot be volatile.  */
14706   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14707     return false;
14708
14709   /* If we have SImode and slow unaligned ldp,
14710      check the alignment to be at least 8 byte. */
14711   if (mode == SImode
14712       && (aarch64_tune_params.extra_tuning_flags
14713           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14714       && !optimize_size
14715       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14716     return false;
14717
14718   /* Check if the addresses are in the form of [base+offset].  */
14719   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14720   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14721     return false;
14722   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14723   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14724     return false;
14725
14726   /* Check if the bases are same.  */
14727   if (!rtx_equal_p (base_1, base_2))
14728     return false;
14729
14730   offval_1 = INTVAL (offset_1);
14731   offval_2 = INTVAL (offset_2);
14732   msize = GET_MODE_SIZE (mode);
14733   /* Check if the offsets are consecutive.  */
14734   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14735     return false;
14736
14737   /* Check if the addresses are clobbered by load.  */
14738   if (load)
14739     {
14740       if (reg_mentioned_p (reg_1, mem_1))
14741         return false;
14742
14743       /* In increasing order, the last load can clobber the address.  */
14744       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14745       return false;
14746     }
14747
14748   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14749     rclass_1 = FP_REGS;
14750   else
14751     rclass_1 = GENERAL_REGS;
14752
14753   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14754     rclass_2 = FP_REGS;
14755   else
14756     rclass_2 = GENERAL_REGS;
14757
14758   /* Check if the registers are of same class.  */
14759   if (rclass_1 != rclass_2)
14760     return false;
14761
14762   return true;
14763 }
14764
14765 /* Given OPERANDS of consecutive load/store, check if we can merge
14766    them into ldp/stp by adjusting the offset.  LOAD is true if they
14767    are load instructions.  MODE is the mode of memory operands.
14768
14769    Given below consecutive stores:
14770
14771      str  w1, [xb, 0x100]
14772      str  w1, [xb, 0x104]
14773      str  w1, [xb, 0x108]
14774      str  w1, [xb, 0x10c]
14775
14776    Though the offsets are out of the range supported by stp, we can
14777    still pair them after adjusting the offset, like:
14778
14779      add  scratch, xb, 0x100
14780      stp  w1, w1, [scratch]
14781      stp  w1, w1, [scratch, 0x8]
14782
14783    The peephole patterns detecting this opportunity should guarantee
14784    the scratch register is avaliable.  */
14785
14786 bool
14787 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14788                                        machine_mode mode)
14789 {
14790   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14791   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14792   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14793   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14794
14795   if (load)
14796     {
14797       reg_1 = operands[0];
14798       mem_1 = operands[1];
14799       reg_2 = operands[2];
14800       mem_2 = operands[3];
14801       reg_3 = operands[4];
14802       mem_3 = operands[5];
14803       reg_4 = operands[6];
14804       mem_4 = operands[7];
14805       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14806                   && REG_P (reg_3) && REG_P (reg_4));
14807       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14808         return false;
14809     }
14810   else
14811     {
14812       mem_1 = operands[0];
14813       reg_1 = operands[1];
14814       mem_2 = operands[2];
14815       reg_2 = operands[3];
14816       mem_3 = operands[4];
14817       reg_3 = operands[5];
14818       mem_4 = operands[6];
14819       reg_4 = operands[7];
14820     }
14821   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14822   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14823     return false;
14824
14825   /* The mems cannot be volatile.  */
14826   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14827       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14828     return false;
14829
14830   /* Check if the addresses are in the form of [base+offset].  */
14831   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14832   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14833     return false;
14834   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14835   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14836     return false;
14837   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14838   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14839     return false;
14840   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14841   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14842     return false;
14843
14844   /* Check if the bases are same.  */
14845   if (!rtx_equal_p (base_1, base_2)
14846       || !rtx_equal_p (base_2, base_3)
14847       || !rtx_equal_p (base_3, base_4))
14848     return false;
14849
14850   offval_1 = INTVAL (offset_1);
14851   offval_2 = INTVAL (offset_2);
14852   offval_3 = INTVAL (offset_3);
14853   offval_4 = INTVAL (offset_4);
14854   msize = GET_MODE_SIZE (mode);
14855   /* Check if the offsets are consecutive.  */
14856   if ((offval_1 != (offval_2 + msize)
14857        || offval_1 != (offval_3 + msize * 2)
14858        || offval_1 != (offval_4 + msize * 3))
14859       && (offval_4 != (offval_3 + msize)
14860           || offval_4 != (offval_2 + msize * 2)
14861           || offval_4 != (offval_1 + msize * 3)))
14862     return false;
14863
14864   /* Check if the addresses are clobbered by load.  */
14865   if (load)
14866     {
14867       if (reg_mentioned_p (reg_1, mem_1)
14868           || reg_mentioned_p (reg_2, mem_2)
14869           || reg_mentioned_p (reg_3, mem_3))
14870         return false;
14871
14872       /* In increasing order, the last load can clobber the address.  */
14873       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14874         return false;
14875     }
14876
14877   /* If we have SImode and slow unaligned ldp,
14878      check the alignment to be at least 8 byte. */
14879   if (mode == SImode
14880       && (aarch64_tune_params.extra_tuning_flags
14881           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14882       && !optimize_size
14883       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14884     return false;
14885
14886   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14887     rclass_1 = FP_REGS;
14888   else
14889     rclass_1 = GENERAL_REGS;
14890
14891   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14892     rclass_2 = FP_REGS;
14893   else
14894     rclass_2 = GENERAL_REGS;
14895
14896   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14897     rclass_3 = FP_REGS;
14898   else
14899     rclass_3 = GENERAL_REGS;
14900
14901   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14902     rclass_4 = FP_REGS;
14903   else
14904     rclass_4 = GENERAL_REGS;
14905
14906   /* Check if the registers are of same class.  */
14907   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14908     return false;
14909
14910   return true;
14911 }
14912
14913 /* Given OPERANDS of consecutive load/store, this function pairs them
14914    into ldp/stp after adjusting the offset.  It depends on the fact
14915    that addresses of load/store instructions are in increasing order.
14916    MODE is the mode of memory operands.  CODE is the rtl operator
14917    which should be applied to all memory operands, it's SIGN_EXTEND,
14918    ZERO_EXTEND or UNKNOWN.  */
14919
14920 bool
14921 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14922                              machine_mode mode, RTX_CODE code)
14923 {
14924   rtx base, offset, t1, t2;
14925   rtx mem_1, mem_2, mem_3, mem_4;
14926   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14927
14928   if (load)
14929     {
14930       mem_1 = operands[1];
14931       mem_2 = operands[3];
14932       mem_3 = operands[5];
14933       mem_4 = operands[7];
14934     }
14935   else
14936     {
14937       mem_1 = operands[0];
14938       mem_2 = operands[2];
14939       mem_3 = operands[4];
14940       mem_4 = operands[6];
14941       gcc_assert (code == UNKNOWN);
14942     }
14943
14944   extract_base_offset_in_addr (mem_1, &base, &offset);
14945   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14946
14947   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14948   msize = GET_MODE_SIZE (mode);
14949   stp_off_limit = msize * 0x40;
14950   off_val = INTVAL (offset);
14951   abs_off = (off_val < 0) ? -off_val : off_val;
14952   new_off = abs_off % stp_off_limit;
14953   adj_off = abs_off - new_off;
14954
14955   /* Further adjust to make sure all offsets are OK.  */
14956   if ((new_off + msize * 2) >= stp_off_limit)
14957     {
14958       adj_off += stp_off_limit;
14959       new_off -= stp_off_limit;
14960     }
14961
14962   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14963   if (adj_off >= 0x1000)
14964     return false;
14965
14966   if (off_val < 0)
14967     {
14968       adj_off = -adj_off;
14969       new_off = -new_off;
14970     }
14971
14972   /* Create new memory references.  */
14973   mem_1 = change_address (mem_1, VOIDmode,
14974                           plus_constant (DImode, operands[8], new_off));
14975
14976   /* Check if the adjusted address is OK for ldp/stp.  */
14977   if (!aarch64_mem_pair_operand (mem_1, mode))
14978     return false;
14979
14980   msize = GET_MODE_SIZE (mode);
14981   mem_2 = change_address (mem_2, VOIDmode,
14982                           plus_constant (DImode,
14983                                          operands[8],
14984                                          new_off + msize));
14985   mem_3 = change_address (mem_3, VOIDmode,
14986                           plus_constant (DImode,
14987                                          operands[8],
14988                                          new_off + msize * 2));
14989   mem_4 = change_address (mem_4, VOIDmode,
14990                           plus_constant (DImode,
14991                                          operands[8],
14992                                          new_off + msize * 3));
14993
14994   if (code == ZERO_EXTEND)
14995     {
14996       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14997       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14998       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14999       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15000     }
15001   else if (code == SIGN_EXTEND)
15002     {
15003       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15004       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15005       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15006       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15007     }
15008
15009   if (load)
15010     {
15011       operands[1] = mem_1;
15012       operands[3] = mem_2;
15013       operands[5] = mem_3;
15014       operands[7] = mem_4;
15015     }
15016   else
15017     {
15018       operands[0] = mem_1;
15019       operands[2] = mem_2;
15020       operands[4] = mem_3;
15021       operands[6] = mem_4;
15022     }
15023
15024   /* Emit adjusting instruction.  */
15025   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15026   /* Emit ldp/stp instructions.  */
15027   t1 = gen_rtx_SET (operands[0], operands[1]);
15028   t2 = gen_rtx_SET (operands[2], operands[3]);
15029   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15030   t1 = gen_rtx_SET (operands[4], operands[5]);
15031   t2 = gen_rtx_SET (operands[6], operands[7]);
15032   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15033   return true;
15034 }
15035
15036 /* Return 1 if pseudo register should be created and used to hold
15037    GOT address for PIC code.  */
15038
15039 bool
15040 aarch64_use_pseudo_pic_reg (void)
15041 {
15042   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15043 }
15044
15045 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15046
15047 static int
15048 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15049 {
15050   switch (XINT (x, 1))
15051     {
15052     case UNSPEC_GOTSMALLPIC:
15053     case UNSPEC_GOTSMALLPIC28K:
15054     case UNSPEC_GOTTINYPIC:
15055       return 0;
15056     default:
15057       break;
15058     }
15059
15060   return default_unspec_may_trap_p (x, flags);
15061 }
15062
15063
15064 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15065    return the log2 of that value.  Otherwise return -1.  */
15066
15067 int
15068 aarch64_fpconst_pow_of_2 (rtx x)
15069 {
15070   const REAL_VALUE_TYPE *r;
15071
15072   if (!CONST_DOUBLE_P (x))
15073     return -1;
15074
15075   r = CONST_DOUBLE_REAL_VALUE (x);
15076
15077   if (REAL_VALUE_NEGATIVE (*r)
15078       || REAL_VALUE_ISNAN (*r)
15079       || REAL_VALUE_ISINF (*r)
15080       || !real_isinteger (r, DFmode))
15081     return -1;
15082
15083   return exact_log2 (real_to_integer (r));
15084 }
15085
15086 /* If X is a vector of equal CONST_DOUBLE values and that value is
15087    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15088
15089 int
15090 aarch64_vec_fpconst_pow_of_2 (rtx x)
15091 {
15092   if (GET_CODE (x) != CONST_VECTOR)
15093     return -1;
15094
15095   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15096     return -1;
15097
15098   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15099   if (firstval <= 0)
15100     return -1;
15101
15102   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15103     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15104       return -1;
15105
15106   return firstval;
15107 }
15108
15109 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15110    to float.
15111
15112    __fp16 always promotes through this hook.
15113    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15114    through the generic excess precision logic rather than here.  */
15115
15116 static tree
15117 aarch64_promoted_type (const_tree t)
15118 {
15119   if (SCALAR_FLOAT_TYPE_P (t)
15120       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15121     return float_type_node;
15122
15123   return NULL_TREE;
15124 }
15125
15126 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15127
15128 static bool
15129 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15130                            optimization_type opt_type)
15131 {
15132   switch (op)
15133     {
15134     case rsqrt_optab:
15135       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15136
15137     default:
15138       return true;
15139     }
15140 }
15141
15142 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15143    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15144
15145 static bool
15146 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15147 {
15148   return (mode == HFmode
15149           ? true
15150           : default_libgcc_floating_mode_supported_p (mode));
15151 }
15152
15153 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15154    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15155
15156 static bool
15157 aarch64_scalar_mode_supported_p (scalar_mode mode)
15158 {
15159   return (mode == HFmode
15160           ? true
15161           : default_scalar_mode_supported_p (mode));
15162 }
15163
15164 /* Set the value of FLT_EVAL_METHOD.
15165    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15166
15167     0: evaluate all operations and constants, whose semantic type has at
15168        most the range and precision of type float, to the range and
15169        precision of float; evaluate all other operations and constants to
15170        the range and precision of the semantic type;
15171
15172     N, where _FloatN is a supported interchange floating type
15173        evaluate all operations and constants, whose semantic type has at
15174        most the range and precision of _FloatN type, to the range and
15175        precision of the _FloatN type; evaluate all other operations and
15176        constants to the range and precision of the semantic type;
15177
15178    If we have the ARMv8.2-A extensions then we support _Float16 in native
15179    precision, so we should set this to 16.  Otherwise, we support the type,
15180    but want to evaluate expressions in float precision, so set this to
15181    0.  */
15182
15183 static enum flt_eval_method
15184 aarch64_excess_precision (enum excess_precision_type type)
15185 {
15186   switch (type)
15187     {
15188       case EXCESS_PRECISION_TYPE_FAST:
15189       case EXCESS_PRECISION_TYPE_STANDARD:
15190         /* We can calculate either in 16-bit range and precision or
15191            32-bit range and precision.  Make that decision based on whether
15192            we have native support for the ARMv8.2-A 16-bit floating-point
15193            instructions or not.  */
15194         return (TARGET_FP_F16INST
15195                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15196                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15197       case EXCESS_PRECISION_TYPE_IMPLICIT:
15198         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15199       default:
15200         gcc_unreachable ();
15201     }
15202   return FLT_EVAL_METHOD_UNPREDICTABLE;
15203 }
15204
15205 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15206    scheduled for speculative execution.  Reject the long-running division
15207    and square-root instructions.  */
15208
15209 static bool
15210 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15211 {
15212   switch (get_attr_type (insn))
15213     {
15214       case TYPE_SDIV:
15215       case TYPE_UDIV:
15216       case TYPE_FDIVS:
15217       case TYPE_FDIVD:
15218       case TYPE_FSQRTS:
15219       case TYPE_FSQRTD:
15220       case TYPE_NEON_FP_SQRT_S:
15221       case TYPE_NEON_FP_SQRT_D:
15222       case TYPE_NEON_FP_SQRT_S_Q:
15223       case TYPE_NEON_FP_SQRT_D_Q:
15224       case TYPE_NEON_FP_DIV_S:
15225       case TYPE_NEON_FP_DIV_D:
15226       case TYPE_NEON_FP_DIV_S_Q:
15227       case TYPE_NEON_FP_DIV_D_Q:
15228         return false;
15229       default:
15230         return true;
15231     }
15232 }
15233
15234 /* Target-specific selftests.  */
15235
15236 #if CHECKING_P
15237
15238 namespace selftest {
15239
15240 /* Selftest for the RTL loader.
15241    Verify that the RTL loader copes with a dump from
15242    print_rtx_function.  This is essentially just a test that class
15243    function_reader can handle a real dump, but it also verifies
15244    that lookup_reg_by_dump_name correctly handles hard regs.
15245    The presence of hard reg names in the dump means that the test is
15246    target-specific, hence it is in this file.  */
15247
15248 static void
15249 aarch64_test_loading_full_dump ()
15250 {
15251   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15252
15253   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15254
15255   rtx_insn *insn_1 = get_insn_by_uid (1);
15256   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15257
15258   rtx_insn *insn_15 = get_insn_by_uid (15);
15259   ASSERT_EQ (INSN, GET_CODE (insn_15));
15260   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15261
15262   /* Verify crtl->return_rtx.  */
15263   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15264   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15265   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15266 }
15267
15268 /* Run all target-specific selftests.  */
15269
15270 static void
15271 aarch64_run_selftests (void)
15272 {
15273   aarch64_test_loading_full_dump ();
15274 }
15275
15276 } // namespace selftest
15277
15278 #endif /* #if CHECKING_P */
15279
15280 #undef TARGET_ADDRESS_COST
15281 #define TARGET_ADDRESS_COST aarch64_address_cost
15282
15283 /* This hook will determines whether unnamed bitfields affect the alignment
15284    of the containing structure.  The hook returns true if the structure
15285    should inherit the alignment requirements of an unnamed bitfield's
15286    type.  */
15287 #undef TARGET_ALIGN_ANON_BITFIELD
15288 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15289
15290 #undef TARGET_ASM_ALIGNED_DI_OP
15291 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15292
15293 #undef TARGET_ASM_ALIGNED_HI_OP
15294 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15295
15296 #undef TARGET_ASM_ALIGNED_SI_OP
15297 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15298
15299 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15300 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15301   hook_bool_const_tree_hwi_hwi_const_tree_true
15302
15303 #undef TARGET_ASM_FILE_START
15304 #define TARGET_ASM_FILE_START aarch64_start_file
15305
15306 #undef TARGET_ASM_OUTPUT_MI_THUNK
15307 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15308
15309 #undef TARGET_ASM_SELECT_RTX_SECTION
15310 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15311
15312 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15313 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15314
15315 #undef TARGET_BUILD_BUILTIN_VA_LIST
15316 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15317
15318 #undef TARGET_CALLEE_COPIES
15319 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15320
15321 #undef TARGET_CAN_ELIMINATE
15322 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15323
15324 #undef TARGET_CAN_INLINE_P
15325 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15326
15327 #undef TARGET_CANNOT_FORCE_CONST_MEM
15328 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15329
15330 #undef TARGET_CASE_VALUES_THRESHOLD
15331 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15332
15333 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15334 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15335
15336 /* Only the least significant bit is used for initialization guard
15337    variables.  */
15338 #undef TARGET_CXX_GUARD_MASK_BIT
15339 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15340
15341 #undef TARGET_C_MODE_FOR_SUFFIX
15342 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15343
15344 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15345 #undef  TARGET_DEFAULT_TARGET_FLAGS
15346 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15347 #endif
15348
15349 #undef TARGET_CLASS_MAX_NREGS
15350 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15351
15352 #undef TARGET_BUILTIN_DECL
15353 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15354
15355 #undef TARGET_BUILTIN_RECIPROCAL
15356 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15357
15358 #undef TARGET_C_EXCESS_PRECISION
15359 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15360
15361 #undef  TARGET_EXPAND_BUILTIN
15362 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15363
15364 #undef TARGET_EXPAND_BUILTIN_VA_START
15365 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15366
15367 #undef TARGET_FOLD_BUILTIN
15368 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15369
15370 #undef TARGET_FUNCTION_ARG
15371 #define TARGET_FUNCTION_ARG aarch64_function_arg
15372
15373 #undef TARGET_FUNCTION_ARG_ADVANCE
15374 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15375
15376 #undef TARGET_FUNCTION_ARG_BOUNDARY
15377 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15378
15379 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15380 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15381
15382 #undef TARGET_FUNCTION_VALUE
15383 #define TARGET_FUNCTION_VALUE aarch64_function_value
15384
15385 #undef TARGET_FUNCTION_VALUE_REGNO_P
15386 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15387
15388 #undef TARGET_FRAME_POINTER_REQUIRED
15389 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15390
15391 #undef TARGET_GIMPLE_FOLD_BUILTIN
15392 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15393
15394 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15395 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15396
15397 #undef  TARGET_INIT_BUILTINS
15398 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15399
15400 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15401 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15402   aarch64_ira_change_pseudo_allocno_class
15403
15404 #undef TARGET_LEGITIMATE_ADDRESS_P
15405 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15406
15407 #undef TARGET_LEGITIMATE_CONSTANT_P
15408 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15409
15410 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15411 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15412   aarch64_legitimize_address_displacement
15413
15414 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15415 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15416
15417 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15418 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15419 aarch64_libgcc_floating_mode_supported_p
15420
15421 #undef TARGET_MANGLE_TYPE
15422 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15423
15424 #undef TARGET_MEMORY_MOVE_COST
15425 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15426
15427 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15428 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15429
15430 #undef TARGET_MUST_PASS_IN_STACK
15431 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15432
15433 /* This target hook should return true if accesses to volatile bitfields
15434    should use the narrowest mode possible.  It should return false if these
15435    accesses should use the bitfield container type.  */
15436 #undef TARGET_NARROW_VOLATILE_BITFIELD
15437 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15438
15439 #undef  TARGET_OPTION_OVERRIDE
15440 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15441
15442 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15443 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15444   aarch64_override_options_after_change
15445
15446 #undef TARGET_OPTION_SAVE
15447 #define TARGET_OPTION_SAVE aarch64_option_save
15448
15449 #undef TARGET_OPTION_RESTORE
15450 #define TARGET_OPTION_RESTORE aarch64_option_restore
15451
15452 #undef TARGET_OPTION_PRINT
15453 #define TARGET_OPTION_PRINT aarch64_option_print
15454
15455 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15456 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15457
15458 #undef TARGET_SET_CURRENT_FUNCTION
15459 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15460
15461 #undef TARGET_PASS_BY_REFERENCE
15462 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15463
15464 #undef TARGET_PREFERRED_RELOAD_CLASS
15465 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15466
15467 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15468 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15469
15470 #undef TARGET_PROMOTED_TYPE
15471 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15472
15473 #undef TARGET_SECONDARY_RELOAD
15474 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15475
15476 #undef TARGET_SHIFT_TRUNCATION_MASK
15477 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15478
15479 #undef TARGET_SETUP_INCOMING_VARARGS
15480 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15481
15482 #undef TARGET_STRUCT_VALUE_RTX
15483 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15484
15485 #undef TARGET_REGISTER_MOVE_COST
15486 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15487
15488 #undef TARGET_RETURN_IN_MEMORY
15489 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15490
15491 #undef TARGET_RETURN_IN_MSB
15492 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15493
15494 #undef TARGET_RTX_COSTS
15495 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15496
15497 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15498 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15499
15500 #undef TARGET_SCHED_ISSUE_RATE
15501 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15502
15503 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15504 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15505   aarch64_sched_first_cycle_multipass_dfa_lookahead
15506
15507 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15508 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15509   aarch64_first_cycle_multipass_dfa_lookahead_guard
15510
15511 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15512 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15513   aarch64_get_separate_components
15514
15515 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15516 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15517   aarch64_components_for_bb
15518
15519 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15520 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15521   aarch64_disqualify_components
15522
15523 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15524 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15525   aarch64_emit_prologue_components
15526
15527 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15528 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15529   aarch64_emit_epilogue_components
15530
15531 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15532 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15533   aarch64_set_handled_components
15534
15535 #undef TARGET_TRAMPOLINE_INIT
15536 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15537
15538 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15539 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15540
15541 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15542 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15543
15544 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15545 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15546   aarch64_builtin_support_vector_misalignment
15547
15548 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15549 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15550
15551 #undef TARGET_VECTORIZE_ADD_STMT_COST
15552 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15553
15554 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15555 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15556   aarch64_builtin_vectorization_cost
15557
15558 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15559 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15560
15561 #undef TARGET_VECTORIZE_BUILTINS
15562 #define TARGET_VECTORIZE_BUILTINS
15563
15564 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15565 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15566   aarch64_builtin_vectorized_function
15567
15568 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15569 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15570   aarch64_autovectorize_vector_sizes
15571
15572 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15573 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15574   aarch64_atomic_assign_expand_fenv
15575
15576 /* Section anchor support.  */
15577
15578 #undef TARGET_MIN_ANCHOR_OFFSET
15579 #define TARGET_MIN_ANCHOR_OFFSET -256
15580
15581 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15582    byte offset; we can do much more for larger data types, but have no way
15583    to determine the size of the access.  We assume accesses are aligned.  */
15584 #undef TARGET_MAX_ANCHOR_OFFSET
15585 #define TARGET_MAX_ANCHOR_OFFSET 4095
15586
15587 #undef TARGET_VECTOR_ALIGNMENT
15588 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15589
15590 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15591 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15592   aarch64_simd_vector_alignment_reachable
15593
15594 /* vec_perm support.  */
15595
15596 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15597 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15598   aarch64_vectorize_vec_perm_const_ok
15599
15600 #undef TARGET_INIT_LIBFUNCS
15601 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15602
15603 #undef TARGET_FIXED_CONDITION_CODE_REGS
15604 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15605
15606 #undef TARGET_FLAGS_REGNUM
15607 #define TARGET_FLAGS_REGNUM CC_REGNUM
15608
15609 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15610 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15611
15612 #undef TARGET_ASAN_SHADOW_OFFSET
15613 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15614
15615 #undef TARGET_LEGITIMIZE_ADDRESS
15616 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15617
15618 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15619 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15620   aarch64_use_by_pieces_infrastructure_p
15621
15622 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15623 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15624
15625 #undef TARGET_CAN_USE_DOLOOP_P
15626 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15627
15628 #undef TARGET_SCHED_ADJUST_PRIORITY
15629 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15630
15631 #undef TARGET_SCHED_MACRO_FUSION_P
15632 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15633
15634 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15635 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15636
15637 #undef TARGET_SCHED_FUSION_PRIORITY
15638 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15639
15640 #undef TARGET_UNSPEC_MAY_TRAP_P
15641 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15642
15643 #undef TARGET_USE_PSEUDO_PIC_REG
15644 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15645
15646 #undef TARGET_PRINT_OPERAND
15647 #define TARGET_PRINT_OPERAND aarch64_print_operand
15648
15649 #undef TARGET_PRINT_OPERAND_ADDRESS
15650 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15651
15652 #undef TARGET_OPTAB_SUPPORTED_P
15653 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15654
15655 #undef TARGET_OMIT_STRUCT_RETURN_REG
15656 #define TARGET_OMIT_STRUCT_RETURN_REG true
15657
15658 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15659 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15660 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15661
15662 #if CHECKING_P
15663 #undef TARGET_RUN_TARGET_SELFTESTS
15664 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15665 #endif /* #if CHECKING_P */
15666
15667 struct gcc_target targetm = TARGET_INITIALIZER;
15668
15669 #include "gt-aarch64.h"