gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "hashtab.h"
  57 #include "function.h"
  58 #include "flags.h"
  59 #include "statistics.h"
  60 #include "real.h"
  61 #include "fixed-value.h"
  62 #include "insn-config.h"
  63 #include "expmed.h"
  64 #include "dojump.h"
  65 #include "explow.h"
  66 #include "emit-rtl.h"
  67 #include "stmt.h"
  68 #include "expr.h"
  69 #include "reload.h"
  70 #include "toplev.h"
  71 #include "target.h"
  72 #include "target-def.h"
  73 #include "targhooks.h"
  74 #include "ggc.h"
  75 #include "tm_p.h"
  76 #include "recog.h"
  77 #include "langhooks.h"
  78 #include "diagnostic-core.h"
  79 #include "hash-table.h"
  80 #include "tree-ssa-alias.h"
  81 #include "internal-fn.h"
  82 #include "gimple-fold.h"
  83 #include "tree-eh.h"
  84 #include "gimple-expr.h"
  85 #include "is-a.h"
  86 #include "gimple.h"
  87 #include "gimplify.h"
  88 #include "optabs.h"
  89 #include "dwarf2.h"
  90 #include "cfgloop.h"
  91 #include "tree-vectorizer.h"
  92 #include "aarch64-cost-tables.h"
  93 #include "dumpfile.h"
  94 #include "builtins.h"
  95 #include "rtl-iter.h"
  96 #include "tm-constrs.h"
  97
  98 /* Defined for convenience.  */
  99 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 100
 101 /* Classifies an address.
 102
 103    ADDRESS_REG_IMM
 104        A simple base register plus immediate offset.
 105
 106    ADDRESS_REG_WB
 107        A base register indexed by immediate offset with writeback.
 108
 109    ADDRESS_REG_REG
 110        A base register indexed by (optionally scaled) register.
 111
 112    ADDRESS_REG_UXTW
 113        A base register indexed by (optionally scaled) zero-extended register.
 114
 115    ADDRESS_REG_SXTW
 116        A base register indexed by (optionally scaled) sign-extended register.
 117
 118    ADDRESS_LO_SUM
 119        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 120
 121    ADDRESS_SYMBOLIC:
 122        A constant symbolic address, in pc-relative literal pool.  */
 123
 124 enum aarch64_address_type {
 125   ADDRESS_REG_IMM,
 126   ADDRESS_REG_WB,
 127   ADDRESS_REG_REG,
 128   ADDRESS_REG_UXTW,
 129   ADDRESS_REG_SXTW,
 130   ADDRESS_LO_SUM,
 131   ADDRESS_SYMBOLIC
 132 };
 133
 134 struct aarch64_address_info {
 135   enum aarch64_address_type type;
 136   rtx base;
 137   rtx offset;
 138   int shift;
 139   enum aarch64_symbol_type symbol_type;
 140 };
 141
 142 struct simd_immediate_info
 143 {
 144   rtx value;
 145   int shift;
 146   int element_width;
 147   bool mvn;
 148   bool msl;
 149 };
 150
 151 /* The current code model.  */
 152 enum aarch64_code_model aarch64_cmodel;
 153
 154 #ifdef HAVE_AS_TLS
 155 #undef TARGET_HAVE_TLS
 156 #define TARGET_HAVE_TLS 1
 157 #endif
 158
 159 static bool aarch64_lra_p (void);
 160 static bool aarch64_composite_type_p (const_tree, machine_mode);
 161 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 162                                                      const_tree,
 163                                                      machine_mode *, int *,
 164                                                      bool *);
 165 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 166 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 167 static void aarch64_override_options_after_change (void);
 168 static bool aarch64_vector_mode_supported_p (machine_mode);
 169 static unsigned bit_count (unsigned HOST_WIDE_INT);
 170 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 171                                                  const unsigned char *sel);
 172 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 173
 174 /* Major revision number of the ARM Architecture implemented by the target.  */
 175 unsigned aarch64_architecture_version;
 176
 177 /* The processor for which instructions should be scheduled.  */
 178 enum aarch64_processor aarch64_tune = cortexa53;
 179
 180 /* The current tuning set.  */
 181 const struct tune_params *aarch64_tune_params;
 182
 183 /* Mask to specify which instructions we are allowed to generate.  */
 184 unsigned long aarch64_isa_flags = 0;
 185
 186 /* Mask to specify which instruction scheduling options should be used.  */
 187 unsigned long aarch64_tune_flags = 0;
 188
 189 /* Tuning parameters.  */
 190
 191 #if HAVE_DESIGNATED_INITIALIZERS
 192 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 193 #else
 194 #define NAMED_PARAM(NAME, VAL) (VAL)
 195 #endif
 196
 197 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 198 __extension__
 199 #endif
 200
 201 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 202 __extension__
 203 #endif
 204 static const struct cpu_addrcost_table generic_addrcost_table =
 205 {
 206 #if HAVE_DESIGNATED_INITIALIZERS
 207   .addr_scale_costs =
 208 #endif
 209     {
 210       NAMED_PARAM (hi, 0),
 211       NAMED_PARAM (si, 0),
 212       NAMED_PARAM (di, 0),
 213       NAMED_PARAM (ti, 0),
 214     },
 215   NAMED_PARAM (pre_modify, 0),
 216   NAMED_PARAM (post_modify, 0),
 217   NAMED_PARAM (register_offset, 0),
 218   NAMED_PARAM (register_extend, 0),
 219   NAMED_PARAM (imm_offset, 0)
 220 };
 221
 222 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 223 __extension__
 224 #endif
 225 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 226 {
 227 #if HAVE_DESIGNATED_INITIALIZERS
 228   .addr_scale_costs =
 229 #endif
 230     {
 231       NAMED_PARAM (hi, 1),
 232       NAMED_PARAM (si, 0),
 233       NAMED_PARAM (di, 0),
 234       NAMED_PARAM (ti, 1),
 235     },
 236   NAMED_PARAM (pre_modify, 0),
 237   NAMED_PARAM (post_modify, 0),
 238   NAMED_PARAM (register_offset, 0),
 239   NAMED_PARAM (register_extend, 0),
 240   NAMED_PARAM (imm_offset, 0),
 241 };
 242
 243 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 244 __extension__
 245 #endif
 246 static const struct cpu_regmove_cost generic_regmove_cost =
 247 {
 248   NAMED_PARAM (GP2GP, 1),
 249   /* Avoid the use of slow int<->fp moves for spilling by setting
 250      their cost higher than memmov_cost.  */
 251   NAMED_PARAM (GP2FP, 5),
 252   NAMED_PARAM (FP2GP, 5),
 253   NAMED_PARAM (FP2FP, 2)
 254 };
 255
 256 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 257 {
 258   NAMED_PARAM (GP2GP, 1),
 259   /* Avoid the use of slow int<->fp moves for spilling by setting
 260      their cost higher than memmov_cost.  */
 261   NAMED_PARAM (GP2FP, 5),
 262   NAMED_PARAM (FP2GP, 5),
 263   NAMED_PARAM (FP2FP, 2)
 264 };
 265
 266 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 267 {
 268   NAMED_PARAM (GP2GP, 1),
 269   /* Avoid the use of slow int<->fp moves for spilling by setting
 270      their cost higher than memmov_cost.  */
 271   NAMED_PARAM (GP2FP, 5),
 272   NAMED_PARAM (FP2GP, 5),
 273   NAMED_PARAM (FP2FP, 2)
 274 };
 275
 276 static const struct cpu_regmove_cost thunderx_regmove_cost =
 277 {
 278   NAMED_PARAM (GP2GP, 2),
 279   NAMED_PARAM (GP2FP, 2),
 280   NAMED_PARAM (FP2GP, 6),
 281   NAMED_PARAM (FP2FP, 4)
 282 };
 283
 284 /* Generic costs for vector insn classes.  */
 285 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 286 __extension__
 287 #endif
 288 static const struct cpu_vector_cost generic_vector_cost =
 289 {
 290   NAMED_PARAM (scalar_stmt_cost, 1),
 291   NAMED_PARAM (scalar_load_cost, 1),
 292   NAMED_PARAM (scalar_store_cost, 1),
 293   NAMED_PARAM (vec_stmt_cost, 1),
 294   NAMED_PARAM (vec_to_scalar_cost, 1),
 295   NAMED_PARAM (scalar_to_vec_cost, 1),
 296   NAMED_PARAM (vec_align_load_cost, 1),
 297   NAMED_PARAM (vec_unalign_load_cost, 1),
 298   NAMED_PARAM (vec_unalign_store_cost, 1),
 299   NAMED_PARAM (vec_store_cost, 1),
 300   NAMED_PARAM (cond_taken_branch_cost, 3),
 301   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 302 };
 303
 304 /* Generic costs for vector insn classes.  */
 305 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 306 __extension__
 307 #endif
 308 static const struct cpu_vector_cost cortexa57_vector_cost =
 309 {
 310   NAMED_PARAM (scalar_stmt_cost, 1),
 311   NAMED_PARAM (scalar_load_cost, 4),
 312   NAMED_PARAM (scalar_store_cost, 1),
 313   NAMED_PARAM (vec_stmt_cost, 3),
 314   NAMED_PARAM (vec_to_scalar_cost, 8),
 315   NAMED_PARAM (scalar_to_vec_cost, 8),
 316   NAMED_PARAM (vec_align_load_cost, 5),
 317   NAMED_PARAM (vec_unalign_load_cost, 5),
 318   NAMED_PARAM (vec_unalign_store_cost, 1),
 319   NAMED_PARAM (vec_store_cost, 1),
 320   NAMED_PARAM (cond_taken_branch_cost, 1),
 321   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 322 };
 323
 324 #define AARCH64_FUSE_NOTHING    (0)
 325 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 326 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 327 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 328 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 329 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 330
 331 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 332 __extension__
 333 #endif
 334 static const struct tune_params generic_tunings =
 335 {
 336   &cortexa57_extra_costs,
 337   &generic_addrcost_table,
 338   &generic_regmove_cost,
 339   &generic_vector_cost,
 340   NAMED_PARAM (memmov_cost, 4),
 341   NAMED_PARAM (issue_rate, 2),
 342   NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING),
 343   8,    /* function_align.  */
 344   8,    /* jump_align.  */
 345   4,    /* loop_align.  */
 346   2,    /* int_reassoc_width.  */
 347   4,    /* fp_reassoc_width.  */
 348   1     /* vec_reassoc_width.  */
 349 };
 350
 351 static const struct tune_params cortexa53_tunings =
 352 {
 353   &cortexa53_extra_costs,
 354   &generic_addrcost_table,
 355   &cortexa53_regmove_cost,
 356   &generic_vector_cost,
 357   NAMED_PARAM (memmov_cost, 4),
 358   NAMED_PARAM (issue_rate, 2),
 359   NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 360                              | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR)),
 361   8,    /* function_align.  */
 362   8,    /* jump_align.  */
 363   4,    /* loop_align.  */
 364   2,    /* int_reassoc_width.  */
 365   4,    /* fp_reassoc_width.  */
 366   1     /* vec_reassoc_width.  */
 367 };
 368
 369 static const struct tune_params cortexa57_tunings =
 370 {
 371   &cortexa57_extra_costs,
 372   &cortexa57_addrcost_table,
 373   &cortexa57_regmove_cost,
 374   &cortexa57_vector_cost,
 375   NAMED_PARAM (memmov_cost, 4),
 376   NAMED_PARAM (issue_rate, 3),
 377   NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK)),
 378   16,   /* function_align.  */
 379   8,    /* jump_align.  */
 380   4,    /* loop_align.  */
 381   2,    /* int_reassoc_width.  */
 382   4,    /* fp_reassoc_width.  */
 383   1     /* vec_reassoc_width.  */
 384 };
 385
 386 static const struct tune_params thunderx_tunings =
 387 {
 388   &thunderx_extra_costs,
 389   &generic_addrcost_table,
 390   &thunderx_regmove_cost,
 391   &generic_vector_cost,
 392   NAMED_PARAM (memmov_cost, 6),
 393   NAMED_PARAM (issue_rate, 2),
 394   NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH),
 395   8,    /* function_align.  */
 396   8,    /* jump_align.  */
 397   8,    /* loop_align.  */
 398   2,    /* int_reassoc_width.  */
 399   4,    /* fp_reassoc_width.  */
 400   1     /* vec_reassoc_width.  */
 401 };
 402
 403 /* A processor implementing AArch64.  */
 404 struct processor
 405 {
 406   const char *const name;
 407   enum aarch64_processor core;
 408   const char *arch;
 409   unsigned architecture_version;
 410   const unsigned long flags;
 411   const struct tune_params *const tune;
 412 };
 413
 414 /* Processor cores implementing AArch64.  */
 415 static const struct processor all_cores[] =
 416 {
 417 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS) \
 418   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 419 #include "aarch64-cores.def"
 420 #undef AARCH64_CORE
 421   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 422   {NULL, aarch64_none, NULL, 0, 0, NULL}
 423 };
 424
 425 /* Architectures implementing AArch64.  */
 426 static const struct processor all_architectures[] =
 427 {
 428 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 429   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 430 #include "aarch64-arches.def"
 431 #undef AARCH64_ARCH
 432   {NULL, aarch64_none, NULL, 0, 0, NULL}
 433 };
 434
 435 /* Target specification.  These are populated as commandline arguments
 436    are processed, or NULL if not specified.  */
 437 static const struct processor *selected_arch;
 438 static const struct processor *selected_cpu;
 439 static const struct processor *selected_tune;
 440
 441 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 442
 443 /* An ISA extension in the co-processor and main instruction set space.  */
 444 struct aarch64_option_extension
 445 {
 446   const char *const name;
 447   const unsigned long flags_on;
 448   const unsigned long flags_off;
 449 };
 450
 451 /* ISA extensions in AArch64.  */
 452 static const struct aarch64_option_extension all_extensions[] =
 453 {
 454 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 455   {NAME, FLAGS_ON, FLAGS_OFF},
 456 #include "aarch64-option-extensions.def"
 457 #undef AARCH64_OPT_EXTENSION
 458   {NULL, 0, 0}
 459 };
 460
 461 /* Used to track the size of an address when generating a pre/post
 462    increment address.  */
 463 static machine_mode aarch64_memory_reference_mode;
 464
 465 /* Used to force GTY into this file.  */
 466 static GTY(()) int gty_dummy;
 467
 468 /* A table of valid AArch64 "bitmask immediate" values for
 469    logical instructions.  */
 470
 471 #define AARCH64_NUM_BITMASKS  5334
 472 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 473
 474 typedef enum aarch64_cond_code
 475 {
 476   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 477   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 478   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 479 }
 480 aarch64_cc;
 481
 482 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 483
 484 /* The condition codes of the processor, and the inverse function.  */
 485 static const char * const aarch64_condition_codes[] =
 486 {
 487   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 488   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 489 };
 490
 491 static unsigned int
 492 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
 493 {
 494   return 2;
 495 }
 496
 497 static int
 498 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 499                              enum machine_mode mode)
 500 {
 501   if (VECTOR_MODE_P (mode))
 502     return aarch64_tune_params->vec_reassoc_width;
 503   if (INTEGRAL_MODE_P (mode))
 504     return aarch64_tune_params->int_reassoc_width;
 505   if (FLOAT_MODE_P (mode))
 506     return aarch64_tune_params->fp_reassoc_width;
 507   return 1;
 508 }
 509
 510 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 511 unsigned
 512 aarch64_dbx_register_number (unsigned regno)
 513 {
 514    if (GP_REGNUM_P (regno))
 515      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 516    else if (regno == SP_REGNUM)
 517      return AARCH64_DWARF_SP;
 518    else if (FP_REGNUM_P (regno))
 519      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 520
 521    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 522       equivalent DWARF register.  */
 523    return DWARF_FRAME_REGISTERS;
 524 }
 525
 526 /* Return TRUE if MODE is any of the large INT modes.  */
 527 static bool
 528 aarch64_vect_struct_mode_p (machine_mode mode)
 529 {
 530   return mode == OImode || mode == CImode || mode == XImode;
 531 }
 532
 533 /* Return TRUE if MODE is any of the vector modes.  */
 534 static bool
 535 aarch64_vector_mode_p (machine_mode mode)
 536 {
 537   return aarch64_vector_mode_supported_p (mode)
 538          || aarch64_vect_struct_mode_p (mode);
 539 }
 540
 541 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 542 static bool
 543 aarch64_array_mode_supported_p (machine_mode mode,
 544                                 unsigned HOST_WIDE_INT nelems)
 545 {
 546   if (TARGET_SIMD
 547       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 548       && (nelems >= 2 && nelems <= 4))
 549     return true;
 550
 551   return false;
 552 }
 553
 554 /* Implement HARD_REGNO_NREGS.  */
 555
 556 int
 557 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 558 {
 559   switch (aarch64_regno_regclass (regno))
 560     {
 561     case FP_REGS:
 562     case FP_LO_REGS:
 563       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 564     default:
 565       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 566     }
 567   gcc_unreachable ();
 568 }
 569
 570 /* Implement HARD_REGNO_MODE_OK.  */
 571
 572 int
 573 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 574 {
 575   if (GET_MODE_CLASS (mode) == MODE_CC)
 576     return regno == CC_REGNUM;
 577
 578   if (regno == SP_REGNUM)
 579     /* The purpose of comparing with ptr_mode is to support the
 580        global register variable associated with the stack pointer
 581        register via the syntax of asm ("wsp") in ILP32.  */
 582     return mode == Pmode || mode == ptr_mode;
 583
 584   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 585     return mode == Pmode;
 586
 587   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 588     return 1;
 589
 590   if (FP_REGNUM_P (regno))
 591     {
 592       if (aarch64_vect_struct_mode_p (mode))
 593         return
 594           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 595       else
 596         return 1;
 597     }
 598
 599   return 0;
 600 }
 601
 602 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 603 machine_mode
 604 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 605                                      machine_mode mode)
 606 {
 607   /* Handle modes that fit within single registers.  */
 608   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 609     {
 610       if (GET_MODE_SIZE (mode) >= 4)
 611         return mode;
 612       else
 613         return SImode;
 614     }
 615   /* Fall back to generic for multi-reg and very large modes.  */
 616   else
 617     return choose_hard_reg_mode (regno, nregs, false);
 618 }
 619
 620 /* Return true if calls to DECL should be treated as
 621    long-calls (ie called via a register).  */
 622 static bool
 623 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 624 {
 625   return false;
 626 }
 627
 628 /* Return true if calls to symbol-ref SYM should be treated as
 629    long-calls (ie called via a register).  */
 630 bool
 631 aarch64_is_long_call_p (rtx sym)
 632 {
 633   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 634 }
 635
 636 /* Return true if the offsets to a zero/sign-extract operation
 637    represent an expression that matches an extend operation.  The
 638    operands represent the paramters from
 639
 640    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 641 bool
 642 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 643                                 rtx extract_imm)
 644 {
 645   HOST_WIDE_INT mult_val, extract_val;
 646
 647   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 648     return false;
 649
 650   mult_val = INTVAL (mult_imm);
 651   extract_val = INTVAL (extract_imm);
 652
 653   if (extract_val > 8
 654       && extract_val < GET_MODE_BITSIZE (mode)
 655       && exact_log2 (extract_val & ~7) > 0
 656       && (extract_val & 7) <= 4
 657       && mult_val == (1 << (extract_val & 7)))
 658     return true;
 659
 660   return false;
 661 }
 662
 663 /* Emit an insn that's a simple single-set.  Both the operands must be
 664    known to be valid.  */
 665 inline static rtx
 666 emit_set_insn (rtx x, rtx y)
 667 {
 668   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 669 }
 670
 671 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 672    return the rtx for register 0 in the proper mode.  */
 673 rtx
 674 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 675 {
 676   machine_mode mode = SELECT_CC_MODE (code, x, y);
 677   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 678
 679   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 680   return cc_reg;
 681 }
 682
 683 /* Build the SYMBOL_REF for __tls_get_addr.  */
 684
 685 static GTY(()) rtx tls_get_addr_libfunc;
 686
 687 rtx
 688 aarch64_tls_get_addr (void)
 689 {
 690   if (!tls_get_addr_libfunc)
 691     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 692   return tls_get_addr_libfunc;
 693 }
 694
 695 /* Return the TLS model to use for ADDR.  */
 696
 697 static enum tls_model
 698 tls_symbolic_operand_type (rtx addr)
 699 {
 700   enum tls_model tls_kind = TLS_MODEL_NONE;
 701   rtx sym, addend;
 702
 703   if (GET_CODE (addr) == CONST)
 704     {
 705       split_const (addr, &sym, &addend);
 706       if (GET_CODE (sym) == SYMBOL_REF)
 707         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 708     }
 709   else if (GET_CODE (addr) == SYMBOL_REF)
 710     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 711
 712   return tls_kind;
 713 }
 714
 715 /* We'll allow lo_sum's in addresses in our legitimate addresses
 716    so that combine would take care of combining addresses where
 717    necessary, but for generation purposes, we'll generate the address
 718    as :
 719    RTL                               Absolute
 720    tmp = hi (symbol_ref);            adrp  x1, foo
 721    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 722                                      nop
 723
 724    PIC                               TLS
 725    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 726    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 727                                      bl   __tls_get_addr
 728                                      nop
 729
 730    Load TLS symbol, depending on TLS mechanism and TLS access model.
 731
 732    Global Dynamic - Traditional TLS:
 733    adrp tmp, :tlsgd:imm
 734    add  dest, tmp, #:tlsgd_lo12:imm
 735    bl   __tls_get_addr
 736
 737    Global Dynamic - TLS Descriptors:
 738    adrp dest, :tlsdesc:imm
 739    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 740    add  dest, dest, #:tlsdesc_lo12:imm
 741    blr  tmp
 742    mrs  tp, tpidr_el0
 743    add  dest, dest, tp
 744
 745    Initial Exec:
 746    mrs  tp, tpidr_el0
 747    adrp tmp, :gottprel:imm
 748    ldr  dest, [tmp, #:gottprel_lo12:imm]
 749    add  dest, dest, tp
 750
 751    Local Exec:
 752    mrs  tp, tpidr_el0
 753    add  t0, tp, #:tprel_hi12:imm
 754    add  t0, #:tprel_lo12_nc:imm
 755 */
 756
 757 static void
 758 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 759                                    enum aarch64_symbol_type type)
 760 {
 761   switch (type)
 762     {
 763     case SYMBOL_SMALL_ABSOLUTE:
 764       {
 765         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 766         rtx tmp_reg = dest;
 767         machine_mode mode = GET_MODE (dest);
 768
 769         gcc_assert (mode == Pmode || mode == ptr_mode);
 770
 771         if (can_create_pseudo_p ())
 772           tmp_reg = gen_reg_rtx (mode);
 773
 774         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 775         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 776         return;
 777       }
 778
 779     case SYMBOL_TINY_ABSOLUTE:
 780       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 781       return;
 782
 783     case SYMBOL_SMALL_GOT:
 784       {
 785         /* In ILP32, the mode of dest can be either SImode or DImode,
 786            while the got entry is always of SImode size.  The mode of
 787            dest depends on how dest is used: if dest is assigned to a
 788            pointer (e.g. in the memory), it has SImode; it may have
 789            DImode if dest is dereferenced to access the memeory.
 790            This is why we have to handle three different ldr_got_small
 791            patterns here (two patterns for ILP32).  */
 792         rtx tmp_reg = dest;
 793         machine_mode mode = GET_MODE (dest);
 794
 795         if (can_create_pseudo_p ())
 796           tmp_reg = gen_reg_rtx (mode);
 797
 798         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 799         if (mode == ptr_mode)
 800           {
 801             if (mode == DImode)
 802               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 803             else
 804               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 805           }
 806         else
 807           {
 808             gcc_assert (mode == Pmode);
 809             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 810           }
 811
 812         return;
 813       }
 814
 815     case SYMBOL_SMALL_TLSGD:
 816       {
 817         rtx_insn *insns;
 818         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 819
 820         start_sequence ();
 821         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 822         insns = get_insns ();
 823         end_sequence ();
 824
 825         RTL_CONST_CALL_P (insns) = 1;
 826         emit_libcall_block (insns, dest, result, imm);
 827         return;
 828       }
 829
 830     case SYMBOL_SMALL_TLSDESC:
 831       {
 832         machine_mode mode = GET_MODE (dest);
 833         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 834         rtx tp;
 835
 836         gcc_assert (mode == Pmode || mode == ptr_mode);
 837
 838         /* In ILP32, the got entry is always of SImode size.  Unlike
 839            small GOT, the dest is fixed at reg 0.  */
 840         if (TARGET_ILP32)
 841           emit_insn (gen_tlsdesc_small_si (imm));
 842         else
 843           emit_insn (gen_tlsdesc_small_di (imm));
 844         tp = aarch64_load_tp (NULL);
 845
 846         if (mode != Pmode)
 847           tp = gen_lowpart (mode, tp);
 848
 849         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 850         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 851         return;
 852       }
 853
 854     case SYMBOL_SMALL_GOTTPREL:
 855       {
 856         /* In ILP32, the mode of dest can be either SImode or DImode,
 857            while the got entry is always of SImode size.  The mode of
 858            dest depends on how dest is used: if dest is assigned to a
 859            pointer (e.g. in the memory), it has SImode; it may have
 860            DImode if dest is dereferenced to access the memeory.
 861            This is why we have to handle three different tlsie_small
 862            patterns here (two patterns for ILP32).  */
 863         machine_mode mode = GET_MODE (dest);
 864         rtx tmp_reg = gen_reg_rtx (mode);
 865         rtx tp = aarch64_load_tp (NULL);
 866
 867         if (mode == ptr_mode)
 868           {
 869             if (mode == DImode)
 870               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 871             else
 872               {
 873                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 874                 tp = gen_lowpart (mode, tp);
 875               }
 876           }
 877         else
 878           {
 879             gcc_assert (mode == Pmode);
 880             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 881           }
 882
 883         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 884         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 885         return;
 886       }
 887
 888     case SYMBOL_SMALL_TPREL:
 889       {
 890         rtx tp = aarch64_load_tp (NULL);
 891         emit_insn (gen_tlsle_small (dest, tp, imm));
 892         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 893         return;
 894       }
 895
 896     case SYMBOL_TINY_GOT:
 897       emit_insn (gen_ldr_got_tiny (dest, imm));
 898       return;
 899
 900     default:
 901       gcc_unreachable ();
 902     }
 903 }
 904
 905 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 906    handle all moves if !can_create_pseudo_p ().  The distinction is
 907    important because, unlike emit_move_insn, the move expanders know
 908    how to force Pmode objects into the constant pool even when the
 909    constant pool address is not itself legitimate.  */
 910 static rtx
 911 aarch64_emit_move (rtx dest, rtx src)
 912 {
 913   return (can_create_pseudo_p ()
 914           ? emit_move_insn (dest, src)
 915           : emit_move_insn_1 (dest, src));
 916 }
 917
 918 /* Split a 128-bit move operation into two 64-bit move operations,
 919    taking care to handle partial overlap of register to register
 920    copies.  Special cases are needed when moving between GP regs and
 921    FP regs.  SRC can be a register, constant or memory; DST a register
 922    or memory.  If either operand is memory it must not have any side
 923    effects.  */
 924 void
 925 aarch64_split_128bit_move (rtx dst, rtx src)
 926 {
 927   rtx dst_lo, dst_hi;
 928   rtx src_lo, src_hi;
 929
 930   machine_mode mode = GET_MODE (dst);
 931
 932   gcc_assert (mode == TImode || mode == TFmode);
 933   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 934   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 935
 936   if (REG_P (dst) && REG_P (src))
 937     {
 938       int src_regno = REGNO (src);
 939       int dst_regno = REGNO (dst);
 940
 941       /* Handle FP <-> GP regs.  */
 942       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 943         {
 944           src_lo = gen_lowpart (word_mode, src);
 945           src_hi = gen_highpart (word_mode, src);
 946
 947           if (mode == TImode)
 948             {
 949               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 950               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 951             }
 952           else
 953             {
 954               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 955               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 956             }
 957           return;
 958         }
 959       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 960         {
 961           dst_lo = gen_lowpart (word_mode, dst);
 962           dst_hi = gen_highpart (word_mode, dst);
 963
 964           if (mode == TImode)
 965             {
 966               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 967               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 968             }
 969           else
 970             {
 971               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 972               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 973             }
 974           return;
 975         }
 976     }
 977
 978   dst_lo = gen_lowpart (word_mode, dst);
 979   dst_hi = gen_highpart (word_mode, dst);
 980   src_lo = gen_lowpart (word_mode, src);
 981   src_hi = gen_highpart_mode (word_mode, mode, src);
 982
 983   /* At most one pairing may overlap.  */
 984   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 985     {
 986       aarch64_emit_move (dst_hi, src_hi);
 987       aarch64_emit_move (dst_lo, src_lo);
 988     }
 989   else
 990     {
 991       aarch64_emit_move (dst_lo, src_lo);
 992       aarch64_emit_move (dst_hi, src_hi);
 993     }
 994 }
 995
 996 bool
 997 aarch64_split_128bit_move_p (rtx dst, rtx src)
 998 {
 999   return (! REG_P (src)
1000           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1001 }
1002
1003 /* Split a complex SIMD combine.  */
1004
1005 void
1006 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1007 {
1008   machine_mode src_mode = GET_MODE (src1);
1009   machine_mode dst_mode = GET_MODE (dst);
1010
1011   gcc_assert (VECTOR_MODE_P (dst_mode));
1012
1013   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1014     {
1015       rtx (*gen) (rtx, rtx, rtx);
1016
1017       switch (src_mode)
1018         {
1019         case V8QImode:
1020           gen = gen_aarch64_simd_combinev8qi;
1021           break;
1022         case V4HImode:
1023           gen = gen_aarch64_simd_combinev4hi;
1024           break;
1025         case V2SImode:
1026           gen = gen_aarch64_simd_combinev2si;
1027           break;
1028         case V2SFmode:
1029           gen = gen_aarch64_simd_combinev2sf;
1030           break;
1031         case DImode:
1032           gen = gen_aarch64_simd_combinedi;
1033           break;
1034         case DFmode:
1035           gen = gen_aarch64_simd_combinedf;
1036           break;
1037         default:
1038           gcc_unreachable ();
1039         }
1040
1041       emit_insn (gen (dst, src1, src2));
1042       return;
1043     }
1044 }
1045
1046 /* Split a complex SIMD move.  */
1047
1048 void
1049 aarch64_split_simd_move (rtx dst, rtx src)
1050 {
1051   machine_mode src_mode = GET_MODE (src);
1052   machine_mode dst_mode = GET_MODE (dst);
1053
1054   gcc_assert (VECTOR_MODE_P (dst_mode));
1055
1056   if (REG_P (dst) && REG_P (src))
1057     {
1058       rtx (*gen) (rtx, rtx);
1059
1060       gcc_assert (VECTOR_MODE_P (src_mode));
1061
1062       switch (src_mode)
1063         {
1064         case V16QImode:
1065           gen = gen_aarch64_split_simd_movv16qi;
1066           break;
1067         case V8HImode:
1068           gen = gen_aarch64_split_simd_movv8hi;
1069           break;
1070         case V4SImode:
1071           gen = gen_aarch64_split_simd_movv4si;
1072           break;
1073         case V2DImode:
1074           gen = gen_aarch64_split_simd_movv2di;
1075           break;
1076         case V4SFmode:
1077           gen = gen_aarch64_split_simd_movv4sf;
1078           break;
1079         case V2DFmode:
1080           gen = gen_aarch64_split_simd_movv2df;
1081           break;
1082         default:
1083           gcc_unreachable ();
1084         }
1085
1086       emit_insn (gen (dst, src));
1087       return;
1088     }
1089 }
1090
1091 static rtx
1092 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1093 {
1094   if (can_create_pseudo_p ())
1095     return force_reg (mode, value);
1096   else
1097     {
1098       x = aarch64_emit_move (x, value);
1099       return x;
1100     }
1101 }
1102
1103
1104 static rtx
1105 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1106 {
1107   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1108     {
1109       rtx high;
1110       /* Load the full offset into a register.  This
1111          might be improvable in the future.  */
1112       high = GEN_INT (offset);
1113       offset = 0;
1114       high = aarch64_force_temporary (mode, temp, high);
1115       reg = aarch64_force_temporary (mode, temp,
1116                                      gen_rtx_PLUS (mode, high, reg));
1117     }
1118   return plus_constant (mode, reg, offset);
1119 }
1120
1121 static int
1122 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1123                                 machine_mode mode)
1124 {
1125   unsigned HOST_WIDE_INT mask;
1126   int i;
1127   bool first;
1128   unsigned HOST_WIDE_INT val;
1129   bool subtargets;
1130   rtx subtarget;
1131   int one_match, zero_match, first_not_ffff_match;
1132   int num_insns = 0;
1133
1134   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1135     {
1136       if (generate)
1137         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1138       num_insns++;
1139       return num_insns;
1140     }
1141
1142   if (mode == SImode)
1143     {
1144       /* We know we can't do this in 1 insn, and we must be able to do it
1145          in two; so don't mess around looking for sequences that don't buy
1146          us anything.  */
1147       if (generate)
1148         {
1149           emit_insn (gen_rtx_SET (VOIDmode, dest,
1150                                   GEN_INT (INTVAL (imm) & 0xffff)));
1151           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1152                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1153         }
1154       num_insns += 2;
1155       return num_insns;
1156     }
1157
1158   /* Remaining cases are all for DImode.  */
1159
1160   val = INTVAL (imm);
1161   subtargets = optimize && can_create_pseudo_p ();
1162
1163   one_match = 0;
1164   zero_match = 0;
1165   mask = 0xffff;
1166   first_not_ffff_match = -1;
1167
1168   for (i = 0; i < 64; i += 16, mask <<= 16)
1169     {
1170       if ((val & mask) == mask)
1171         one_match++;
1172       else
1173         {
1174           if (first_not_ffff_match < 0)
1175             first_not_ffff_match = i;
1176           if ((val & mask) == 0)
1177             zero_match++;
1178         }
1179     }
1180
1181   if (one_match == 2)
1182     {
1183       /* Set one of the quarters and then insert back into result.  */
1184       mask = 0xffffll << first_not_ffff_match;
1185       if (generate)
1186         {
1187           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1188           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1189                                      GEN_INT ((val >> first_not_ffff_match)
1190                                               & 0xffff)));
1191         }
1192       num_insns += 2;
1193       return num_insns;
1194     }
1195
1196   if (zero_match == 2)
1197     goto simple_sequence;
1198
1199   mask = 0x0ffff0000UL;
1200   for (i = 16; i < 64; i += 16, mask <<= 16)
1201     {
1202       HOST_WIDE_INT comp = mask & ~(mask - 1);
1203
1204       if (aarch64_uimm12_shift (val - (val & mask)))
1205         {
1206           if (generate)
1207             {
1208               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1209               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1210                                       GEN_INT (val & mask)));
1211               emit_insn (gen_adddi3 (dest, subtarget,
1212                                      GEN_INT (val - (val & mask))));
1213             }
1214           num_insns += 2;
1215           return num_insns;
1216         }
1217       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1218         {
1219           if (generate)
1220             {
1221               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1222               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1223                                       GEN_INT ((val + comp) & mask)));
1224               emit_insn (gen_adddi3 (dest, subtarget,
1225                                      GEN_INT (val - ((val + comp) & mask))));
1226             }
1227           num_insns += 2;
1228           return num_insns;
1229         }
1230       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1231         {
1232           if (generate)
1233             {
1234               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1235               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1236                                       GEN_INT ((val - comp) | ~mask)));
1237               emit_insn (gen_adddi3 (dest, subtarget,
1238                                      GEN_INT (val - ((val - comp) | ~mask))));
1239             }
1240           num_insns += 2;
1241           return num_insns;
1242         }
1243       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1244         {
1245           if (generate)
1246             {
1247               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1248               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1249                                       GEN_INT (val | ~mask)));
1250               emit_insn (gen_adddi3 (dest, subtarget,
1251                                      GEN_INT (val - (val | ~mask))));
1252             }
1253           num_insns += 2;
1254           return num_insns;
1255         }
1256     }
1257
1258   /* See if we can do it by arithmetically combining two
1259      immediates.  */
1260   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1261     {
1262       int j;
1263       mask = 0xffff;
1264
1265       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1266           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1267         {
1268           if (generate)
1269             {
1270               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1271               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1272                                       GEN_INT (aarch64_bitmasks[i])));
1273               emit_insn (gen_adddi3 (dest, subtarget,
1274                                      GEN_INT (val - aarch64_bitmasks[i])));
1275             }
1276           num_insns += 2;
1277           return num_insns;
1278         }
1279
1280       for (j = 0; j < 64; j += 16, mask <<= 16)
1281         {
1282           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1283             {
1284               if (generate)
1285                 {
1286                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1287                                           GEN_INT (aarch64_bitmasks[i])));
1288                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1289                                              GEN_INT ((val >> j) & 0xffff)));
1290                 }
1291               num_insns += 2;
1292               return num_insns;
1293             }
1294         }
1295     }
1296
1297   /* See if we can do it by logically combining two immediates.  */
1298   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1299     {
1300       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1301         {
1302           int j;
1303
1304           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1305             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1306               {
1307                 if (generate)
1308                   {
1309                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1310                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1311                                             GEN_INT (aarch64_bitmasks[i])));
1312                     emit_insn (gen_iordi3 (dest, subtarget,
1313                                            GEN_INT (aarch64_bitmasks[j])));
1314                   }
1315                 num_insns += 2;
1316                 return num_insns;
1317               }
1318         }
1319       else if ((val & aarch64_bitmasks[i]) == val)
1320         {
1321           int j;
1322
1323           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1324             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1325               {
1326                 if (generate)
1327                   {
1328                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1329                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1330                                             GEN_INT (aarch64_bitmasks[j])));
1331                     emit_insn (gen_anddi3 (dest, subtarget,
1332                                            GEN_INT (aarch64_bitmasks[i])));
1333                   }
1334                 num_insns += 2;
1335                 return num_insns;
1336               }
1337         }
1338     }
1339
1340   if (one_match > zero_match)
1341     {
1342       /* Set either first three quarters or all but the third.   */
1343       mask = 0xffffll << (16 - first_not_ffff_match);
1344       if (generate)
1345         emit_insn (gen_rtx_SET (VOIDmode, dest,
1346                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1347       num_insns ++;
1348
1349       /* Now insert other two quarters.  */
1350       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1351            i < 64; i += 16, mask <<= 16)
1352         {
1353           if ((val & mask) != mask)
1354             {
1355               if (generate)
1356                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1357                                            GEN_INT ((val >> i) & 0xffff)));
1358               num_insns ++;
1359             }
1360         }
1361       return num_insns;
1362     }
1363
1364  simple_sequence:
1365   first = true;
1366   mask = 0xffff;
1367   for (i = 0; i < 64; i += 16, mask <<= 16)
1368     {
1369       if ((val & mask) != 0)
1370         {
1371           if (first)
1372             {
1373               if (generate)
1374                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1375                                         GEN_INT (val & mask)));
1376               num_insns ++;
1377               first = false;
1378             }
1379           else
1380             {
1381               if (generate)
1382                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1383                                            GEN_INT ((val >> i) & 0xffff)));
1384               num_insns ++;
1385             }
1386         }
1387     }
1388
1389   return num_insns;
1390 }
1391
1392
1393 void
1394 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1395 {
1396   machine_mode mode = GET_MODE (dest);
1397
1398   gcc_assert (mode == SImode || mode == DImode);
1399
1400   /* Check on what type of symbol it is.  */
1401   if (GET_CODE (imm) == SYMBOL_REF
1402       || GET_CODE (imm) == LABEL_REF
1403       || GET_CODE (imm) == CONST)
1404     {
1405       rtx mem, base, offset;
1406       enum aarch64_symbol_type sty;
1407
1408       /* If we have (const (plus symbol offset)), separate out the offset
1409          before we start classifying the symbol.  */
1410       split_const (imm, &base, &offset);
1411
1412       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1413       switch (sty)
1414         {
1415         case SYMBOL_FORCE_TO_MEM:
1416           if (offset != const0_rtx
1417               && targetm.cannot_force_const_mem (mode, imm))
1418             {
1419               gcc_assert (can_create_pseudo_p ());
1420               base = aarch64_force_temporary (mode, dest, base);
1421               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1422               aarch64_emit_move (dest, base);
1423               return;
1424             }
1425           mem = force_const_mem (ptr_mode, imm);
1426           gcc_assert (mem);
1427           if (mode != ptr_mode)
1428             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1429           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1430           return;
1431
1432         case SYMBOL_SMALL_TLSGD:
1433         case SYMBOL_SMALL_TLSDESC:
1434         case SYMBOL_SMALL_GOTTPREL:
1435         case SYMBOL_SMALL_GOT:
1436         case SYMBOL_TINY_GOT:
1437           if (offset != const0_rtx)
1438             {
1439               gcc_assert(can_create_pseudo_p ());
1440               base = aarch64_force_temporary (mode, dest, base);
1441               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1442               aarch64_emit_move (dest, base);
1443               return;
1444             }
1445           /* FALLTHRU */
1446
1447         case SYMBOL_SMALL_TPREL:
1448         case SYMBOL_SMALL_ABSOLUTE:
1449         case SYMBOL_TINY_ABSOLUTE:
1450           aarch64_load_symref_appropriately (dest, imm, sty);
1451           return;
1452
1453         default:
1454           gcc_unreachable ();
1455         }
1456     }
1457
1458   if (!CONST_INT_P (imm))
1459     {
1460       if (GET_CODE (imm) == HIGH)
1461         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1462       else
1463         {
1464           rtx mem = force_const_mem (mode, imm);
1465           gcc_assert (mem);
1466           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1467         }
1468
1469       return;
1470     }
1471
1472   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1473 }
1474
1475 static bool
1476 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1477                                  tree exp ATTRIBUTE_UNUSED)
1478 {
1479   /* Currently, always true.  */
1480   return true;
1481 }
1482
1483 /* Implement TARGET_PASS_BY_REFERENCE.  */
1484
1485 static bool
1486 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1487                            machine_mode mode,
1488                            const_tree type,
1489                            bool named ATTRIBUTE_UNUSED)
1490 {
1491   HOST_WIDE_INT size;
1492   machine_mode dummymode;
1493   int nregs;
1494
1495   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1496   size = (mode == BLKmode && type)
1497     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1498
1499   /* Aggregates are passed by reference based on their size.  */
1500   if (type && AGGREGATE_TYPE_P (type))
1501     {
1502       size = int_size_in_bytes (type);
1503     }
1504
1505   /* Variable sized arguments are always returned by reference.  */
1506   if (size < 0)
1507     return true;
1508
1509   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1510   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1511                                                &dummymode, &nregs,
1512                                                NULL))
1513     return false;
1514
1515   /* Arguments which are variable sized or larger than 2 registers are
1516      passed by reference unless they are a homogenous floating point
1517      aggregate.  */
1518   return size > 2 * UNITS_PER_WORD;
1519 }
1520
1521 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1522 static bool
1523 aarch64_return_in_msb (const_tree valtype)
1524 {
1525   machine_mode dummy_mode;
1526   int dummy_int;
1527
1528   /* Never happens in little-endian mode.  */
1529   if (!BYTES_BIG_ENDIAN)
1530     return false;
1531
1532   /* Only composite types smaller than or equal to 16 bytes can
1533      be potentially returned in registers.  */
1534   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1535       || int_size_in_bytes (valtype) <= 0
1536       || int_size_in_bytes (valtype) > 16)
1537     return false;
1538
1539   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1540      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1541      is always passed/returned in the least significant bits of fp/simd
1542      register(s).  */
1543   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1544                                                &dummy_mode, &dummy_int, NULL))
1545     return false;
1546
1547   return true;
1548 }
1549
1550 /* Implement TARGET_FUNCTION_VALUE.
1551    Define how to find the value returned by a function.  */
1552
1553 static rtx
1554 aarch64_function_value (const_tree type, const_tree func,
1555                         bool outgoing ATTRIBUTE_UNUSED)
1556 {
1557   machine_mode mode;
1558   int unsignedp;
1559   int count;
1560   machine_mode ag_mode;
1561
1562   mode = TYPE_MODE (type);
1563   if (INTEGRAL_TYPE_P (type))
1564     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1565
1566   if (aarch64_return_in_msb (type))
1567     {
1568       HOST_WIDE_INT size = int_size_in_bytes (type);
1569
1570       if (size % UNITS_PER_WORD != 0)
1571         {
1572           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1573           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1574         }
1575     }
1576
1577   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1578                                                &ag_mode, &count, NULL))
1579     {
1580       if (!aarch64_composite_type_p (type, mode))
1581         {
1582           gcc_assert (count == 1 && mode == ag_mode);
1583           return gen_rtx_REG (mode, V0_REGNUM);
1584         }
1585       else
1586         {
1587           int i;
1588           rtx par;
1589
1590           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1591           for (i = 0; i < count; i++)
1592             {
1593               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1594               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1595                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1596               XVECEXP (par, 0, i) = tmp;
1597             }
1598           return par;
1599         }
1600     }
1601   else
1602     return gen_rtx_REG (mode, R0_REGNUM);
1603 }
1604
1605 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1606    Return true if REGNO is the number of a hard register in which the values
1607    of called function may come back.  */
1608
1609 static bool
1610 aarch64_function_value_regno_p (const unsigned int regno)
1611 {
1612   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1613      of 16-byte return values are: 128-bit integers and 16-byte small
1614      structures (excluding homogeneous floating-point aggregates).  */
1615   if (regno == R0_REGNUM || regno == R1_REGNUM)
1616     return true;
1617
1618   /* Up to four fp/simd registers can return a function value, e.g. a
1619      homogeneous floating-point aggregate having four members.  */
1620   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1621     return !TARGET_GENERAL_REGS_ONLY;
1622
1623   return false;
1624 }
1625
1626 /* Implement TARGET_RETURN_IN_MEMORY.
1627
1628    If the type T of the result of a function is such that
1629      void func (T arg)
1630    would require that arg be passed as a value in a register (or set of
1631    registers) according to the parameter passing rules, then the result
1632    is returned in the same registers as would be used for such an
1633    argument.  */
1634
1635 static bool
1636 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1637 {
1638   HOST_WIDE_INT size;
1639   machine_mode ag_mode;
1640   int count;
1641
1642   if (!AGGREGATE_TYPE_P (type)
1643       && TREE_CODE (type) != COMPLEX_TYPE
1644       && TREE_CODE (type) != VECTOR_TYPE)
1645     /* Simple scalar types always returned in registers.  */
1646     return false;
1647
1648   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1649                                                type,
1650                                                &ag_mode,
1651                                                &count,
1652                                                NULL))
1653     return false;
1654
1655   /* Types larger than 2 registers returned in memory.  */
1656   size = int_size_in_bytes (type);
1657   return (size < 0 || size > 2 * UNITS_PER_WORD);
1658 }
1659
1660 static bool
1661 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1662                                const_tree type, int *nregs)
1663 {
1664   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1665   return aarch64_vfp_is_call_or_return_candidate (mode,
1666                                                   type,
1667                                                   &pcum->aapcs_vfp_rmode,
1668                                                   nregs,
1669                                                   NULL);
1670 }
1671
1672 /* Given MODE and TYPE of a function argument, return the alignment in
1673    bits.  The idea is to suppress any stronger alignment requested by
1674    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1675    This is a helper function for local use only.  */
1676
1677 static unsigned int
1678 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1679 {
1680   unsigned int alignment;
1681
1682   if (type)
1683     {
1684       if (!integer_zerop (TYPE_SIZE (type)))
1685         {
1686           if (TYPE_MODE (type) == mode)
1687             alignment = TYPE_ALIGN (type);
1688           else
1689             alignment = GET_MODE_ALIGNMENT (mode);
1690         }
1691       else
1692         alignment = 0;
1693     }
1694   else
1695     alignment = GET_MODE_ALIGNMENT (mode);
1696
1697   return alignment;
1698 }
1699
1700 /* Layout a function argument according to the AAPCS64 rules.  The rule
1701    numbers refer to the rule numbers in the AAPCS64.  */
1702
1703 static void
1704 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1705                     const_tree type,
1706                     bool named ATTRIBUTE_UNUSED)
1707 {
1708   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1709   int ncrn, nvrn, nregs;
1710   bool allocate_ncrn, allocate_nvrn;
1711   HOST_WIDE_INT size;
1712
1713   /* We need to do this once per argument.  */
1714   if (pcum->aapcs_arg_processed)
1715     return;
1716
1717   pcum->aapcs_arg_processed = true;
1718
1719   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1720   size
1721     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1722                         UNITS_PER_WORD);
1723
1724   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1725   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1726                                                  mode,
1727                                                  type,
1728                                                  &nregs);
1729
1730   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1731      The following code thus handles passing by SIMD/FP registers first.  */
1732
1733   nvrn = pcum->aapcs_nvrn;
1734
1735   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1736      and homogenous short-vector aggregates (HVA).  */
1737   if (allocate_nvrn)
1738     {
1739       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1740         {
1741           pcum->aapcs_nextnvrn = nvrn + nregs;
1742           if (!aarch64_composite_type_p (type, mode))
1743             {
1744               gcc_assert (nregs == 1);
1745               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1746             }
1747           else
1748             {
1749               rtx par;
1750               int i;
1751               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1752               for (i = 0; i < nregs; i++)
1753                 {
1754                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1755                                          V0_REGNUM + nvrn + i);
1756                   tmp = gen_rtx_EXPR_LIST
1757                     (VOIDmode, tmp,
1758                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1759                   XVECEXP (par, 0, i) = tmp;
1760                 }
1761               pcum->aapcs_reg = par;
1762             }
1763           return;
1764         }
1765       else
1766         {
1767           /* C.3 NSRN is set to 8.  */
1768           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1769           goto on_stack;
1770         }
1771     }
1772
1773   ncrn = pcum->aapcs_ncrn;
1774   nregs = size / UNITS_PER_WORD;
1775
1776   /* C6 - C9.  though the sign and zero extension semantics are
1777      handled elsewhere.  This is the case where the argument fits
1778      entirely general registers.  */
1779   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1780     {
1781       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1782
1783       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1784
1785       /* C.8 if the argument has an alignment of 16 then the NGRN is
1786          rounded up to the next even number.  */
1787       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1788         {
1789           ++ncrn;
1790           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1791         }
1792       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1793          A reg is still generated for it, but the caller should be smart
1794          enough not to use it.  */
1795       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1796         {
1797           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1798         }
1799       else
1800         {
1801           rtx par;
1802           int i;
1803
1804           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1805           for (i = 0; i < nregs; i++)
1806             {
1807               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1808               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1809                                        GEN_INT (i * UNITS_PER_WORD));
1810               XVECEXP (par, 0, i) = tmp;
1811             }
1812           pcum->aapcs_reg = par;
1813         }
1814
1815       pcum->aapcs_nextncrn = ncrn + nregs;
1816       return;
1817     }
1818
1819   /* C.11  */
1820   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1821
1822   /* The argument is passed on stack; record the needed number of words for
1823      this argument and align the total size if necessary.  */
1824 on_stack:
1825   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1826   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1827     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1828                                                16 / UNITS_PER_WORD);
1829   return;
1830 }
1831
1832 /* Implement TARGET_FUNCTION_ARG.  */
1833
1834 static rtx
1835 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1836                       const_tree type, bool named)
1837 {
1838   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1839   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1840
1841   if (mode == VOIDmode)
1842     return NULL_RTX;
1843
1844   aarch64_layout_arg (pcum_v, mode, type, named);
1845   return pcum->aapcs_reg;
1846 }
1847
1848 void
1849 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1850                            const_tree fntype ATTRIBUTE_UNUSED,
1851                            rtx libname ATTRIBUTE_UNUSED,
1852                            const_tree fndecl ATTRIBUTE_UNUSED,
1853                            unsigned n_named ATTRIBUTE_UNUSED)
1854 {
1855   pcum->aapcs_ncrn = 0;
1856   pcum->aapcs_nvrn = 0;
1857   pcum->aapcs_nextncrn = 0;
1858   pcum->aapcs_nextnvrn = 0;
1859   pcum->pcs_variant = ARM_PCS_AAPCS64;
1860   pcum->aapcs_reg = NULL_RTX;
1861   pcum->aapcs_arg_processed = false;
1862   pcum->aapcs_stack_words = 0;
1863   pcum->aapcs_stack_size = 0;
1864
1865   return;
1866 }
1867
1868 static void
1869 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1870                               machine_mode mode,
1871                               const_tree type,
1872                               bool named)
1873 {
1874   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1875   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1876     {
1877       aarch64_layout_arg (pcum_v, mode, type, named);
1878       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1879                   != (pcum->aapcs_stack_words != 0));
1880       pcum->aapcs_arg_processed = false;
1881       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1882       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1883       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1884       pcum->aapcs_stack_words = 0;
1885       pcum->aapcs_reg = NULL_RTX;
1886     }
1887 }
1888
1889 bool
1890 aarch64_function_arg_regno_p (unsigned regno)
1891 {
1892   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1893           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1894 }
1895
1896 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1897    PARM_BOUNDARY bits of alignment, but will be given anything up
1898    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1899    that both before and after the layout of each argument, the Next
1900    Stacked Argument Address (NSAA) will have a minimum alignment of
1901    8 bytes.  */
1902
1903 static unsigned int
1904 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1905 {
1906   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1907
1908   if (alignment < PARM_BOUNDARY)
1909     alignment = PARM_BOUNDARY;
1910   if (alignment > STACK_BOUNDARY)
1911     alignment = STACK_BOUNDARY;
1912   return alignment;
1913 }
1914
1915 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1916
1917    Return true if an argument passed on the stack should be padded upwards,
1918    i.e. if the least-significant byte of the stack slot has useful data.
1919
1920    Small aggregate types are placed in the lowest memory address.
1921
1922    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1923
1924 bool
1925 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1926 {
1927   /* On little-endian targets, the least significant byte of every stack
1928      argument is passed at the lowest byte address of the stack slot.  */
1929   if (!BYTES_BIG_ENDIAN)
1930     return true;
1931
1932   /* Otherwise, integral, floating-point and pointer types are padded downward:
1933      the least significant byte of a stack argument is passed at the highest
1934      byte address of the stack slot.  */
1935   if (type
1936       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1937          || POINTER_TYPE_P (type))
1938       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1939     return false;
1940
1941   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1942   return true;
1943 }
1944
1945 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1946
1947    It specifies padding for the last (may also be the only)
1948    element of a block move between registers and memory.  If
1949    assuming the block is in the memory, padding upward means that
1950    the last element is padded after its highest significant byte,
1951    while in downward padding, the last element is padded at the
1952    its least significant byte side.
1953
1954    Small aggregates and small complex types are always padded
1955    upwards.
1956
1957    We don't need to worry about homogeneous floating-point or
1958    short-vector aggregates; their move is not affected by the
1959    padding direction determined here.  Regardless of endianness,
1960    each element of such an aggregate is put in the least
1961    significant bits of a fp/simd register.
1962
1963    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1964    register has useful data, and return the opposite if the most
1965    significant byte does.  */
1966
1967 bool
1968 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1969                      bool first ATTRIBUTE_UNUSED)
1970 {
1971
1972   /* Small composite types are always padded upward.  */
1973   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1974     {
1975       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1976                             : GET_MODE_SIZE (mode));
1977       if (size < 2 * UNITS_PER_WORD)
1978         return true;
1979     }
1980
1981   /* Otherwise, use the default padding.  */
1982   return !BYTES_BIG_ENDIAN;
1983 }
1984
1985 static machine_mode
1986 aarch64_libgcc_cmp_return_mode (void)
1987 {
1988   return SImode;
1989 }
1990
1991 static bool
1992 aarch64_frame_pointer_required (void)
1993 {
1994   /* In aarch64_override_options_after_change
1995      flag_omit_leaf_frame_pointer turns off the frame pointer by
1996      default.  Turn it back on now if we've not got a leaf
1997      function.  */
1998   if (flag_omit_leaf_frame_pointer
1999       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2000     return true;
2001
2002   return false;
2003 }
2004
2005 /* Mark the registers that need to be saved by the callee and calculate
2006    the size of the callee-saved registers area and frame record (both FP
2007    and LR may be omitted).  */
2008 static void
2009 aarch64_layout_frame (void)
2010 {
2011   HOST_WIDE_INT offset = 0;
2012   int regno;
2013
2014   if (reload_completed && cfun->machine->frame.laid_out)
2015     return;
2016
2017 #define SLOT_NOT_REQUIRED (-2)
2018 #define SLOT_REQUIRED     (-1)
2019
2020   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2021   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2022
2023   /* First mark all the registers that really need to be saved...  */
2024   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2025     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2026
2027   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2028     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2029
2030   /* ... that includes the eh data registers (if needed)...  */
2031   if (crtl->calls_eh_return)
2032     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2033       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2034         = SLOT_REQUIRED;
2035
2036   /* ... and any callee saved register that dataflow says is live.  */
2037   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2038     if (df_regs_ever_live_p (regno)
2039         && (regno == R30_REGNUM
2040             || !call_used_regs[regno]))
2041       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2042
2043   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2044     if (df_regs_ever_live_p (regno)
2045         && !call_used_regs[regno])
2046       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2047
2048   if (frame_pointer_needed)
2049     {
2050       /* FP and LR are placed in the linkage record.  */
2051       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2052       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2053       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2054       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2055       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2056       offset += 2 * UNITS_PER_WORD;
2057     }
2058
2059   /* Now assign stack slots for them.  */
2060   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2061     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2062       {
2063         cfun->machine->frame.reg_offset[regno] = offset;
2064         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2065           cfun->machine->frame.wb_candidate1 = regno;
2066         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2067           cfun->machine->frame.wb_candidate2 = regno;
2068         offset += UNITS_PER_WORD;
2069       }
2070
2071   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2073       {
2074         cfun->machine->frame.reg_offset[regno] = offset;
2075         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2076           cfun->machine->frame.wb_candidate1 = regno;
2077         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2078                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2079           cfun->machine->frame.wb_candidate2 = regno;
2080         offset += UNITS_PER_WORD;
2081       }
2082
2083   cfun->machine->frame.padding0 =
2084     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2085   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2086
2087   cfun->machine->frame.saved_regs_size = offset;
2088
2089   cfun->machine->frame.hard_fp_offset
2090     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2091                         + get_frame_size ()
2092                         + cfun->machine->frame.saved_regs_size,
2093                         STACK_BOUNDARY / BITS_PER_UNIT);
2094
2095   cfun->machine->frame.frame_size
2096     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2097                         + crtl->outgoing_args_size,
2098                         STACK_BOUNDARY / BITS_PER_UNIT);
2099
2100   cfun->machine->frame.laid_out = true;
2101 }
2102
2103 static bool
2104 aarch64_register_saved_on_entry (int regno)
2105 {
2106   return cfun->machine->frame.reg_offset[regno] >= 0;
2107 }
2108
2109 static unsigned
2110 aarch64_next_callee_save (unsigned regno, unsigned limit)
2111 {
2112   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2113     regno ++;
2114   return regno;
2115 }
2116
2117 static void
2118 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2119                            HOST_WIDE_INT adjustment)
2120  {
2121   rtx base_rtx = stack_pointer_rtx;
2122   rtx insn, reg, mem;
2123
2124   reg = gen_rtx_REG (mode, regno);
2125   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2126                             plus_constant (Pmode, base_rtx, -adjustment));
2127   mem = gen_rtx_MEM (mode, mem);
2128
2129   insn = emit_move_insn (mem, reg);
2130   RTX_FRAME_RELATED_P (insn) = 1;
2131 }
2132
2133 static rtx
2134 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2135                           HOST_WIDE_INT adjustment)
2136 {
2137   switch (mode)
2138     {
2139     case DImode:
2140       return gen_storewb_pairdi_di (base, base, reg, reg2,
2141                                     GEN_INT (-adjustment),
2142                                     GEN_INT (UNITS_PER_WORD - adjustment));
2143     case DFmode:
2144       return gen_storewb_pairdf_di (base, base, reg, reg2,
2145                                     GEN_INT (-adjustment),
2146                                     GEN_INT (UNITS_PER_WORD - adjustment));
2147     default:
2148       gcc_unreachable ();
2149     }
2150 }
2151
2152 static void
2153 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2154                          unsigned regno2, HOST_WIDE_INT adjustment)
2155 {
2156   rtx_insn *insn;
2157   rtx reg1 = gen_rtx_REG (mode, regno1);
2158   rtx reg2 = gen_rtx_REG (mode, regno2);
2159
2160   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2161                                               reg2, adjustment));
2162   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2163   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2164   RTX_FRAME_RELATED_P (insn) = 1;
2165 }
2166
2167 static rtx
2168 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2169                          HOST_WIDE_INT adjustment)
2170 {
2171   switch (mode)
2172     {
2173     case DImode:
2174       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2175                                    GEN_INT (UNITS_PER_WORD));
2176     case DFmode:
2177       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2178                                    GEN_INT (UNITS_PER_WORD));
2179     default:
2180       gcc_unreachable ();
2181     }
2182 }
2183
2184 static rtx
2185 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2186                         rtx reg2)
2187 {
2188   switch (mode)
2189     {
2190     case DImode:
2191       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2192
2193     case DFmode:
2194       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2195
2196     default:
2197       gcc_unreachable ();
2198     }
2199 }
2200
2201 static rtx
2202 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2203                        rtx mem2)
2204 {
2205   switch (mode)
2206     {
2207     case DImode:
2208       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2209
2210     case DFmode:
2211       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2212
2213     default:
2214       gcc_unreachable ();
2215     }
2216 }
2217
2218
2219 static void
2220 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2221                            unsigned start, unsigned limit, bool skip_wb)
2222 {
2223   rtx_insn *insn;
2224   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2225                                                  ? gen_frame_mem : gen_rtx_MEM);
2226   unsigned regno;
2227   unsigned regno2;
2228
2229   for (regno = aarch64_next_callee_save (start, limit);
2230        regno <= limit;
2231        regno = aarch64_next_callee_save (regno + 1, limit))
2232     {
2233       rtx reg, mem;
2234       HOST_WIDE_INT offset;
2235
2236       if (skip_wb
2237           && (regno == cfun->machine->frame.wb_candidate1
2238               || regno == cfun->machine->frame.wb_candidate2))
2239         continue;
2240
2241       reg = gen_rtx_REG (mode, regno);
2242       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2243       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2244                                               offset));
2245
2246       regno2 = aarch64_next_callee_save (regno + 1, limit);
2247
2248       if (regno2 <= limit
2249           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2250               == cfun->machine->frame.reg_offset[regno2]))
2251
2252         {
2253           rtx reg2 = gen_rtx_REG (mode, regno2);
2254           rtx mem2;
2255
2256           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2257           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2258                                                    offset));
2259           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2260                                                     reg2));
2261
2262           /* The first part of a frame-related parallel insn is
2263              always assumed to be relevant to the frame
2264              calculations; subsequent parts, are only
2265              frame-related if explicitly marked.  */
2266           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2267           regno = regno2;
2268         }
2269       else
2270         insn = emit_move_insn (mem, reg);
2271
2272       RTX_FRAME_RELATED_P (insn) = 1;
2273     }
2274 }
2275
2276 static void
2277 aarch64_restore_callee_saves (machine_mode mode,
2278                               HOST_WIDE_INT start_offset, unsigned start,
2279                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2280 {
2281   rtx base_rtx = stack_pointer_rtx;
2282   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2283                                                  ? gen_frame_mem : gen_rtx_MEM);
2284   unsigned regno;
2285   unsigned regno2;
2286   HOST_WIDE_INT offset;
2287
2288   for (regno = aarch64_next_callee_save (start, limit);
2289        regno <= limit;
2290        regno = aarch64_next_callee_save (regno + 1, limit))
2291     {
2292       rtx reg, mem;
2293
2294       if (skip_wb
2295           && (regno == cfun->machine->frame.wb_candidate1
2296               || regno == cfun->machine->frame.wb_candidate2))
2297         continue;
2298
2299       reg = gen_rtx_REG (mode, regno);
2300       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2301       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2302
2303       regno2 = aarch64_next_callee_save (regno + 1, limit);
2304
2305       if (regno2 <= limit
2306           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2307               == cfun->machine->frame.reg_offset[regno2]))
2308         {
2309           rtx reg2 = gen_rtx_REG (mode, regno2);
2310           rtx mem2;
2311
2312           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2313           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2314           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2315
2316           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2317           regno = regno2;
2318         }
2319       else
2320         emit_move_insn (reg, mem);
2321       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2322     }
2323 }
2324
2325 /* AArch64 stack frames generated by this compiler look like:
2326
2327         +-------------------------------+
2328         |                               |
2329         |  incoming stack arguments     |
2330         |                               |
2331         +-------------------------------+
2332         |                               | <-- incoming stack pointer (aligned)
2333         |  callee-allocated save area   |
2334         |  for register varargs         |
2335         |                               |
2336         +-------------------------------+
2337         |  local variables              | <-- frame_pointer_rtx
2338         |                               |
2339         +-------------------------------+
2340         |  padding0                     | \
2341         +-------------------------------+  |
2342         |  callee-saved registers       |  | frame.saved_regs_size
2343         +-------------------------------+  |
2344         |  LR'                          |  |
2345         +-------------------------------+  |
2346         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2347         +-------------------------------+
2348         |  dynamic allocation           |
2349         +-------------------------------+
2350         |  padding                      |
2351         +-------------------------------+
2352         |  outgoing stack arguments     | <-- arg_pointer
2353         |                               |
2354         +-------------------------------+
2355         |                               | <-- stack_pointer_rtx (aligned)
2356
2357    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2358    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2359    unchanged.  */
2360
2361 /* Generate the prologue instructions for entry into a function.
2362    Establish the stack frame by decreasing the stack pointer with a
2363    properly calculated size and, if necessary, create a frame record
2364    filled with the values of LR and previous frame pointer.  The
2365    current FP is also set up if it is in use.  */
2366
2367 void
2368 aarch64_expand_prologue (void)
2369 {
2370   /* sub sp, sp, #<frame_size>
2371      stp {fp, lr}, [sp, #<frame_size> - 16]
2372      add fp, sp, #<frame_size> - hardfp_offset
2373      stp {cs_reg}, [fp, #-16] etc.
2374
2375      sub sp, sp, <final_adjustment_if_any>
2376   */
2377   HOST_WIDE_INT frame_size, offset;
2378   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2379   HOST_WIDE_INT hard_fp_offset;
2380   rtx_insn *insn;
2381
2382   aarch64_layout_frame ();
2383
2384   offset = frame_size = cfun->machine->frame.frame_size;
2385   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2386   fp_offset = frame_size - hard_fp_offset;
2387
2388   if (flag_stack_usage_info)
2389     current_function_static_stack_size = frame_size;
2390
2391   /* Store pairs and load pairs have a range only -512 to 504.  */
2392   if (offset >= 512)
2393     {
2394       /* When the frame has a large size, an initial decrease is done on
2395          the stack pointer to jump over the callee-allocated save area for
2396          register varargs, the local variable area and/or the callee-saved
2397          register area.  This will allow the pre-index write-back
2398          store pair instructions to be used for setting up the stack frame
2399          efficiently.  */
2400       offset = hard_fp_offset;
2401       if (offset >= 512)
2402         offset = cfun->machine->frame.saved_regs_size;
2403
2404       frame_size -= (offset + crtl->outgoing_args_size);
2405       fp_offset = 0;
2406
2407       if (frame_size >= 0x1000000)
2408         {
2409           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2410           emit_move_insn (op0, GEN_INT (-frame_size));
2411           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2412
2413           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2414                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2415                                      plus_constant (Pmode, stack_pointer_rtx,
2416                                                     -frame_size)));
2417           RTX_FRAME_RELATED_P (insn) = 1;
2418         }
2419       else if (frame_size > 0)
2420         {
2421           int hi_ofs = frame_size & 0xfff000;
2422           int lo_ofs = frame_size & 0x000fff;
2423
2424           if (hi_ofs)
2425             {
2426               insn = emit_insn (gen_add2_insn
2427                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2428               RTX_FRAME_RELATED_P (insn) = 1;
2429             }
2430           if (lo_ofs)
2431             {
2432               insn = emit_insn (gen_add2_insn
2433                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2434               RTX_FRAME_RELATED_P (insn) = 1;
2435             }
2436         }
2437     }
2438   else
2439     frame_size = -1;
2440
2441   if (offset > 0)
2442     {
2443       bool skip_wb = false;
2444
2445       if (frame_pointer_needed)
2446         {
2447           skip_wb = true;
2448
2449           if (fp_offset)
2450             {
2451               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2452                                                GEN_INT (-offset)));
2453               RTX_FRAME_RELATED_P (insn) = 1;
2454
2455               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2456                                          R30_REGNUM, false);
2457             }
2458           else
2459             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2460
2461           /* Set up frame pointer to point to the location of the
2462              previous frame pointer on the stack.  */
2463           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2464                                            stack_pointer_rtx,
2465                                            GEN_INT (fp_offset)));
2466           RTX_FRAME_RELATED_P (insn) = 1;
2467           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2468         }
2469       else
2470         {
2471           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2472           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2473
2474           if (fp_offset
2475               || reg1 == FIRST_PSEUDO_REGISTER
2476               || (reg2 == FIRST_PSEUDO_REGISTER
2477                   && offset >= 256))
2478             {
2479               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480                                                GEN_INT (-offset)));
2481               RTX_FRAME_RELATED_P (insn) = 1;
2482             }
2483           else
2484             {
2485               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2486
2487               skip_wb = true;
2488
2489               if (reg2 == FIRST_PSEUDO_REGISTER)
2490                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2491               else
2492                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2493             }
2494         }
2495
2496       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2497                                  skip_wb);
2498       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2499                                  skip_wb);
2500     }
2501
2502   /* when offset >= 512,
2503      sub sp, sp, #<outgoing_args_size> */
2504   if (frame_size > -1)
2505     {
2506       if (crtl->outgoing_args_size > 0)
2507         {
2508           insn = emit_insn (gen_add2_insn
2509                             (stack_pointer_rtx,
2510                              GEN_INT (- crtl->outgoing_args_size)));
2511           RTX_FRAME_RELATED_P (insn) = 1;
2512         }
2513     }
2514 }
2515
2516 /* Return TRUE if we can use a simple_return insn.
2517
2518    This function checks whether the callee saved stack is empty, which
2519    means no restore actions are need. The pro_and_epilogue will use
2520    this to check whether shrink-wrapping opt is feasible.  */
2521
2522 bool
2523 aarch64_use_return_insn_p (void)
2524 {
2525   if (!reload_completed)
2526     return false;
2527
2528   if (crtl->profile)
2529     return false;
2530
2531   aarch64_layout_frame ();
2532
2533   return cfun->machine->frame.frame_size == 0;
2534 }
2535
2536 /* Generate the epilogue instructions for returning from a function.  */
2537 void
2538 aarch64_expand_epilogue (bool for_sibcall)
2539 {
2540   HOST_WIDE_INT frame_size, offset;
2541   HOST_WIDE_INT fp_offset;
2542   HOST_WIDE_INT hard_fp_offset;
2543   rtx_insn *insn;
2544   /* We need to add memory barrier to prevent read from deallocated stack.  */
2545   bool need_barrier_p = (get_frame_size () != 0
2546                          || cfun->machine->frame.saved_varargs_size);
2547
2548   aarch64_layout_frame ();
2549
2550   offset = frame_size = cfun->machine->frame.frame_size;
2551   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2552   fp_offset = frame_size - hard_fp_offset;
2553
2554   /* Store pairs and load pairs have a range only -512 to 504.  */
2555   if (offset >= 512)
2556     {
2557       offset = hard_fp_offset;
2558       if (offset >= 512)
2559         offset = cfun->machine->frame.saved_regs_size;
2560
2561       frame_size -= (offset + crtl->outgoing_args_size);
2562       fp_offset = 0;
2563       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2564         {
2565           insn = emit_insn (gen_add2_insn
2566                             (stack_pointer_rtx,
2567                              GEN_INT (crtl->outgoing_args_size)));
2568           RTX_FRAME_RELATED_P (insn) = 1;
2569         }
2570     }
2571   else
2572     frame_size = -1;
2573
2574   /* If there were outgoing arguments or we've done dynamic stack
2575      allocation, then restore the stack pointer from the frame
2576      pointer.  This is at most one insn and more efficient than using
2577      GCC's internal mechanism.  */
2578   if (frame_pointer_needed
2579       && (crtl->outgoing_args_size || cfun->calls_alloca))
2580     {
2581       if (cfun->calls_alloca)
2582         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2583
2584       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2585                                        hard_frame_pointer_rtx,
2586                                        GEN_INT (0)));
2587       offset = offset - fp_offset;
2588     }
2589
2590   if (offset > 0)
2591     {
2592       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2593       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2594       bool skip_wb = true;
2595       rtx cfi_ops = NULL;
2596
2597       if (frame_pointer_needed)
2598         fp_offset = 0;
2599       else if (fp_offset
2600                || reg1 == FIRST_PSEUDO_REGISTER
2601                || (reg2 == FIRST_PSEUDO_REGISTER
2602                    && offset >= 256))
2603         skip_wb = false;
2604
2605       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2606                                     skip_wb, &cfi_ops);
2607       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2608                                     skip_wb, &cfi_ops);
2609
2610       if (need_barrier_p)
2611         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2612
2613       if (skip_wb)
2614         {
2615           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2616           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2617
2618           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2619           if (reg2 == FIRST_PSEUDO_REGISTER)
2620             {
2621               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2622               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2623               mem = gen_rtx_MEM (mode1, mem);
2624               insn = emit_move_insn (rreg1, mem);
2625             }
2626           else
2627             {
2628               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2629
2630               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2631               insn = emit_insn (aarch64_gen_loadwb_pair
2632                                 (mode1, stack_pointer_rtx, rreg1,
2633                                  rreg2, offset));
2634             }
2635         }
2636       else
2637         {
2638           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2639                                            GEN_INT (offset)));
2640         }
2641
2642       /* Reset the CFA to be SP + FRAME_SIZE.  */
2643       rtx new_cfa = stack_pointer_rtx;
2644       if (frame_size > 0)
2645         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2646       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2647       REG_NOTES (insn) = cfi_ops;
2648       RTX_FRAME_RELATED_P (insn) = 1;
2649     }
2650
2651   if (frame_size > 0)
2652     {
2653       if (need_barrier_p)
2654         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2655
2656       if (frame_size >= 0x1000000)
2657         {
2658           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2659           emit_move_insn (op0, GEN_INT (frame_size));
2660           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2661         }
2662       else
2663         {
2664           int hi_ofs = frame_size & 0xfff000;
2665           int lo_ofs = frame_size & 0x000fff;
2666
2667           if (hi_ofs && lo_ofs)
2668             {
2669               insn = emit_insn (gen_add2_insn
2670                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2671               RTX_FRAME_RELATED_P (insn) = 1;
2672               frame_size = lo_ofs;
2673             }
2674           insn = emit_insn (gen_add2_insn
2675                             (stack_pointer_rtx, GEN_INT (frame_size)));
2676         }
2677
2678       /* Reset the CFA to be SP + 0.  */
2679       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2680       RTX_FRAME_RELATED_P (insn) = 1;
2681     }
2682
2683   /* Stack adjustment for exception handler.  */
2684   if (crtl->calls_eh_return)
2685     {
2686       /* We need to unwind the stack by the offset computed by
2687          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2688          to be SP; letting the CFA move during this adjustment
2689          is just as correct as retaining the CFA from the body
2690          of the function.  Therefore, do nothing special.  */
2691       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2692     }
2693
2694   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2695   if (!for_sibcall)
2696     emit_jump_insn (ret_rtx);
2697 }
2698
2699 /* Return the place to copy the exception unwinding return address to.
2700    This will probably be a stack slot, but could (in theory be the
2701    return register).  */
2702 rtx
2703 aarch64_final_eh_return_addr (void)
2704 {
2705   HOST_WIDE_INT fp_offset;
2706
2707   aarch64_layout_frame ();
2708
2709   fp_offset = cfun->machine->frame.frame_size
2710               - cfun->machine->frame.hard_fp_offset;
2711
2712   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2713     return gen_rtx_REG (DImode, LR_REGNUM);
2714
2715   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2716      result in a store to save LR introduced by builtin_eh_return () being
2717      incorrectly deleted because the alias is not detected.
2718      So in the calculation of the address to copy the exception unwinding
2719      return address to, we note 2 cases.
2720      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2721      we return a SP-relative location since all the addresses are SP-relative
2722      in this case.  This prevents the store from being optimized away.
2723      If the fp_offset is not 0, then the addresses will be FP-relative and
2724      therefore we return a FP-relative location.  */
2725
2726   if (frame_pointer_needed)
2727     {
2728       if (fp_offset)
2729         return gen_frame_mem (DImode,
2730                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2731       else
2732         return gen_frame_mem (DImode,
2733                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2734     }
2735
2736   /* If FP is not needed, we calculate the location of LR, which would be
2737      at the top of the saved registers block.  */
2738
2739   return gen_frame_mem (DImode,
2740                         plus_constant (Pmode,
2741                                        stack_pointer_rtx,
2742                                        fp_offset
2743                                        + cfun->machine->frame.saved_regs_size
2744                                        - 2 * UNITS_PER_WORD));
2745 }
2746
2747 /* Possibly output code to build up a constant in a register.  For
2748    the benefit of the costs infrastructure, returns the number of
2749    instructions which would be emitted.  GENERATE inhibits or
2750    enables code generation.  */
2751
2752 static int
2753 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2754 {
2755   int insns = 0;
2756
2757   if (aarch64_bitmask_imm (val, DImode))
2758     {
2759       if (generate)
2760         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2761       insns = 1;
2762     }
2763   else
2764     {
2765       int i;
2766       int ncount = 0;
2767       int zcount = 0;
2768       HOST_WIDE_INT valp = val >> 16;
2769       HOST_WIDE_INT valm;
2770       HOST_WIDE_INT tval;
2771
2772       for (i = 16; i < 64; i += 16)
2773         {
2774           valm = (valp & 0xffff);
2775
2776           if (valm != 0)
2777             ++ zcount;
2778
2779           if (valm != 0xffff)
2780             ++ ncount;
2781
2782           valp >>= 16;
2783         }
2784
2785       /* zcount contains the number of additional MOVK instructions
2786          required if the constant is built up with an initial MOVZ instruction,
2787          while ncount is the number of MOVK instructions required if starting
2788          with a MOVN instruction.  Choose the sequence that yields the fewest
2789          number of instructions, preferring MOVZ instructions when they are both
2790          the same.  */
2791       if (ncount < zcount)
2792         {
2793           if (generate)
2794             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2795                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2796           tval = 0xffff;
2797           insns++;
2798         }
2799       else
2800         {
2801           if (generate)
2802             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2803                             GEN_INT (val & 0xffff));
2804           tval = 0;
2805           insns++;
2806         }
2807
2808       val >>= 16;
2809
2810       for (i = 16; i < 64; i += 16)
2811         {
2812           if ((val & 0xffff) != tval)
2813             {
2814               if (generate)
2815                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2816                                            GEN_INT (i),
2817                                            GEN_INT (val & 0xffff)));
2818               insns++;
2819             }
2820           val >>= 16;
2821         }
2822     }
2823   return insns;
2824 }
2825
2826 static void
2827 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2828 {
2829   HOST_WIDE_INT mdelta = delta;
2830   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2831   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2832
2833   if (mdelta < 0)
2834     mdelta = -mdelta;
2835
2836   if (mdelta >= 4096 * 4096)
2837     {
2838       (void) aarch64_build_constant (scratchreg, delta, true);
2839       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2840     }
2841   else if (mdelta > 0)
2842     {
2843       if (mdelta >= 4096)
2844         {
2845           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2846           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2847           if (delta < 0)
2848             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2849                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2850           else
2851             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2852                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2853         }
2854       if (mdelta % 4096 != 0)
2855         {
2856           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2857           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2858                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2859         }
2860     }
2861 }
2862
2863 /* Output code to add DELTA to the first argument, and then jump
2864    to FUNCTION.  Used for C++ multiple inheritance.  */
2865 static void
2866 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2867                          HOST_WIDE_INT delta,
2868                          HOST_WIDE_INT vcall_offset,
2869                          tree function)
2870 {
2871   /* The this pointer is always in x0.  Note that this differs from
2872      Arm where the this pointer maybe bumped to r1 if r0 is required
2873      to return a pointer to an aggregate.  On AArch64 a result value
2874      pointer will be in x8.  */
2875   int this_regno = R0_REGNUM;
2876   rtx this_rtx, temp0, temp1, addr, funexp;
2877   rtx_insn *insn;
2878
2879   reload_completed = 1;
2880   emit_note (NOTE_INSN_PROLOGUE_END);
2881
2882   if (vcall_offset == 0)
2883     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2884   else
2885     {
2886       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2887
2888       this_rtx = gen_rtx_REG (Pmode, this_regno);
2889       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2890       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2891
2892       addr = this_rtx;
2893       if (delta != 0)
2894         {
2895           if (delta >= -256 && delta < 256)
2896             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2897                                        plus_constant (Pmode, this_rtx, delta));
2898           else
2899             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2900         }
2901
2902       if (Pmode == ptr_mode)
2903         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2904       else
2905         aarch64_emit_move (temp0,
2906                            gen_rtx_ZERO_EXTEND (Pmode,
2907                                                 gen_rtx_MEM (ptr_mode, addr)));
2908
2909       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2910           addr = plus_constant (Pmode, temp0, vcall_offset);
2911       else
2912         {
2913           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2914           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2915         }
2916
2917       if (Pmode == ptr_mode)
2918         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2919       else
2920         aarch64_emit_move (temp1,
2921                            gen_rtx_SIGN_EXTEND (Pmode,
2922                                                 gen_rtx_MEM (ptr_mode, addr)));
2923
2924       emit_insn (gen_add2_insn (this_rtx, temp1));
2925     }
2926
2927   /* Generate a tail call to the target function.  */
2928   if (!TREE_USED (function))
2929     {
2930       assemble_external (function);
2931       TREE_USED (function) = 1;
2932     }
2933   funexp = XEXP (DECL_RTL (function), 0);
2934   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2935   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2936   SIBLING_CALL_P (insn) = 1;
2937
2938   insn = get_insns ();
2939   shorten_branches (insn);
2940   final_start_function (insn, file, 1);
2941   final (insn, file, 1);
2942   final_end_function ();
2943
2944   /* Stop pretending to be a post-reload pass.  */
2945   reload_completed = 0;
2946 }
2947
2948 static bool
2949 aarch64_tls_referenced_p (rtx x)
2950 {
2951   if (!TARGET_HAVE_TLS)
2952     return false;
2953   subrtx_iterator::array_type array;
2954   FOR_EACH_SUBRTX (iter, array, x, ALL)
2955     {
2956       const_rtx x = *iter;
2957       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2958         return true;
2959       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2960          TLS offsets, not real symbol references.  */
2961       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2962         iter.skip_subrtxes ();
2963     }
2964   return false;
2965 }
2966
2967
2968 static int
2969 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2970 {
2971   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2972   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2973
2974   if (*imm1 < *imm2)
2975     return -1;
2976   if (*imm1 > *imm2)
2977     return +1;
2978   return 0;
2979 }
2980
2981
2982 static void
2983 aarch64_build_bitmask_table (void)
2984 {
2985   unsigned HOST_WIDE_INT mask, imm;
2986   unsigned int log_e, e, s, r;
2987   unsigned int nimms = 0;
2988
2989   for (log_e = 1; log_e <= 6; log_e++)
2990     {
2991       e = 1 << log_e;
2992       if (e == 64)
2993         mask = ~(HOST_WIDE_INT) 0;
2994       else
2995         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2996       for (s = 1; s < e; s++)
2997         {
2998           for (r = 0; r < e; r++)
2999             {
3000               /* set s consecutive bits to 1 (s < 64) */
3001               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3002               /* rotate right by r */
3003               if (r != 0)
3004                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3005               /* replicate the constant depending on SIMD size */
3006               switch (log_e) {
3007               case 1: imm |= (imm <<  2);
3008               case 2: imm |= (imm <<  4);
3009               case 3: imm |= (imm <<  8);
3010               case 4: imm |= (imm << 16);
3011               case 5: imm |= (imm << 32);
3012               case 6:
3013                 break;
3014               default:
3015                 gcc_unreachable ();
3016               }
3017               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3018               aarch64_bitmasks[nimms++] = imm;
3019             }
3020         }
3021     }
3022
3023   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3024   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3025          aarch64_bitmasks_cmp);
3026 }
3027
3028
3029 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3030    a left shift of 0 or 12 bits.  */
3031 bool
3032 aarch64_uimm12_shift (HOST_WIDE_INT val)
3033 {
3034   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3035           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3036           );
3037 }
3038
3039
3040 /* Return true if val is an immediate that can be loaded into a
3041    register by a MOVZ instruction.  */
3042 static bool
3043 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3044 {
3045   if (GET_MODE_SIZE (mode) > 4)
3046     {
3047       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3048           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3049         return 1;
3050     }
3051   else
3052     {
3053       /* Ignore sign extension.  */
3054       val &= (HOST_WIDE_INT) 0xffffffff;
3055     }
3056   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3057           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3058 }
3059
3060
3061 /* Return true if val is a valid bitmask immediate.  */
3062 bool
3063 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3064 {
3065   if (GET_MODE_SIZE (mode) < 8)
3066     {
3067       /* Replicate bit pattern.  */
3068       val &= (HOST_WIDE_INT) 0xffffffff;
3069       val |= val << 32;
3070     }
3071   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3072                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3073 }
3074
3075
3076 /* Return true if val is an immediate that can be loaded into a
3077    register in a single instruction.  */
3078 bool
3079 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3080 {
3081   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3082     return 1;
3083   return aarch64_bitmask_imm (val, mode);
3084 }
3085
3086 static bool
3087 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3088 {
3089   rtx base, offset;
3090
3091   if (GET_CODE (x) == HIGH)
3092     return true;
3093
3094   split_const (x, &base, &offset);
3095   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3096     {
3097       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3098           != SYMBOL_FORCE_TO_MEM)
3099         return true;
3100       else
3101         /* Avoid generating a 64-bit relocation in ILP32; leave
3102            to aarch64_expand_mov_immediate to handle it properly.  */
3103         return mode != ptr_mode;
3104     }
3105
3106   return aarch64_tls_referenced_p (x);
3107 }
3108
3109 /* Return true if register REGNO is a valid index register.
3110    STRICT_P is true if REG_OK_STRICT is in effect.  */
3111
3112 bool
3113 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3114 {
3115   if (!HARD_REGISTER_NUM_P (regno))
3116     {
3117       if (!strict_p)
3118         return true;
3119
3120       if (!reg_renumber)
3121         return false;
3122
3123       regno = reg_renumber[regno];
3124     }
3125   return GP_REGNUM_P (regno);
3126 }
3127
3128 /* Return true if register REGNO is a valid base register for mode MODE.
3129    STRICT_P is true if REG_OK_STRICT is in effect.  */
3130
3131 bool
3132 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3133 {
3134   if (!HARD_REGISTER_NUM_P (regno))
3135     {
3136       if (!strict_p)
3137         return true;
3138
3139       if (!reg_renumber)
3140         return false;
3141
3142       regno = reg_renumber[regno];
3143     }
3144
3145   /* The fake registers will be eliminated to either the stack or
3146      hard frame pointer, both of which are usually valid base registers.
3147      Reload deals with the cases where the eliminated form isn't valid.  */
3148   return (GP_REGNUM_P (regno)
3149           || regno == SP_REGNUM
3150           || regno == FRAME_POINTER_REGNUM
3151           || regno == ARG_POINTER_REGNUM);
3152 }
3153
3154 /* Return true if X is a valid base register for mode MODE.
3155    STRICT_P is true if REG_OK_STRICT is in effect.  */
3156
3157 static bool
3158 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3159 {
3160   if (!strict_p && GET_CODE (x) == SUBREG)
3161     x = SUBREG_REG (x);
3162
3163   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3164 }
3165
3166 /* Return true if address offset is a valid index.  If it is, fill in INFO
3167    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3168
3169 static bool
3170 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3171                         machine_mode mode, bool strict_p)
3172 {
3173   enum aarch64_address_type type;
3174   rtx index;
3175   int shift;
3176
3177   /* (reg:P) */
3178   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3179       && GET_MODE (x) == Pmode)
3180     {
3181       type = ADDRESS_REG_REG;
3182       index = x;
3183       shift = 0;
3184     }
3185   /* (sign_extend:DI (reg:SI)) */
3186   else if ((GET_CODE (x) == SIGN_EXTEND
3187             || GET_CODE (x) == ZERO_EXTEND)
3188            && GET_MODE (x) == DImode
3189            && GET_MODE (XEXP (x, 0)) == SImode)
3190     {
3191       type = (GET_CODE (x) == SIGN_EXTEND)
3192         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3193       index = XEXP (x, 0);
3194       shift = 0;
3195     }
3196   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3197   else if (GET_CODE (x) == MULT
3198            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3199                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3200            && GET_MODE (XEXP (x, 0)) == DImode
3201            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3202            && CONST_INT_P (XEXP (x, 1)))
3203     {
3204       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3205         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3206       index = XEXP (XEXP (x, 0), 0);
3207       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3208     }
3209   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3210   else if (GET_CODE (x) == ASHIFT
3211            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3212                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3213            && GET_MODE (XEXP (x, 0)) == DImode
3214            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3215            && CONST_INT_P (XEXP (x, 1)))
3216     {
3217       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3218         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3219       index = XEXP (XEXP (x, 0), 0);
3220       shift = INTVAL (XEXP (x, 1));
3221     }
3222   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3223   else if ((GET_CODE (x) == SIGN_EXTRACT
3224             || GET_CODE (x) == ZERO_EXTRACT)
3225            && GET_MODE (x) == DImode
3226            && GET_CODE (XEXP (x, 0)) == MULT
3227            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3228            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3229     {
3230       type = (GET_CODE (x) == SIGN_EXTRACT)
3231         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3232       index = XEXP (XEXP (x, 0), 0);
3233       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3234       if (INTVAL (XEXP (x, 1)) != 32 + shift
3235           || INTVAL (XEXP (x, 2)) != 0)
3236         shift = -1;
3237     }
3238   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3239      (const_int 0xffffffff<<shift)) */
3240   else if (GET_CODE (x) == AND
3241            && GET_MODE (x) == DImode
3242            && GET_CODE (XEXP (x, 0)) == MULT
3243            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3244            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3245            && CONST_INT_P (XEXP (x, 1)))
3246     {
3247       type = ADDRESS_REG_UXTW;
3248       index = XEXP (XEXP (x, 0), 0);
3249       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3250       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3251         shift = -1;
3252     }
3253   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3254   else if ((GET_CODE (x) == SIGN_EXTRACT
3255             || GET_CODE (x) == ZERO_EXTRACT)
3256            && GET_MODE (x) == DImode
3257            && GET_CODE (XEXP (x, 0)) == ASHIFT
3258            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3259            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3260     {
3261       type = (GET_CODE (x) == SIGN_EXTRACT)
3262         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3263       index = XEXP (XEXP (x, 0), 0);
3264       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3265       if (INTVAL (XEXP (x, 1)) != 32 + shift
3266           || INTVAL (XEXP (x, 2)) != 0)
3267         shift = -1;
3268     }
3269   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3270      (const_int 0xffffffff<<shift)) */
3271   else if (GET_CODE (x) == AND
3272            && GET_MODE (x) == DImode
3273            && GET_CODE (XEXP (x, 0)) == ASHIFT
3274            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3275            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3276            && CONST_INT_P (XEXP (x, 1)))
3277     {
3278       type = ADDRESS_REG_UXTW;
3279       index = XEXP (XEXP (x, 0), 0);
3280       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3281       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3282         shift = -1;
3283     }
3284   /* (mult:P (reg:P) (const_int scale)) */
3285   else if (GET_CODE (x) == MULT
3286            && GET_MODE (x) == Pmode
3287            && GET_MODE (XEXP (x, 0)) == Pmode
3288            && CONST_INT_P (XEXP (x, 1)))
3289     {
3290       type = ADDRESS_REG_REG;
3291       index = XEXP (x, 0);
3292       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3293     }
3294   /* (ashift:P (reg:P) (const_int shift)) */
3295   else if (GET_CODE (x) == ASHIFT
3296            && GET_MODE (x) == Pmode
3297            && GET_MODE (XEXP (x, 0)) == Pmode
3298            && CONST_INT_P (XEXP (x, 1)))
3299     {
3300       type = ADDRESS_REG_REG;
3301       index = XEXP (x, 0);
3302       shift = INTVAL (XEXP (x, 1));
3303     }
3304   else
3305     return false;
3306
3307   if (GET_CODE (index) == SUBREG)
3308     index = SUBREG_REG (index);
3309
3310   if ((shift == 0 ||
3311        (shift > 0 && shift <= 3
3312         && (1 << shift) == GET_MODE_SIZE (mode)))
3313       && REG_P (index)
3314       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3315     {
3316       info->type = type;
3317       info->offset = index;
3318       info->shift = shift;
3319       return true;
3320     }
3321
3322   return false;
3323 }
3324
3325 bool
3326 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3327 {
3328   return (offset >= -64 * GET_MODE_SIZE (mode)
3329           && offset < 64 * GET_MODE_SIZE (mode)
3330           && offset % GET_MODE_SIZE (mode) == 0);
3331 }
3332
3333 static inline bool
3334 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3335                                HOST_WIDE_INT offset)
3336 {
3337   return offset >= -256 && offset < 256;
3338 }
3339
3340 static inline bool
3341 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3342 {
3343   return (offset >= 0
3344           && offset < 4096 * GET_MODE_SIZE (mode)
3345           && offset % GET_MODE_SIZE (mode) == 0);
3346 }
3347
3348 /* Return true if X is a valid address for machine mode MODE.  If it is,
3349    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3350    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3351
3352 static bool
3353 aarch64_classify_address (struct aarch64_address_info *info,
3354                           rtx x, machine_mode mode,
3355                           RTX_CODE outer_code, bool strict_p)
3356 {
3357   enum rtx_code code = GET_CODE (x);
3358   rtx op0, op1;
3359   bool allow_reg_index_p =
3360     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3361                                || aarch64_vector_mode_supported_p (mode));
3362   /* Don't support anything other than POST_INC or REG addressing for
3363      AdvSIMD.  */
3364   if (aarch64_vect_struct_mode_p (mode)
3365       && (code != POST_INC && code != REG))
3366     return false;
3367
3368   switch (code)
3369     {
3370     case REG:
3371     case SUBREG:
3372       info->type = ADDRESS_REG_IMM;
3373       info->base = x;
3374       info->offset = const0_rtx;
3375       return aarch64_base_register_rtx_p (x, strict_p);
3376
3377     case PLUS:
3378       op0 = XEXP (x, 0);
3379       op1 = XEXP (x, 1);
3380
3381       if (! strict_p
3382           && REG_P (op0)
3383           && (op0 == virtual_stack_vars_rtx
3384               || op0 == frame_pointer_rtx
3385               || op0 == arg_pointer_rtx)
3386           && CONST_INT_P (op1))
3387         {
3388           info->type = ADDRESS_REG_IMM;
3389           info->base = op0;
3390           info->offset = op1;
3391
3392           return true;
3393         }
3394
3395       if (GET_MODE_SIZE (mode) != 0
3396           && CONST_INT_P (op1)
3397           && aarch64_base_register_rtx_p (op0, strict_p))
3398         {
3399           HOST_WIDE_INT offset = INTVAL (op1);
3400
3401           info->type = ADDRESS_REG_IMM;
3402           info->base = op0;
3403           info->offset = op1;
3404
3405           /* TImode and TFmode values are allowed in both pairs of X
3406              registers and individual Q registers.  The available
3407              address modes are:
3408              X,X: 7-bit signed scaled offset
3409              Q:   9-bit signed offset
3410              We conservatively require an offset representable in either mode.
3411            */
3412           if (mode == TImode || mode == TFmode)
3413             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3414                     && offset_9bit_signed_unscaled_p (mode, offset));
3415
3416           if (outer_code == PARALLEL)
3417             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3418                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3419           else
3420             return (offset_9bit_signed_unscaled_p (mode, offset)
3421                     || offset_12bit_unsigned_scaled_p (mode, offset));
3422         }
3423
3424       if (allow_reg_index_p)
3425         {
3426           /* Look for base + (scaled/extended) index register.  */
3427           if (aarch64_base_register_rtx_p (op0, strict_p)
3428               && aarch64_classify_index (info, op1, mode, strict_p))
3429             {
3430               info->base = op0;
3431               return true;
3432             }
3433           if (aarch64_base_register_rtx_p (op1, strict_p)
3434               && aarch64_classify_index (info, op0, mode, strict_p))
3435             {
3436               info->base = op1;
3437               return true;
3438             }
3439         }
3440
3441       return false;
3442
3443     case POST_INC:
3444     case POST_DEC:
3445     case PRE_INC:
3446     case PRE_DEC:
3447       info->type = ADDRESS_REG_WB;
3448       info->base = XEXP (x, 0);
3449       info->offset = NULL_RTX;
3450       return aarch64_base_register_rtx_p (info->base, strict_p);
3451
3452     case POST_MODIFY:
3453     case PRE_MODIFY:
3454       info->type = ADDRESS_REG_WB;
3455       info->base = XEXP (x, 0);
3456       if (GET_CODE (XEXP (x, 1)) == PLUS
3457           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3458           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3459           && aarch64_base_register_rtx_p (info->base, strict_p))
3460         {
3461           HOST_WIDE_INT offset;
3462           info->offset = XEXP (XEXP (x, 1), 1);
3463           offset = INTVAL (info->offset);
3464
3465           /* TImode and TFmode values are allowed in both pairs of X
3466              registers and individual Q registers.  The available
3467              address modes are:
3468              X,X: 7-bit signed scaled offset
3469              Q:   9-bit signed offset
3470              We conservatively require an offset representable in either mode.
3471            */
3472           if (mode == TImode || mode == TFmode)
3473             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3474                     && offset_9bit_signed_unscaled_p (mode, offset));
3475
3476           if (outer_code == PARALLEL)
3477             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3478                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3479           else
3480             return offset_9bit_signed_unscaled_p (mode, offset);
3481         }
3482       return false;
3483
3484     case CONST:
3485     case SYMBOL_REF:
3486     case LABEL_REF:
3487       /* load literal: pc-relative constant pool entry.  Only supported
3488          for SI mode or larger.  */
3489       info->type = ADDRESS_SYMBOLIC;
3490       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3491         {
3492           rtx sym, addend;
3493
3494           split_const (x, &sym, &addend);
3495           return (GET_CODE (sym) == LABEL_REF
3496                   || (GET_CODE (sym) == SYMBOL_REF
3497                       && CONSTANT_POOL_ADDRESS_P (sym)));
3498         }
3499       return false;
3500
3501     case LO_SUM:
3502       info->type = ADDRESS_LO_SUM;
3503       info->base = XEXP (x, 0);
3504       info->offset = XEXP (x, 1);
3505       if (allow_reg_index_p
3506           && aarch64_base_register_rtx_p (info->base, strict_p))
3507         {
3508           rtx sym, offs;
3509           split_const (info->offset, &sym, &offs);
3510           if (GET_CODE (sym) == SYMBOL_REF
3511               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3512                   == SYMBOL_SMALL_ABSOLUTE))
3513             {
3514               /* The symbol and offset must be aligned to the access size.  */
3515               unsigned int align;
3516               unsigned int ref_size;
3517
3518               if (CONSTANT_POOL_ADDRESS_P (sym))
3519                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3520               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3521                 {
3522                   tree exp = SYMBOL_REF_DECL (sym);
3523                   align = TYPE_ALIGN (TREE_TYPE (exp));
3524                   align = CONSTANT_ALIGNMENT (exp, align);
3525                 }
3526               else if (SYMBOL_REF_DECL (sym))
3527                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3528               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3529                        && SYMBOL_REF_BLOCK (sym) != NULL)
3530                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3531               else
3532                 align = BITS_PER_UNIT;
3533
3534               ref_size = GET_MODE_SIZE (mode);
3535               if (ref_size == 0)
3536                 ref_size = GET_MODE_SIZE (DImode);
3537
3538               return ((INTVAL (offs) & (ref_size - 1)) == 0
3539                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3540             }
3541         }
3542       return false;
3543
3544     default:
3545       return false;
3546     }
3547 }
3548
3549 bool
3550 aarch64_symbolic_address_p (rtx x)
3551 {
3552   rtx offset;
3553
3554   split_const (x, &x, &offset);
3555   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3556 }
3557
3558 /* Classify the base of symbolic expression X, given that X appears in
3559    context CONTEXT.  */
3560
3561 enum aarch64_symbol_type
3562 aarch64_classify_symbolic_expression (rtx x,
3563                                       enum aarch64_symbol_context context)
3564 {
3565   rtx offset;
3566
3567   split_const (x, &x, &offset);
3568   return aarch64_classify_symbol (x, offset, context);
3569 }
3570
3571
3572 /* Return TRUE if X is a legitimate address for accessing memory in
3573    mode MODE.  */
3574 static bool
3575 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3576 {
3577   struct aarch64_address_info addr;
3578
3579   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3580 }
3581
3582 /* Return TRUE if X is a legitimate address for accessing memory in
3583    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3584    pair operation.  */
3585 bool
3586 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3587                               RTX_CODE outer_code, bool strict_p)
3588 {
3589   struct aarch64_address_info addr;
3590
3591   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3592 }
3593
3594 /* Return TRUE if rtx X is immediate constant 0.0 */
3595 bool
3596 aarch64_float_const_zero_rtx_p (rtx x)
3597 {
3598   REAL_VALUE_TYPE r;
3599
3600   if (GET_MODE (x) == VOIDmode)
3601     return false;
3602
3603   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3604   if (REAL_VALUE_MINUS_ZERO (r))
3605     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3606   return REAL_VALUES_EQUAL (r, dconst0);
3607 }
3608
3609 /* Return the fixed registers used for condition codes.  */
3610
3611 static bool
3612 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3613 {
3614   *p1 = CC_REGNUM;
3615   *p2 = INVALID_REGNUM;
3616   return true;
3617 }
3618
3619 /* Emit call insn with PAT and do aarch64-specific handling.  */
3620
3621 void
3622 aarch64_emit_call_insn (rtx pat)
3623 {
3624   rtx insn = emit_call_insn (pat);
3625
3626   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3627   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3628   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3629 }
3630
3631 machine_mode
3632 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3633 {
3634   /* All floating point compares return CCFP if it is an equality
3635      comparison, and CCFPE otherwise.  */
3636   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3637     {
3638       switch (code)
3639         {
3640         case EQ:
3641         case NE:
3642         case UNORDERED:
3643         case ORDERED:
3644         case UNLT:
3645         case UNLE:
3646         case UNGT:
3647         case UNGE:
3648         case UNEQ:
3649         case LTGT:
3650           return CCFPmode;
3651
3652         case LT:
3653         case LE:
3654         case GT:
3655         case GE:
3656           return CCFPEmode;
3657
3658         default:
3659           gcc_unreachable ();
3660         }
3661     }
3662
3663   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3664       && y == const0_rtx
3665       && (code == EQ || code == NE || code == LT || code == GE)
3666       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3667           || GET_CODE (x) == NEG))
3668     return CC_NZmode;
3669
3670   /* A compare with a shifted operand.  Because of canonicalization,
3671      the comparison will have to be swapped when we emit the assembly
3672      code.  */
3673   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3674       && (REG_P (y) || GET_CODE (y) == SUBREG)
3675       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3676           || GET_CODE (x) == LSHIFTRT
3677           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3678     return CC_SWPmode;
3679
3680   /* Similarly for a negated operand, but we can only do this for
3681      equalities.  */
3682   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3683       && (REG_P (y) || GET_CODE (y) == SUBREG)
3684       && (code == EQ || code == NE)
3685       && GET_CODE (x) == NEG)
3686     return CC_Zmode;
3687
3688   /* A compare of a mode narrower than SI mode against zero can be done
3689      by extending the value in the comparison.  */
3690   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3691       && y == const0_rtx)
3692     /* Only use sign-extension if we really need it.  */
3693     return ((code == GT || code == GE || code == LE || code == LT)
3694             ? CC_SESWPmode : CC_ZESWPmode);
3695
3696   /* For everything else, return CCmode.  */
3697   return CCmode;
3698 }
3699
3700 static int
3701 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3702
3703 int
3704 aarch64_get_condition_code (rtx x)
3705 {
3706   machine_mode mode = GET_MODE (XEXP (x, 0));
3707   enum rtx_code comp_code = GET_CODE (x);
3708
3709   if (GET_MODE_CLASS (mode) != MODE_CC)
3710     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3711   return aarch64_get_condition_code_1 (mode, comp_code);
3712 }
3713
3714 static int
3715 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3716 {
3717   int ne = -1, eq = -1;
3718   switch (mode)
3719     {
3720     case CCFPmode:
3721     case CCFPEmode:
3722       switch (comp_code)
3723         {
3724         case GE: return AARCH64_GE;
3725         case GT: return AARCH64_GT;
3726         case LE: return AARCH64_LS;
3727         case LT: return AARCH64_MI;
3728         case NE: return AARCH64_NE;
3729         case EQ: return AARCH64_EQ;
3730         case ORDERED: return AARCH64_VC;
3731         case UNORDERED: return AARCH64_VS;
3732         case UNLT: return AARCH64_LT;
3733         case UNLE: return AARCH64_LE;
3734         case UNGT: return AARCH64_HI;
3735         case UNGE: return AARCH64_PL;
3736         default: return -1;
3737         }
3738       break;
3739
3740     case CC_DNEmode:
3741       ne = AARCH64_NE;
3742       eq = AARCH64_EQ;
3743       break;
3744
3745     case CC_DEQmode:
3746       ne = AARCH64_EQ;
3747       eq = AARCH64_NE;
3748       break;
3749
3750     case CC_DGEmode:
3751       ne = AARCH64_GE;
3752       eq = AARCH64_LT;
3753       break;
3754
3755     case CC_DLTmode:
3756       ne = AARCH64_LT;
3757       eq = AARCH64_GE;
3758       break;
3759
3760     case CC_DGTmode:
3761       ne = AARCH64_GT;
3762       eq = AARCH64_LE;
3763       break;
3764
3765     case CC_DLEmode:
3766       ne = AARCH64_LE;
3767       eq = AARCH64_GT;
3768       break;
3769
3770     case CC_DGEUmode:
3771       ne = AARCH64_CS;
3772       eq = AARCH64_CC;
3773       break;
3774
3775     case CC_DLTUmode:
3776       ne = AARCH64_CC;
3777       eq = AARCH64_CS;
3778       break;
3779
3780     case CC_DGTUmode:
3781       ne = AARCH64_HI;
3782       eq = AARCH64_LS;
3783       break;
3784
3785     case CC_DLEUmode:
3786       ne = AARCH64_LS;
3787       eq = AARCH64_HI;
3788       break;
3789
3790     case CCmode:
3791       switch (comp_code)
3792         {
3793         case NE: return AARCH64_NE;
3794         case EQ: return AARCH64_EQ;
3795         case GE: return AARCH64_GE;
3796         case GT: return AARCH64_GT;
3797         case LE: return AARCH64_LE;
3798         case LT: return AARCH64_LT;
3799         case GEU: return AARCH64_CS;
3800         case GTU: return AARCH64_HI;
3801         case LEU: return AARCH64_LS;
3802         case LTU: return AARCH64_CC;
3803         default: return -1;
3804         }
3805       break;
3806
3807     case CC_SWPmode:
3808     case CC_ZESWPmode:
3809     case CC_SESWPmode:
3810       switch (comp_code)
3811         {
3812         case NE: return AARCH64_NE;
3813         case EQ: return AARCH64_EQ;
3814         case GE: return AARCH64_LE;
3815         case GT: return AARCH64_LT;
3816         case LE: return AARCH64_GE;
3817         case LT: return AARCH64_GT;
3818         case GEU: return AARCH64_LS;
3819         case GTU: return AARCH64_CC;
3820         case LEU: return AARCH64_CS;
3821         case LTU: return AARCH64_HI;
3822         default: return -1;
3823         }
3824       break;
3825
3826     case CC_NZmode:
3827       switch (comp_code)
3828         {
3829         case NE: return AARCH64_NE;
3830         case EQ: return AARCH64_EQ;
3831         case GE: return AARCH64_PL;
3832         case LT: return AARCH64_MI;
3833         default: return -1;
3834         }
3835       break;
3836
3837     case CC_Zmode:
3838       switch (comp_code)
3839         {
3840         case NE: return AARCH64_NE;
3841         case EQ: return AARCH64_EQ;
3842         default: return -1;
3843         }
3844       break;
3845
3846     default:
3847       return -1;
3848       break;
3849     }
3850
3851   if (comp_code == NE)
3852     return ne;
3853
3854   if (comp_code == EQ)
3855     return eq;
3856
3857   return -1;
3858 }
3859
3860 bool
3861 aarch64_const_vec_all_same_in_range_p (rtx x,
3862                                   HOST_WIDE_INT minval,
3863                                   HOST_WIDE_INT maxval)
3864 {
3865   HOST_WIDE_INT firstval;
3866   int count, i;
3867
3868   if (GET_CODE (x) != CONST_VECTOR
3869       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3870     return false;
3871
3872   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3873   if (firstval < minval || firstval > maxval)
3874     return false;
3875
3876   count = CONST_VECTOR_NUNITS (x);
3877   for (i = 1; i < count; i++)
3878     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3879       return false;
3880
3881   return true;
3882 }
3883
3884 bool
3885 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3886 {
3887   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3888 }
3889
3890 static unsigned
3891 bit_count (unsigned HOST_WIDE_INT value)
3892 {
3893   unsigned count = 0;
3894
3895   while (value)
3896     {
3897       count++;
3898       value &= value - 1;
3899     }
3900
3901   return count;
3902 }
3903
3904 /* N Z C V.  */
3905 #define AARCH64_CC_V 1
3906 #define AARCH64_CC_C (1 << 1)
3907 #define AARCH64_CC_Z (1 << 2)
3908 #define AARCH64_CC_N (1 << 3)
3909
3910 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3911    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3912 static const int aarch64_nzcv_codes[][2] =
3913 {
3914   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3915   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3916   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
3917   {0, AARCH64_CC_C}, /* CC, C == 0.  */
3918   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
3919   {0, AARCH64_CC_N}, /* PL, N == 0.  */
3920   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
3921   {0, AARCH64_CC_V}, /* VC, V == 0.  */
3922   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
3923   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
3924   {0, AARCH64_CC_V}, /* GE, N == V.  */
3925   {AARCH64_CC_V, 0}, /* LT, N != V.  */
3926   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
3927   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
3928   {0, 0}, /* AL, Any.  */
3929   {0, 0}, /* NV, Any.  */
3930 };
3931
3932 int
3933 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3934 {
3935   switch (mode)
3936     {
3937     case CC_DNEmode:
3938       return NE;
3939
3940     case CC_DEQmode:
3941       return EQ;
3942
3943     case CC_DLEmode:
3944       return LE;
3945
3946     case CC_DGTmode:
3947       return GT;
3948
3949     case CC_DLTmode:
3950       return LT;
3951
3952     case CC_DGEmode:
3953       return GE;
3954
3955     case CC_DLEUmode:
3956       return LEU;
3957
3958     case CC_DGTUmode:
3959       return GTU;
3960
3961     case CC_DLTUmode:
3962       return LTU;
3963
3964     case CC_DGEUmode:
3965       return GEU;
3966
3967     default:
3968       gcc_unreachable ();
3969     }
3970 }
3971
3972
3973 void
3974 aarch64_print_operand (FILE *f, rtx x, char code)
3975 {
3976   switch (code)
3977     {
3978     /* An integer or symbol address without a preceding # sign.  */
3979     case 'c':
3980       switch (GET_CODE (x))
3981         {
3982         case CONST_INT:
3983           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3984           break;
3985
3986         case SYMBOL_REF:
3987           output_addr_const (f, x);
3988           break;
3989
3990         case CONST:
3991           if (GET_CODE (XEXP (x, 0)) == PLUS
3992               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3993             {
3994               output_addr_const (f, x);
3995               break;
3996             }
3997           /* Fall through.  */
3998
3999         default:
4000           output_operand_lossage ("Unsupported operand for code '%c'", code);
4001         }
4002       break;
4003
4004     case 'e':
4005       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4006       {
4007         int n;
4008
4009         if (!CONST_INT_P (x)
4010             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4011           {
4012             output_operand_lossage ("invalid operand for '%%%c'", code);
4013             return;
4014           }
4015
4016         switch (n)
4017           {
4018           case 3:
4019             fputc ('b', f);
4020             break;
4021           case 4:
4022             fputc ('h', f);
4023             break;
4024           case 5:
4025             fputc ('w', f);
4026             break;
4027           default:
4028             output_operand_lossage ("invalid operand for '%%%c'", code);
4029             return;
4030           }
4031       }
4032       break;
4033
4034     case 'p':
4035       {
4036         int n;
4037
4038         /* Print N such that 2^N == X.  */
4039         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4040           {
4041             output_operand_lossage ("invalid operand for '%%%c'", code);
4042             return;
4043           }
4044
4045         asm_fprintf (f, "%d", n);
4046       }
4047       break;
4048
4049     case 'P':
4050       /* Print the number of non-zero bits in X (a const_int).  */
4051       if (!CONST_INT_P (x))
4052         {
4053           output_operand_lossage ("invalid operand for '%%%c'", code);
4054           return;
4055         }
4056
4057       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4058       break;
4059
4060     case 'H':
4061       /* Print the higher numbered register of a pair (TImode) of regs.  */
4062       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4063         {
4064           output_operand_lossage ("invalid operand for '%%%c'", code);
4065           return;
4066         }
4067
4068       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4069       break;
4070
4071     case 'm':
4072       {
4073         int cond_code;
4074         /* Print a condition (eq, ne, etc).  */
4075
4076         /* CONST_TRUE_RTX means always -- that's the default.  */
4077         if (x == const_true_rtx)
4078           return;
4079
4080         if (!COMPARISON_P (x))
4081           {
4082             output_operand_lossage ("invalid operand for '%%%c'", code);
4083             return;
4084           }
4085
4086         cond_code = aarch64_get_condition_code (x);
4087         gcc_assert (cond_code >= 0);
4088         fputs (aarch64_condition_codes[cond_code], f);
4089       }
4090       break;
4091
4092     case 'M':
4093       {
4094         int cond_code;
4095         /* Print the inverse of a condition (eq <-> ne, etc).  */
4096
4097         /* CONST_TRUE_RTX means never -- that's the default.  */
4098         if (x == const_true_rtx)
4099           {
4100             fputs ("nv", f);
4101             return;
4102           }
4103
4104         if (!COMPARISON_P (x))
4105           {
4106             output_operand_lossage ("invalid operand for '%%%c'", code);
4107             return;
4108           }
4109         cond_code = aarch64_get_condition_code (x);
4110         gcc_assert (cond_code >= 0);
4111         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4112                                        (cond_code)], f);
4113       }
4114       break;
4115
4116     case 'b':
4117     case 'h':
4118     case 's':
4119     case 'd':
4120     case 'q':
4121       /* Print a scalar FP/SIMD register name.  */
4122       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4123         {
4124           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4125           return;
4126         }
4127       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4128       break;
4129
4130     case 'S':
4131     case 'T':
4132     case 'U':
4133     case 'V':
4134       /* Print the first FP/SIMD register name in a list.  */
4135       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4136         {
4137           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4138           return;
4139         }
4140       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4141       break;
4142
4143     case 'X':
4144       /* Print bottom 16 bits of integer constant in hex.  */
4145       if (!CONST_INT_P (x))
4146         {
4147           output_operand_lossage ("invalid operand for '%%%c'", code);
4148           return;
4149         }
4150       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4151       break;
4152
4153     case 'w':
4154     case 'x':
4155       /* Print a general register name or the zero register (32-bit or
4156          64-bit).  */
4157       if (x == const0_rtx
4158           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4159         {
4160           asm_fprintf (f, "%czr", code);
4161           break;
4162         }
4163
4164       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4165         {
4166           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4167           break;
4168         }
4169
4170       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4171         {
4172           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4173           break;
4174         }
4175
4176       /* Fall through */
4177
4178     case 0:
4179       /* Print a normal operand, if it's a general register, then we
4180          assume DImode.  */
4181       if (x == NULL)
4182         {
4183           output_operand_lossage ("missing operand");
4184           return;
4185         }
4186
4187       switch (GET_CODE (x))
4188         {
4189         case REG:
4190           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4191           break;
4192
4193         case MEM:
4194           aarch64_memory_reference_mode = GET_MODE (x);
4195           output_address (XEXP (x, 0));
4196           break;
4197
4198         case LABEL_REF:
4199         case SYMBOL_REF:
4200           output_addr_const (asm_out_file, x);
4201           break;
4202
4203         case CONST_INT:
4204           asm_fprintf (f, "%wd", INTVAL (x));
4205           break;
4206
4207         case CONST_VECTOR:
4208           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4209             {
4210               gcc_assert (
4211                   aarch64_const_vec_all_same_in_range_p (x,
4212                                                          HOST_WIDE_INT_MIN,
4213                                                          HOST_WIDE_INT_MAX));
4214               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4215             }
4216           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4217             {
4218               fputc ('0', f);
4219             }
4220           else
4221             gcc_unreachable ();
4222           break;
4223
4224         case CONST_DOUBLE:
4225           /* CONST_DOUBLE can represent a double-width integer.
4226              In this case, the mode of x is VOIDmode.  */
4227           if (GET_MODE (x) == VOIDmode)
4228             ; /* Do Nothing.  */
4229           else if (aarch64_float_const_zero_rtx_p (x))
4230             {
4231               fputc ('0', f);
4232               break;
4233             }
4234           else if (aarch64_float_const_representable_p (x))
4235             {
4236 #define buf_size 20
4237               char float_buf[buf_size] = {'\0'};
4238               REAL_VALUE_TYPE r;
4239               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4240               real_to_decimal_for_mode (float_buf, &r,
4241                                         buf_size, buf_size,
4242                                         1, GET_MODE (x));
4243               asm_fprintf (asm_out_file, "%s", float_buf);
4244               break;
4245 #undef buf_size
4246             }
4247           output_operand_lossage ("invalid constant");
4248           return;
4249         default:
4250           output_operand_lossage ("invalid operand");
4251           return;
4252         }
4253       break;
4254
4255     case 'A':
4256       if (GET_CODE (x) == HIGH)
4257         x = XEXP (x, 0);
4258
4259       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4260         {
4261         case SYMBOL_SMALL_GOT:
4262           asm_fprintf (asm_out_file, ":got:");
4263           break;
4264
4265         case SYMBOL_SMALL_TLSGD:
4266           asm_fprintf (asm_out_file, ":tlsgd:");
4267           break;
4268
4269         case SYMBOL_SMALL_TLSDESC:
4270           asm_fprintf (asm_out_file, ":tlsdesc:");
4271           break;
4272
4273         case SYMBOL_SMALL_GOTTPREL:
4274           asm_fprintf (asm_out_file, ":gottprel:");
4275           break;
4276
4277         case SYMBOL_SMALL_TPREL:
4278           asm_fprintf (asm_out_file, ":tprel:");
4279           break;
4280
4281         case SYMBOL_TINY_GOT:
4282           gcc_unreachable ();
4283           break;
4284
4285         default:
4286           break;
4287         }
4288       output_addr_const (asm_out_file, x);
4289       break;
4290
4291     case 'L':
4292       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4293         {
4294         case SYMBOL_SMALL_GOT:
4295           asm_fprintf (asm_out_file, ":lo12:");
4296           break;
4297
4298         case SYMBOL_SMALL_TLSGD:
4299           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4300           break;
4301
4302         case SYMBOL_SMALL_TLSDESC:
4303           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4304           break;
4305
4306         case SYMBOL_SMALL_GOTTPREL:
4307           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4308           break;
4309
4310         case SYMBOL_SMALL_TPREL:
4311           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4312           break;
4313
4314         case SYMBOL_TINY_GOT:
4315           asm_fprintf (asm_out_file, ":got:");
4316           break;
4317
4318         default:
4319           break;
4320         }
4321       output_addr_const (asm_out_file, x);
4322       break;
4323
4324     case 'G':
4325
4326       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4327         {
4328         case SYMBOL_SMALL_TPREL:
4329           asm_fprintf (asm_out_file, ":tprel_hi12:");
4330           break;
4331         default:
4332           break;
4333         }
4334       output_addr_const (asm_out_file, x);
4335       break;
4336
4337     case 'K':
4338       {
4339         int cond_code;
4340         /* Print nzcv.  */
4341
4342         if (!COMPARISON_P (x))
4343           {
4344             output_operand_lossage ("invalid operand for '%%%c'", code);
4345             return;
4346           }
4347
4348         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4349         gcc_assert (cond_code >= 0);
4350         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4351       }
4352       break;
4353
4354     case 'k':
4355       {
4356         int cond_code;
4357         /* Print nzcv.  */
4358
4359         if (!COMPARISON_P (x))
4360           {
4361             output_operand_lossage ("invalid operand for '%%%c'", code);
4362             return;
4363           }
4364
4365         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4366         gcc_assert (cond_code >= 0);
4367         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4368       }
4369       break;
4370
4371     default:
4372       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4373       return;
4374     }
4375 }
4376
4377 void
4378 aarch64_print_operand_address (FILE *f, rtx x)
4379 {
4380   struct aarch64_address_info addr;
4381
4382   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4383                              MEM, true))
4384     switch (addr.type)
4385       {
4386       case ADDRESS_REG_IMM:
4387         if (addr.offset == const0_rtx)
4388           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4389         else
4390           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4391                        INTVAL (addr.offset));
4392         return;
4393
4394       case ADDRESS_REG_REG:
4395         if (addr.shift == 0)
4396           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4397                        reg_names [REGNO (addr.offset)]);
4398         else
4399           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4400                        reg_names [REGNO (addr.offset)], addr.shift);
4401         return;
4402
4403       case ADDRESS_REG_UXTW:
4404         if (addr.shift == 0)
4405           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4406                        REGNO (addr.offset) - R0_REGNUM);
4407         else
4408           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4409                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4410         return;
4411
4412       case ADDRESS_REG_SXTW:
4413         if (addr.shift == 0)
4414           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4415                        REGNO (addr.offset) - R0_REGNUM);
4416         else
4417           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4418                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4419         return;
4420
4421       case ADDRESS_REG_WB:
4422         switch (GET_CODE (x))
4423           {
4424           case PRE_INC:
4425             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4426                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4427             return;
4428           case POST_INC:
4429             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4430                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4431             return;
4432           case PRE_DEC:
4433             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4434                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4435             return;
4436           case POST_DEC:
4437             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4438                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4439             return;
4440           case PRE_MODIFY:
4441             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4442                          INTVAL (addr.offset));
4443             return;
4444           case POST_MODIFY:
4445             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4446                          INTVAL (addr.offset));
4447             return;
4448           default:
4449             break;
4450           }
4451         break;
4452
4453       case ADDRESS_LO_SUM:
4454         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4455         output_addr_const (f, addr.offset);
4456         asm_fprintf (f, "]");
4457         return;
4458
4459       case ADDRESS_SYMBOLIC:
4460         break;
4461       }
4462
4463   output_addr_const (f, x);
4464 }
4465
4466 bool
4467 aarch64_label_mentioned_p (rtx x)
4468 {
4469   const char *fmt;
4470   int i;
4471
4472   if (GET_CODE (x) == LABEL_REF)
4473     return true;
4474
4475   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4476      referencing instruction, but they are constant offsets, not
4477      symbols.  */
4478   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4479     return false;
4480
4481   fmt = GET_RTX_FORMAT (GET_CODE (x));
4482   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4483     {
4484       if (fmt[i] == 'E')
4485         {
4486           int j;
4487
4488           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4489             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4490               return 1;
4491         }
4492       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4493         return 1;
4494     }
4495
4496   return 0;
4497 }
4498
4499 /* Implement REGNO_REG_CLASS.  */
4500
4501 enum reg_class
4502 aarch64_regno_regclass (unsigned regno)
4503 {
4504   if (GP_REGNUM_P (regno))
4505     return GENERAL_REGS;
4506
4507   if (regno == SP_REGNUM)
4508     return STACK_REG;
4509
4510   if (regno == FRAME_POINTER_REGNUM
4511       || regno == ARG_POINTER_REGNUM)
4512     return POINTER_REGS;
4513
4514   if (FP_REGNUM_P (regno))
4515     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4516
4517   return NO_REGS;
4518 }
4519
4520 static rtx
4521 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4522 {
4523   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4524      where mask is selected by alignment and size of the offset.
4525      We try to pick as large a range for the offset as possible to
4526      maximize the chance of a CSE.  However, for aligned addresses
4527      we limit the range to 4k so that structures with different sized
4528      elements are likely to use the same base.  */
4529
4530   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4531     {
4532       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4533       HOST_WIDE_INT base_offset;
4534
4535       /* Does it look like we'll need a load/store-pair operation?  */
4536       if (GET_MODE_SIZE (mode) > 16
4537           || mode == TImode)
4538         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4539                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4540       /* For offsets aren't a multiple of the access size, the limit is
4541          -256...255.  */
4542       else if (offset & (GET_MODE_SIZE (mode) - 1))
4543         base_offset = (offset + 0x100) & ~0x1ff;
4544       else
4545         base_offset = offset & ~0xfff;
4546
4547       if (base_offset == 0)
4548         return x;
4549
4550       offset -= base_offset;
4551       rtx base_reg = gen_reg_rtx (Pmode);
4552       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4553                            NULL_RTX);
4554       emit_move_insn (base_reg, val);
4555       x = plus_constant (Pmode, base_reg, offset);
4556     }
4557
4558   return x;
4559 }
4560
4561 /* Try a machine-dependent way of reloading an illegitimate address
4562    operand.  If we find one, push the reload and return the new rtx.  */
4563
4564 rtx
4565 aarch64_legitimize_reload_address (rtx *x_p,
4566                                    machine_mode mode,
4567                                    int opnum, int type,
4568                                    int ind_levels ATTRIBUTE_UNUSED)
4569 {
4570   rtx x = *x_p;
4571
4572   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4573   if (aarch64_vect_struct_mode_p (mode)
4574       && GET_CODE (x) == PLUS
4575       && REG_P (XEXP (x, 0))
4576       && CONST_INT_P (XEXP (x, 1)))
4577     {
4578       rtx orig_rtx = x;
4579       x = copy_rtx (x);
4580       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4581                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4582                    opnum, (enum reload_type) type);
4583       return x;
4584     }
4585
4586   /* We must recognize output that we have already generated ourselves.  */
4587   if (GET_CODE (x) == PLUS
4588       && GET_CODE (XEXP (x, 0)) == PLUS
4589       && REG_P (XEXP (XEXP (x, 0), 0))
4590       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4591       && CONST_INT_P (XEXP (x, 1)))
4592     {
4593       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4594                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4595                    opnum, (enum reload_type) type);
4596       return x;
4597     }
4598
4599   /* We wish to handle large displacements off a base register by splitting
4600      the addend across an add and the mem insn.  This can cut the number of
4601      extra insns needed from 3 to 1.  It is only useful for load/store of a
4602      single register with 12 bit offset field.  */
4603   if (GET_CODE (x) == PLUS
4604       && REG_P (XEXP (x, 0))
4605       && CONST_INT_P (XEXP (x, 1))
4606       && HARD_REGISTER_P (XEXP (x, 0))
4607       && mode != TImode
4608       && mode != TFmode
4609       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4610     {
4611       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4612       HOST_WIDE_INT low = val & 0xfff;
4613       HOST_WIDE_INT high = val - low;
4614       HOST_WIDE_INT offs;
4615       rtx cst;
4616       machine_mode xmode = GET_MODE (x);
4617
4618       /* In ILP32, xmode can be either DImode or SImode.  */
4619       gcc_assert (xmode == DImode || xmode == SImode);
4620
4621       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4622          BLKmode alignment.  */
4623       if (GET_MODE_SIZE (mode) == 0)
4624         return NULL_RTX;
4625
4626       offs = low % GET_MODE_SIZE (mode);
4627
4628       /* Align misaligned offset by adjusting high part to compensate.  */
4629       if (offs != 0)
4630         {
4631           if (aarch64_uimm12_shift (high + offs))
4632             {
4633               /* Align down.  */
4634               low = low - offs;
4635               high = high + offs;
4636             }
4637           else
4638             {
4639               /* Align up.  */
4640               offs = GET_MODE_SIZE (mode) - offs;
4641               low = low + offs;
4642               high = high + (low & 0x1000) - offs;
4643               low &= 0xfff;
4644             }
4645         }
4646
4647       /* Check for overflow.  */
4648       if (high + low != val)
4649         return NULL_RTX;
4650
4651       cst = GEN_INT (high);
4652       if (!aarch64_uimm12_shift (high))
4653         cst = force_const_mem (xmode, cst);
4654
4655       /* Reload high part into base reg, leaving the low part
4656          in the mem instruction.
4657          Note that replacing this gen_rtx_PLUS with plus_constant is
4658          wrong in this case because we rely on the
4659          (plus (plus reg c1) c2) structure being preserved so that
4660          XEXP (*p, 0) in push_reload below uses the correct term.  */
4661       x = gen_rtx_PLUS (xmode,
4662                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4663                         GEN_INT (low));
4664
4665       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4666                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4667                    opnum, (enum reload_type) type);
4668       return x;
4669     }
4670
4671   return NULL_RTX;
4672 }
4673
4674
4675 static reg_class_t
4676 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4677                           reg_class_t rclass,
4678                           machine_mode mode,
4679                           secondary_reload_info *sri)
4680 {
4681   /* Without the TARGET_SIMD instructions we cannot move a Q register
4682      to a Q register directly.  We need a scratch.  */
4683   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4684       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4685       && reg_class_subset_p (rclass, FP_REGS))
4686     {
4687       if (mode == TFmode)
4688         sri->icode = CODE_FOR_aarch64_reload_movtf;
4689       else if (mode == TImode)
4690         sri->icode = CODE_FOR_aarch64_reload_movti;
4691       return NO_REGS;
4692     }
4693
4694   /* A TFmode or TImode memory access should be handled via an FP_REGS
4695      because AArch64 has richer addressing modes for LDR/STR instructions
4696      than LDP/STP instructions.  */
4697   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4698       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4699     return FP_REGS;
4700
4701   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4702       return GENERAL_REGS;
4703
4704   return NO_REGS;
4705 }
4706
4707 static bool
4708 aarch64_can_eliminate (const int from, const int to)
4709 {
4710   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4711      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4712
4713   if (frame_pointer_needed)
4714     {
4715       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4716         return true;
4717       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4718         return false;
4719       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4720           && !cfun->calls_alloca)
4721         return true;
4722       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4723         return true;
4724
4725       return false;
4726     }
4727   else
4728     {
4729       /* If we decided that we didn't need a leaf frame pointer but then used
4730          LR in the function, then we'll want a frame pointer after all, so
4731          prevent this elimination to ensure a frame pointer is used.  */
4732       if (to == STACK_POINTER_REGNUM
4733           && flag_omit_leaf_frame_pointer
4734           && df_regs_ever_live_p (LR_REGNUM))
4735         return false;
4736     }
4737
4738   return true;
4739 }
4740
4741 HOST_WIDE_INT
4742 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4743 {
4744   aarch64_layout_frame ();
4745
4746   if (to == HARD_FRAME_POINTER_REGNUM)
4747     {
4748       if (from == ARG_POINTER_REGNUM)
4749         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4750
4751       if (from == FRAME_POINTER_REGNUM)
4752         return (cfun->machine->frame.hard_fp_offset
4753                 - cfun->machine->frame.saved_varargs_size);
4754     }
4755
4756   if (to == STACK_POINTER_REGNUM)
4757     {
4758       if (from == FRAME_POINTER_REGNUM)
4759           return (cfun->machine->frame.frame_size
4760                   - cfun->machine->frame.saved_varargs_size);
4761     }
4762
4763   return cfun->machine->frame.frame_size;
4764 }
4765
4766 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4767    previous frame.  */
4768
4769 rtx
4770 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4771 {
4772   if (count != 0)
4773     return const0_rtx;
4774   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4775 }
4776
4777
4778 static void
4779 aarch64_asm_trampoline_template (FILE *f)
4780 {
4781   if (TARGET_ILP32)
4782     {
4783       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4784       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4785     }
4786   else
4787     {
4788       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4789       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4790     }
4791   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4792   assemble_aligned_integer (4, const0_rtx);
4793   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4794   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4795 }
4796
4797 static void
4798 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4799 {
4800   rtx fnaddr, mem, a_tramp;
4801   const int tramp_code_sz = 16;
4802
4803   /* Don't need to copy the trailing D-words, we fill those in below.  */
4804   emit_block_move (m_tramp, assemble_trampoline_template (),
4805                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4806   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4807   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4808   if (GET_MODE (fnaddr) != ptr_mode)
4809     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4810   emit_move_insn (mem, fnaddr);
4811
4812   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4813   emit_move_insn (mem, chain_value);
4814
4815   /* XXX We should really define a "clear_cache" pattern and use
4816      gen_clear_cache().  */
4817   a_tramp = XEXP (m_tramp, 0);
4818   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4819                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4820                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4821                      ptr_mode);
4822 }
4823
4824 static unsigned char
4825 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4826 {
4827   switch (regclass)
4828     {
4829     case CALLER_SAVE_REGS:
4830     case POINTER_REGS:
4831     case GENERAL_REGS:
4832     case ALL_REGS:
4833     case FP_REGS:
4834     case FP_LO_REGS:
4835       return
4836         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4837                                        (GET_MODE_SIZE (mode) + 7) / 8;
4838     case STACK_REG:
4839       return 1;
4840
4841     case NO_REGS:
4842       return 0;
4843
4844     default:
4845       break;
4846     }
4847   gcc_unreachable ();
4848 }
4849
4850 static reg_class_t
4851 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4852 {
4853   if (regclass == POINTER_REGS)
4854     return GENERAL_REGS;
4855
4856   if (regclass == STACK_REG)
4857     {
4858       if (REG_P(x)
4859           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4860           return regclass;
4861
4862       return NO_REGS;
4863     }
4864
4865   /* If it's an integer immediate that MOVI can't handle, then
4866      FP_REGS is not an option, so we return NO_REGS instead.  */
4867   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4868       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4869     return NO_REGS;
4870
4871   /* Register eliminiation can result in a request for
4872      SP+constant->FP_REGS.  We cannot support such operations which
4873      use SP as source and an FP_REG as destination, so reject out
4874      right now.  */
4875   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4876     {
4877       rtx lhs = XEXP (x, 0);
4878
4879       /* Look through a possible SUBREG introduced by ILP32.  */
4880       if (GET_CODE (lhs) == SUBREG)
4881         lhs = SUBREG_REG (lhs);
4882
4883       gcc_assert (REG_P (lhs));
4884       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4885                                       POINTER_REGS));
4886       return NO_REGS;
4887     }
4888
4889   return regclass;
4890 }
4891
4892 void
4893 aarch64_asm_output_labelref (FILE* f, const char *name)
4894 {
4895   asm_fprintf (f, "%U%s", name);
4896 }
4897
4898 static void
4899 aarch64_elf_asm_constructor (rtx symbol, int priority)
4900 {
4901   if (priority == DEFAULT_INIT_PRIORITY)
4902     default_ctor_section_asm_out_constructor (symbol, priority);
4903   else
4904     {
4905       section *s;
4906       char buf[18];
4907       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4908       s = get_section (buf, SECTION_WRITE, NULL);
4909       switch_to_section (s);
4910       assemble_align (POINTER_SIZE);
4911       assemble_aligned_integer (POINTER_BYTES, symbol);
4912     }
4913 }
4914
4915 static void
4916 aarch64_elf_asm_destructor (rtx symbol, int priority)
4917 {
4918   if (priority == DEFAULT_INIT_PRIORITY)
4919     default_dtor_section_asm_out_destructor (symbol, priority);
4920   else
4921     {
4922       section *s;
4923       char buf[18];
4924       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4925       s = get_section (buf, SECTION_WRITE, NULL);
4926       switch_to_section (s);
4927       assemble_align (POINTER_SIZE);
4928       assemble_aligned_integer (POINTER_BYTES, symbol);
4929     }
4930 }
4931
4932 const char*
4933 aarch64_output_casesi (rtx *operands)
4934 {
4935   char buf[100];
4936   char label[100];
4937   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
4938   int index;
4939   static const char *const patterns[4][2] =
4940   {
4941     {
4942       "ldrb\t%w3, [%0,%w1,uxtw]",
4943       "add\t%3, %4, %w3, sxtb #2"
4944     },
4945     {
4946       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4947       "add\t%3, %4, %w3, sxth #2"
4948     },
4949     {
4950       "ldr\t%w3, [%0,%w1,uxtw #2]",
4951       "add\t%3, %4, %w3, sxtw #2"
4952     },
4953     /* We assume that DImode is only generated when not optimizing and
4954        that we don't really need 64-bit address offsets.  That would
4955        imply an object file with 8GB of code in a single function!  */
4956     {
4957       "ldr\t%w3, [%0,%w1,uxtw #2]",
4958       "add\t%3, %4, %w3, sxtw #2"
4959     }
4960   };
4961
4962   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4963
4964   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4965
4966   gcc_assert (index >= 0 && index <= 3);
4967
4968   /* Need to implement table size reduction, by chaning the code below.  */
4969   output_asm_insn (patterns[index][0], operands);
4970   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4971   snprintf (buf, sizeof (buf),
4972             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4973   output_asm_insn (buf, operands);
4974   output_asm_insn (patterns[index][1], operands);
4975   output_asm_insn ("br\t%3", operands);
4976   assemble_label (asm_out_file, label);
4977   return "";
4978 }
4979
4980
4981 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4982    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4983    operator.  */
4984
4985 int
4986 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4987 {
4988   if (shift >= 0 && shift <= 3)
4989     {
4990       int size;
4991       for (size = 8; size <= 32; size *= 2)
4992         {
4993           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4994           if (mask == bits << shift)
4995             return size;
4996         }
4997     }
4998   return 0;
4999 }
5000
5001 static bool
5002 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5003                                    const_rtx x ATTRIBUTE_UNUSED)
5004 {
5005   /* We can't use blocks for constants when we're using a per-function
5006      constant pool.  */
5007   return false;
5008 }
5009
5010 static section *
5011 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5012                             rtx x ATTRIBUTE_UNUSED,
5013                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5014 {
5015   /* Force all constant pool entries into the current function section.  */
5016   return function_section (current_function_decl);
5017 }
5018
5019
5020 /* Costs.  */
5021
5022 /* Helper function for rtx cost calculation.  Strip a shift expression
5023    from X.  Returns the inner operand if successful, or the original
5024    expression on failure.  */
5025 static rtx
5026 aarch64_strip_shift (rtx x)
5027 {
5028   rtx op = x;
5029
5030   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5031      we can convert both to ROR during final output.  */
5032   if ((GET_CODE (op) == ASHIFT
5033        || GET_CODE (op) == ASHIFTRT
5034        || GET_CODE (op) == LSHIFTRT
5035        || GET_CODE (op) == ROTATERT
5036        || GET_CODE (op) == ROTATE)
5037       && CONST_INT_P (XEXP (op, 1)))
5038     return XEXP (op, 0);
5039
5040   if (GET_CODE (op) == MULT
5041       && CONST_INT_P (XEXP (op, 1))
5042       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5043     return XEXP (op, 0);
5044
5045   return x;
5046 }
5047
5048 /* Helper function for rtx cost calculation.  Strip an extend
5049    expression from X.  Returns the inner operand if successful, or the
5050    original expression on failure.  We deal with a number of possible
5051    canonicalization variations here.  */
5052 static rtx
5053 aarch64_strip_extend (rtx x)
5054 {
5055   rtx op = x;
5056
5057   /* Zero and sign extraction of a widened value.  */
5058   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5059       && XEXP (op, 2) == const0_rtx
5060       && GET_CODE (XEXP (op, 0)) == MULT
5061       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5062                                          XEXP (op, 1)))
5063     return XEXP (XEXP (op, 0), 0);
5064
5065   /* It can also be represented (for zero-extend) as an AND with an
5066      immediate.  */
5067   if (GET_CODE (op) == AND
5068       && GET_CODE (XEXP (op, 0)) == MULT
5069       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5070       && CONST_INT_P (XEXP (op, 1))
5071       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5072                            INTVAL (XEXP (op, 1))) != 0)
5073     return XEXP (XEXP (op, 0), 0);
5074
5075   /* Now handle extended register, as this may also have an optional
5076      left shift by 1..4.  */
5077   if (GET_CODE (op) == ASHIFT
5078       && CONST_INT_P (XEXP (op, 1))
5079       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5080     op = XEXP (op, 0);
5081
5082   if (GET_CODE (op) == ZERO_EXTEND
5083       || GET_CODE (op) == SIGN_EXTEND)
5084     op = XEXP (op, 0);
5085
5086   if (op != x)
5087     return op;
5088
5089   return x;
5090 }
5091
5092 /* Helper function for rtx cost calculation.  Calculate the cost of
5093    a MULT, which may be part of a multiply-accumulate rtx.  Return
5094    the calculated cost of the expression, recursing manually in to
5095    operands where needed.  */
5096
5097 static int
5098 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5099 {
5100   rtx op0, op1;
5101   const struct cpu_cost_table *extra_cost
5102     = aarch64_tune_params->insn_extra_cost;
5103   int cost = 0;
5104   bool maybe_fma = (outer == PLUS || outer == MINUS);
5105   machine_mode mode = GET_MODE (x);
5106
5107   gcc_checking_assert (code == MULT);
5108
5109   op0 = XEXP (x, 0);
5110   op1 = XEXP (x, 1);
5111
5112   if (VECTOR_MODE_P (mode))
5113     mode = GET_MODE_INNER (mode);
5114
5115   /* Integer multiply/fma.  */
5116   if (GET_MODE_CLASS (mode) == MODE_INT)
5117     {
5118       /* The multiply will be canonicalized as a shift, cost it as such.  */
5119       if (CONST_INT_P (op1)
5120           && exact_log2 (INTVAL (op1)) > 0)
5121         {
5122           if (speed)
5123             {
5124               if (maybe_fma)
5125                 /* ADD (shifted register).  */
5126                 cost += extra_cost->alu.arith_shift;
5127               else
5128                 /* LSL (immediate).  */
5129                 cost += extra_cost->alu.shift;
5130             }
5131
5132           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5133
5134           return cost;
5135         }
5136
5137       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5138       if ((GET_CODE (op0) == ZERO_EXTEND
5139            && GET_CODE (op1) == ZERO_EXTEND)
5140           || (GET_CODE (op0) == SIGN_EXTEND
5141               && GET_CODE (op1) == SIGN_EXTEND))
5142         {
5143           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5144                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5145
5146           if (speed)
5147             {
5148               if (maybe_fma)
5149                 /* MADD/SMADDL/UMADDL.  */
5150                 cost += extra_cost->mult[0].extend_add;
5151               else
5152                 /* MUL/SMULL/UMULL.  */
5153                 cost += extra_cost->mult[0].extend;
5154             }
5155
5156           return cost;
5157         }
5158
5159       /* This is either an integer multiply or an FMA.  In both cases
5160          we want to recurse and cost the operands.  */
5161       cost += rtx_cost (op0, MULT, 0, speed)
5162               + rtx_cost (op1, MULT, 1, speed);
5163
5164       if (speed)
5165         {
5166           if (maybe_fma)
5167             /* MADD.  */
5168             cost += extra_cost->mult[mode == DImode].add;
5169           else
5170             /* MUL.  */
5171             cost += extra_cost->mult[mode == DImode].simple;
5172         }
5173
5174       return cost;
5175     }
5176   else
5177     {
5178       if (speed)
5179         {
5180           /* Floating-point FMA/FMUL can also support negations of the
5181              operands.  */
5182           if (GET_CODE (op0) == NEG)
5183             op0 = XEXP (op0, 0);
5184           if (GET_CODE (op1) == NEG)
5185             op1 = XEXP (op1, 0);
5186
5187           if (maybe_fma)
5188             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5189             cost += extra_cost->fp[mode == DFmode].fma;
5190           else
5191             /* FMUL/FNMUL.  */
5192             cost += extra_cost->fp[mode == DFmode].mult;
5193         }
5194
5195       cost += rtx_cost (op0, MULT, 0, speed)
5196               + rtx_cost (op1, MULT, 1, speed);
5197       return cost;
5198     }
5199 }
5200
5201 static int
5202 aarch64_address_cost (rtx x,
5203                       machine_mode mode,
5204                       addr_space_t as ATTRIBUTE_UNUSED,
5205                       bool speed)
5206 {
5207   enum rtx_code c = GET_CODE (x);
5208   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5209   struct aarch64_address_info info;
5210   int cost = 0;
5211   info.shift = 0;
5212
5213   if (!aarch64_classify_address (&info, x, mode, c, false))
5214     {
5215       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5216         {
5217           /* This is a CONST or SYMBOL ref which will be split
5218              in a different way depending on the code model in use.
5219              Cost it through the generic infrastructure.  */
5220           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5221           /* Divide through by the cost of one instruction to
5222              bring it to the same units as the address costs.  */
5223           cost_symbol_ref /= COSTS_N_INSNS (1);
5224           /* The cost is then the cost of preparing the address,
5225              followed by an immediate (possibly 0) offset.  */
5226           return cost_symbol_ref + addr_cost->imm_offset;
5227         }
5228       else
5229         {
5230           /* This is most likely a jump table from a case
5231              statement.  */
5232           return addr_cost->register_offset;
5233         }
5234     }
5235
5236   switch (info.type)
5237     {
5238       case ADDRESS_LO_SUM:
5239       case ADDRESS_SYMBOLIC:
5240       case ADDRESS_REG_IMM:
5241         cost += addr_cost->imm_offset;
5242         break;
5243
5244       case ADDRESS_REG_WB:
5245         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5246           cost += addr_cost->pre_modify;
5247         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5248           cost += addr_cost->post_modify;
5249         else
5250           gcc_unreachable ();
5251
5252         break;
5253
5254       case ADDRESS_REG_REG:
5255         cost += addr_cost->register_offset;
5256         break;
5257
5258       case ADDRESS_REG_UXTW:
5259       case ADDRESS_REG_SXTW:
5260         cost += addr_cost->register_extend;
5261         break;
5262
5263       default:
5264         gcc_unreachable ();
5265     }
5266
5267
5268   if (info.shift > 0)
5269     {
5270       /* For the sake of calculating the cost of the shifted register
5271          component, we can treat same sized modes in the same way.  */
5272       switch (GET_MODE_BITSIZE (mode))
5273         {
5274           case 16:
5275             cost += addr_cost->addr_scale_costs.hi;
5276             break;
5277
5278           case 32:
5279             cost += addr_cost->addr_scale_costs.si;
5280             break;
5281
5282           case 64:
5283             cost += addr_cost->addr_scale_costs.di;
5284             break;
5285
5286           /* We can't tell, or this is a 128-bit vector.  */
5287           default:
5288             cost += addr_cost->addr_scale_costs.ti;
5289             break;
5290         }
5291     }
5292
5293   return cost;
5294 }
5295
5296 /* Return true if the RTX X in mode MODE is a zero or sign extract
5297    usable in an ADD or SUB (extended register) instruction.  */
5298 static bool
5299 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5300 {
5301   /* Catch add with a sign extract.
5302      This is add_<optab><mode>_multp2.  */
5303   if (GET_CODE (x) == SIGN_EXTRACT
5304       || GET_CODE (x) == ZERO_EXTRACT)
5305     {
5306       rtx op0 = XEXP (x, 0);
5307       rtx op1 = XEXP (x, 1);
5308       rtx op2 = XEXP (x, 2);
5309
5310       if (GET_CODE (op0) == MULT
5311           && CONST_INT_P (op1)
5312           && op2 == const0_rtx
5313           && CONST_INT_P (XEXP (op0, 1))
5314           && aarch64_is_extend_from_extract (mode,
5315                                              XEXP (op0, 1),
5316                                              op1))
5317         {
5318           return true;
5319         }
5320     }
5321
5322   return false;
5323 }
5324
5325 static bool
5326 aarch64_frint_unspec_p (unsigned int u)
5327 {
5328   switch (u)
5329     {
5330       case UNSPEC_FRINTZ:
5331       case UNSPEC_FRINTP:
5332       case UNSPEC_FRINTM:
5333       case UNSPEC_FRINTA:
5334       case UNSPEC_FRINTN:
5335       case UNSPEC_FRINTX:
5336       case UNSPEC_FRINTI:
5337         return true;
5338
5339       default:
5340         return false;
5341     }
5342 }
5343
5344 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5345    storing it in *COST.  Result is true if the total cost of the operation
5346    has now been calculated.  */
5347 static bool
5348 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5349 {
5350   rtx inner;
5351   rtx comparator;
5352   enum rtx_code cmpcode;
5353
5354   if (COMPARISON_P (op0))
5355     {
5356       inner = XEXP (op0, 0);
5357       comparator = XEXP (op0, 1);
5358       cmpcode = GET_CODE (op0);
5359     }
5360   else
5361     {
5362       inner = op0;
5363       comparator = const0_rtx;
5364       cmpcode = NE;
5365     }
5366
5367   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5368     {
5369       /* Conditional branch.  */
5370       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5371         return true;
5372       else
5373         {
5374           if (cmpcode == NE || cmpcode == EQ)
5375             {
5376               if (comparator == const0_rtx)
5377                 {
5378                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5379                   if (GET_CODE (inner) == ZERO_EXTRACT)
5380                     /* TBZ/TBNZ.  */
5381                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5382                                        0, speed);
5383                 else
5384                   /* CBZ/CBNZ.  */
5385                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5386
5387                 return true;
5388               }
5389             }
5390           else if (cmpcode == LT || cmpcode == GE)
5391             {
5392               /* TBZ/TBNZ.  */
5393               if (comparator == const0_rtx)
5394                 return true;
5395             }
5396         }
5397     }
5398   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5399     {
5400       /* It's a conditional operation based on the status flags,
5401          so it must be some flavor of CSEL.  */
5402
5403       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5404       if (GET_CODE (op1) == NEG
5405           || GET_CODE (op1) == NOT
5406           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5407         op1 = XEXP (op1, 0);
5408
5409       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5410       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5411       return true;
5412     }
5413
5414   /* We don't know what this is, cost all operands.  */
5415   return false;
5416 }
5417
5418 /* Calculate the cost of calculating X, storing it in *COST.  Result
5419    is true if the total cost of the operation has now been calculated.  */
5420 static bool
5421 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5422                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5423 {
5424   rtx op0, op1, op2;
5425   const struct cpu_cost_table *extra_cost
5426     = aarch64_tune_params->insn_extra_cost;
5427   machine_mode mode = GET_MODE (x);
5428
5429   /* By default, assume that everything has equivalent cost to the
5430      cheapest instruction.  Any additional costs are applied as a delta
5431      above this default.  */
5432   *cost = COSTS_N_INSNS (1);
5433
5434   /* TODO: The cost infrastructure currently does not handle
5435      vector operations.  Assume that all vector operations
5436      are equally expensive.  */
5437   if (VECTOR_MODE_P (mode))
5438     {
5439       if (speed)
5440         *cost += extra_cost->vect.alu;
5441       return true;
5442     }
5443
5444   switch (code)
5445     {
5446     case SET:
5447       /* The cost depends entirely on the operands to SET.  */
5448       *cost = 0;
5449       op0 = SET_DEST (x);
5450       op1 = SET_SRC (x);
5451
5452       switch (GET_CODE (op0))
5453         {
5454         case MEM:
5455           if (speed)
5456             {
5457               rtx address = XEXP (op0, 0);
5458               if (GET_MODE_CLASS (mode) == MODE_INT)
5459                 *cost += extra_cost->ldst.store;
5460               else if (mode == SFmode)
5461                 *cost += extra_cost->ldst.storef;
5462               else if (mode == DFmode)
5463                 *cost += extra_cost->ldst.stored;
5464
5465               *cost +=
5466                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5467                                                      0, speed));
5468             }
5469
5470           *cost += rtx_cost (op1, SET, 1, speed);
5471           return true;
5472
5473         case SUBREG:
5474           if (! REG_P (SUBREG_REG (op0)))
5475             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5476
5477           /* Fall through.  */
5478         case REG:
5479           /* const0_rtx is in general free, but we will use an
5480              instruction to set a register to 0.  */
5481           if (REG_P (op1) || op1 == const0_rtx)
5482             {
5483               /* The cost is 1 per register copied.  */
5484               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5485                               / UNITS_PER_WORD;
5486               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5487             }
5488           else
5489             /* Cost is just the cost of the RHS of the set.  */
5490             *cost += rtx_cost (op1, SET, 1, speed);
5491           return true;
5492
5493         case ZERO_EXTRACT:
5494         case SIGN_EXTRACT:
5495           /* Bit-field insertion.  Strip any redundant widening of
5496              the RHS to meet the width of the target.  */
5497           if (GET_CODE (op1) == SUBREG)
5498             op1 = SUBREG_REG (op1);
5499           if ((GET_CODE (op1) == ZERO_EXTEND
5500                || GET_CODE (op1) == SIGN_EXTEND)
5501               && CONST_INT_P (XEXP (op0, 1))
5502               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5503                   >= INTVAL (XEXP (op0, 1))))
5504             op1 = XEXP (op1, 0);
5505
5506           if (CONST_INT_P (op1))
5507             {
5508               /* MOV immediate is assumed to always be cheap.  */
5509               *cost = COSTS_N_INSNS (1);
5510             }
5511           else
5512             {
5513               /* BFM.  */
5514               if (speed)
5515                 *cost += extra_cost->alu.bfi;
5516               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5517             }
5518
5519           return true;
5520
5521         default:
5522           /* We can't make sense of this, assume default cost.  */
5523           *cost = COSTS_N_INSNS (1);
5524           return false;
5525         }
5526       return false;
5527
5528     case CONST_INT:
5529       /* If an instruction can incorporate a constant within the
5530          instruction, the instruction's expression avoids calling
5531          rtx_cost() on the constant.  If rtx_cost() is called on a
5532          constant, then it is usually because the constant must be
5533          moved into a register by one or more instructions.
5534
5535          The exception is constant 0, which can be expressed
5536          as XZR/WZR and is therefore free.  The exception to this is
5537          if we have (set (reg) (const0_rtx)) in which case we must cost
5538          the move.  However, we can catch that when we cost the SET, so
5539          we don't need to consider that here.  */
5540       if (x == const0_rtx)
5541         *cost = 0;
5542       else
5543         {
5544           /* To an approximation, building any other constant is
5545              proportionally expensive to the number of instructions
5546              required to build that constant.  This is true whether we
5547              are compiling for SPEED or otherwise.  */
5548           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5549                                  (NULL_RTX, x, false, mode));
5550         }
5551       return true;
5552
5553     case CONST_DOUBLE:
5554       if (speed)
5555         {
5556           /* mov[df,sf]_aarch64.  */
5557           if (aarch64_float_const_representable_p (x))
5558             /* FMOV (scalar immediate).  */
5559             *cost += extra_cost->fp[mode == DFmode].fpconst;
5560           else if (!aarch64_float_const_zero_rtx_p (x))
5561             {
5562               /* This will be a load from memory.  */
5563               if (mode == DFmode)
5564                 *cost += extra_cost->ldst.loadd;
5565               else
5566                 *cost += extra_cost->ldst.loadf;
5567             }
5568           else
5569             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5570                or MOV v0.s[0], wzr - neither of which are modeled by the
5571                cost tables.  Just use the default cost.  */
5572             {
5573             }
5574         }
5575
5576       return true;
5577
5578     case MEM:
5579       if (speed)
5580         {
5581           /* For loads we want the base cost of a load, plus an
5582              approximation for the additional cost of the addressing
5583              mode.  */
5584           rtx address = XEXP (x, 0);
5585           if (GET_MODE_CLASS (mode) == MODE_INT)
5586             *cost += extra_cost->ldst.load;
5587           else if (mode == SFmode)
5588             *cost += extra_cost->ldst.loadf;
5589           else if (mode == DFmode)
5590             *cost += extra_cost->ldst.loadd;
5591
5592           *cost +=
5593                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5594                                                      0, speed));
5595         }
5596
5597       return true;
5598
5599     case NEG:
5600       op0 = XEXP (x, 0);
5601
5602       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5603        {
5604           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5605               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5606             {
5607               /* CSETM.  */
5608               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5609               return true;
5610             }
5611
5612           /* Cost this as SUB wzr, X.  */
5613           op0 = CONST0_RTX (GET_MODE (x));
5614           op1 = XEXP (x, 0);
5615           goto cost_minus;
5616         }
5617
5618       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5619         {
5620           /* Support (neg(fma...)) as a single instruction only if
5621              sign of zeros is unimportant.  This matches the decision
5622              making in aarch64.md.  */
5623           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5624             {
5625               /* FNMADD.  */
5626               *cost = rtx_cost (op0, NEG, 0, speed);
5627               return true;
5628             }
5629           if (speed)
5630             /* FNEG.  */
5631             *cost += extra_cost->fp[mode == DFmode].neg;
5632           return false;
5633         }
5634
5635       return false;
5636
5637     case CLRSB:
5638     case CLZ:
5639       if (speed)
5640         *cost += extra_cost->alu.clz;
5641
5642       return false;
5643
5644     case COMPARE:
5645       op0 = XEXP (x, 0);
5646       op1 = XEXP (x, 1);
5647
5648       if (op1 == const0_rtx
5649           && GET_CODE (op0) == AND)
5650         {
5651           x = op0;
5652           goto cost_logic;
5653         }
5654
5655       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5656         {
5657           /* TODO: A write to the CC flags possibly costs extra, this
5658              needs encoding in the cost tables.  */
5659
5660           /* CC_ZESWPmode supports zero extend for free.  */
5661           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5662             op0 = XEXP (op0, 0);
5663
5664           /* ANDS.  */
5665           if (GET_CODE (op0) == AND)
5666             {
5667               x = op0;
5668               goto cost_logic;
5669             }
5670
5671           if (GET_CODE (op0) == PLUS)
5672             {
5673               /* ADDS (and CMN alias).  */
5674               x = op0;
5675               goto cost_plus;
5676             }
5677
5678           if (GET_CODE (op0) == MINUS)
5679             {
5680               /* SUBS.  */
5681               x = op0;
5682               goto cost_minus;
5683             }
5684
5685           if (GET_CODE (op1) == NEG)
5686             {
5687               /* CMN.  */
5688               if (speed)
5689                 *cost += extra_cost->alu.arith;
5690
5691               *cost += rtx_cost (op0, COMPARE, 0, speed);
5692               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5693               return true;
5694             }
5695
5696           /* CMP.
5697
5698              Compare can freely swap the order of operands, and
5699              canonicalization puts the more complex operation first.
5700              But the integer MINUS logic expects the shift/extend
5701              operation in op1.  */
5702           if (! (REG_P (op0)
5703                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5704           {
5705             op0 = XEXP (x, 1);
5706             op1 = XEXP (x, 0);
5707           }
5708           goto cost_minus;
5709         }
5710
5711       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5712         {
5713           /* FCMP.  */
5714           if (speed)
5715             *cost += extra_cost->fp[mode == DFmode].compare;
5716
5717           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5718             {
5719               /* FCMP supports constant 0.0 for no extra cost. */
5720               return true;
5721             }
5722           return false;
5723         }
5724
5725       return false;
5726
5727     case MINUS:
5728       {
5729         op0 = XEXP (x, 0);
5730         op1 = XEXP (x, 1);
5731
5732 cost_minus:
5733         /* Detect valid immediates.  */
5734         if ((GET_MODE_CLASS (mode) == MODE_INT
5735              || (GET_MODE_CLASS (mode) == MODE_CC
5736                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5737             && CONST_INT_P (op1)
5738             && aarch64_uimm12_shift (INTVAL (op1)))
5739           {
5740             *cost += rtx_cost (op0, MINUS, 0, speed);
5741
5742             if (speed)
5743               /* SUB(S) (immediate).  */
5744               *cost += extra_cost->alu.arith;
5745             return true;
5746
5747           }
5748
5749         /* Look for SUB (extended register).  */
5750         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5751           {
5752             if (speed)
5753               *cost += extra_cost->alu.arith_shift;
5754
5755             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5756                                (enum rtx_code) GET_CODE (op1),
5757                                0, speed);
5758             return true;
5759           }
5760
5761         rtx new_op1 = aarch64_strip_extend (op1);
5762
5763         /* Cost this as an FMA-alike operation.  */
5764         if ((GET_CODE (new_op1) == MULT
5765              || GET_CODE (new_op1) == ASHIFT)
5766             && code != COMPARE)
5767           {
5768             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5769                                             (enum rtx_code) code,
5770                                             speed);
5771             *cost += rtx_cost (op0, MINUS, 0, speed);
5772             return true;
5773           }
5774
5775         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5776
5777         if (speed)
5778           {
5779             if (GET_MODE_CLASS (mode) == MODE_INT)
5780               /* SUB(S).  */
5781               *cost += extra_cost->alu.arith;
5782             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5783               /* FSUB.  */
5784               *cost += extra_cost->fp[mode == DFmode].addsub;
5785           }
5786         return true;
5787       }
5788
5789     case PLUS:
5790       {
5791         rtx new_op0;
5792
5793         op0 = XEXP (x, 0);
5794         op1 = XEXP (x, 1);
5795
5796 cost_plus:
5797         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5798             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5799           {
5800             /* CSINC.  */
5801             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5802             *cost += rtx_cost (op1, PLUS, 1, speed);
5803             return true;
5804           }
5805
5806         if (GET_MODE_CLASS (mode) == MODE_INT
5807             && CONST_INT_P (op1)
5808             && aarch64_uimm12_shift (INTVAL (op1)))
5809           {
5810             *cost += rtx_cost (op0, PLUS, 0, speed);
5811
5812             if (speed)
5813               /* ADD (immediate).  */
5814               *cost += extra_cost->alu.arith;
5815             return true;
5816           }
5817
5818         /* Look for ADD (extended register).  */
5819         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5820           {
5821             if (speed)
5822               *cost += extra_cost->alu.arith_shift;
5823
5824             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5825                                (enum rtx_code) GET_CODE (op0),
5826                                0, speed);
5827             return true;
5828           }
5829
5830         /* Strip any extend, leave shifts behind as we will
5831            cost them through mult_cost.  */
5832         new_op0 = aarch64_strip_extend (op0);
5833
5834         if (GET_CODE (new_op0) == MULT
5835             || GET_CODE (new_op0) == ASHIFT)
5836           {
5837             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5838                                             speed);
5839             *cost += rtx_cost (op1, PLUS, 1, speed);
5840             return true;
5841           }
5842
5843         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5844                   + rtx_cost (op1, PLUS, 1, speed));
5845
5846         if (speed)
5847           {
5848             if (GET_MODE_CLASS (mode) == MODE_INT)
5849               /* ADD.  */
5850               *cost += extra_cost->alu.arith;
5851             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5852               /* FADD.  */
5853               *cost += extra_cost->fp[mode == DFmode].addsub;
5854           }
5855         return true;
5856       }
5857
5858     case BSWAP:
5859       *cost = COSTS_N_INSNS (1);
5860
5861       if (speed)
5862         *cost += extra_cost->alu.rev;
5863
5864       return false;
5865
5866     case IOR:
5867       if (aarch_rev16_p (x))
5868         {
5869           *cost = COSTS_N_INSNS (1);
5870
5871           if (speed)
5872             *cost += extra_cost->alu.rev;
5873
5874           return true;
5875         }
5876     /* Fall through.  */
5877     case XOR:
5878     case AND:
5879     cost_logic:
5880       op0 = XEXP (x, 0);
5881       op1 = XEXP (x, 1);
5882
5883       if (code == AND
5884           && GET_CODE (op0) == MULT
5885           && CONST_INT_P (XEXP (op0, 1))
5886           && CONST_INT_P (op1)
5887           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5888                                INTVAL (op1)) != 0)
5889         {
5890           /* This is a UBFM/SBFM.  */
5891           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5892           if (speed)
5893             *cost += extra_cost->alu.bfx;
5894           return true;
5895         }
5896
5897       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5898         {
5899           /* We possibly get the immediate for free, this is not
5900              modelled.  */
5901           if (CONST_INT_P (op1)
5902               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5903             {
5904               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5905
5906               if (speed)
5907                 *cost += extra_cost->alu.logical;
5908
5909               return true;
5910             }
5911           else
5912             {
5913               rtx new_op0 = op0;
5914
5915               /* Handle ORN, EON, or BIC.  */
5916               if (GET_CODE (op0) == NOT)
5917                 op0 = XEXP (op0, 0);
5918
5919               new_op0 = aarch64_strip_shift (op0);
5920
5921               /* If we had a shift on op0 then this is a logical-shift-
5922                  by-register/immediate operation.  Otherwise, this is just
5923                  a logical operation.  */
5924               if (speed)
5925                 {
5926                   if (new_op0 != op0)
5927                     {
5928                       /* Shift by immediate.  */
5929                       if (CONST_INT_P (XEXP (op0, 1)))
5930                         *cost += extra_cost->alu.log_shift;
5931                       else
5932                         *cost += extra_cost->alu.log_shift_reg;
5933                     }
5934                   else
5935                     *cost += extra_cost->alu.logical;
5936                 }
5937
5938               /* In both cases we want to cost both operands.  */
5939               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5940                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5941
5942               return true;
5943             }
5944         }
5945       return false;
5946
5947     case NOT:
5948       /* MVN.  */
5949       if (speed)
5950         *cost += extra_cost->alu.logical;
5951
5952       /* The logical instruction could have the shifted register form,
5953          but the cost is the same if the shift is processed as a separate
5954          instruction, so we don't bother with it here.  */
5955       return false;
5956
5957     case ZERO_EXTEND:
5958
5959       op0 = XEXP (x, 0);
5960       /* If a value is written in SI mode, then zero extended to DI
5961          mode, the operation will in general be free as a write to
5962          a 'w' register implicitly zeroes the upper bits of an 'x'
5963          register.  However, if this is
5964
5965            (set (reg) (zero_extend (reg)))
5966
5967          we must cost the explicit register move.  */
5968       if (mode == DImode
5969           && GET_MODE (op0) == SImode
5970           && outer == SET)
5971         {
5972           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5973
5974           if (!op_cost && speed)
5975             /* MOV.  */
5976             *cost += extra_cost->alu.extend;
5977           else
5978             /* Free, the cost is that of the SI mode operation.  */
5979             *cost = op_cost;
5980
5981           return true;
5982         }
5983       else if (MEM_P (XEXP (x, 0)))
5984         {
5985           /* All loads can zero extend to any size for free.  */
5986           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5987           return true;
5988         }
5989
5990       /* UXTB/UXTH.  */
5991       if (speed)
5992         *cost += extra_cost->alu.extend;
5993
5994       return false;
5995
5996     case SIGN_EXTEND:
5997       if (MEM_P (XEXP (x, 0)))
5998         {
5999           /* LDRSH.  */
6000           if (speed)
6001             {
6002               rtx address = XEXP (XEXP (x, 0), 0);
6003               *cost += extra_cost->ldst.load_sign_extend;
6004
6005               *cost +=
6006                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6007                                                      0, speed));
6008             }
6009           return true;
6010         }
6011
6012       if (speed)
6013         *cost += extra_cost->alu.extend;
6014       return false;
6015
6016     case ASHIFT:
6017       op0 = XEXP (x, 0);
6018       op1 = XEXP (x, 1);
6019
6020       if (CONST_INT_P (op1))
6021         {
6022           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6023              aliases.  */
6024           if (speed)
6025             *cost += extra_cost->alu.shift;
6026
6027           /* We can incorporate zero/sign extend for free.  */
6028           if (GET_CODE (op0) == ZERO_EXTEND
6029               || GET_CODE (op0) == SIGN_EXTEND)
6030             op0 = XEXP (op0, 0);
6031
6032           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6033           return true;
6034         }
6035       else
6036         {
6037           /* LSLV.  */
6038           if (speed)
6039             *cost += extra_cost->alu.shift_reg;
6040
6041           return false;  /* All arguments need to be in registers.  */
6042         }
6043
6044     case ROTATE:
6045     case ROTATERT:
6046     case LSHIFTRT:
6047     case ASHIFTRT:
6048       op0 = XEXP (x, 0);
6049       op1 = XEXP (x, 1);
6050
6051       if (CONST_INT_P (op1))
6052         {
6053           /* ASR (immediate) and friends.  */
6054           if (speed)
6055             *cost += extra_cost->alu.shift;
6056
6057           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6058           return true;
6059         }
6060       else
6061         {
6062
6063           /* ASR (register) and friends.  */
6064           if (speed)
6065             *cost += extra_cost->alu.shift_reg;
6066
6067           return false;  /* All arguments need to be in registers.  */
6068         }
6069
6070     case SYMBOL_REF:
6071
6072       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6073         {
6074           /* LDR.  */
6075           if (speed)
6076             *cost += extra_cost->ldst.load;
6077         }
6078       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6079                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6080         {
6081           /* ADRP, followed by ADD.  */
6082           *cost += COSTS_N_INSNS (1);
6083           if (speed)
6084             *cost += 2 * extra_cost->alu.arith;
6085         }
6086       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6087                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6088         {
6089           /* ADR.  */
6090           if (speed)
6091             *cost += extra_cost->alu.arith;
6092         }
6093
6094       if (flag_pic)
6095         {
6096           /* One extra load instruction, after accessing the GOT.  */
6097           *cost += COSTS_N_INSNS (1);
6098           if (speed)
6099             *cost += extra_cost->ldst.load;
6100         }
6101       return true;
6102
6103     case HIGH:
6104     case LO_SUM:
6105       /* ADRP/ADD (immediate).  */
6106       if (speed)
6107         *cost += extra_cost->alu.arith;
6108       return true;
6109
6110     case ZERO_EXTRACT:
6111     case SIGN_EXTRACT:
6112       /* UBFX/SBFX.  */
6113       if (speed)
6114         *cost += extra_cost->alu.bfx;
6115
6116       /* We can trust that the immediates used will be correct (there
6117          are no by-register forms), so we need only cost op0.  */
6118       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6119       return true;
6120
6121     case MULT:
6122       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6123       /* aarch64_rtx_mult_cost always handles recursion to its
6124          operands.  */
6125       return true;
6126
6127     case MOD:
6128     case UMOD:
6129       if (speed)
6130         {
6131           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6132             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6133                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6134           else if (GET_MODE (x) == DFmode)
6135             *cost += (extra_cost->fp[1].mult
6136                       + extra_cost->fp[1].div);
6137           else if (GET_MODE (x) == SFmode)
6138             *cost += (extra_cost->fp[0].mult
6139                       + extra_cost->fp[0].div);
6140         }
6141       return false;  /* All arguments need to be in registers.  */
6142
6143     case DIV:
6144     case UDIV:
6145     case SQRT:
6146       if (speed)
6147         {
6148           if (GET_MODE_CLASS (mode) == MODE_INT)
6149             /* There is no integer SQRT, so only DIV and UDIV can get
6150                here.  */
6151             *cost += extra_cost->mult[mode == DImode].idiv;
6152           else
6153             *cost += extra_cost->fp[mode == DFmode].div;
6154         }
6155       return false;  /* All arguments need to be in registers.  */
6156
6157     case IF_THEN_ELSE:
6158       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6159                                          XEXP (x, 2), cost, speed);
6160
6161     case EQ:
6162     case NE:
6163     case GT:
6164     case GTU:
6165     case LT:
6166     case LTU:
6167     case GE:
6168     case GEU:
6169     case LE:
6170     case LEU:
6171
6172       return false; /* All arguments must be in registers.  */
6173
6174     case FMA:
6175       op0 = XEXP (x, 0);
6176       op1 = XEXP (x, 1);
6177       op2 = XEXP (x, 2);
6178
6179       if (speed)
6180         *cost += extra_cost->fp[mode == DFmode].fma;
6181
6182       /* FMSUB, FNMADD, and FNMSUB are free.  */
6183       if (GET_CODE (op0) == NEG)
6184         op0 = XEXP (op0, 0);
6185
6186       if (GET_CODE (op2) == NEG)
6187         op2 = XEXP (op2, 0);
6188
6189       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6190          and the by-element operand as operand 0.  */
6191       if (GET_CODE (op1) == NEG)
6192         op1 = XEXP (op1, 0);
6193
6194       /* Catch vector-by-element operations.  The by-element operand can
6195          either be (vec_duplicate (vec_select (x))) or just
6196          (vec_select (x)), depending on whether we are multiplying by
6197          a vector or a scalar.
6198
6199          Canonicalization is not very good in these cases, FMA4 will put the
6200          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6201       if (GET_CODE (op0) == VEC_DUPLICATE)
6202         op0 = XEXP (op0, 0);
6203       else if (GET_CODE (op1) == VEC_DUPLICATE)
6204         op1 = XEXP (op1, 0);
6205
6206       if (GET_CODE (op0) == VEC_SELECT)
6207         op0 = XEXP (op0, 0);
6208       else if (GET_CODE (op1) == VEC_SELECT)
6209         op1 = XEXP (op1, 0);
6210
6211       /* If the remaining parameters are not registers,
6212          get the cost to put them into registers.  */
6213       *cost += rtx_cost (op0, FMA, 0, speed);
6214       *cost += rtx_cost (op1, FMA, 1, speed);
6215       *cost += rtx_cost (op2, FMA, 2, speed);
6216       return true;
6217
6218     case FLOAT_EXTEND:
6219       if (speed)
6220         *cost += extra_cost->fp[mode == DFmode].widen;
6221       return false;
6222
6223     case FLOAT_TRUNCATE:
6224       if (speed)
6225         *cost += extra_cost->fp[mode == DFmode].narrow;
6226       return false;
6227
6228     case FIX:
6229     case UNSIGNED_FIX:
6230       x = XEXP (x, 0);
6231       /* Strip the rounding part.  They will all be implemented
6232          by the fcvt* family of instructions anyway.  */
6233       if (GET_CODE (x) == UNSPEC)
6234         {
6235           unsigned int uns_code = XINT (x, 1);
6236
6237           if (uns_code == UNSPEC_FRINTA
6238               || uns_code == UNSPEC_FRINTM
6239               || uns_code == UNSPEC_FRINTN
6240               || uns_code == UNSPEC_FRINTP
6241               || uns_code == UNSPEC_FRINTZ)
6242             x = XVECEXP (x, 0, 0);
6243         }
6244
6245       if (speed)
6246         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6247
6248       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6249       return true;
6250
6251     case ABS:
6252       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6253         {
6254           /* FABS and FNEG are analogous.  */
6255           if (speed)
6256             *cost += extra_cost->fp[mode == DFmode].neg;
6257         }
6258       else
6259         {
6260           /* Integer ABS will either be split to
6261              two arithmetic instructions, or will be an ABS
6262              (scalar), which we don't model.  */
6263           *cost = COSTS_N_INSNS (2);
6264           if (speed)
6265             *cost += 2 * extra_cost->alu.arith;
6266         }
6267       return false;
6268
6269     case SMAX:
6270     case SMIN:
6271       if (speed)
6272         {
6273           /* FMAXNM/FMINNM/FMAX/FMIN.
6274              TODO: This may not be accurate for all implementations, but
6275              we do not model this in the cost tables.  */
6276           *cost += extra_cost->fp[mode == DFmode].addsub;
6277         }
6278       return false;
6279
6280     case UNSPEC:
6281       /* The floating point round to integer frint* instructions.  */
6282       if (aarch64_frint_unspec_p (XINT (x, 1)))
6283         {
6284           if (speed)
6285             *cost += extra_cost->fp[mode == DFmode].roundint;
6286
6287           return false;
6288         }
6289
6290       if (XINT (x, 1) == UNSPEC_RBIT)
6291         {
6292           if (speed)
6293             *cost += extra_cost->alu.rev;
6294
6295           return false;
6296         }
6297       break;
6298
6299     case TRUNCATE:
6300
6301       /* Decompose <su>muldi3_highpart.  */
6302       if (/* (truncate:DI  */
6303           mode == DImode
6304           /*   (lshiftrt:TI  */
6305           && GET_MODE (XEXP (x, 0)) == TImode
6306           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6307           /*      (mult:TI  */
6308           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6309           /*        (ANY_EXTEND:TI (reg:DI))
6310                     (ANY_EXTEND:TI (reg:DI)))  */
6311           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6312                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6313               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6314                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6315           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6316           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6317           /*     (const_int 64)  */
6318           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6319           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6320         {
6321           /* UMULH/SMULH.  */
6322           if (speed)
6323             *cost += extra_cost->mult[mode == DImode].extend;
6324           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6325                              MULT, 0, speed);
6326           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6327                              MULT, 1, speed);
6328           return true;
6329         }
6330
6331       /* Fall through.  */
6332     default:
6333       break;
6334     }
6335
6336   if (dump_file && (dump_flags & TDF_DETAILS))
6337     fprintf (dump_file,
6338       "\nFailed to cost RTX.  Assuming default cost.\n");
6339
6340   return true;
6341 }
6342
6343 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6344    calculated for X.  This cost is stored in *COST.  Returns true
6345    if the total cost of X was calculated.  */
6346 static bool
6347 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6348                    int param, int *cost, bool speed)
6349 {
6350   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6351
6352   if (dump_file && (dump_flags & TDF_DETAILS))
6353     {
6354       print_rtl_single (dump_file, x);
6355       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6356                speed ? "Hot" : "Cold",
6357                *cost, result ? "final" : "partial");
6358     }
6359
6360   return result;
6361 }
6362
6363 static int
6364 aarch64_register_move_cost (machine_mode mode,
6365                             reg_class_t from_i, reg_class_t to_i)
6366 {
6367   enum reg_class from = (enum reg_class) from_i;
6368   enum reg_class to = (enum reg_class) to_i;
6369   const struct cpu_regmove_cost *regmove_cost
6370     = aarch64_tune_params->regmove_cost;
6371
6372   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6373   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6374     to = GENERAL_REGS;
6375
6376   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6377     from = GENERAL_REGS;
6378
6379   /* Moving between GPR and stack cost is the same as GP2GP.  */
6380   if ((from == GENERAL_REGS && to == STACK_REG)
6381       || (to == GENERAL_REGS && from == STACK_REG))
6382     return regmove_cost->GP2GP;
6383
6384   /* To/From the stack register, we move via the gprs.  */
6385   if (to == STACK_REG || from == STACK_REG)
6386     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6387             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6388
6389   if (GET_MODE_SIZE (mode) == 16)
6390     {
6391       /* 128-bit operations on general registers require 2 instructions.  */
6392       if (from == GENERAL_REGS && to == GENERAL_REGS)
6393         return regmove_cost->GP2GP * 2;
6394       else if (from == GENERAL_REGS)
6395         return regmove_cost->GP2FP * 2;
6396       else if (to == GENERAL_REGS)
6397         return regmove_cost->FP2GP * 2;
6398
6399       /* When AdvSIMD instructions are disabled it is not possible to move
6400          a 128-bit value directly between Q registers.  This is handled in
6401          secondary reload.  A general register is used as a scratch to move
6402          the upper DI value and the lower DI value is moved directly,
6403          hence the cost is the sum of three moves. */
6404       if (! TARGET_SIMD)
6405         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6406
6407       return regmove_cost->FP2FP;
6408     }
6409
6410   if (from == GENERAL_REGS && to == GENERAL_REGS)
6411     return regmove_cost->GP2GP;
6412   else if (from == GENERAL_REGS)
6413     return regmove_cost->GP2FP;
6414   else if (to == GENERAL_REGS)
6415     return regmove_cost->FP2GP;
6416
6417   return regmove_cost->FP2FP;
6418 }
6419
6420 static int
6421 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6422                           reg_class_t rclass ATTRIBUTE_UNUSED,
6423                           bool in ATTRIBUTE_UNUSED)
6424 {
6425   return aarch64_tune_params->memmov_cost;
6426 }
6427
6428 /* Return the number of instructions that can be issued per cycle.  */
6429 static int
6430 aarch64_sched_issue_rate (void)
6431 {
6432   return aarch64_tune_params->issue_rate;
6433 }
6434
6435 /* Vectorizer cost model target hooks.  */
6436
6437 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6438 static int
6439 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6440                                     tree vectype,
6441                                     int misalign ATTRIBUTE_UNUSED)
6442 {
6443   unsigned elements;
6444
6445   switch (type_of_cost)
6446     {
6447       case scalar_stmt:
6448         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6449
6450       case scalar_load:
6451         return aarch64_tune_params->vec_costs->scalar_load_cost;
6452
6453       case scalar_store:
6454         return aarch64_tune_params->vec_costs->scalar_store_cost;
6455
6456       case vector_stmt:
6457         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6458
6459       case vector_load:
6460         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6461
6462       case vector_store:
6463         return aarch64_tune_params->vec_costs->vec_store_cost;
6464
6465       case vec_to_scalar:
6466         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6467
6468       case scalar_to_vec:
6469         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6470
6471       case unaligned_load:
6472         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6473
6474       case unaligned_store:
6475         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6476
6477       case cond_branch_taken:
6478         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6479
6480       case cond_branch_not_taken:
6481         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6482
6483       case vec_perm:
6484       case vec_promote_demote:
6485         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6486
6487       case vec_construct:
6488         elements = TYPE_VECTOR_SUBPARTS (vectype);
6489         return elements / 2 + 1;
6490
6491       default:
6492         gcc_unreachable ();
6493     }
6494 }
6495
6496 /* Implement targetm.vectorize.add_stmt_cost.  */
6497 static unsigned
6498 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6499                        struct _stmt_vec_info *stmt_info, int misalign,
6500                        enum vect_cost_model_location where)
6501 {
6502   unsigned *cost = (unsigned *) data;
6503   unsigned retval = 0;
6504
6505   if (flag_vect_cost_model)
6506     {
6507       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6508       int stmt_cost =
6509             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6510
6511       /* Statements in an inner loop relative to the loop being
6512          vectorized are weighted more heavily.  The value here is
6513          a function (linear for now) of the loop nest level.  */
6514       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6515         {
6516           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6517           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6518           unsigned nest_level = loop_depth (loop);
6519
6520           count *= nest_level;
6521         }
6522
6523       retval = (unsigned) (count * stmt_cost);
6524       cost[where] += retval;
6525     }
6526
6527   return retval;
6528 }
6529
6530 static void initialize_aarch64_code_model (void);
6531
6532 /* Parse the architecture extension string.  */
6533
6534 static void
6535 aarch64_parse_extension (char *str)
6536 {
6537   /* The extension string is parsed left to right.  */
6538   const struct aarch64_option_extension *opt = NULL;
6539
6540   /* Flag to say whether we are adding or removing an extension.  */
6541   int adding_ext = -1;
6542
6543   while (str != NULL && *str != 0)
6544     {
6545       char *ext;
6546       size_t len;
6547
6548       str++;
6549       ext = strchr (str, '+');
6550
6551       if (ext != NULL)
6552         len = ext - str;
6553       else
6554         len = strlen (str);
6555
6556       if (len >= 2 && strncmp (str, "no", 2) == 0)
6557         {
6558           adding_ext = 0;
6559           len -= 2;
6560           str += 2;
6561         }
6562       else if (len > 0)
6563         adding_ext = 1;
6564
6565       if (len == 0)
6566         {
6567           error ("missing feature modifier after %qs", adding_ext ? "+"
6568                                                                   : "+no");
6569           return;
6570         }
6571
6572       /* Scan over the extensions table trying to find an exact match.  */
6573       for (opt = all_extensions; opt->name != NULL; opt++)
6574         {
6575           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6576             {
6577               /* Add or remove the extension.  */
6578               if (adding_ext)
6579                 aarch64_isa_flags |= opt->flags_on;
6580               else
6581                 aarch64_isa_flags &= ~(opt->flags_off);
6582               break;
6583             }
6584         }
6585
6586       if (opt->name == NULL)
6587         {
6588           /* Extension not found in list.  */
6589           error ("unknown feature modifier %qs", str);
6590           return;
6591         }
6592
6593       str = ext;
6594     };
6595
6596   return;
6597 }
6598
6599 /* Parse the ARCH string.  */
6600
6601 static void
6602 aarch64_parse_arch (void)
6603 {
6604   char *ext;
6605   const struct processor *arch;
6606   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6607   size_t len;
6608
6609   strcpy (str, aarch64_arch_string);
6610
6611   ext = strchr (str, '+');
6612
6613   if (ext != NULL)
6614     len = ext - str;
6615   else
6616     len = strlen (str);
6617
6618   if (len == 0)
6619     {
6620       error ("missing arch name in -march=%qs", str);
6621       return;
6622     }
6623
6624   /* Loop through the list of supported ARCHs to find a match.  */
6625   for (arch = all_architectures; arch->name != NULL; arch++)
6626     {
6627       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6628         {
6629           selected_arch = arch;
6630           aarch64_isa_flags = selected_arch->flags;
6631
6632           if (!selected_cpu)
6633             selected_cpu = &all_cores[selected_arch->core];
6634
6635           if (ext != NULL)
6636             {
6637               /* ARCH string contains at least one extension.  */
6638               aarch64_parse_extension (ext);
6639             }
6640
6641           if (strcmp (selected_arch->arch, selected_cpu->arch))
6642             {
6643               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6644                        selected_cpu->name, selected_arch->name);
6645             }
6646
6647           return;
6648         }
6649     }
6650
6651   /* ARCH name not found in list.  */
6652   error ("unknown value %qs for -march", str);
6653   return;
6654 }
6655
6656 /* Parse the CPU string.  */
6657
6658 static void
6659 aarch64_parse_cpu (void)
6660 {
6661   char *ext;
6662   const struct processor *cpu;
6663   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6664   size_t len;
6665
6666   strcpy (str, aarch64_cpu_string);
6667
6668   ext = strchr (str, '+');
6669
6670   if (ext != NULL)
6671     len = ext - str;
6672   else
6673     len = strlen (str);
6674
6675   if (len == 0)
6676     {
6677       error ("missing cpu name in -mcpu=%qs", str);
6678       return;
6679     }
6680
6681   /* Loop through the list of supported CPUs to find a match.  */
6682   for (cpu = all_cores; cpu->name != NULL; cpu++)
6683     {
6684       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6685         {
6686           selected_cpu = cpu;
6687           aarch64_isa_flags = selected_cpu->flags;
6688
6689           if (ext != NULL)
6690             {
6691               /* CPU string contains at least one extension.  */
6692               aarch64_parse_extension (ext);
6693             }
6694
6695           return;
6696         }
6697     }
6698
6699   /* CPU name not found in list.  */
6700   error ("unknown value %qs for -mcpu", str);
6701   return;
6702 }
6703
6704 /* Parse the TUNE string.  */
6705
6706 static void
6707 aarch64_parse_tune (void)
6708 {
6709   const struct processor *cpu;
6710   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6711   strcpy (str, aarch64_tune_string);
6712
6713   /* Loop through the list of supported CPUs to find a match.  */
6714   for (cpu = all_cores; cpu->name != NULL; cpu++)
6715     {
6716       if (strcmp (cpu->name, str) == 0)
6717         {
6718           selected_tune = cpu;
6719           return;
6720         }
6721     }
6722
6723   /* CPU name not found in list.  */
6724   error ("unknown value %qs for -mtune", str);
6725   return;
6726 }
6727
6728
6729 /* Implement TARGET_OPTION_OVERRIDE.  */
6730
6731 static void
6732 aarch64_override_options (void)
6733 {
6734   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6735      If either of -march or -mtune is given, they override their
6736      respective component of -mcpu.
6737
6738      So, first parse AARCH64_CPU_STRING, then the others, be careful
6739      with -march as, if -mcpu is not present on the command line, march
6740      must set a sensible default CPU.  */
6741   if (aarch64_cpu_string)
6742     {
6743       aarch64_parse_cpu ();
6744     }
6745
6746   if (aarch64_arch_string)
6747     {
6748       aarch64_parse_arch ();
6749     }
6750
6751   if (aarch64_tune_string)
6752     {
6753       aarch64_parse_tune ();
6754     }
6755
6756 #ifndef HAVE_AS_MABI_OPTION
6757   /* The compiler may have been configured with 2.23.* binutils, which does
6758      not have support for ILP32.  */
6759   if (TARGET_ILP32)
6760     error ("Assembler does not support -mabi=ilp32");
6761 #endif
6762
6763   initialize_aarch64_code_model ();
6764
6765   aarch64_build_bitmask_table ();
6766
6767   /* This target defaults to strict volatile bitfields.  */
6768   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6769     flag_strict_volatile_bitfields = 1;
6770
6771   /* If the user did not specify a processor, choose the default
6772      one for them.  This will be the CPU set during configuration using
6773      --with-cpu, otherwise it is "generic".  */
6774   if (!selected_cpu)
6775     {
6776       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6777       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6778     }
6779
6780   gcc_assert (selected_cpu);
6781
6782   if (!selected_tune)
6783     selected_tune = selected_cpu;
6784
6785   aarch64_tune_flags = selected_tune->flags;
6786   aarch64_tune = selected_tune->core;
6787   aarch64_tune_params = selected_tune->tune;
6788   aarch64_architecture_version = selected_cpu->architecture_version;
6789
6790   if (aarch64_fix_a53_err835769 == 2)
6791     {
6792 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6793       aarch64_fix_a53_err835769 = 1;
6794 #else
6795       aarch64_fix_a53_err835769 = 0;
6796 #endif
6797     }
6798
6799   /* If not opzimizing for size, set the default
6800      alignment to what the target wants */
6801   if (!optimize_size)
6802     {
6803       if (align_loops <= 0)
6804         align_loops = aarch64_tune_params->loop_align;
6805       if (align_jumps <= 0)
6806         align_jumps = aarch64_tune_params->jump_align;
6807       if (align_functions <= 0)
6808         align_functions = aarch64_tune_params->function_align;
6809     }
6810
6811   aarch64_override_options_after_change ();
6812 }
6813
6814 /* Implement targetm.override_options_after_change.  */
6815
6816 static void
6817 aarch64_override_options_after_change (void)
6818 {
6819   if (flag_omit_frame_pointer)
6820     flag_omit_leaf_frame_pointer = false;
6821   else if (flag_omit_leaf_frame_pointer)
6822     flag_omit_frame_pointer = true;
6823 }
6824
6825 static struct machine_function *
6826 aarch64_init_machine_status (void)
6827 {
6828   struct machine_function *machine;
6829   machine = ggc_cleared_alloc<machine_function> ();
6830   return machine;
6831 }
6832
6833 void
6834 aarch64_init_expanders (void)
6835 {
6836   init_machine_status = aarch64_init_machine_status;
6837 }
6838
6839 /* A checking mechanism for the implementation of the various code models.  */
6840 static void
6841 initialize_aarch64_code_model (void)
6842 {
6843    if (flag_pic)
6844      {
6845        switch (aarch64_cmodel_var)
6846          {
6847          case AARCH64_CMODEL_TINY:
6848            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6849            break;
6850          case AARCH64_CMODEL_SMALL:
6851            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6852            break;
6853          case AARCH64_CMODEL_LARGE:
6854            sorry ("code model %qs with -f%s", "large",
6855                   flag_pic > 1 ? "PIC" : "pic");
6856          default:
6857            gcc_unreachable ();
6858          }
6859      }
6860    else
6861      aarch64_cmodel = aarch64_cmodel_var;
6862 }
6863
6864 /* Return true if SYMBOL_REF X binds locally.  */
6865
6866 static bool
6867 aarch64_symbol_binds_local_p (const_rtx x)
6868 {
6869   return (SYMBOL_REF_DECL (x)
6870           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6871           : SYMBOL_REF_LOCAL_P (x));
6872 }
6873
6874 /* Return true if SYMBOL_REF X is thread local */
6875 static bool
6876 aarch64_tls_symbol_p (rtx x)
6877 {
6878   if (! TARGET_HAVE_TLS)
6879     return false;
6880
6881   if (GET_CODE (x) != SYMBOL_REF)
6882     return false;
6883
6884   return SYMBOL_REF_TLS_MODEL (x) != 0;
6885 }
6886
6887 /* Classify a TLS symbol into one of the TLS kinds.  */
6888 enum aarch64_symbol_type
6889 aarch64_classify_tls_symbol (rtx x)
6890 {
6891   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6892
6893   switch (tls_kind)
6894     {
6895     case TLS_MODEL_GLOBAL_DYNAMIC:
6896     case TLS_MODEL_LOCAL_DYNAMIC:
6897       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6898
6899     case TLS_MODEL_INITIAL_EXEC:
6900       return SYMBOL_SMALL_GOTTPREL;
6901
6902     case TLS_MODEL_LOCAL_EXEC:
6903       return SYMBOL_SMALL_TPREL;
6904
6905     case TLS_MODEL_EMULATED:
6906     case TLS_MODEL_NONE:
6907       return SYMBOL_FORCE_TO_MEM;
6908
6909     default:
6910       gcc_unreachable ();
6911     }
6912 }
6913
6914 /* Return the method that should be used to access SYMBOL_REF or
6915    LABEL_REF X in context CONTEXT.  */
6916
6917 enum aarch64_symbol_type
6918 aarch64_classify_symbol (rtx x, rtx offset,
6919                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6920 {
6921   if (GET_CODE (x) == LABEL_REF)
6922     {
6923       switch (aarch64_cmodel)
6924         {
6925         case AARCH64_CMODEL_LARGE:
6926           return SYMBOL_FORCE_TO_MEM;
6927
6928         case AARCH64_CMODEL_TINY_PIC:
6929         case AARCH64_CMODEL_TINY:
6930           return SYMBOL_TINY_ABSOLUTE;
6931
6932         case AARCH64_CMODEL_SMALL_PIC:
6933         case AARCH64_CMODEL_SMALL:
6934           return SYMBOL_SMALL_ABSOLUTE;
6935
6936         default:
6937           gcc_unreachable ();
6938         }
6939     }
6940
6941   if (GET_CODE (x) == SYMBOL_REF)
6942     {
6943       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6944           return SYMBOL_FORCE_TO_MEM;
6945
6946       if (aarch64_tls_symbol_p (x))
6947         return aarch64_classify_tls_symbol (x);
6948
6949       switch (aarch64_cmodel)
6950         {
6951         case AARCH64_CMODEL_TINY:
6952           /* When we retreive symbol + offset address, we have to make sure
6953              the offset does not cause overflow of the final address.  But
6954              we have no way of knowing the address of symbol at compile time
6955              so we can't accurately say if the distance between the PC and
6956              symbol + offset is outside the addressible range of +/-1M in the
6957              TINY code model.  So we rely on images not being greater than
6958              1M and cap the offset at 1M and anything beyond 1M will have to
6959              be loaded using an alternative mechanism.  */
6960           if (SYMBOL_REF_WEAK (x)
6961               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
6962             return SYMBOL_FORCE_TO_MEM;
6963           return SYMBOL_TINY_ABSOLUTE;
6964
6965         case AARCH64_CMODEL_SMALL:
6966           /* Same reasoning as the tiny code model, but the offset cap here is
6967              4G.  */
6968           if (SYMBOL_REF_WEAK (x)
6969               || INTVAL (offset) < (HOST_WIDE_INT) -4294967263
6970               || INTVAL (offset) > (HOST_WIDE_INT) 4294967264)
6971             return SYMBOL_FORCE_TO_MEM;
6972           return SYMBOL_SMALL_ABSOLUTE;
6973
6974         case AARCH64_CMODEL_TINY_PIC:
6975           if (!aarch64_symbol_binds_local_p (x))
6976             return SYMBOL_TINY_GOT;
6977           return SYMBOL_TINY_ABSOLUTE;
6978
6979         case AARCH64_CMODEL_SMALL_PIC:
6980           if (!aarch64_symbol_binds_local_p (x))
6981             return SYMBOL_SMALL_GOT;
6982           return SYMBOL_SMALL_ABSOLUTE;
6983
6984         default:
6985           gcc_unreachable ();
6986         }
6987     }
6988
6989   /* By default push everything into the constant pool.  */
6990   return SYMBOL_FORCE_TO_MEM;
6991 }
6992
6993 bool
6994 aarch64_constant_address_p (rtx x)
6995 {
6996   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6997 }
6998
6999 bool
7000 aarch64_legitimate_pic_operand_p (rtx x)
7001 {
7002   if (GET_CODE (x) == SYMBOL_REF
7003       || (GET_CODE (x) == CONST
7004           && GET_CODE (XEXP (x, 0)) == PLUS
7005           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7006      return false;
7007
7008   return true;
7009 }
7010
7011 /* Return true if X holds either a quarter-precision or
7012      floating-point +0.0 constant.  */
7013 static bool
7014 aarch64_valid_floating_const (machine_mode mode, rtx x)
7015 {
7016   if (!CONST_DOUBLE_P (x))
7017     return false;
7018
7019   /* TODO: We could handle moving 0.0 to a TFmode register,
7020      but first we would like to refactor the movtf_aarch64
7021      to be more amicable to split moves properly and
7022      correctly gate on TARGET_SIMD.  For now - reject all
7023      constants which are not to SFmode or DFmode registers.  */
7024   if (!(mode == SFmode || mode == DFmode))
7025     return false;
7026
7027   if (aarch64_float_const_zero_rtx_p (x))
7028     return true;
7029   return aarch64_float_const_representable_p (x);
7030 }
7031
7032 static bool
7033 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7034 {
7035   /* Do not allow vector struct mode constants.  We could support
7036      0 and -1 easily, but they need support in aarch64-simd.md.  */
7037   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7038     return false;
7039
7040   /* This could probably go away because
7041      we now decompose CONST_INTs according to expand_mov_immediate.  */
7042   if ((GET_CODE (x) == CONST_VECTOR
7043        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7044       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7045         return !targetm.cannot_force_const_mem (mode, x);
7046
7047   if (GET_CODE (x) == HIGH
7048       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7049     return true;
7050
7051   return aarch64_constant_address_p (x);
7052 }
7053
7054 rtx
7055 aarch64_load_tp (rtx target)
7056 {
7057   if (!target
7058       || GET_MODE (target) != Pmode
7059       || !register_operand (target, Pmode))
7060     target = gen_reg_rtx (Pmode);
7061
7062   /* Can return in any reg.  */
7063   emit_insn (gen_aarch64_load_tp_hard (target));
7064   return target;
7065 }
7066
7067 /* On AAPCS systems, this is the "struct __va_list".  */
7068 static GTY(()) tree va_list_type;
7069
7070 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7071    Return the type to use as __builtin_va_list.
7072
7073    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7074
7075    struct __va_list
7076    {
7077      void *__stack;
7078      void *__gr_top;
7079      void *__vr_top;
7080      int   __gr_offs;
7081      int   __vr_offs;
7082    };  */
7083
7084 static tree
7085 aarch64_build_builtin_va_list (void)
7086 {
7087   tree va_list_name;
7088   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7089
7090   /* Create the type.  */
7091   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7092   /* Give it the required name.  */
7093   va_list_name = build_decl (BUILTINS_LOCATION,
7094                              TYPE_DECL,
7095                              get_identifier ("__va_list"),
7096                              va_list_type);
7097   DECL_ARTIFICIAL (va_list_name) = 1;
7098   TYPE_NAME (va_list_type) = va_list_name;
7099   TYPE_STUB_DECL (va_list_type) = va_list_name;
7100
7101   /* Create the fields.  */
7102   f_stack = build_decl (BUILTINS_LOCATION,
7103                         FIELD_DECL, get_identifier ("__stack"),
7104                         ptr_type_node);
7105   f_grtop = build_decl (BUILTINS_LOCATION,
7106                         FIELD_DECL, get_identifier ("__gr_top"),
7107                         ptr_type_node);
7108   f_vrtop = build_decl (BUILTINS_LOCATION,
7109                         FIELD_DECL, get_identifier ("__vr_top"),
7110                         ptr_type_node);
7111   f_groff = build_decl (BUILTINS_LOCATION,
7112                         FIELD_DECL, get_identifier ("__gr_offs"),
7113                         integer_type_node);
7114   f_vroff = build_decl (BUILTINS_LOCATION,
7115                         FIELD_DECL, get_identifier ("__vr_offs"),
7116                         integer_type_node);
7117
7118   DECL_ARTIFICIAL (f_stack) = 1;
7119   DECL_ARTIFICIAL (f_grtop) = 1;
7120   DECL_ARTIFICIAL (f_vrtop) = 1;
7121   DECL_ARTIFICIAL (f_groff) = 1;
7122   DECL_ARTIFICIAL (f_vroff) = 1;
7123
7124   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7125   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7126   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7127   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7128   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7129
7130   TYPE_FIELDS (va_list_type) = f_stack;
7131   DECL_CHAIN (f_stack) = f_grtop;
7132   DECL_CHAIN (f_grtop) = f_vrtop;
7133   DECL_CHAIN (f_vrtop) = f_groff;
7134   DECL_CHAIN (f_groff) = f_vroff;
7135
7136   /* Compute its layout.  */
7137   layout_type (va_list_type);
7138
7139   return va_list_type;
7140 }
7141
7142 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7143 static void
7144 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7145 {
7146   const CUMULATIVE_ARGS *cum;
7147   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7148   tree stack, grtop, vrtop, groff, vroff;
7149   tree t;
7150   int gr_save_area_size;
7151   int vr_save_area_size;
7152   int vr_offset;
7153
7154   cum = &crtl->args.info;
7155   gr_save_area_size
7156     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7157   vr_save_area_size
7158     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7159
7160   if (TARGET_GENERAL_REGS_ONLY)
7161     {
7162       if (cum->aapcs_nvrn > 0)
7163         sorry ("%qs and floating point or vector arguments",
7164                "-mgeneral-regs-only");
7165       vr_save_area_size = 0;
7166     }
7167
7168   f_stack = TYPE_FIELDS (va_list_type_node);
7169   f_grtop = DECL_CHAIN (f_stack);
7170   f_vrtop = DECL_CHAIN (f_grtop);
7171   f_groff = DECL_CHAIN (f_vrtop);
7172   f_vroff = DECL_CHAIN (f_groff);
7173
7174   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7175                   NULL_TREE);
7176   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7177                   NULL_TREE);
7178   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7179                   NULL_TREE);
7180   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7181                   NULL_TREE);
7182   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7183                   NULL_TREE);
7184
7185   /* Emit code to initialize STACK, which points to the next varargs stack
7186      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7187      by named arguments.  STACK is 8-byte aligned.  */
7188   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7189   if (cum->aapcs_stack_size > 0)
7190     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7191   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7192   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7193
7194   /* Emit code to initialize GRTOP, the top of the GR save area.
7195      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7196   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7197   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7198   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7199
7200   /* Emit code to initialize VRTOP, the top of the VR save area.
7201      This address is gr_save_area_bytes below GRTOP, rounded
7202      down to the next 16-byte boundary.  */
7203   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7204   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7205                              STACK_BOUNDARY / BITS_PER_UNIT);
7206
7207   if (vr_offset)
7208     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7209   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7210   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7211
7212   /* Emit code to initialize GROFF, the offset from GRTOP of the
7213      next GPR argument.  */
7214   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7215               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7216   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7217
7218   /* Likewise emit code to initialize VROFF, the offset from FTOP
7219      of the next VR argument.  */
7220   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7221               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7222   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7223 }
7224
7225 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7226
7227 static tree
7228 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7229                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7230 {
7231   tree addr;
7232   bool indirect_p;
7233   bool is_ha;           /* is HFA or HVA.  */
7234   bool dw_align;        /* double-word align.  */
7235   machine_mode ag_mode = VOIDmode;
7236   int nregs;
7237   machine_mode mode;
7238
7239   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7240   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7241   HOST_WIDE_INT size, rsize, adjust, align;
7242   tree t, u, cond1, cond2;
7243
7244   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7245   if (indirect_p)
7246     type = build_pointer_type (type);
7247
7248   mode = TYPE_MODE (type);
7249
7250   f_stack = TYPE_FIELDS (va_list_type_node);
7251   f_grtop = DECL_CHAIN (f_stack);
7252   f_vrtop = DECL_CHAIN (f_grtop);
7253   f_groff = DECL_CHAIN (f_vrtop);
7254   f_vroff = DECL_CHAIN (f_groff);
7255
7256   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7257                   f_stack, NULL_TREE);
7258   size = int_size_in_bytes (type);
7259   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7260
7261   dw_align = false;
7262   adjust = 0;
7263   if (aarch64_vfp_is_call_or_return_candidate (mode,
7264                                                type,
7265                                                &ag_mode,
7266                                                &nregs,
7267                                                &is_ha))
7268     {
7269       /* TYPE passed in fp/simd registers.  */
7270       if (TARGET_GENERAL_REGS_ONLY)
7271         sorry ("%qs and floating point or vector arguments",
7272                "-mgeneral-regs-only");
7273
7274       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7275                       unshare_expr (valist), f_vrtop, NULL_TREE);
7276       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7277                       unshare_expr (valist), f_vroff, NULL_TREE);
7278
7279       rsize = nregs * UNITS_PER_VREG;
7280
7281       if (is_ha)
7282         {
7283           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7284             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7285         }
7286       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7287                && size < UNITS_PER_VREG)
7288         {
7289           adjust = UNITS_PER_VREG - size;
7290         }
7291     }
7292   else
7293     {
7294       /* TYPE passed in general registers.  */
7295       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7296                       unshare_expr (valist), f_grtop, NULL_TREE);
7297       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7298                       unshare_expr (valist), f_groff, NULL_TREE);
7299       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7300       nregs = rsize / UNITS_PER_WORD;
7301
7302       if (align > 8)
7303         dw_align = true;
7304
7305       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7306           && size < UNITS_PER_WORD)
7307         {
7308           adjust = UNITS_PER_WORD  - size;
7309         }
7310     }
7311
7312   /* Get a local temporary for the field value.  */
7313   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7314
7315   /* Emit code to branch if off >= 0.  */
7316   t = build2 (GE_EXPR, boolean_type_node, off,
7317               build_int_cst (TREE_TYPE (off), 0));
7318   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7319
7320   if (dw_align)
7321     {
7322       /* Emit: offs = (offs + 15) & -16.  */
7323       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7324                   build_int_cst (TREE_TYPE (off), 15));
7325       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7326                   build_int_cst (TREE_TYPE (off), -16));
7327       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7328     }
7329   else
7330     roundup = NULL;
7331
7332   /* Update ap.__[g|v]r_offs  */
7333   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7334               build_int_cst (TREE_TYPE (off), rsize));
7335   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7336
7337   /* String up.  */
7338   if (roundup)
7339     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7340
7341   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7342   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7343               build_int_cst (TREE_TYPE (f_off), 0));
7344   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7345
7346   /* String up: make sure the assignment happens before the use.  */
7347   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7348   COND_EXPR_ELSE (cond1) = t;
7349
7350   /* Prepare the trees handling the argument that is passed on the stack;
7351      the top level node will store in ON_STACK.  */
7352   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7353   if (align > 8)
7354     {
7355       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7356       t = fold_convert (intDI_type_node, arg);
7357       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7358                   build_int_cst (TREE_TYPE (t), 15));
7359       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7360                   build_int_cst (TREE_TYPE (t), -16));
7361       t = fold_convert (TREE_TYPE (arg), t);
7362       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7363     }
7364   else
7365     roundup = NULL;
7366   /* Advance ap.__stack  */
7367   t = fold_convert (intDI_type_node, arg);
7368   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7369               build_int_cst (TREE_TYPE (t), size + 7));
7370   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7371               build_int_cst (TREE_TYPE (t), -8));
7372   t = fold_convert (TREE_TYPE (arg), t);
7373   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7374   /* String up roundup and advance.  */
7375   if (roundup)
7376     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7377   /* String up with arg */
7378   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7379   /* Big-endianness related address adjustment.  */
7380   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7381       && size < UNITS_PER_WORD)
7382   {
7383     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7384                 size_int (UNITS_PER_WORD - size));
7385     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7386   }
7387
7388   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7389   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7390
7391   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7392   t = off;
7393   if (adjust)
7394     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7395                 build_int_cst (TREE_TYPE (off), adjust));
7396
7397   t = fold_convert (sizetype, t);
7398   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7399
7400   if (is_ha)
7401     {
7402       /* type ha; // treat as "struct {ftype field[n];}"
7403          ... [computing offs]
7404          for (i = 0; i <nregs; ++i, offs += 16)
7405            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7406          return ha;  */
7407       int i;
7408       tree tmp_ha, field_t, field_ptr_t;
7409
7410       /* Declare a local variable.  */
7411       tmp_ha = create_tmp_var_raw (type, "ha");
7412       gimple_add_tmp_var (tmp_ha);
7413
7414       /* Establish the base type.  */
7415       switch (ag_mode)
7416         {
7417         case SFmode:
7418           field_t = float_type_node;
7419           field_ptr_t = float_ptr_type_node;
7420           break;
7421         case DFmode:
7422           field_t = double_type_node;
7423           field_ptr_t = double_ptr_type_node;
7424           break;
7425         case TFmode:
7426           field_t = long_double_type_node;
7427           field_ptr_t = long_double_ptr_type_node;
7428           break;
7429 /* The half precision and quad precision are not fully supported yet.  Enable
7430    the following code after the support is complete.  Need to find the correct
7431    type node for __fp16 *.  */
7432 #if 0
7433         case HFmode:
7434           field_t = float_type_node;
7435           field_ptr_t = float_ptr_type_node;
7436           break;
7437 #endif
7438         case V2SImode:
7439         case V4SImode:
7440             {
7441               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7442               field_t = build_vector_type_for_mode (innertype, ag_mode);
7443               field_ptr_t = build_pointer_type (field_t);
7444             }
7445           break;
7446         default:
7447           gcc_assert (0);
7448         }
7449
7450       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7451       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7452       addr = t;
7453       t = fold_convert (field_ptr_t, addr);
7454       t = build2 (MODIFY_EXPR, field_t,
7455                   build1 (INDIRECT_REF, field_t, tmp_ha),
7456                   build1 (INDIRECT_REF, field_t, t));
7457
7458       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7459       for (i = 1; i < nregs; ++i)
7460         {
7461           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7462           u = fold_convert (field_ptr_t, addr);
7463           u = build2 (MODIFY_EXPR, field_t,
7464                       build2 (MEM_REF, field_t, tmp_ha,
7465                               build_int_cst (field_ptr_t,
7466                                              (i *
7467                                               int_size_in_bytes (field_t)))),
7468                       build1 (INDIRECT_REF, field_t, u));
7469           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7470         }
7471
7472       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7473       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7474     }
7475
7476   COND_EXPR_ELSE (cond2) = t;
7477   addr = fold_convert (build_pointer_type (type), cond1);
7478   addr = build_va_arg_indirect_ref (addr);
7479
7480   if (indirect_p)
7481     addr = build_va_arg_indirect_ref (addr);
7482
7483   return addr;
7484 }
7485
7486 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7487
7488 static void
7489 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7490                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7491                                 int no_rtl)
7492 {
7493   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7494   CUMULATIVE_ARGS local_cum;
7495   int gr_saved, vr_saved;
7496
7497   /* The caller has advanced CUM up to, but not beyond, the last named
7498      argument.  Advance a local copy of CUM past the last "real" named
7499      argument, to find out how many registers are left over.  */
7500   local_cum = *cum;
7501   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7502
7503   /* Found out how many registers we need to save.  */
7504   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7505   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7506
7507   if (TARGET_GENERAL_REGS_ONLY)
7508     {
7509       if (local_cum.aapcs_nvrn > 0)
7510         sorry ("%qs and floating point or vector arguments",
7511                "-mgeneral-regs-only");
7512       vr_saved = 0;
7513     }
7514
7515   if (!no_rtl)
7516     {
7517       if (gr_saved > 0)
7518         {
7519           rtx ptr, mem;
7520
7521           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7522           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7523                                - gr_saved * UNITS_PER_WORD);
7524           mem = gen_frame_mem (BLKmode, ptr);
7525           set_mem_alias_set (mem, get_varargs_alias_set ());
7526
7527           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7528                                mem, gr_saved);
7529         }
7530       if (vr_saved > 0)
7531         {
7532           /* We can't use move_block_from_reg, because it will use
7533              the wrong mode, storing D regs only.  */
7534           machine_mode mode = TImode;
7535           int off, i;
7536
7537           /* Set OFF to the offset from virtual_incoming_args_rtx of
7538              the first vector register.  The VR save area lies below
7539              the GR one, and is aligned to 16 bytes.  */
7540           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7541                                    STACK_BOUNDARY / BITS_PER_UNIT);
7542           off -= vr_saved * UNITS_PER_VREG;
7543
7544           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7545             {
7546               rtx ptr, mem;
7547
7548               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7549               mem = gen_frame_mem (mode, ptr);
7550               set_mem_alias_set (mem, get_varargs_alias_set ());
7551               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7552               off += UNITS_PER_VREG;
7553             }
7554         }
7555     }
7556
7557   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7558      any complication of having crtl->args.pretend_args_size changed.  */
7559   cfun->machine->frame.saved_varargs_size
7560     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7561                       STACK_BOUNDARY / BITS_PER_UNIT)
7562        + vr_saved * UNITS_PER_VREG);
7563 }
7564
7565 static void
7566 aarch64_conditional_register_usage (void)
7567 {
7568   int i;
7569   if (!TARGET_FLOAT)
7570     {
7571       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7572         {
7573           fixed_regs[i] = 1;
7574           call_used_regs[i] = 1;
7575         }
7576     }
7577 }
7578
7579 /* Walk down the type tree of TYPE counting consecutive base elements.
7580    If *MODEP is VOIDmode, then set it to the first valid floating point
7581    type.  If a non-floating point type is found, or if a floating point
7582    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7583    otherwise return the count in the sub-tree.  */
7584 static int
7585 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7586 {
7587   machine_mode mode;
7588   HOST_WIDE_INT size;
7589
7590   switch (TREE_CODE (type))
7591     {
7592     case REAL_TYPE:
7593       mode = TYPE_MODE (type);
7594       if (mode != DFmode && mode != SFmode && mode != TFmode)
7595         return -1;
7596
7597       if (*modep == VOIDmode)
7598         *modep = mode;
7599
7600       if (*modep == mode)
7601         return 1;
7602
7603       break;
7604
7605     case COMPLEX_TYPE:
7606       mode = TYPE_MODE (TREE_TYPE (type));
7607       if (mode != DFmode && mode != SFmode && mode != TFmode)
7608         return -1;
7609
7610       if (*modep == VOIDmode)
7611         *modep = mode;
7612
7613       if (*modep == mode)
7614         return 2;
7615
7616       break;
7617
7618     case VECTOR_TYPE:
7619       /* Use V2SImode and V4SImode as representatives of all 64-bit
7620          and 128-bit vector types.  */
7621       size = int_size_in_bytes (type);
7622       switch (size)
7623         {
7624         case 8:
7625           mode = V2SImode;
7626           break;
7627         case 16:
7628           mode = V4SImode;
7629           break;
7630         default:
7631           return -1;
7632         }
7633
7634       if (*modep == VOIDmode)
7635         *modep = mode;
7636
7637       /* Vector modes are considered to be opaque: two vectors are
7638          equivalent for the purposes of being homogeneous aggregates
7639          if they are the same size.  */
7640       if (*modep == mode)
7641         return 1;
7642
7643       break;
7644
7645     case ARRAY_TYPE:
7646       {
7647         int count;
7648         tree index = TYPE_DOMAIN (type);
7649
7650         /* Can't handle incomplete types nor sizes that are not
7651            fixed.  */
7652         if (!COMPLETE_TYPE_P (type)
7653             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7654           return -1;
7655
7656         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7657         if (count == -1
7658             || !index
7659             || !TYPE_MAX_VALUE (index)
7660             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7661             || !TYPE_MIN_VALUE (index)
7662             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7663             || count < 0)
7664           return -1;
7665
7666         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7667                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7668
7669         /* There must be no padding.  */
7670         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7671           return -1;
7672
7673         return count;
7674       }
7675
7676     case RECORD_TYPE:
7677       {
7678         int count = 0;
7679         int sub_count;
7680         tree field;
7681
7682         /* Can't handle incomplete types nor sizes that are not
7683            fixed.  */
7684         if (!COMPLETE_TYPE_P (type)
7685             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7686           return -1;
7687
7688         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7689           {
7690             if (TREE_CODE (field) != FIELD_DECL)
7691               continue;
7692
7693             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7694             if (sub_count < 0)
7695               return -1;
7696             count += sub_count;
7697           }
7698
7699         /* There must be no padding.  */
7700         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7701           return -1;
7702
7703         return count;
7704       }
7705
7706     case UNION_TYPE:
7707     case QUAL_UNION_TYPE:
7708       {
7709         /* These aren't very interesting except in a degenerate case.  */
7710         int count = 0;
7711         int sub_count;
7712         tree field;
7713
7714         /* Can't handle incomplete types nor sizes that are not
7715            fixed.  */
7716         if (!COMPLETE_TYPE_P (type)
7717             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7718           return -1;
7719
7720         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7721           {
7722             if (TREE_CODE (field) != FIELD_DECL)
7723               continue;
7724
7725             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7726             if (sub_count < 0)
7727               return -1;
7728             count = count > sub_count ? count : sub_count;
7729           }
7730
7731         /* There must be no padding.  */
7732         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7733           return -1;
7734
7735         return count;
7736       }
7737
7738     default:
7739       break;
7740     }
7741
7742   return -1;
7743 }
7744
7745 /* Return true if we use LRA instead of reload pass.  */
7746 static bool
7747 aarch64_lra_p (void)
7748 {
7749   return aarch64_lra_flag;
7750 }
7751
7752 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7753    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7754    array types.  The C99 floating-point complex types are also considered
7755    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7756    types, which are GCC extensions and out of the scope of AAPCS64, are
7757    treated as composite types here as well.
7758
7759    Note that MODE itself is not sufficient in determining whether a type
7760    is such a composite type or not.  This is because
7761    stor-layout.c:compute_record_mode may have already changed the MODE
7762    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7763    structure with only one field may have its MODE set to the mode of the
7764    field.  Also an integer mode whose size matches the size of the
7765    RECORD_TYPE type may be used to substitute the original mode
7766    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7767    solely relied on.  */
7768
7769 static bool
7770 aarch64_composite_type_p (const_tree type,
7771                           machine_mode mode)
7772 {
7773   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7774     return true;
7775
7776   if (mode == BLKmode
7777       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7778       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7779     return true;
7780
7781   return false;
7782 }
7783
7784 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7785    type as described in AAPCS64 \S 4.1.2.
7786
7787    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7788
7789 static bool
7790 aarch64_short_vector_p (const_tree type,
7791                         machine_mode mode)
7792 {
7793   HOST_WIDE_INT size = -1;
7794
7795   if (type && TREE_CODE (type) == VECTOR_TYPE)
7796     size = int_size_in_bytes (type);
7797   else if (!aarch64_composite_type_p (type, mode)
7798            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7799                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7800     size = GET_MODE_SIZE (mode);
7801
7802   return (size == 8 || size == 16) ? true : false;
7803 }
7804
7805 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7806    shall be passed or returned in simd/fp register(s) (providing these
7807    parameter passing registers are available).
7808
7809    Upon successful return, *COUNT returns the number of needed registers,
7810    *BASE_MODE returns the mode of the individual register and when IS_HAF
7811    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7812    floating-point aggregate or a homogeneous short-vector aggregate.  */
7813
7814 static bool
7815 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7816                                          const_tree type,
7817                                          machine_mode *base_mode,
7818                                          int *count,
7819                                          bool *is_ha)
7820 {
7821   machine_mode new_mode = VOIDmode;
7822   bool composite_p = aarch64_composite_type_p (type, mode);
7823
7824   if (is_ha != NULL) *is_ha = false;
7825
7826   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7827       || aarch64_short_vector_p (type, mode))
7828     {
7829       *count = 1;
7830       new_mode = mode;
7831     }
7832   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7833     {
7834       if (is_ha != NULL) *is_ha = true;
7835       *count = 2;
7836       new_mode = GET_MODE_INNER (mode);
7837     }
7838   else if (type && composite_p)
7839     {
7840       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7841
7842       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7843         {
7844           if (is_ha != NULL) *is_ha = true;
7845           *count = ag_count;
7846         }
7847       else
7848         return false;
7849     }
7850   else
7851     return false;
7852
7853   *base_mode = new_mode;
7854   return true;
7855 }
7856
7857 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7858
7859 static rtx
7860 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7861                           int incoming ATTRIBUTE_UNUSED)
7862 {
7863   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7864 }
7865
7866 /* Implements target hook vector_mode_supported_p.  */
7867 static bool
7868 aarch64_vector_mode_supported_p (machine_mode mode)
7869 {
7870   if (TARGET_SIMD
7871       && (mode == V4SImode  || mode == V8HImode
7872           || mode == V16QImode || mode == V2DImode
7873           || mode == V2SImode  || mode == V4HImode
7874           || mode == V8QImode || mode == V2SFmode
7875           || mode == V4SFmode || mode == V2DFmode
7876           || mode == V1DFmode))
7877     return true;
7878
7879   return false;
7880 }
7881
7882 /* Return appropriate SIMD container
7883    for MODE within a vector of WIDTH bits.  */
7884 static machine_mode
7885 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7886 {
7887   gcc_assert (width == 64 || width == 128);
7888   if (TARGET_SIMD)
7889     {
7890       if (width == 128)
7891         switch (mode)
7892           {
7893           case DFmode:
7894             return V2DFmode;
7895           case SFmode:
7896             return V4SFmode;
7897           case SImode:
7898             return V4SImode;
7899           case HImode:
7900             return V8HImode;
7901           case QImode:
7902             return V16QImode;
7903           case DImode:
7904             return V2DImode;
7905           default:
7906             break;
7907           }
7908       else
7909         switch (mode)
7910           {
7911           case SFmode:
7912             return V2SFmode;
7913           case SImode:
7914             return V2SImode;
7915           case HImode:
7916             return V4HImode;
7917           case QImode:
7918             return V8QImode;
7919           default:
7920             break;
7921           }
7922     }
7923   return word_mode;
7924 }
7925
7926 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7927 static machine_mode
7928 aarch64_preferred_simd_mode (machine_mode mode)
7929 {
7930   return aarch64_simd_container_mode (mode, 128);
7931 }
7932
7933 /* Return the bitmask of possible vector sizes for the vectorizer
7934    to iterate over.  */
7935 static unsigned int
7936 aarch64_autovectorize_vector_sizes (void)
7937 {
7938   return (16 | 8);
7939 }
7940
7941 /* Implement TARGET_MANGLE_TYPE.  */
7942
7943 static const char *
7944 aarch64_mangle_type (const_tree type)
7945 {
7946   /* The AArch64 ABI documents say that "__va_list" has to be
7947      managled as if it is in the "std" namespace.  */
7948   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7949     return "St9__va_list";
7950
7951   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
7952      builtin types.  */
7953   if (TYPE_NAME (type) != NULL)
7954     return aarch64_mangle_builtin_type (type);
7955
7956   /* Use the default mangling.  */
7957   return NULL;
7958 }
7959
7960
7961 /* Return true if the rtx_insn contains a MEM RTX somewhere
7962    in it.  */
7963
7964 static bool
7965 has_memory_op (rtx_insn *mem_insn)
7966 {
7967   subrtx_iterator::array_type array;
7968   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
7969     if (MEM_P (*iter))
7970       return true;
7971
7972   return false;
7973 }
7974
7975 /* Find the first rtx_insn before insn that will generate an assembly
7976    instruction.  */
7977
7978 static rtx_insn *
7979 aarch64_prev_real_insn (rtx_insn *insn)
7980 {
7981   if (!insn)
7982     return NULL;
7983
7984   do
7985     {
7986       insn = prev_real_insn (insn);
7987     }
7988   while (insn && recog_memoized (insn) < 0);
7989
7990   return insn;
7991 }
7992
7993 static bool
7994 is_madd_op (enum attr_type t1)
7995 {
7996   unsigned int i;
7997   /* A number of these may be AArch32 only.  */
7998   enum attr_type mlatypes[] = {
7999     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8000     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8001     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8002   };
8003
8004   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8005     {
8006       if (t1 == mlatypes[i])
8007         return true;
8008     }
8009
8010   return false;
8011 }
8012
8013 /* Check if there is a register dependency between a load and the insn
8014    for which we hold recog_data.  */
8015
8016 static bool
8017 dep_between_memop_and_curr (rtx memop)
8018 {
8019   rtx load_reg;
8020   int opno;
8021
8022   gcc_assert (GET_CODE (memop) == SET);
8023
8024   if (!REG_P (SET_DEST (memop)))
8025     return false;
8026
8027   load_reg = SET_DEST (memop);
8028   for (opno = 1; opno < recog_data.n_operands; opno++)
8029     {
8030       rtx operand = recog_data.operand[opno];
8031       if (REG_P (operand)
8032           && reg_overlap_mentioned_p (load_reg, operand))
8033         return true;
8034
8035     }
8036   return false;
8037 }
8038
8039
8040 /* When working around the Cortex-A53 erratum 835769,
8041    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8042    instruction and has a preceding memory instruction such that a NOP
8043    should be inserted between them.  */
8044
8045 bool
8046 aarch64_madd_needs_nop (rtx_insn* insn)
8047 {
8048   enum attr_type attr_type;
8049   rtx_insn *prev;
8050   rtx body;
8051
8052   if (!aarch64_fix_a53_err835769)
8053     return false;
8054
8055   if (recog_memoized (insn) < 0)
8056     return false;
8057
8058   attr_type = get_attr_type (insn);
8059   if (!is_madd_op (attr_type))
8060     return false;
8061
8062   prev = aarch64_prev_real_insn (insn);
8063   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8064      Restore recog state to INSN to avoid state corruption.  */
8065   extract_constrain_insn_cached (insn);
8066
8067   if (!prev || !has_memory_op (prev))
8068     return false;
8069
8070   body = single_set (prev);
8071
8072   /* If the previous insn is a memory op and there is no dependency between
8073      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8074      have a complex memory operation, probably a load/store pair.
8075      Be conservative for now and emit a NOP.  */
8076   if (GET_MODE (recog_data.operand[0]) == DImode
8077       && (!body || !dep_between_memop_and_curr (body)))
8078     return true;
8079
8080   return false;
8081
8082 }
8083
8084
8085 /* Implement FINAL_PRESCAN_INSN.  */
8086
8087 void
8088 aarch64_final_prescan_insn (rtx_insn *insn)
8089 {
8090   if (aarch64_madd_needs_nop (insn))
8091     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8092 }
8093
8094
8095 /* Return the equivalent letter for size.  */
8096 static char
8097 sizetochar (int size)
8098 {
8099   switch (size)
8100     {
8101     case 64: return 'd';
8102     case 32: return 's';
8103     case 16: return 'h';
8104     case 8 : return 'b';
8105     default: gcc_unreachable ();
8106     }
8107 }
8108
8109 /* Return true iff x is a uniform vector of floating-point
8110    constants, and the constant can be represented in
8111    quarter-precision form.  Note, as aarch64_float_const_representable
8112    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8113 static bool
8114 aarch64_vect_float_const_representable_p (rtx x)
8115 {
8116   int i = 0;
8117   REAL_VALUE_TYPE r0, ri;
8118   rtx x0, xi;
8119
8120   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8121     return false;
8122
8123   x0 = CONST_VECTOR_ELT (x, 0);
8124   if (!CONST_DOUBLE_P (x0))
8125     return false;
8126
8127   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8128
8129   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8130     {
8131       xi = CONST_VECTOR_ELT (x, i);
8132       if (!CONST_DOUBLE_P (xi))
8133         return false;
8134
8135       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8136       if (!REAL_VALUES_EQUAL (r0, ri))
8137         return false;
8138     }
8139
8140   return aarch64_float_const_representable_p (x0);
8141 }
8142
8143 /* Return true for valid and false for invalid.  */
8144 bool
8145 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8146                               struct simd_immediate_info *info)
8147 {
8148 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8149   matches = 1;                                          \
8150   for (i = 0; i < idx; i += (STRIDE))                   \
8151     if (!(TEST))                                        \
8152       matches = 0;                                      \
8153   if (matches)                                          \
8154     {                                                   \
8155       immtype = (CLASS);                                \
8156       elsize = (ELSIZE);                                \
8157       eshift = (SHIFT);                                 \
8158       emvn = (NEG);                                     \
8159       break;                                            \
8160     }
8161
8162   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8163   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8164   unsigned char bytes[16];
8165   int immtype = -1, matches;
8166   unsigned int invmask = inverse ? 0xff : 0;
8167   int eshift, emvn;
8168
8169   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8170     {
8171       if (! (aarch64_simd_imm_zero_p (op, mode)
8172              || aarch64_vect_float_const_representable_p (op)))
8173         return false;
8174
8175       if (info)
8176         {
8177           info->value = CONST_VECTOR_ELT (op, 0);
8178           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8179           info->mvn = false;
8180           info->shift = 0;
8181         }
8182
8183       return true;
8184     }
8185
8186   /* Splat vector constant out into a byte vector.  */
8187   for (i = 0; i < n_elts; i++)
8188     {
8189       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8190          it must be laid out in the vector register in reverse order.  */
8191       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8192       unsigned HOST_WIDE_INT elpart;
8193       unsigned int part, parts;
8194
8195       if (CONST_INT_P (el))
8196         {
8197           elpart = INTVAL (el);
8198           parts = 1;
8199         }
8200       else if (GET_CODE (el) == CONST_DOUBLE)
8201         {
8202           elpart = CONST_DOUBLE_LOW (el);
8203           parts = 2;
8204         }
8205       else
8206         gcc_unreachable ();
8207
8208       for (part = 0; part < parts; part++)
8209         {
8210           unsigned int byte;
8211           for (byte = 0; byte < innersize; byte++)
8212             {
8213               bytes[idx++] = (elpart & 0xff) ^ invmask;
8214               elpart >>= BITS_PER_UNIT;
8215             }
8216           if (GET_CODE (el) == CONST_DOUBLE)
8217             elpart = CONST_DOUBLE_HIGH (el);
8218         }
8219     }
8220
8221   /* Sanity check.  */
8222   gcc_assert (idx == GET_MODE_SIZE (mode));
8223
8224   do
8225     {
8226       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8227              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8228
8229       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8230              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8231
8232       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8233              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8234
8235       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8236              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8237
8238       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8239
8240       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8241
8242       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8243              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8244
8245       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8246              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8247
8248       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8249              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8250
8251       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8252              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8253
8254       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8255
8256       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8257
8258       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8259              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8260
8261       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8262              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8263
8264       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8265              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8266
8267       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8268              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8269
8270       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8271
8272       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8273              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8274     }
8275   while (0);
8276
8277   if (immtype == -1)
8278     return false;
8279
8280   if (info)
8281     {
8282       info->element_width = elsize;
8283       info->mvn = emvn != 0;
8284       info->shift = eshift;
8285
8286       unsigned HOST_WIDE_INT imm = 0;
8287
8288       if (immtype >= 12 && immtype <= 15)
8289         info->msl = true;
8290
8291       /* Un-invert bytes of recognized vector, if necessary.  */
8292       if (invmask != 0)
8293         for (i = 0; i < idx; i++)
8294           bytes[i] ^= invmask;
8295
8296       if (immtype == 17)
8297         {
8298           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8299           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8300
8301           for (i = 0; i < 8; i++)
8302             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8303               << (i * BITS_PER_UNIT);
8304
8305
8306           info->value = GEN_INT (imm);
8307         }
8308       else
8309         {
8310           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8311             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8312
8313           /* Construct 'abcdefgh' because the assembler cannot handle
8314              generic constants.  */
8315           if (info->mvn)
8316             imm = ~imm;
8317           imm = (imm >> info->shift) & 0xff;
8318           info->value = GEN_INT (imm);
8319         }
8320     }
8321
8322   return true;
8323 #undef CHECK
8324 }
8325
8326 /* Check of immediate shift constants are within range.  */
8327 bool
8328 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8329 {
8330   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8331   if (left)
8332     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8333   else
8334     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8335 }
8336
8337 /* Return true if X is a uniform vector where all elements
8338    are either the floating-point constant 0.0 or the
8339    integer constant 0.  */
8340 bool
8341 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8342 {
8343   return x == CONST0_RTX (mode);
8344 }
8345
8346 bool
8347 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8348 {
8349   HOST_WIDE_INT imm = INTVAL (x);
8350   int i;
8351
8352   for (i = 0; i < 8; i++)
8353     {
8354       unsigned int byte = imm & 0xff;
8355       if (byte != 0xff && byte != 0)
8356        return false;
8357       imm >>= 8;
8358     }
8359
8360   return true;
8361 }
8362
8363 bool
8364 aarch64_mov_operand_p (rtx x,
8365                        enum aarch64_symbol_context context,
8366                        machine_mode mode)
8367 {
8368   if (GET_CODE (x) == HIGH
8369       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8370     return true;
8371
8372   if (CONST_INT_P (x))
8373     return true;
8374
8375   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8376     return true;
8377
8378   return aarch64_classify_symbolic_expression (x, context)
8379     == SYMBOL_TINY_ABSOLUTE;
8380 }
8381
8382 /* Return a const_int vector of VAL.  */
8383 rtx
8384 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8385 {
8386   int nunits = GET_MODE_NUNITS (mode);
8387   rtvec v = rtvec_alloc (nunits);
8388   int i;
8389
8390   for (i=0; i < nunits; i++)
8391     RTVEC_ELT (v, i) = GEN_INT (val);
8392
8393   return gen_rtx_CONST_VECTOR (mode, v);
8394 }
8395
8396 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8397
8398 bool
8399 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8400 {
8401   machine_mode vmode;
8402
8403   gcc_assert (!VECTOR_MODE_P (mode));
8404   vmode = aarch64_preferred_simd_mode (mode);
8405   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8406   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8407 }
8408
8409 /* Construct and return a PARALLEL RTX vector with elements numbering the
8410    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8411    the vector - from the perspective of the architecture.  This does not
8412    line up with GCC's perspective on lane numbers, so we end up with
8413    different masks depending on our target endian-ness.  The diagram
8414    below may help.  We must draw the distinction when building masks
8415    which select one half of the vector.  An instruction selecting
8416    architectural low-lanes for a big-endian target, must be described using
8417    a mask selecting GCC high-lanes.
8418
8419                  Big-Endian             Little-Endian
8420
8421 GCC             0   1   2   3           3   2   1   0
8422               | x | x | x | x |       | x | x | x | x |
8423 Architecture    3   2   1   0           3   2   1   0
8424
8425 Low Mask:         { 2, 3 }                { 0, 1 }
8426 High Mask:        { 0, 1 }                { 2, 3 }
8427 */
8428
8429 rtx
8430 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8431 {
8432   int nunits = GET_MODE_NUNITS (mode);
8433   rtvec v = rtvec_alloc (nunits / 2);
8434   int high_base = nunits / 2;
8435   int low_base = 0;
8436   int base;
8437   rtx t1;
8438   int i;
8439
8440   if (BYTES_BIG_ENDIAN)
8441     base = high ? low_base : high_base;
8442   else
8443     base = high ? high_base : low_base;
8444
8445   for (i = 0; i < nunits / 2; i++)
8446     RTVEC_ELT (v, i) = GEN_INT (base + i);
8447
8448   t1 = gen_rtx_PARALLEL (mode, v);
8449   return t1;
8450 }
8451
8452 /* Check OP for validity as a PARALLEL RTX vector with elements
8453    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8454    from the perspective of the architecture.  See the diagram above
8455    aarch64_simd_vect_par_cnst_half for more details.  */
8456
8457 bool
8458 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8459                                        bool high)
8460 {
8461   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8462   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8463   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8464   int i = 0;
8465
8466   if (!VECTOR_MODE_P (mode))
8467     return false;
8468
8469   if (count_op != count_ideal)
8470     return false;
8471
8472   for (i = 0; i < count_ideal; i++)
8473     {
8474       rtx elt_op = XVECEXP (op, 0, i);
8475       rtx elt_ideal = XVECEXP (ideal, 0, i);
8476
8477       if (!CONST_INT_P (elt_op)
8478           || INTVAL (elt_ideal) != INTVAL (elt_op))
8479         return false;
8480     }
8481   return true;
8482 }
8483
8484 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8485    HIGH (exclusive).  */
8486 void
8487 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8488                           const_tree exp)
8489 {
8490   HOST_WIDE_INT lane;
8491   gcc_assert (CONST_INT_P (operand));
8492   lane = INTVAL (operand);
8493
8494   if (lane < low || lane >= high)
8495   {
8496     if (exp)
8497       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8498     else
8499       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8500   }
8501 }
8502
8503 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8504    registers).  */
8505 void
8506 aarch64_simd_emit_pair_result_insn (machine_mode mode,
8507                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8508                             rtx op1)
8509 {
8510   rtx mem = gen_rtx_MEM (mode, destaddr);
8511   rtx tmp1 = gen_reg_rtx (mode);
8512   rtx tmp2 = gen_reg_rtx (mode);
8513
8514   emit_insn (intfn (tmp1, op1, tmp2));
8515
8516   emit_move_insn (mem, tmp1);
8517   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8518   emit_move_insn (mem, tmp2);
8519 }
8520
8521 /* Return TRUE if OP is a valid vector addressing mode.  */
8522 bool
8523 aarch64_simd_mem_operand_p (rtx op)
8524 {
8525   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8526                         || REG_P (XEXP (op, 0)));
8527 }
8528
8529 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
8530    not to early-clobber SRC registers in the process.
8531
8532    We assume that the operands described by SRC and DEST represent a
8533    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
8534    number of components into which the copy has been decomposed.  */
8535 void
8536 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
8537                                 rtx *src, unsigned int count)
8538 {
8539   unsigned int i;
8540
8541   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8542       || REGNO (operands[0]) < REGNO (operands[1]))
8543     {
8544       for (i = 0; i < count; i++)
8545         {
8546           operands[2 * i] = dest[i];
8547           operands[2 * i + 1] = src[i];
8548         }
8549     }
8550   else
8551     {
8552       for (i = 0; i < count; i++)
8553         {
8554           operands[2 * i] = dest[count - i - 1];
8555           operands[2 * i + 1] = src[count - i - 1];
8556         }
8557     }
8558 }
8559
8560 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8561    one of VSTRUCT modes: OI, CI or XI.  */
8562 int
8563 aarch64_simd_attr_length_move (rtx_insn *insn)
8564 {
8565   machine_mode mode;
8566
8567   extract_insn_cached (insn);
8568
8569   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8570     {
8571       mode = GET_MODE (recog_data.operand[0]);
8572       switch (mode)
8573         {
8574         case OImode:
8575           return 8;
8576         case CImode:
8577           return 12;
8578         case XImode:
8579           return 16;
8580         default:
8581           gcc_unreachable ();
8582         }
8583     }
8584   return 4;
8585 }
8586
8587 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8588    alignment of a vector to 128 bits.  */
8589 static HOST_WIDE_INT
8590 aarch64_simd_vector_alignment (const_tree type)
8591 {
8592   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8593   return MIN (align, 128);
8594 }
8595
8596 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8597 static bool
8598 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8599 {
8600   if (is_packed)
8601     return false;
8602
8603   /* We guarantee alignment for vectors up to 128-bits.  */
8604   if (tree_int_cst_compare (TYPE_SIZE (type),
8605                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8606     return false;
8607
8608   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8609   return true;
8610 }
8611
8612 /* If VALS is a vector constant that can be loaded into a register
8613    using DUP, generate instructions to do so and return an RTX to
8614    assign to the register.  Otherwise return NULL_RTX.  */
8615 static rtx
8616 aarch64_simd_dup_constant (rtx vals)
8617 {
8618   machine_mode mode = GET_MODE (vals);
8619   machine_mode inner_mode = GET_MODE_INNER (mode);
8620   int n_elts = GET_MODE_NUNITS (mode);
8621   bool all_same = true;
8622   rtx x;
8623   int i;
8624
8625   if (GET_CODE (vals) != CONST_VECTOR)
8626     return NULL_RTX;
8627
8628   for (i = 1; i < n_elts; ++i)
8629     {
8630       x = CONST_VECTOR_ELT (vals, i);
8631       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8632         all_same = false;
8633     }
8634
8635   if (!all_same)
8636     return NULL_RTX;
8637
8638   /* We can load this constant by using DUP and a constant in a
8639      single ARM register.  This will be cheaper than a vector
8640      load.  */
8641   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8642   return gen_rtx_VEC_DUPLICATE (mode, x);
8643 }
8644
8645
8646 /* Generate code to load VALS, which is a PARALLEL containing only
8647    constants (for vec_init) or CONST_VECTOR, efficiently into a
8648    register.  Returns an RTX to copy into the register, or NULL_RTX
8649    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8650 static rtx
8651 aarch64_simd_make_constant (rtx vals)
8652 {
8653   machine_mode mode = GET_MODE (vals);
8654   rtx const_dup;
8655   rtx const_vec = NULL_RTX;
8656   int n_elts = GET_MODE_NUNITS (mode);
8657   int n_const = 0;
8658   int i;
8659
8660   if (GET_CODE (vals) == CONST_VECTOR)
8661     const_vec = vals;
8662   else if (GET_CODE (vals) == PARALLEL)
8663     {
8664       /* A CONST_VECTOR must contain only CONST_INTs and
8665          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8666          Only store valid constants in a CONST_VECTOR.  */
8667       for (i = 0; i < n_elts; ++i)
8668         {
8669           rtx x = XVECEXP (vals, 0, i);
8670           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8671             n_const++;
8672         }
8673       if (n_const == n_elts)
8674         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8675     }
8676   else
8677     gcc_unreachable ();
8678
8679   if (const_vec != NULL_RTX
8680       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8681     /* Load using MOVI/MVNI.  */
8682     return const_vec;
8683   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8684     /* Loaded using DUP.  */
8685     return const_dup;
8686   else if (const_vec != NULL_RTX)
8687     /* Load from constant pool. We can not take advantage of single-cycle
8688        LD1 because we need a PC-relative addressing mode.  */
8689     return const_vec;
8690   else
8691     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8692        We can not construct an initializer.  */
8693     return NULL_RTX;
8694 }
8695
8696 void
8697 aarch64_expand_vector_init (rtx target, rtx vals)
8698 {
8699   machine_mode mode = GET_MODE (target);
8700   machine_mode inner_mode = GET_MODE_INNER (mode);
8701   int n_elts = GET_MODE_NUNITS (mode);
8702   int n_var = 0, one_var = -1;
8703   bool all_same = true;
8704   rtx x, mem;
8705   int i;
8706
8707   x = XVECEXP (vals, 0, 0);
8708   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8709     n_var = 1, one_var = 0;
8710
8711   for (i = 1; i < n_elts; ++i)
8712     {
8713       x = XVECEXP (vals, 0, i);
8714       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8715         ++n_var, one_var = i;
8716
8717       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8718         all_same = false;
8719     }
8720
8721   if (n_var == 0)
8722     {
8723       rtx constant = aarch64_simd_make_constant (vals);
8724       if (constant != NULL_RTX)
8725         {
8726           emit_move_insn (target, constant);
8727           return;
8728         }
8729     }
8730
8731   /* Splat a single non-constant element if we can.  */
8732   if (all_same)
8733     {
8734       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8735       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8736       return;
8737     }
8738
8739   /* One field is non-constant.  Load constant then overwrite varying
8740      field.  This is more efficient than using the stack.  */
8741   if (n_var == 1)
8742     {
8743       rtx copy = copy_rtx (vals);
8744       rtx index = GEN_INT (one_var);
8745       enum insn_code icode;
8746
8747       /* Load constant part of vector, substitute neighboring value for
8748          varying element.  */
8749       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8750       aarch64_expand_vector_init (target, copy);
8751
8752       /* Insert variable.  */
8753       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8754       icode = optab_handler (vec_set_optab, mode);
8755       gcc_assert (icode != CODE_FOR_nothing);
8756       emit_insn (GEN_FCN (icode) (target, x, index));
8757       return;
8758     }
8759
8760   /* Construct the vector in memory one field at a time
8761      and load the whole vector.  */
8762   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8763   for (i = 0; i < n_elts; i++)
8764     emit_move_insn (adjust_address_nv (mem, inner_mode,
8765                                     i * GET_MODE_SIZE (inner_mode)),
8766                     XVECEXP (vals, 0, i));
8767   emit_move_insn (target, mem);
8768
8769 }
8770
8771 static unsigned HOST_WIDE_INT
8772 aarch64_shift_truncation_mask (machine_mode mode)
8773 {
8774   return
8775     (aarch64_vector_mode_supported_p (mode)
8776      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8777 }
8778
8779 #ifndef TLS_SECTION_ASM_FLAG
8780 #define TLS_SECTION_ASM_FLAG 'T'
8781 #endif
8782
8783 void
8784 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8785                                tree decl ATTRIBUTE_UNUSED)
8786 {
8787   char flagchars[10], *f = flagchars;
8788
8789   /* If we have already declared this section, we can use an
8790      abbreviated form to switch back to it -- unless this section is
8791      part of a COMDAT groups, in which case GAS requires the full
8792      declaration every time.  */
8793   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8794       && (flags & SECTION_DECLARED))
8795     {
8796       fprintf (asm_out_file, "\t.section\t%s\n", name);
8797       return;
8798     }
8799
8800   if (!(flags & SECTION_DEBUG))
8801     *f++ = 'a';
8802   if (flags & SECTION_WRITE)
8803     *f++ = 'w';
8804   if (flags & SECTION_CODE)
8805     *f++ = 'x';
8806   if (flags & SECTION_SMALL)
8807     *f++ = 's';
8808   if (flags & SECTION_MERGE)
8809     *f++ = 'M';
8810   if (flags & SECTION_STRINGS)
8811     *f++ = 'S';
8812   if (flags & SECTION_TLS)
8813     *f++ = TLS_SECTION_ASM_FLAG;
8814   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8815     *f++ = 'G';
8816   *f = '\0';
8817
8818   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8819
8820   if (!(flags & SECTION_NOTYPE))
8821     {
8822       const char *type;
8823       const char *format;
8824
8825       if (flags & SECTION_BSS)
8826         type = "nobits";
8827       else
8828         type = "progbits";
8829
8830 #ifdef TYPE_OPERAND_FMT
8831       format = "," TYPE_OPERAND_FMT;
8832 #else
8833       format = ",@%s";
8834 #endif
8835
8836       fprintf (asm_out_file, format, type);
8837
8838       if (flags & SECTION_ENTSIZE)
8839         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8840       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8841         {
8842           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8843             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8844           else
8845             fprintf (asm_out_file, ",%s,comdat",
8846                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8847         }
8848     }
8849
8850   putc ('\n', asm_out_file);
8851 }
8852
8853 /* Select a format to encode pointers in exception handling data.  */
8854 int
8855 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8856 {
8857    int type;
8858    switch (aarch64_cmodel)
8859      {
8860      case AARCH64_CMODEL_TINY:
8861      case AARCH64_CMODEL_TINY_PIC:
8862      case AARCH64_CMODEL_SMALL:
8863      case AARCH64_CMODEL_SMALL_PIC:
8864        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8865           for everything.  */
8866        type = DW_EH_PE_sdata4;
8867        break;
8868      default:
8869        /* No assumptions here.  8-byte relocs required.  */
8870        type = DW_EH_PE_sdata8;
8871        break;
8872      }
8873    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8874 }
8875
8876 /* Emit load exclusive.  */
8877
8878 static void
8879 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8880                              rtx mem, rtx model_rtx)
8881 {
8882   rtx (*gen) (rtx, rtx, rtx);
8883
8884   switch (mode)
8885     {
8886     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8887     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8888     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8889     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8890     default:
8891       gcc_unreachable ();
8892     }
8893
8894   emit_insn (gen (rval, mem, model_rtx));
8895 }
8896
8897 /* Emit store exclusive.  */
8898
8899 static void
8900 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
8901                               rtx rval, rtx mem, rtx model_rtx)
8902 {
8903   rtx (*gen) (rtx, rtx, rtx, rtx);
8904
8905   switch (mode)
8906     {
8907     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8908     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8909     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8910     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8911     default:
8912       gcc_unreachable ();
8913     }
8914
8915   emit_insn (gen (bval, rval, mem, model_rtx));
8916 }
8917
8918 /* Mark the previous jump instruction as unlikely.  */
8919
8920 static void
8921 aarch64_emit_unlikely_jump (rtx insn)
8922 {
8923   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8924
8925   insn = emit_jump_insn (insn);
8926   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8927 }
8928
8929 /* Expand a compare and swap pattern.  */
8930
8931 void
8932 aarch64_expand_compare_and_swap (rtx operands[])
8933 {
8934   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8935   machine_mode mode, cmp_mode;
8936   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8937
8938   bval = operands[0];
8939   rval = operands[1];
8940   mem = operands[2];
8941   oldval = operands[3];
8942   newval = operands[4];
8943   is_weak = operands[5];
8944   mod_s = operands[6];
8945   mod_f = operands[7];
8946   mode = GET_MODE (mem);
8947   cmp_mode = mode;
8948
8949   /* Normally the succ memory model must be stronger than fail, but in the
8950      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8951      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8952
8953   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8954       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8955     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8956
8957   switch (mode)
8958     {
8959     case QImode:
8960     case HImode:
8961       /* For short modes, we're going to perform the comparison in SImode,
8962          so do the zero-extension now.  */
8963       cmp_mode = SImode;
8964       rval = gen_reg_rtx (SImode);
8965       oldval = convert_modes (SImode, mode, oldval, true);
8966       /* Fall through.  */
8967
8968     case SImode:
8969     case DImode:
8970       /* Force the value into a register if needed.  */
8971       if (!aarch64_plus_operand (oldval, mode))
8972         oldval = force_reg (cmp_mode, oldval);
8973       break;
8974
8975     default:
8976       gcc_unreachable ();
8977     }
8978
8979   switch (mode)
8980     {
8981     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8982     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8983     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8984     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8985     default:
8986       gcc_unreachable ();
8987     }
8988
8989   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8990
8991   if (mode == QImode || mode == HImode)
8992     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8993
8994   x = gen_rtx_REG (CCmode, CC_REGNUM);
8995   x = gen_rtx_EQ (SImode, x, const0_rtx);
8996   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8997 }
8998
8999 /* Split a compare and swap pattern.  */
9000
9001 void
9002 aarch64_split_compare_and_swap (rtx operands[])
9003 {
9004   rtx rval, mem, oldval, newval, scratch;
9005   machine_mode mode;
9006   bool is_weak;
9007   rtx_code_label *label1, *label2;
9008   rtx x, cond;
9009
9010   rval = operands[0];
9011   mem = operands[1];
9012   oldval = operands[2];
9013   newval = operands[3];
9014   is_weak = (operands[4] != const0_rtx);
9015   scratch = operands[7];
9016   mode = GET_MODE (mem);
9017
9018   label1 = NULL;
9019   if (!is_weak)
9020     {
9021       label1 = gen_label_rtx ();
9022       emit_label (label1);
9023     }
9024   label2 = gen_label_rtx ();
9025
9026   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9027
9028   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9029   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9030   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9031                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9032   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9033
9034   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9035
9036   if (!is_weak)
9037     {
9038       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9039       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9040                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9041       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9042     }
9043   else
9044     {
9045       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9046       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9047       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9048     }
9049
9050   emit_label (label2);
9051 }
9052
9053 /* Split an atomic operation.  */
9054
9055 void
9056 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9057                      rtx value, rtx model_rtx, rtx cond)
9058 {
9059   machine_mode mode = GET_MODE (mem);
9060   machine_mode wmode = (mode == DImode ? DImode : SImode);
9061   rtx_code_label *label;
9062   rtx x;
9063
9064   label = gen_label_rtx ();
9065   emit_label (label);
9066
9067   if (new_out)
9068     new_out = gen_lowpart (wmode, new_out);
9069   if (old_out)
9070     old_out = gen_lowpart (wmode, old_out);
9071   else
9072     old_out = new_out;
9073   value = simplify_gen_subreg (wmode, value, mode, 0);
9074
9075   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9076
9077   switch (code)
9078     {
9079     case SET:
9080       new_out = value;
9081       break;
9082
9083     case NOT:
9084       x = gen_rtx_AND (wmode, old_out, value);
9085       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9086       x = gen_rtx_NOT (wmode, new_out);
9087       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9088       break;
9089
9090     case MINUS:
9091       if (CONST_INT_P (value))
9092         {
9093           value = GEN_INT (-INTVAL (value));
9094           code = PLUS;
9095         }
9096       /* Fall through.  */
9097
9098     default:
9099       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9100       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9101       break;
9102     }
9103
9104   aarch64_emit_store_exclusive (mode, cond, mem,
9105                                 gen_lowpart (mode, new_out), model_rtx);
9106
9107   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9108   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9109                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9110   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9111 }
9112
9113 static void
9114 aarch64_print_extension (void)
9115 {
9116   const struct aarch64_option_extension *opt = NULL;
9117
9118   for (opt = all_extensions; opt->name != NULL; opt++)
9119     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9120       asm_fprintf (asm_out_file, "+%s", opt->name);
9121
9122   asm_fprintf (asm_out_file, "\n");
9123 }
9124
9125 static void
9126 aarch64_start_file (void)
9127 {
9128   if (selected_arch)
9129     {
9130       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9131       aarch64_print_extension ();
9132     }
9133   else if (selected_cpu)
9134     {
9135       const char *truncated_name
9136             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9137       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9138       aarch64_print_extension ();
9139     }
9140   default_file_start();
9141 }
9142
9143 /* Target hook for c_mode_for_suffix.  */
9144 static machine_mode
9145 aarch64_c_mode_for_suffix (char suffix)
9146 {
9147   if (suffix == 'q')
9148     return TFmode;
9149
9150   return VOIDmode;
9151 }
9152
9153 /* We can only represent floating point constants which will fit in
9154    "quarter-precision" values.  These values are characterised by
9155    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9156    by:
9157
9158    (-1)^s * (n/16) * 2^r
9159
9160    Where:
9161      's' is the sign bit.
9162      'n' is an integer in the range 16 <= n <= 31.
9163      'r' is an integer in the range -3 <= r <= 4.  */
9164
9165 /* Return true iff X can be represented by a quarter-precision
9166    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9167 bool
9168 aarch64_float_const_representable_p (rtx x)
9169 {
9170   /* This represents our current view of how many bits
9171      make up the mantissa.  */
9172   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9173   int exponent;
9174   unsigned HOST_WIDE_INT mantissa, mask;
9175   REAL_VALUE_TYPE r, m;
9176   bool fail;
9177
9178   if (!CONST_DOUBLE_P (x))
9179     return false;
9180
9181   if (GET_MODE (x) == VOIDmode)
9182     return false;
9183
9184   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9185
9186   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9187      know if we have +zero until we analyse the mantissa, but we
9188      can reject the other invalid values.  */
9189   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9190       || REAL_VALUE_MINUS_ZERO (r))
9191     return false;
9192
9193   /* Extract exponent.  */
9194   r = real_value_abs (&r);
9195   exponent = REAL_EXP (&r);
9196
9197   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9198      highest (sign) bit, with a fixed binary point at bit point_pos.
9199      m1 holds the low part of the mantissa, m2 the high part.
9200      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9201      bits for the mantissa, this can fail (low bits will be lost).  */
9202   real_ldexp (&m, &r, point_pos - exponent);
9203   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9204
9205   /* If the low part of the mantissa has bits set we cannot represent
9206      the value.  */
9207   if (w.elt (0) != 0)
9208     return false;
9209   /* We have rejected the lower HOST_WIDE_INT, so update our
9210      understanding of how many bits lie in the mantissa and
9211      look only at the high HOST_WIDE_INT.  */
9212   mantissa = w.elt (1);
9213   point_pos -= HOST_BITS_PER_WIDE_INT;
9214
9215   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9216   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9217   if ((mantissa & mask) != 0)
9218     return false;
9219
9220   /* Having filtered unrepresentable values, we may now remove all
9221      but the highest 5 bits.  */
9222   mantissa >>= point_pos - 5;
9223
9224   /* We cannot represent the value 0.0, so reject it.  This is handled
9225      elsewhere.  */
9226   if (mantissa == 0)
9227     return false;
9228
9229   /* Then, as bit 4 is always set, we can mask it off, leaving
9230      the mantissa in the range [0, 15].  */
9231   mantissa &= ~(1 << 4);
9232   gcc_assert (mantissa <= 15);
9233
9234   /* GCC internally does not use IEEE754-like encoding (where normalized
9235      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9236      Our mantissa values are shifted 4 places to the left relative to
9237      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9238      by 5 places to correct for GCC's representation.  */
9239   exponent = 5 - exponent;
9240
9241   return (exponent >= 0 && exponent <= 7);
9242 }
9243
9244 char*
9245 aarch64_output_simd_mov_immediate (rtx const_vector,
9246                                    machine_mode mode,
9247                                    unsigned width)
9248 {
9249   bool is_valid;
9250   static char templ[40];
9251   const char *mnemonic;
9252   const char *shift_op;
9253   unsigned int lane_count = 0;
9254   char element_char;
9255
9256   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9257
9258   /* This will return true to show const_vector is legal for use as either
9259      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9260      also update INFO to show how the immediate should be generated.  */
9261   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9262   gcc_assert (is_valid);
9263
9264   element_char = sizetochar (info.element_width);
9265   lane_count = width / info.element_width;
9266
9267   mode = GET_MODE_INNER (mode);
9268   if (mode == SFmode || mode == DFmode)
9269     {
9270       gcc_assert (info.shift == 0 && ! info.mvn);
9271       if (aarch64_float_const_zero_rtx_p (info.value))
9272         info.value = GEN_INT (0);
9273       else
9274         {
9275 #define buf_size 20
9276           REAL_VALUE_TYPE r;
9277           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9278           char float_buf[buf_size] = {'\0'};
9279           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9280 #undef buf_size
9281
9282           if (lane_count == 1)
9283             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9284           else
9285             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9286                       lane_count, element_char, float_buf);
9287           return templ;
9288         }
9289     }
9290
9291   mnemonic = info.mvn ? "mvni" : "movi";
9292   shift_op = info.msl ? "msl" : "lsl";
9293
9294   if (lane_count == 1)
9295     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9296               mnemonic, UINTVAL (info.value));
9297   else if (info.shift)
9298     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9299               ", %s %d", mnemonic, lane_count, element_char,
9300               UINTVAL (info.value), shift_op, info.shift);
9301   else
9302     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9303               mnemonic, lane_count, element_char, UINTVAL (info.value));
9304   return templ;
9305 }
9306
9307 char*
9308 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9309                                           machine_mode mode)
9310 {
9311   machine_mode vmode;
9312
9313   gcc_assert (!VECTOR_MODE_P (mode));
9314   vmode = aarch64_simd_container_mode (mode, 64);
9315   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9316   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9317 }
9318
9319 /* Split operands into moves from op[1] + op[2] into op[0].  */
9320
9321 void
9322 aarch64_split_combinev16qi (rtx operands[3])
9323 {
9324   unsigned int dest = REGNO (operands[0]);
9325   unsigned int src1 = REGNO (operands[1]);
9326   unsigned int src2 = REGNO (operands[2]);
9327   machine_mode halfmode = GET_MODE (operands[1]);
9328   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9329   rtx destlo, desthi;
9330
9331   gcc_assert (halfmode == V16QImode);
9332
9333   if (src1 == dest && src2 == dest + halfregs)
9334     {
9335       /* No-op move.  Can't split to nothing; emit something.  */
9336       emit_note (NOTE_INSN_DELETED);
9337       return;
9338     }
9339
9340   /* Preserve register attributes for variable tracking.  */
9341   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9342   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9343                                GET_MODE_SIZE (halfmode));
9344
9345   /* Special case of reversed high/low parts.  */
9346   if (reg_overlap_mentioned_p (operands[2], destlo)
9347       && reg_overlap_mentioned_p (operands[1], desthi))
9348     {
9349       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9350       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9351       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9352     }
9353   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9354     {
9355       /* Try to avoid unnecessary moves if part of the result
9356          is in the right place already.  */
9357       if (src1 != dest)
9358         emit_move_insn (destlo, operands[1]);
9359       if (src2 != dest + halfregs)
9360         emit_move_insn (desthi, operands[2]);
9361     }
9362   else
9363     {
9364       if (src2 != dest + halfregs)
9365         emit_move_insn (desthi, operands[2]);
9366       if (src1 != dest)
9367         emit_move_insn (destlo, operands[1]);
9368     }
9369 }
9370
9371 /* vec_perm support.  */
9372
9373 #define MAX_VECT_LEN 16
9374
9375 struct expand_vec_perm_d
9376 {
9377   rtx target, op0, op1;
9378   unsigned char perm[MAX_VECT_LEN];
9379   machine_mode vmode;
9380   unsigned char nelt;
9381   bool one_vector_p;
9382   bool testing_p;
9383 };
9384
9385 /* Generate a variable permutation.  */
9386
9387 static void
9388 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9389 {
9390   machine_mode vmode = GET_MODE (target);
9391   bool one_vector_p = rtx_equal_p (op0, op1);
9392
9393   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9394   gcc_checking_assert (GET_MODE (op0) == vmode);
9395   gcc_checking_assert (GET_MODE (op1) == vmode);
9396   gcc_checking_assert (GET_MODE (sel) == vmode);
9397   gcc_checking_assert (TARGET_SIMD);
9398
9399   if (one_vector_p)
9400     {
9401       if (vmode == V8QImode)
9402         {
9403           /* Expand the argument to a V16QI mode by duplicating it.  */
9404           rtx pair = gen_reg_rtx (V16QImode);
9405           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9406           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9407         }
9408       else
9409         {
9410           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9411         }
9412     }
9413   else
9414     {
9415       rtx pair;
9416
9417       if (vmode == V8QImode)
9418         {
9419           pair = gen_reg_rtx (V16QImode);
9420           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9421           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9422         }
9423       else
9424         {
9425           pair = gen_reg_rtx (OImode);
9426           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9427           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9428         }
9429     }
9430 }
9431
9432 void
9433 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9434 {
9435   machine_mode vmode = GET_MODE (target);
9436   unsigned int nelt = GET_MODE_NUNITS (vmode);
9437   bool one_vector_p = rtx_equal_p (op0, op1);
9438   rtx mask;
9439
9440   /* The TBL instruction does not use a modulo index, so we must take care
9441      of that ourselves.  */
9442   mask = aarch64_simd_gen_const_vector_dup (vmode,
9443       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9444   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9445
9446   /* For big-endian, we also need to reverse the index within the vector
9447      (but not which vector).  */
9448   if (BYTES_BIG_ENDIAN)
9449     {
9450       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9451       if (!one_vector_p)
9452         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9453       sel = expand_simple_binop (vmode, XOR, sel, mask,
9454                                  NULL, 0, OPTAB_LIB_WIDEN);
9455     }
9456   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9457 }
9458
9459 /* Recognize patterns suitable for the TRN instructions.  */
9460 static bool
9461 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9462 {
9463   unsigned int i, odd, mask, nelt = d->nelt;
9464   rtx out, in0, in1, x;
9465   rtx (*gen) (rtx, rtx, rtx);
9466   machine_mode vmode = d->vmode;
9467
9468   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9469     return false;
9470
9471   /* Note that these are little-endian tests.
9472      We correct for big-endian later.  */
9473   if (d->perm[0] == 0)
9474     odd = 0;
9475   else if (d->perm[0] == 1)
9476     odd = 1;
9477   else
9478     return false;
9479   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9480
9481   for (i = 0; i < nelt; i += 2)
9482     {
9483       if (d->perm[i] != i + odd)
9484         return false;
9485       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9486         return false;
9487     }
9488
9489   /* Success!  */
9490   if (d->testing_p)
9491     return true;
9492
9493   in0 = d->op0;
9494   in1 = d->op1;
9495   if (BYTES_BIG_ENDIAN)
9496     {
9497       x = in0, in0 = in1, in1 = x;
9498       odd = !odd;
9499     }
9500   out = d->target;
9501
9502   if (odd)
9503     {
9504       switch (vmode)
9505         {
9506         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9507         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9508         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9509         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9510         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9511         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9512         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9513         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9514         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9515         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9516         default:
9517           return false;
9518         }
9519     }
9520   else
9521     {
9522       switch (vmode)
9523         {
9524         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9525         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9526         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9527         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9528         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9529         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9530         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9531         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9532         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9533         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9534         default:
9535           return false;
9536         }
9537     }
9538
9539   emit_insn (gen (out, in0, in1));
9540   return true;
9541 }
9542
9543 /* Recognize patterns suitable for the UZP instructions.  */
9544 static bool
9545 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9546 {
9547   unsigned int i, odd, mask, nelt = d->nelt;
9548   rtx out, in0, in1, x;
9549   rtx (*gen) (rtx, rtx, rtx);
9550   machine_mode vmode = d->vmode;
9551
9552   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9553     return false;
9554
9555   /* Note that these are little-endian tests.
9556      We correct for big-endian later.  */
9557   if (d->perm[0] == 0)
9558     odd = 0;
9559   else if (d->perm[0] == 1)
9560     odd = 1;
9561   else
9562     return false;
9563   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9564
9565   for (i = 0; i < nelt; i++)
9566     {
9567       unsigned elt = (i * 2 + odd) & mask;
9568       if (d->perm[i] != elt)
9569         return false;
9570     }
9571
9572   /* Success!  */
9573   if (d->testing_p)
9574     return true;
9575
9576   in0 = d->op0;
9577   in1 = d->op1;
9578   if (BYTES_BIG_ENDIAN)
9579     {
9580       x = in0, in0 = in1, in1 = x;
9581       odd = !odd;
9582     }
9583   out = d->target;
9584
9585   if (odd)
9586     {
9587       switch (vmode)
9588         {
9589         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9590         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9591         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9592         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9593         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9594         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9595         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9596         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9597         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9598         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9599         default:
9600           return false;
9601         }
9602     }
9603   else
9604     {
9605       switch (vmode)
9606         {
9607         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9608         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9609         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9610         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9611         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9612         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9613         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9614         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9615         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9616         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9617         default:
9618           return false;
9619         }
9620     }
9621
9622   emit_insn (gen (out, in0, in1));
9623   return true;
9624 }
9625
9626 /* Recognize patterns suitable for the ZIP instructions.  */
9627 static bool
9628 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9629 {
9630   unsigned int i, high, mask, nelt = d->nelt;
9631   rtx out, in0, in1, x;
9632   rtx (*gen) (rtx, rtx, rtx);
9633   machine_mode vmode = d->vmode;
9634
9635   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9636     return false;
9637
9638   /* Note that these are little-endian tests.
9639      We correct for big-endian later.  */
9640   high = nelt / 2;
9641   if (d->perm[0] == high)
9642     /* Do Nothing.  */
9643     ;
9644   else if (d->perm[0] == 0)
9645     high = 0;
9646   else
9647     return false;
9648   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9649
9650   for (i = 0; i < nelt / 2; i++)
9651     {
9652       unsigned elt = (i + high) & mask;
9653       if (d->perm[i * 2] != elt)
9654         return false;
9655       elt = (elt + nelt) & mask;
9656       if (d->perm[i * 2 + 1] != elt)
9657         return false;
9658     }
9659
9660   /* Success!  */
9661   if (d->testing_p)
9662     return true;
9663
9664   in0 = d->op0;
9665   in1 = d->op1;
9666   if (BYTES_BIG_ENDIAN)
9667     {
9668       x = in0, in0 = in1, in1 = x;
9669       high = !high;
9670     }
9671   out = d->target;
9672
9673   if (high)
9674     {
9675       switch (vmode)
9676         {
9677         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9678         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9679         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9680         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9681         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9682         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9683         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9684         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9685         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9686         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9687         default:
9688           return false;
9689         }
9690     }
9691   else
9692     {
9693       switch (vmode)
9694         {
9695         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9696         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9697         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9698         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9699         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9700         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9701         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9702         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9703         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9704         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9705         default:
9706           return false;
9707         }
9708     }
9709
9710   emit_insn (gen (out, in0, in1));
9711   return true;
9712 }
9713
9714 /* Recognize patterns for the EXT insn.  */
9715
9716 static bool
9717 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9718 {
9719   unsigned int i, nelt = d->nelt;
9720   rtx (*gen) (rtx, rtx, rtx, rtx);
9721   rtx offset;
9722
9723   unsigned int location = d->perm[0]; /* Always < nelt.  */
9724
9725   /* Check if the extracted indices are increasing by one.  */
9726   for (i = 1; i < nelt; i++)
9727     {
9728       unsigned int required = location + i;
9729       if (d->one_vector_p)
9730         {
9731           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9732           required &= (nelt - 1);
9733         }
9734       if (d->perm[i] != required)
9735         return false;
9736     }
9737
9738   switch (d->vmode)
9739     {
9740     case V16QImode: gen = gen_aarch64_extv16qi; break;
9741     case V8QImode: gen = gen_aarch64_extv8qi; break;
9742     case V4HImode: gen = gen_aarch64_extv4hi; break;
9743     case V8HImode: gen = gen_aarch64_extv8hi; break;
9744     case V2SImode: gen = gen_aarch64_extv2si; break;
9745     case V4SImode: gen = gen_aarch64_extv4si; break;
9746     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9747     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9748     case V2DImode: gen = gen_aarch64_extv2di; break;
9749     case V2DFmode: gen = gen_aarch64_extv2df; break;
9750     default:
9751       return false;
9752     }
9753
9754   /* Success! */
9755   if (d->testing_p)
9756     return true;
9757
9758   /* The case where (location == 0) is a no-op for both big- and little-endian,
9759      and is removed by the mid-end at optimization levels -O1 and higher.  */
9760
9761   if (BYTES_BIG_ENDIAN && (location != 0))
9762     {
9763       /* After setup, we want the high elements of the first vector (stored
9764          at the LSB end of the register), and the low elements of the second
9765          vector (stored at the MSB end of the register). So swap.  */
9766       rtx temp = d->op0;
9767       d->op0 = d->op1;
9768       d->op1 = temp;
9769       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9770       location = nelt - location;
9771     }
9772
9773   offset = GEN_INT (location);
9774   emit_insn (gen (d->target, d->op0, d->op1, offset));
9775   return true;
9776 }
9777
9778 /* Recognize patterns for the REV insns.  */
9779
9780 static bool
9781 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9782 {
9783   unsigned int i, j, diff, nelt = d->nelt;
9784   rtx (*gen) (rtx, rtx);
9785
9786   if (!d->one_vector_p)
9787     return false;
9788
9789   diff = d->perm[0];
9790   switch (diff)
9791     {
9792     case 7:
9793       switch (d->vmode)
9794         {
9795         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9796         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9797         default:
9798           return false;
9799         }
9800       break;
9801     case 3:
9802       switch (d->vmode)
9803         {
9804         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9805         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9806         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9807         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9808         default:
9809           return false;
9810         }
9811       break;
9812     case 1:
9813       switch (d->vmode)
9814         {
9815         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9816         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9817         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9818         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9819         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9820         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9821         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9822         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9823         default:
9824           return false;
9825         }
9826       break;
9827     default:
9828       return false;
9829     }
9830
9831   for (i = 0; i < nelt ; i += diff + 1)
9832     for (j = 0; j <= diff; j += 1)
9833       {
9834         /* This is guaranteed to be true as the value of diff
9835            is 7, 3, 1 and we should have enough elements in the
9836            queue to generate this.  Getting a vector mask with a
9837            value of diff other than these values implies that
9838            something is wrong by the time we get here.  */
9839         gcc_assert (i + j < nelt);
9840         if (d->perm[i + j] != i + diff - j)
9841           return false;
9842       }
9843
9844   /* Success! */
9845   if (d->testing_p)
9846     return true;
9847
9848   emit_insn (gen (d->target, d->op0));
9849   return true;
9850 }
9851
9852 static bool
9853 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9854 {
9855   rtx (*gen) (rtx, rtx, rtx);
9856   rtx out = d->target;
9857   rtx in0;
9858   machine_mode vmode = d->vmode;
9859   unsigned int i, elt, nelt = d->nelt;
9860   rtx lane;
9861
9862   elt = d->perm[0];
9863   for (i = 1; i < nelt; i++)
9864     {
9865       if (elt != d->perm[i])
9866         return false;
9867     }
9868
9869   /* The generic preparation in aarch64_expand_vec_perm_const_1
9870      swaps the operand order and the permute indices if it finds
9871      d->perm[0] to be in the second operand.  Thus, we can always
9872      use d->op0 and need not do any extra arithmetic to get the
9873      correct lane number.  */
9874   in0 = d->op0;
9875   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
9876
9877   switch (vmode)
9878     {
9879     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9880     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9881     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9882     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9883     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9884     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9885     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9886     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9887     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9888     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9889     default:
9890       return false;
9891     }
9892
9893   emit_insn (gen (out, in0, lane));
9894   return true;
9895 }
9896
9897 static bool
9898 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9899 {
9900   rtx rperm[MAX_VECT_LEN], sel;
9901   machine_mode vmode = d->vmode;
9902   unsigned int i, nelt = d->nelt;
9903
9904   if (d->testing_p)
9905     return true;
9906
9907   /* Generic code will try constant permutation twice.  Once with the
9908      original mode and again with the elements lowered to QImode.
9909      So wait and don't do the selector expansion ourselves.  */
9910   if (vmode != V8QImode && vmode != V16QImode)
9911     return false;
9912
9913   for (i = 0; i < nelt; ++i)
9914     {
9915       int nunits = GET_MODE_NUNITS (vmode);
9916
9917       /* If big-endian and two vectors we end up with a weird mixed-endian
9918          mode on NEON.  Reverse the index within each word but not the word
9919          itself.  */
9920       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9921                                            : d->perm[i]);
9922     }
9923   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9924   sel = force_reg (vmode, sel);
9925
9926   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9927   return true;
9928 }
9929
9930 static bool
9931 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9932 {
9933   /* The pattern matching functions above are written to look for a small
9934      number to begin the sequence (0, 1, N/2).  If we begin with an index
9935      from the second operand, we can swap the operands.  */
9936   if (d->perm[0] >= d->nelt)
9937     {
9938       unsigned i, nelt = d->nelt;
9939       rtx x;
9940
9941       gcc_assert (nelt == (nelt & -nelt));
9942       for (i = 0; i < nelt; ++i)
9943         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
9944
9945       x = d->op0;
9946       d->op0 = d->op1;
9947       d->op1 = x;
9948     }
9949
9950   if (TARGET_SIMD)
9951     {
9952       if (aarch64_evpc_rev (d))
9953         return true;
9954       else if (aarch64_evpc_ext (d))
9955         return true;
9956       else if (aarch64_evpc_dup (d))
9957         return true;
9958       else if (aarch64_evpc_zip (d))
9959         return true;
9960       else if (aarch64_evpc_uzp (d))
9961         return true;
9962       else if (aarch64_evpc_trn (d))
9963         return true;
9964       return aarch64_evpc_tbl (d);
9965     }
9966   return false;
9967 }
9968
9969 /* Expand a vec_perm_const pattern.  */
9970
9971 bool
9972 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9973 {
9974   struct expand_vec_perm_d d;
9975   int i, nelt, which;
9976
9977   d.target = target;
9978   d.op0 = op0;
9979   d.op1 = op1;
9980
9981   d.vmode = GET_MODE (target);
9982   gcc_assert (VECTOR_MODE_P (d.vmode));
9983   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9984   d.testing_p = false;
9985
9986   for (i = which = 0; i < nelt; ++i)
9987     {
9988       rtx e = XVECEXP (sel, 0, i);
9989       int ei = INTVAL (e) & (2 * nelt - 1);
9990       which |= (ei < nelt ? 1 : 2);
9991       d.perm[i] = ei;
9992     }
9993
9994   switch (which)
9995     {
9996     default:
9997       gcc_unreachable ();
9998
9999     case 3:
10000       d.one_vector_p = false;
10001       if (!rtx_equal_p (op0, op1))
10002         break;
10003
10004       /* The elements of PERM do not suggest that only the first operand
10005          is used, but both operands are identical.  Allow easier matching
10006          of the permutation by folding the permutation into the single
10007          input vector.  */
10008       /* Fall Through.  */
10009     case 2:
10010       for (i = 0; i < nelt; ++i)
10011         d.perm[i] &= nelt - 1;
10012       d.op0 = op1;
10013       d.one_vector_p = true;
10014       break;
10015
10016     case 1:
10017       d.op1 = op0;
10018       d.one_vector_p = true;
10019       break;
10020     }
10021
10022   return aarch64_expand_vec_perm_const_1 (&d);
10023 }
10024
10025 static bool
10026 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10027                                      const unsigned char *sel)
10028 {
10029   struct expand_vec_perm_d d;
10030   unsigned int i, nelt, which;
10031   bool ret;
10032
10033   d.vmode = vmode;
10034   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10035   d.testing_p = true;
10036   memcpy (d.perm, sel, nelt);
10037
10038   /* Calculate whether all elements are in one vector.  */
10039   for (i = which = 0; i < nelt; ++i)
10040     {
10041       unsigned char e = d.perm[i];
10042       gcc_assert (e < 2 * nelt);
10043       which |= (e < nelt ? 1 : 2);
10044     }
10045
10046   /* If all elements are from the second vector, reindex as if from the
10047      first vector.  */
10048   if (which == 2)
10049     for (i = 0; i < nelt; ++i)
10050       d.perm[i] -= nelt;
10051
10052   /* Check whether the mask can be applied to a single vector.  */
10053   d.one_vector_p = (which != 3);
10054
10055   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10056   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10057   if (!d.one_vector_p)
10058     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10059
10060   start_sequence ();
10061   ret = aarch64_expand_vec_perm_const_1 (&d);
10062   end_sequence ();
10063
10064   return ret;
10065 }
10066
10067 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
10068 bool
10069 aarch64_cannot_change_mode_class (machine_mode from,
10070                                   machine_mode to,
10071                                   enum reg_class rclass)
10072 {
10073   /* Full-reg subregs are allowed on general regs or any class if they are
10074      the same size.  */
10075   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
10076       || !reg_classes_intersect_p (FP_REGS, rclass))
10077     return false;
10078
10079   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
10080      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
10081      2. Scalar to Scalar for integer modes or same size float modes.
10082      3. Vector to Vector modes.
10083      4. On little-endian only, Vector-Structure to Vector modes.  */
10084   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
10085     {
10086       if (aarch64_vector_mode_supported_p (from)
10087           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
10088         return false;
10089
10090       if (GET_MODE_NUNITS (from) == 1
10091           && GET_MODE_NUNITS (to) == 1
10092           && (GET_MODE_CLASS (from) == MODE_INT
10093               || from == to))
10094         return false;
10095
10096       if (aarch64_vector_mode_supported_p (from)
10097           && aarch64_vector_mode_supported_p (to))
10098         return false;
10099
10100       /* Within an vector structure straddling multiple vector registers
10101          we are in a mixed-endian representation.  As such, we can't
10102          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
10103          switch between vectors and vector structures cheaply.  */
10104       if (!BYTES_BIG_ENDIAN)
10105         if ((aarch64_vector_mode_supported_p (from)
10106               && aarch64_vect_struct_mode_p (to))
10107             || (aarch64_vector_mode_supported_p (to)
10108               && aarch64_vect_struct_mode_p (from)))
10109           return false;
10110     }
10111
10112   return true;
10113 }
10114
10115 /* Implement MODES_TIEABLE_P.  */
10116
10117 bool
10118 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10119 {
10120   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10121     return true;
10122
10123   /* We specifically want to allow elements of "structure" modes to
10124      be tieable to the structure.  This more general condition allows
10125      other rarer situations too.  */
10126   if (TARGET_SIMD
10127       && aarch64_vector_mode_p (mode1)
10128       && aarch64_vector_mode_p (mode2))
10129     return true;
10130
10131   return false;
10132 }
10133
10134 /* Return a new RTX holding the result of moving POINTER forward by
10135    AMOUNT bytes.  */
10136
10137 static rtx
10138 aarch64_move_pointer (rtx pointer, int amount)
10139 {
10140   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10141
10142   return adjust_automodify_address (pointer, GET_MODE (pointer),
10143                                     next, amount);
10144 }
10145
10146 /* Return a new RTX holding the result of moving POINTER forward by the
10147    size of the mode it points to.  */
10148
10149 static rtx
10150 aarch64_progress_pointer (rtx pointer)
10151 {
10152   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10153
10154   return aarch64_move_pointer (pointer, amount);
10155 }
10156
10157 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10158    MODE bytes.  */
10159
10160 static void
10161 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10162                                               machine_mode mode)
10163 {
10164   rtx reg = gen_reg_rtx (mode);
10165
10166   /* "Cast" the pointers to the correct mode.  */
10167   *src = adjust_address (*src, mode, 0);
10168   *dst = adjust_address (*dst, mode, 0);
10169   /* Emit the memcpy.  */
10170   emit_move_insn (reg, *src);
10171   emit_move_insn (*dst, reg);
10172   /* Move the pointers forward.  */
10173   *src = aarch64_progress_pointer (*src);
10174   *dst = aarch64_progress_pointer (*dst);
10175 }
10176
10177 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10178    we succeed, otherwise return false.  */
10179
10180 bool
10181 aarch64_expand_movmem (rtx *operands)
10182 {
10183   unsigned int n;
10184   rtx dst = operands[0];
10185   rtx src = operands[1];
10186   rtx base;
10187   bool speed_p = !optimize_function_for_size_p (cfun);
10188
10189   /* When optimizing for size, give a better estimate of the length of a
10190      memcpy call, but use the default otherwise.  */
10191   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10192
10193   /* We can't do anything smart if the amount to copy is not constant.  */
10194   if (!CONST_INT_P (operands[2]))
10195     return false;
10196
10197   n = UINTVAL (operands[2]);
10198
10199   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10200      need to make at most two moves.  For cases above 16 bytes it will be one
10201      move for each 16 byte chunk, then at most two additional moves.  */
10202   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10203     return false;
10204
10205   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10206   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10207
10208   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10209   src = adjust_automodify_address (src, VOIDmode, base, 0);
10210
10211   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10212      1-byte chunk.  */
10213   if (n < 4)
10214     {
10215       if (n >= 2)
10216         {
10217           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10218           n -= 2;
10219         }
10220
10221       if (n == 1)
10222         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10223
10224       return true;
10225     }
10226
10227   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10228      4-byte chunk, partially overlapping with the previously copied chunk.  */
10229   if (n < 8)
10230     {
10231       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10232       n -= 4;
10233       if (n > 0)
10234         {
10235           int move = n - 4;
10236
10237           src = aarch64_move_pointer (src, move);
10238           dst = aarch64_move_pointer (dst, move);
10239           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10240         }
10241       return true;
10242     }
10243
10244   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10245      them, then (if applicable) an 8-byte chunk.  */
10246   while (n >= 8)
10247     {
10248       if (n / 16)
10249         {
10250           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10251           n -= 16;
10252         }
10253       else
10254         {
10255           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10256           n -= 8;
10257         }
10258     }
10259
10260   /* Finish the final bytes of the copy.  We can always do this in one
10261      instruction.  We either copy the exact amount we need, or partially
10262      overlap with the previous chunk we copied and copy 8-bytes.  */
10263   if (n == 0)
10264     return true;
10265   else if (n == 1)
10266     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10267   else if (n == 2)
10268     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10269   else if (n == 4)
10270     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10271   else
10272     {
10273       if (n == 3)
10274         {
10275           src = aarch64_move_pointer (src, -1);
10276           dst = aarch64_move_pointer (dst, -1);
10277           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10278         }
10279       else
10280         {
10281           int move = n - 8;
10282
10283           src = aarch64_move_pointer (src, move);
10284           dst = aarch64_move_pointer (dst, move);
10285           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10286         }
10287     }
10288
10289   return true;
10290 }
10291
10292 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10293
10294 static unsigned HOST_WIDE_INT
10295 aarch64_asan_shadow_offset (void)
10296 {
10297   return (HOST_WIDE_INT_1 << 36);
10298 }
10299
10300 static bool
10301 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10302                                         unsigned int align,
10303                                         enum by_pieces_operation op,
10304                                         bool speed_p)
10305 {
10306   /* STORE_BY_PIECES can be used when copying a constant string, but
10307      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10308      For now we always fail this and let the move_by_pieces code copy
10309      the string from read-only memory.  */
10310   if (op == STORE_BY_PIECES)
10311     return false;
10312
10313   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10314 }
10315
10316 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10317    instruction fusion of some sort.  */
10318
10319 static bool
10320 aarch64_macro_fusion_p (void)
10321 {
10322   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10323 }
10324
10325
10326 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10327    should be kept together during scheduling.  */
10328
10329 static bool
10330 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10331 {
10332   rtx set_dest;
10333   rtx prev_set = single_set (prev);
10334   rtx curr_set = single_set (curr);
10335   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10336   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10337
10338   if (!aarch64_macro_fusion_p ())
10339     return false;
10340
10341   if (simple_sets_p
10342       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10343     {
10344       /* We are trying to match:
10345          prev (mov)  == (set (reg r0) (const_int imm16))
10346          curr (movk) == (set (zero_extract (reg r0)
10347                                            (const_int 16)
10348                                            (const_int 16))
10349                              (const_int imm16_1))  */
10350
10351       set_dest = SET_DEST (curr_set);
10352
10353       if (GET_CODE (set_dest) == ZERO_EXTRACT
10354           && CONST_INT_P (SET_SRC (curr_set))
10355           && CONST_INT_P (SET_SRC (prev_set))
10356           && CONST_INT_P (XEXP (set_dest, 2))
10357           && INTVAL (XEXP (set_dest, 2)) == 16
10358           && REG_P (XEXP (set_dest, 0))
10359           && REG_P (SET_DEST (prev_set))
10360           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10361         {
10362           return true;
10363         }
10364     }
10365
10366   if (simple_sets_p
10367       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10368     {
10369
10370       /*  We're trying to match:
10371           prev (adrp) == (set (reg r1)
10372                               (high (symbol_ref ("SYM"))))
10373           curr (add) == (set (reg r0)
10374                              (lo_sum (reg r1)
10375                                      (symbol_ref ("SYM"))))
10376           Note that r0 need not necessarily be the same as r1, especially
10377           during pre-regalloc scheduling.  */
10378
10379       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10380           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10381         {
10382           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10383               && REG_P (XEXP (SET_SRC (curr_set), 0))
10384               && REGNO (XEXP (SET_SRC (curr_set), 0))
10385                  == REGNO (SET_DEST (prev_set))
10386               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10387                               XEXP (SET_SRC (curr_set), 1)))
10388             return true;
10389         }
10390     }
10391
10392   if (simple_sets_p
10393       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10394     {
10395
10396       /* We're trying to match:
10397          prev (movk) == (set (zero_extract (reg r0)
10398                                            (const_int 16)
10399                                            (const_int 32))
10400                              (const_int imm16_1))
10401          curr (movk) == (set (zero_extract (reg r0)
10402                                            (const_int 16)
10403                                            (const_int 48))
10404                              (const_int imm16_2))  */
10405
10406       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10407           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10408           && REG_P (XEXP (SET_DEST (prev_set), 0))
10409           && REG_P (XEXP (SET_DEST (curr_set), 0))
10410           && REGNO (XEXP (SET_DEST (prev_set), 0))
10411              == REGNO (XEXP (SET_DEST (curr_set), 0))
10412           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10413           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10414           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10415           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10416           && CONST_INT_P (SET_SRC (prev_set))
10417           && CONST_INT_P (SET_SRC (curr_set)))
10418         return true;
10419
10420     }
10421   if (simple_sets_p
10422       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10423     {
10424       /* We're trying to match:
10425           prev (adrp) == (set (reg r0)
10426                               (high (symbol_ref ("SYM"))))
10427           curr (ldr) == (set (reg r1)
10428                              (mem (lo_sum (reg r0)
10429                                              (symbol_ref ("SYM")))))
10430                  or
10431           curr (ldr) == (set (reg r1)
10432                              (zero_extend (mem
10433                                            (lo_sum (reg r0)
10434                                                    (symbol_ref ("SYM"))))))  */
10435       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10436           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10437         {
10438           rtx curr_src = SET_SRC (curr_set);
10439
10440           if (GET_CODE (curr_src) == ZERO_EXTEND)
10441             curr_src = XEXP (curr_src, 0);
10442
10443           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10444               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10445               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10446                  == REGNO (SET_DEST (prev_set))
10447               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10448                               XEXP (SET_SRC (prev_set), 0)))
10449               return true;
10450         }
10451     }
10452
10453   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10454       && any_condjump_p (curr))
10455     {
10456       enum attr_type prev_type = get_attr_type (prev);
10457
10458       /* FIXME: this misses some which is considered simple arthematic
10459          instructions for ThunderX.  Simple shifts are missed here.  */
10460       if (prev_type == TYPE_ALUS_SREG
10461           || prev_type == TYPE_ALUS_IMM
10462           || prev_type == TYPE_LOGICS_REG
10463           || prev_type == TYPE_LOGICS_IMM)
10464         return true;
10465     }
10466
10467   return false;
10468 }
10469
10470 /* If MEM is in the form of [base+offset], extract the two parts
10471    of address and set to BASE and OFFSET, otherwise return false
10472    after clearing BASE and OFFSET.  */
10473
10474 bool
10475 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10476 {
10477   rtx addr;
10478
10479   gcc_assert (MEM_P (mem));
10480
10481   addr = XEXP (mem, 0);
10482
10483   if (REG_P (addr))
10484     {
10485       *base = addr;
10486       *offset = const0_rtx;
10487       return true;
10488     }
10489
10490   if (GET_CODE (addr) == PLUS
10491       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10492     {
10493       *base = XEXP (addr, 0);
10494       *offset = XEXP (addr, 1);
10495       return true;
10496     }
10497
10498   *base = NULL_RTX;
10499   *offset = NULL_RTX;
10500
10501   return false;
10502 }
10503
10504 /* Types for scheduling fusion.  */
10505 enum sched_fusion_type
10506 {
10507   SCHED_FUSION_NONE = 0,
10508   SCHED_FUSION_LD_SIGN_EXTEND,
10509   SCHED_FUSION_LD_ZERO_EXTEND,
10510   SCHED_FUSION_LD,
10511   SCHED_FUSION_ST,
10512   SCHED_FUSION_NUM
10513 };
10514
10515 /* If INSN is a load or store of address in the form of [base+offset],
10516    extract the two parts and set to BASE and OFFSET.  Return scheduling
10517    fusion type this INSN is.  */
10518
10519 static enum sched_fusion_type
10520 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10521 {
10522   rtx x, dest, src;
10523   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10524
10525   gcc_assert (INSN_P (insn));
10526   x = PATTERN (insn);
10527   if (GET_CODE (x) != SET)
10528     return SCHED_FUSION_NONE;
10529
10530   src = SET_SRC (x);
10531   dest = SET_DEST (x);
10532
10533   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10534       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10535     return SCHED_FUSION_NONE;
10536
10537   if (GET_CODE (src) == SIGN_EXTEND)
10538     {
10539       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10540       src = XEXP (src, 0);
10541       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10542         return SCHED_FUSION_NONE;
10543     }
10544   else if (GET_CODE (src) == ZERO_EXTEND)
10545     {
10546       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10547       src = XEXP (src, 0);
10548       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10549         return SCHED_FUSION_NONE;
10550     }
10551
10552   if (GET_CODE (src) == MEM && REG_P (dest))
10553     extract_base_offset_in_addr (src, base, offset);
10554   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10555     {
10556       fusion = SCHED_FUSION_ST;
10557       extract_base_offset_in_addr (dest, base, offset);
10558     }
10559   else
10560     return SCHED_FUSION_NONE;
10561
10562   if (*base == NULL_RTX || *offset == NULL_RTX)
10563     fusion = SCHED_FUSION_NONE;
10564
10565   return fusion;
10566 }
10567
10568 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10569
10570    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10571    and PRI are only calculated for these instructions.  For other instruction,
10572    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10573    type instruction fusion can be added by returning different priorities.
10574
10575    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10576
10577 static void
10578 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10579                                int *fusion_pri, int *pri)
10580 {
10581   int tmp, off_val;
10582   rtx base, offset;
10583   enum sched_fusion_type fusion;
10584
10585   gcc_assert (INSN_P (insn));
10586
10587   tmp = max_pri - 1;
10588   fusion = fusion_load_store (insn, &base, &offset);
10589   if (fusion == SCHED_FUSION_NONE)
10590     {
10591       *pri = tmp;
10592       *fusion_pri = tmp;
10593       return;
10594     }
10595
10596   /* Set FUSION_PRI according to fusion type and base register.  */
10597   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10598
10599   /* Calculate PRI.  */
10600   tmp /= 2;
10601
10602   /* INSN with smaller offset goes first.  */
10603   off_val = (int)(INTVAL (offset));
10604   if (off_val >= 0)
10605     tmp -= (off_val & 0xfffff);
10606   else
10607     tmp += ((- off_val) & 0xfffff);
10608
10609   *pri = tmp;
10610   return;
10611 }
10612
10613 /* Given OPERANDS of consecutive load/store, check if we can merge
10614    them into ldp/stp.  LOAD is true if they are load instructions.
10615    MODE is the mode of memory operands.  */
10616
10617 bool
10618 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10619                                 enum machine_mode mode)
10620 {
10621   HOST_WIDE_INT offval_1, offval_2, msize;
10622   enum reg_class rclass_1, rclass_2;
10623   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10624
10625   if (load)
10626     {
10627       mem_1 = operands[1];
10628       mem_2 = operands[3];
10629       reg_1 = operands[0];
10630       reg_2 = operands[2];
10631       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10632       if (REGNO (reg_1) == REGNO (reg_2))
10633         return false;
10634     }
10635   else
10636     {
10637       mem_1 = operands[0];
10638       mem_2 = operands[2];
10639       reg_1 = operands[1];
10640       reg_2 = operands[3];
10641     }
10642
10643   /* The mems cannot be volatile.  */
10644   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10645     return false;
10646
10647   /* Check if the addresses are in the form of [base+offset].  */
10648   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10649   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10650     return false;
10651   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10652   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10653     return false;
10654
10655   /* Check if the bases are same.  */
10656   if (!rtx_equal_p (base_1, base_2))
10657     return false;
10658
10659   offval_1 = INTVAL (offset_1);
10660   offval_2 = INTVAL (offset_2);
10661   msize = GET_MODE_SIZE (mode);
10662   /* Check if the offsets are consecutive.  */
10663   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10664     return false;
10665
10666   /* Check if the addresses are clobbered by load.  */
10667   if (load)
10668     {
10669       if (reg_mentioned_p (reg_1, mem_1))
10670         return false;
10671
10672       /* In increasing order, the last load can clobber the address.  */
10673       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10674       return false;
10675     }
10676
10677   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10678     rclass_1 = FP_REGS;
10679   else
10680     rclass_1 = GENERAL_REGS;
10681
10682   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10683     rclass_2 = FP_REGS;
10684   else
10685     rclass_2 = GENERAL_REGS;
10686
10687   /* Check if the registers are of same class.  */
10688   if (rclass_1 != rclass_2)
10689     return false;
10690
10691   return true;
10692 }
10693
10694 /* Given OPERANDS of consecutive load/store, check if we can merge
10695    them into ldp/stp by adjusting the offset.  LOAD is true if they
10696    are load instructions.  MODE is the mode of memory operands.
10697
10698    Given below consecutive stores:
10699
10700      str  w1, [xb, 0x100]
10701      str  w1, [xb, 0x104]
10702      str  w1, [xb, 0x108]
10703      str  w1, [xb, 0x10c]
10704
10705    Though the offsets are out of the range supported by stp, we can
10706    still pair them after adjusting the offset, like:
10707
10708      add  scratch, xb, 0x100
10709      stp  w1, w1, [scratch]
10710      stp  w1, w1, [scratch, 0x8]
10711
10712    The peephole patterns detecting this opportunity should guarantee
10713    the scratch register is avaliable.  */
10714
10715 bool
10716 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
10717                                        enum machine_mode mode)
10718 {
10719   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
10720   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
10721   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
10722   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
10723
10724   if (load)
10725     {
10726       reg_1 = operands[0];
10727       mem_1 = operands[1];
10728       reg_2 = operands[2];
10729       mem_2 = operands[3];
10730       reg_3 = operands[4];
10731       mem_3 = operands[5];
10732       reg_4 = operands[6];
10733       mem_4 = operands[7];
10734       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
10735                   && REG_P (reg_3) && REG_P (reg_4));
10736       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
10737         return false;
10738     }
10739   else
10740     {
10741       mem_1 = operands[0];
10742       reg_1 = operands[1];
10743       mem_2 = operands[2];
10744       reg_2 = operands[3];
10745       mem_3 = operands[4];
10746       reg_3 = operands[5];
10747       mem_4 = operands[6];
10748       reg_4 = operands[7];
10749     }
10750   /* Skip if memory operand is by itslef valid for ldp/stp.  */
10751   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
10752     return false;
10753
10754   /* The mems cannot be volatile.  */
10755   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
10756       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
10757     return false;
10758
10759   /* Check if the addresses are in the form of [base+offset].  */
10760   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10761   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10762     return false;
10763   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10764   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10765     return false;
10766   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
10767   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
10768     return false;
10769   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
10770   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
10771     return false;
10772
10773   /* Check if the bases are same.  */
10774   if (!rtx_equal_p (base_1, base_2)
10775       || !rtx_equal_p (base_2, base_3)
10776       || !rtx_equal_p (base_3, base_4))
10777     return false;
10778
10779   offval_1 = INTVAL (offset_1);
10780   offval_2 = INTVAL (offset_2);
10781   offval_3 = INTVAL (offset_3);
10782   offval_4 = INTVAL (offset_4);
10783   msize = GET_MODE_SIZE (mode);
10784   /* Check if the offsets are consecutive.  */
10785   if ((offval_1 != (offval_2 + msize)
10786        || offval_1 != (offval_3 + msize * 2)
10787        || offval_1 != (offval_4 + msize * 3))
10788       && (offval_4 != (offval_3 + msize)
10789           || offval_4 != (offval_2 + msize * 2)
10790           || offval_4 != (offval_1 + msize * 3)))
10791     return false;
10792
10793   /* Check if the addresses are clobbered by load.  */
10794   if (load)
10795     {
10796       if (reg_mentioned_p (reg_1, mem_1)
10797           || reg_mentioned_p (reg_2, mem_2)
10798           || reg_mentioned_p (reg_3, mem_3))
10799         return false;
10800
10801       /* In increasing order, the last load can clobber the address.  */
10802       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
10803         return false;
10804     }
10805
10806   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10807     rclass_1 = FP_REGS;
10808   else
10809     rclass_1 = GENERAL_REGS;
10810
10811   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10812     rclass_2 = FP_REGS;
10813   else
10814     rclass_2 = GENERAL_REGS;
10815
10816   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
10817     rclass_3 = FP_REGS;
10818   else
10819     rclass_3 = GENERAL_REGS;
10820
10821   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
10822     rclass_4 = FP_REGS;
10823   else
10824     rclass_4 = GENERAL_REGS;
10825
10826   /* Check if the registers are of same class.  */
10827   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
10828     return false;
10829
10830   return true;
10831 }
10832
10833 /* Given OPERANDS of consecutive load/store, this function pairs them
10834    into ldp/stp after adjusting the offset.  It depends on the fact
10835    that addresses of load/store instructions are in increasing order.
10836    MODE is the mode of memory operands.  CODE is the rtl operator
10837    which should be applied to all memory operands, it's SIGN_EXTEND,
10838    ZERO_EXTEND or UNKNOWN.  */
10839
10840 bool
10841 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
10842                              enum machine_mode mode, RTX_CODE code)
10843 {
10844   rtx base, offset, t1, t2;
10845   rtx mem_1, mem_2, mem_3, mem_4;
10846   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
10847
10848   if (load)
10849     {
10850       mem_1 = operands[1];
10851       mem_2 = operands[3];
10852       mem_3 = operands[5];
10853       mem_4 = operands[7];
10854     }
10855   else
10856     {
10857       mem_1 = operands[0];
10858       mem_2 = operands[2];
10859       mem_3 = operands[4];
10860       mem_4 = operands[6];
10861       gcc_assert (code == UNKNOWN);
10862     }
10863
10864   extract_base_offset_in_addr (mem_1, &base, &offset);
10865   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
10866
10867   /* Adjust offset thus it can fit in ldp/stp instruction.  */
10868   msize = GET_MODE_SIZE (mode);
10869   stp_off_limit = msize * 0x40;
10870   off_val = INTVAL (offset);
10871   abs_off = (off_val < 0) ? -off_val : off_val;
10872   new_off = abs_off % stp_off_limit;
10873   adj_off = abs_off - new_off;
10874
10875   /* Further adjust to make sure all offsets are OK.  */
10876   if ((new_off + msize * 2) >= stp_off_limit)
10877     {
10878       adj_off += stp_off_limit;
10879       new_off -= stp_off_limit;
10880     }
10881
10882   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
10883   if (adj_off >= 0x1000)
10884     return false;
10885
10886   if (off_val < 0)
10887     {
10888       adj_off = -adj_off;
10889       new_off = -new_off;
10890     }
10891
10892   /* Create new memory references.  */
10893   mem_1 = change_address (mem_1, VOIDmode,
10894                           plus_constant (DImode, operands[8], new_off));
10895
10896   /* Check if the adjusted address is OK for ldp/stp.  */
10897   if (!aarch64_mem_pair_operand (mem_1, mode))
10898     return false;
10899
10900   msize = GET_MODE_SIZE (mode);
10901   mem_2 = change_address (mem_2, VOIDmode,
10902                           plus_constant (DImode,
10903                                          operands[8],
10904                                          new_off + msize));
10905   mem_3 = change_address (mem_3, VOIDmode,
10906                           plus_constant (DImode,
10907                                          operands[8],
10908                                          new_off + msize * 2));
10909   mem_4 = change_address (mem_4, VOIDmode,
10910                           plus_constant (DImode,
10911                                          operands[8],
10912                                          new_off + msize * 3));
10913
10914   if (code == ZERO_EXTEND)
10915     {
10916       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
10917       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
10918       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
10919       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
10920     }
10921   else if (code == SIGN_EXTEND)
10922     {
10923       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
10924       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
10925       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
10926       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
10927     }
10928
10929   if (load)
10930     {
10931       operands[1] = mem_1;
10932       operands[3] = mem_2;
10933       operands[5] = mem_3;
10934       operands[7] = mem_4;
10935     }
10936   else
10937     {
10938       operands[0] = mem_1;
10939       operands[2] = mem_2;
10940       operands[4] = mem_3;
10941       operands[6] = mem_4;
10942     }
10943
10944   /* Emit adjusting instruction.  */
10945   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
10946                           plus_constant (DImode, base, adj_off)));
10947   /* Emit ldp/stp instructions.  */
10948   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
10949   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
10950   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
10951   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
10952   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
10953   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
10954   return true;
10955 }
10956
10957 #undef TARGET_ADDRESS_COST
10958 #define TARGET_ADDRESS_COST aarch64_address_cost
10959
10960 /* This hook will determines whether unnamed bitfields affect the alignment
10961    of the containing structure.  The hook returns true if the structure
10962    should inherit the alignment requirements of an unnamed bitfield's
10963    type.  */
10964 #undef TARGET_ALIGN_ANON_BITFIELD
10965 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
10966
10967 #undef TARGET_ASM_ALIGNED_DI_OP
10968 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
10969
10970 #undef TARGET_ASM_ALIGNED_HI_OP
10971 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
10972
10973 #undef TARGET_ASM_ALIGNED_SI_OP
10974 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
10975
10976 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
10977 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
10978   hook_bool_const_tree_hwi_hwi_const_tree_true
10979
10980 #undef TARGET_ASM_FILE_START
10981 #define TARGET_ASM_FILE_START aarch64_start_file
10982
10983 #undef TARGET_ASM_OUTPUT_MI_THUNK
10984 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
10985
10986 #undef TARGET_ASM_SELECT_RTX_SECTION
10987 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
10988
10989 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
10990 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
10991
10992 #undef TARGET_BUILD_BUILTIN_VA_LIST
10993 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
10994
10995 #undef TARGET_CALLEE_COPIES
10996 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
10997
10998 #undef TARGET_CAN_ELIMINATE
10999 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11000
11001 #undef TARGET_CANNOT_FORCE_CONST_MEM
11002 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11003
11004 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11005 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11006
11007 /* Only the least significant bit is used for initialization guard
11008    variables.  */
11009 #undef TARGET_CXX_GUARD_MASK_BIT
11010 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11011
11012 #undef TARGET_C_MODE_FOR_SUFFIX
11013 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11014
11015 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11016 #undef  TARGET_DEFAULT_TARGET_FLAGS
11017 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11018 #endif
11019
11020 #undef TARGET_CLASS_MAX_NREGS
11021 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11022
11023 #undef TARGET_BUILTIN_DECL
11024 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11025
11026 #undef  TARGET_EXPAND_BUILTIN
11027 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11028
11029 #undef TARGET_EXPAND_BUILTIN_VA_START
11030 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11031
11032 #undef TARGET_FOLD_BUILTIN
11033 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11034
11035 #undef TARGET_FUNCTION_ARG
11036 #define TARGET_FUNCTION_ARG aarch64_function_arg
11037
11038 #undef TARGET_FUNCTION_ARG_ADVANCE
11039 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11040
11041 #undef TARGET_FUNCTION_ARG_BOUNDARY
11042 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11043
11044 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11045 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11046
11047 #undef TARGET_FUNCTION_VALUE
11048 #define TARGET_FUNCTION_VALUE aarch64_function_value
11049
11050 #undef TARGET_FUNCTION_VALUE_REGNO_P
11051 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11052
11053 #undef TARGET_FRAME_POINTER_REQUIRED
11054 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11055
11056 #undef TARGET_GIMPLE_FOLD_BUILTIN
11057 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11058
11059 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11060 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11061
11062 #undef  TARGET_INIT_BUILTINS
11063 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11064
11065 #undef TARGET_LEGITIMATE_ADDRESS_P
11066 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11067
11068 #undef TARGET_LEGITIMATE_CONSTANT_P
11069 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11070
11071 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11072 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11073
11074 #undef TARGET_LRA_P
11075 #define TARGET_LRA_P aarch64_lra_p
11076
11077 #undef TARGET_MANGLE_TYPE
11078 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11079
11080 #undef TARGET_MEMORY_MOVE_COST
11081 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11082
11083 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11084 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11085
11086 #undef TARGET_MUST_PASS_IN_STACK
11087 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11088
11089 /* This target hook should return true if accesses to volatile bitfields
11090    should use the narrowest mode possible.  It should return false if these
11091    accesses should use the bitfield container type.  */
11092 #undef TARGET_NARROW_VOLATILE_BITFIELD
11093 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11094
11095 #undef  TARGET_OPTION_OVERRIDE
11096 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11097
11098 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11099 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11100   aarch64_override_options_after_change
11101
11102 #undef TARGET_PASS_BY_REFERENCE
11103 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11104
11105 #undef TARGET_PREFERRED_RELOAD_CLASS
11106 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11107
11108 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11109 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11110
11111 #undef TARGET_SECONDARY_RELOAD
11112 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11113
11114 #undef TARGET_SHIFT_TRUNCATION_MASK
11115 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11116
11117 #undef TARGET_SETUP_INCOMING_VARARGS
11118 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11119
11120 #undef TARGET_STRUCT_VALUE_RTX
11121 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11122
11123 #undef TARGET_REGISTER_MOVE_COST
11124 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11125
11126 #undef TARGET_RETURN_IN_MEMORY
11127 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11128
11129 #undef TARGET_RETURN_IN_MSB
11130 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11131
11132 #undef TARGET_RTX_COSTS
11133 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11134
11135 #undef TARGET_SCHED_ISSUE_RATE
11136 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11137
11138 #undef TARGET_TRAMPOLINE_INIT
11139 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11140
11141 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11142 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11143
11144 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11145 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11146
11147 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11148 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11149
11150 #undef TARGET_VECTORIZE_ADD_STMT_COST
11151 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11152
11153 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11154 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11155   aarch64_builtin_vectorization_cost
11156
11157 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11158 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11159
11160 #undef TARGET_VECTORIZE_BUILTINS
11161 #define TARGET_VECTORIZE_BUILTINS
11162
11163 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11164 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11165   aarch64_builtin_vectorized_function
11166
11167 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11168 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11169   aarch64_autovectorize_vector_sizes
11170
11171 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11172 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11173   aarch64_atomic_assign_expand_fenv
11174
11175 /* Section anchor support.  */
11176
11177 #undef TARGET_MIN_ANCHOR_OFFSET
11178 #define TARGET_MIN_ANCHOR_OFFSET -256
11179
11180 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11181    byte offset; we can do much more for larger data types, but have no way
11182    to determine the size of the access.  We assume accesses are aligned.  */
11183 #undef TARGET_MAX_ANCHOR_OFFSET
11184 #define TARGET_MAX_ANCHOR_OFFSET 4095
11185
11186 #undef TARGET_VECTOR_ALIGNMENT
11187 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11188
11189 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11190 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11191   aarch64_simd_vector_alignment_reachable
11192
11193 /* vec_perm support.  */
11194
11195 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11196 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11197   aarch64_vectorize_vec_perm_const_ok
11198
11199
11200 #undef TARGET_FIXED_CONDITION_CODE_REGS
11201 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11202
11203 #undef TARGET_FLAGS_REGNUM
11204 #define TARGET_FLAGS_REGNUM CC_REGNUM
11205
11206 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11207 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11208
11209 #undef TARGET_ASAN_SHADOW_OFFSET
11210 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11211
11212 #undef TARGET_LEGITIMIZE_ADDRESS
11213 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11214
11215 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11216 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11217   aarch64_use_by_pieces_infrastructure_p
11218
11219 #undef TARGET_CAN_USE_DOLOOP_P
11220 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11221
11222 #undef TARGET_SCHED_MACRO_FUSION_P
11223 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11224
11225 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11226 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11227
11228 #undef TARGET_SCHED_FUSION_PRIORITY
11229 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11230
11231 struct gcc_target targetm = TARGET_INITIALIZER;
11232
11233 #include "gt-aarch64.h"