gcc/config/riscv/riscv-v.cc

   1 /* Subroutines used for code generation for RISC-V 'V' Extension for
   2    GNU compiler.
   3    Copyright (C) 2022-2024 Free Software Foundation, Inc.
   4    Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
   5
   6    This file is part of GCC.
   7
   8    GCC is free software; you can redistribute it and/or modify it
   9    under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    GCC is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GCC; see the file COPYING3.  If not see
  20    <http://www.gnu.org/licenses/>.  */
  21
  22 #define IN_TARGET_CODE 1
  23
  24 /* We have a maximum of 11 operands for RVV instruction patterns according to
  25    the vector.md.  */
  26 #define RVV_INSN_OPERANDS_MAX 11
  27
  28 #include "config.h"
  29 #include "system.h"
  30 #include "coretypes.h"
  31 #include "tm.h"
  32 #include "backend.h"
  33 #include "rtl.h"
  34 #include "insn-config.h"
  35 #include "insn-attr.h"
  36 #include "recog.h"
  37 #include "alias.h"
  38 #include "tree.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "explow.h"
  42 #include "memmodel.h"
  43 #include "emit-rtl.h"
  44 #include "tm_p.h"
  45 #include "target.h"
  46 #include "targhooks.h"
  47 #include "expr.h"
  48 #include "optabs.h"
  49 #include "tm-constrs.h"
  50 #include "rtx-vector-builder.h"
  51 #include "targhooks.h"
  52 #include "predict.h"
  53
  54 using namespace riscv_vector;
  55
  56 namespace riscv_vector {
  57
  58 /* Return true if NUNTIS <=31 so that we can use immediate AVL in vsetivli.  */
  59 bool
  60 imm_avl_p (machine_mode mode)
  61 {
  62   poly_uint64 nunits = GET_MODE_NUNITS (mode);
  63
  64   return nunits.is_constant ()
  65            /* The vsetivli can only hold register 0~31.  */
  66            ? (IN_RANGE (nunits.to_constant (), 0, 31))
  67            /* Only allowed in VLS-VLMAX mode.  */
  68            : false;
  69 }
  70
  71 /* Return true if LEN is equal to NUNITS that out of the range [0, 31].  */
  72 static bool
  73 is_vlmax_len_p (machine_mode mode, rtx len)
  74 {
  75   poly_int64 value;
  76   return poly_int_rtx_p (len, &value)
  77          && known_eq (value, GET_MODE_NUNITS (mode));
  78 }
  79
  80 /* Helper functions for insn_flags && insn_types */
  81
  82 /* Return true if caller need pass mask operand for insn pattern with
  83    INSN_FLAGS. */
  84
  85 static bool
  86 need_mask_operand_p (unsigned insn_flags)
  87 {
  88   return (insn_flags & HAS_MASK_P)
  89          && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
  90 }
  91
  92 template <int MAX_OPERANDS> class insn_expander
  93 {
  94 public:
  95   insn_expander () = delete;
  96
  97   insn_expander (unsigned insn_flags, bool vlmax_p)
  98     : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
  99       m_vl_op (NULL_RTX)
 100   {
 101     check_insn_flags ();
 102   }
 103
 104   void check_insn_flags () const
 105   {
 106     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 107       /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P.  */
 108       gcc_assert ((m_insn_flags & HAS_MASK_P));
 109
 110     if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 111       /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P.  */
 112       gcc_assert ((m_insn_flags & HAS_MASK_P));
 113
 114     /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive.  */
 115     gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
 116                   && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
 117
 118     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 119       /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P.  */
 120       gcc_assert ((m_insn_flags & HAS_MERGE_P));
 121
 122     /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive.  */
 123     gcc_assert (
 124       !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
 125
 126     /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive.  */
 127     gcc_assert (
 128       !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
 129
 130     /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
 131        exclusive.  */
 132     gcc_assert (
 133       !((m_insn_flags & NULLARY_OP_P)
 134         && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 135             || (m_insn_flags & TERNARY_OP_P))));
 136     gcc_assert (
 137       !((m_insn_flags & UNARY_OP_P)
 138         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 139             || (m_insn_flags & TERNARY_OP_P))));
 140     gcc_assert (
 141       !((m_insn_flags & BINARY_OP_P)
 142         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 143             || (m_insn_flags & TERNARY_OP_P))));
 144     gcc_assert (
 145       !((m_insn_flags & TERNARY_OP_P)
 146         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 147             || (m_insn_flags & BINARY_OP_P))));
 148   }
 149
 150   void set_vl (rtx vl) { m_vl_op = vl; }
 151
 152   void add_output_operand (rtx x, machine_mode mode)
 153   {
 154     create_output_operand (&m_ops[m_opno++], x, mode);
 155     gcc_assert (m_opno <= MAX_OPERANDS);
 156   }
 157   void add_input_operand (rtx x, machine_mode mode)
 158   {
 159     create_input_operand (&m_ops[m_opno++], x, mode);
 160     gcc_assert (m_opno <= MAX_OPERANDS);
 161   }
 162   void add_all_one_mask_operand (machine_mode mask_mode)
 163   {
 164     add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
 165   }
 166   void add_first_one_true_mask_operand (machine_mode mask_mode)
 167   {
 168     add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
 169   }
 170   void add_vundef_operand (machine_mode dest_mode)
 171   {
 172     add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
 173   }
 174   void add_policy_operand ()
 175   {
 176     if (m_insn_flags & TU_POLICY_P)
 177       {
 178         rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
 179         add_input_operand (tail_policy_rtx, Pmode);
 180       }
 181     else if (m_insn_flags & TDEFAULT_POLICY_P)
 182       {
 183         rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
 184         add_input_operand (tail_policy_rtx, Pmode);
 185       }
 186
 187     if (m_insn_flags & MU_POLICY_P)
 188       {
 189         rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
 190         add_input_operand (mask_policy_rtx, Pmode);
 191       }
 192     else if (m_insn_flags & MDEFAULT_POLICY_P)
 193       {
 194         rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
 195         add_input_operand (mask_policy_rtx, Pmode);
 196       }
 197   }
 198   void add_avl_type_operand (avl_type type)
 199   {
 200     add_input_operand (gen_int_mode (type, Pmode), Pmode);
 201   }
 202
 203   void
 204   add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
 205   {
 206     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
 207     add_input_operand (frm_rtx, Pmode);
 208   }
 209
 210   /* Return the vtype mode based on insn_flags.
 211      vtype mode mean the mode vsetvl insn set. */
 212   machine_mode
 213   get_vtype_mode (rtx *ops)
 214   {
 215     machine_mode vtype_mode;
 216     if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
 217       vtype_mode = GET_MODE (ops[1]);
 218     else
 219       vtype_mode = GET_MODE (ops[0]);
 220     return vtype_mode;
 221   }
 222
 223   void emit_insn (enum insn_code icode, rtx *ops)
 224   {
 225     int opno = 0;
 226     int num_ops;
 227     /* It's true if any operand is memory operand.  */
 228     bool any_mem_p = false;
 229
 230     machine_mode vtype_mode = get_vtype_mode (ops);
 231     machine_mode mask_mode = get_mask_mode (vtype_mode);
 232
 233     /* Add dest operand.  */
 234     if (m_insn_flags & HAS_DEST_P)
 235       {
 236         rtx op = ops[opno++];
 237         any_mem_p |= MEM_P (op);
 238         add_output_operand (op, GET_MODE (op));
 239       }
 240
 241     /* Add mask operand.  */
 242     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 243       add_first_one_true_mask_operand (mask_mode);
 244     else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 245       add_all_one_mask_operand (mask_mode);
 246     else if (m_insn_flags & HAS_MASK_P)
 247       {
 248         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 249         gcc_assert (mode != VOIDmode);
 250         add_input_operand (ops[opno++], mode);
 251       }
 252
 253     /* Add merge operand.  */
 254     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 255       /* Same as dest operand.  */
 256       add_vundef_operand (GET_MODE (ops[0]));
 257     else if (m_insn_flags & HAS_MERGE_P)
 258       {
 259         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 260         gcc_assert (mode != VOIDmode);
 261         add_input_operand (ops[opno++], mode);
 262       }
 263
 264     if (m_insn_flags & NULLARY_OP_P)
 265       num_ops = 0;
 266     else if (m_insn_flags & UNARY_OP_P)
 267       num_ops = 1;
 268     else if (m_insn_flags & BINARY_OP_P)
 269       num_ops = 2;
 270     else if (m_insn_flags & TERNARY_OP_P)
 271       num_ops = 3;
 272     else
 273       gcc_unreachable ();
 274
 275     /* Add the remain operands.  */
 276     for (; num_ops; num_ops--, opno++)
 277       {
 278         any_mem_p |= MEM_P (ops[opno]);
 279         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 280         /* 'create_input_operand doesn't allow VOIDmode.
 281            According to vector.md, we may have some patterns that do not have
 282            explicit machine mode specifying the operand. Such operands are
 283            always Pmode.  */
 284         if (mode == VOIDmode)
 285           mode = Pmode;
 286         else
 287           /* Early assertion ensures same mode since maybe_legitimize_operand
 288              will check this.  */
 289           gcc_assert (GET_MODE (ops[opno]) == VOIDmode
 290                       || GET_MODE (ops[opno]) == mode);
 291
 292         add_input_operand (ops[opno], mode);
 293       }
 294
 295     /* Add vl operand.  */
 296     rtx len = m_vl_op;
 297     bool vls_p = false;
 298     if (m_vlmax_p)
 299       {
 300         if (riscv_v_ext_vls_mode_p (vtype_mode))
 301           {
 302             /* VLS modes always set VSETVL by
 303                "vsetvl zero, rs1/imm".  */
 304             poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
 305             len = gen_int_mode (nunits, Pmode);
 306             vls_p = true;
 307           }
 308         else if (can_create_pseudo_p ())
 309           {
 310             len = gen_reg_rtx (Pmode);
 311             emit_vlmax_vsetvl (vtype_mode, len);
 312           }
 313       }
 314
 315     gcc_assert (len != NULL_RTX);
 316     add_input_operand (len, Pmode);
 317
 318     /* Add tail and mask policy operands.  */
 319     add_policy_operand ();
 320
 321     /* Add avl_type operand.  */
 322     add_avl_type_operand (
 323       vls_p ? avl_type::VLS
 324             : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
 325
 326     /* Add rounding mode operand.  */
 327     if (m_insn_flags & FRM_DYN_P)
 328       add_rounding_mode_operand (FRM_DYN);
 329     else if (m_insn_flags & FRM_RUP_P)
 330       add_rounding_mode_operand (FRM_RUP);
 331     else if (m_insn_flags & FRM_RDN_P)
 332       add_rounding_mode_operand (FRM_RDN);
 333     else if (m_insn_flags & FRM_RMM_P)
 334       add_rounding_mode_operand (FRM_RMM);
 335     else if (m_insn_flags & FRM_RNE_P)
 336       add_rounding_mode_operand (FRM_RNE);
 337
 338     gcc_assert (insn_data[(int) icode].n_operands == m_opno);
 339     expand (icode, any_mem_p);
 340   }
 341
 342   void expand (enum insn_code icode, bool temporary_volatile_p = false)
 343   {
 344     if (temporary_volatile_p)
 345       {
 346         temporary_volatile_ok v (true);
 347         expand_insn (icode, m_opno, m_ops);
 348       }
 349     else
 350       expand_insn (icode, m_opno, m_ops);
 351   }
 352
 353 private:
 354   unsigned m_insn_flags;
 355   int m_opno;
 356   bool m_vlmax_p;
 357   rtx m_vl_op;
 358   expand_operand m_ops[MAX_OPERANDS];
 359 };
 360
 361 /* Emit an RVV insn with a vector length that equals the number of units of the
 362    vector mode.  For VLA modes this corresponds to VLMAX.
 363
 364    Unless the vector length can be encoded in the vsetivl[i] instruction this
 365    function must only be used as long as we can create pseudo registers. This is
 366    because it will set a pseudo register to VLMAX using vsetvl and use this as
 367    definition for the vector length.  */
 368 void
 369 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
 370 {
 371   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 372   gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
 373
 374   e.emit_insn ((enum insn_code) icode, ops);
 375 }
 376
 377 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
 378    registers anymore.  This function, however, takes a predefined vector length
 379    from the value in VL. */
 380 void
 381 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 382 {
 383   gcc_assert (!can_create_pseudo_p ());
 384   machine_mode mode = GET_MODE (ops[0]);
 385
 386   if (imm_avl_p (mode))
 387     {
 388       /* Even though VL is a real hardreg already allocated since
 389          it is post-RA now, we still gain benefits that we emit
 390          vsetivli zero, imm instead of vsetvli VL, zero which is
 391          we can be more flexible in post-RA instruction scheduling.  */
 392       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 393       e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
 394       e.emit_insn ((enum insn_code) icode, ops);
 395     }
 396   else
 397     {
 398       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 399       e.set_vl (vl);
 400       e.emit_insn ((enum insn_code) icode, ops);
 401     }
 402 }
 403
 404 /* Emit an RVV insn with a predefined vector length.  Contrary to
 405    emit_vlmax_insn the instruction's vector length is not deduced from its mode
 406    but taken from  the value in VL.  */
 407 void
 408 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 409 {
 410   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 411   e.set_vl (vl);
 412   e.emit_insn ((enum insn_code) icode, ops);
 413 }
 414
 415 class rvv_builder : public rtx_vector_builder
 416 {
 417 public:
 418   rvv_builder () : rtx_vector_builder () {}
 419   rvv_builder (machine_mode mode, unsigned int npatterns,
 420                unsigned int nelts_per_pattern)
 421     : rtx_vector_builder (mode, npatterns, nelts_per_pattern)
 422   {
 423     m_inner_mode = GET_MODE_INNER (mode);
 424     m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
 425     m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
 426     m_mask_mode = get_mask_mode (mode);
 427
 428     gcc_assert (
 429       int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
 430     m_int_mode
 431       = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require ();
 432   }
 433
 434   bool can_duplicate_repeating_sequence_p ();
 435   rtx get_merged_repeating_sequence ();
 436
 437   bool repeating_sequence_use_merge_profitable_p ();
 438   bool combine_sequence_use_slideup_profitable_p ();
 439   bool combine_sequence_use_merge_profitable_p ();
 440   rtx get_merge_scalar_mask (unsigned int, machine_mode) const;
 441
 442   bool single_step_npatterns_p () const;
 443   bool npatterns_all_equal_p () const;
 444   bool interleaved_stepped_npatterns_p () const;
 445   bool npatterns_vid_diff_repeated_p () const;
 446
 447   machine_mode new_mode () const { return m_new_mode; }
 448   scalar_mode inner_mode () const { return m_inner_mode; }
 449   scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
 450   machine_mode mask_mode () const { return m_mask_mode; }
 451   machine_mode int_mode () const { return m_int_mode; }
 452   unsigned int inner_bits_size () const { return m_inner_bits_size; }
 453   unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
 454
 455 private:
 456   scalar_mode m_inner_mode;
 457   scalar_int_mode m_inner_int_mode;
 458   machine_mode m_new_mode;
 459   scalar_int_mode m_new_inner_mode;
 460   machine_mode m_mask_mode;
 461   machine_mode m_int_mode;
 462   unsigned int m_inner_bits_size;
 463   unsigned int m_inner_bytes_size;
 464 };
 465
 466 /* Return true if the vector duplicated by a super element which is the fusion
 467    of consecutive elements.
 468
 469      v = { a, b, a, b } super element = ab, v = { ab, ab }  */
 470 bool
 471 rvv_builder::can_duplicate_repeating_sequence_p ()
 472 {
 473   poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
 474   unsigned int new_inner_size = m_inner_bits_size * npatterns ();
 475   if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
 476       || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
 477       || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
 478     return false;
 479   if (full_nelts ().is_constant ())
 480     return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
 481   return nelts_per_pattern () == 1;
 482 }
 483
 484 /* Return true if it is a repeating sequence that using
 485    merge approach has better codegen than using default
 486    approach (slide1down).
 487
 488    Sequence A:
 489      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 490
 491    nelts = 16
 492    npatterns = 2
 493
 494    for merging a we need mask 101010....
 495    for merging b we need mask 010101....
 496
 497    Foreach element in the npattern, we need to build a mask in scalar register.
 498    Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
 499    instruction and 1 scalar move to v0 register.  Finally we need vector merge
 500    to merge them.
 501
 502    lui          a5, #imm
 503    add          a5, #imm
 504    vmov.s.x     v0, a5
 505    vmerge.vxm   v9, v9, a1, v0
 506
 507    So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
 508    If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
 509    So return true in this case as it is profitable.
 510
 511    Sequence B:
 512      {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
 513
 514    nelts = 16
 515    npatterns = 8
 516
 517    COST of merge approach = (3 + 1) * npatterns = 24
 518    COST of slide1down approach = nelts = 16
 519    Return false in this case as it is NOT profitable in merge approach.
 520 */
 521 bool
 522 rvv_builder::repeating_sequence_use_merge_profitable_p ()
 523 {
 524   if (inner_bytes_size () > UNITS_PER_WORD)
 525     return false;
 526
 527   unsigned int nelts = full_nelts ().to_constant ();
 528
 529   if (!repeating_sequence_p (0, nelts, npatterns ()))
 530     return false;
 531
 532   unsigned int merge_cost = 1;
 533   unsigned int build_merge_mask_cost = 3;
 534   unsigned int slide1down_cost = nelts;
 535
 536   return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
 537 }
 538
 539 /* Return true if it's worthwhile to use slideup combine 2 vectors.  */
 540 bool
 541 rvv_builder::combine_sequence_use_slideup_profitable_p ()
 542 {
 543   int nelts = full_nelts ().to_constant ();
 544   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 545   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 546
 547   /* ??? Current heuristic we do is we do combine 2 vectors
 548      by slideup when:
 549        1. # of leading same elements is equal to # of trailing same elements.
 550        2. Both of above are equal to nelts / 2.
 551      Otherwise, it is not profitable.  */
 552   return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
 553 }
 554
 555 /* Return true if it's worthwhile to use merge combine vector with a scalar.  */
 556 bool
 557 rvv_builder::combine_sequence_use_merge_profitable_p ()
 558 {
 559   int nelts = full_nelts ().to_constant ();
 560   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 561   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 562   int nregs = riscv_get_v_regno_alignment (int_mode ());
 563
 564   if (leading_ndups + trailing_ndups != nelts)
 565     return false;
 566
 567   /* Leading elements num > 255 which exceeds the maximum value
 568      of QImode, we will need to use HImode.  */
 569   machine_mode mode;
 570   if (leading_ndups > 255 || nregs > 2)
 571     {
 572       if (!get_vector_mode (HImode, nelts).exists (&mode))
 573         return false;
 574       /* We will need one more AVL/VL toggling vsetvl instruction.  */
 575       return leading_ndups > 4 && trailing_ndups > 4;
 576     }
 577
 578   /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
 579      consume 3 slide instructions.  */
 580   return leading_ndups > 3 && trailing_ndups > 3;
 581 }
 582
 583 /* Merge the repeating sequence into a single element and return the RTX.  */
 584 rtx
 585 rvv_builder::get_merged_repeating_sequence ()
 586 {
 587   scalar_int_mode mode = Pmode;
 588   rtx target = gen_reg_rtx (mode);
 589   emit_move_insn (target, const0_rtx);
 590   rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
 591   /* { a, b, a, b }: Generate duplicate element = b << bits | a.  */
 592   for (unsigned int i = 0; i < npatterns (); i++)
 593     {
 594       unsigned int loc = m_inner_bits_size * i;
 595       rtx shift = gen_int_mode (loc, mode);
 596       rtx ele = gen_lowpart (mode, elt (i));
 597       rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
 598                                      OPTAB_DIRECT);
 599       rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
 600                                       OPTAB_DIRECT);
 601       rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
 602                                       OPTAB_DIRECT);
 603       emit_move_insn (target, tmp3);
 604     }
 605   if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
 606     return gen_lowpart (m_new_inner_mode, target);
 607   return target;
 608 }
 609
 610 /* Get the mask for merge approach.
 611
 612    Consider such following case:
 613      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 614    To merge "a", the mask should be 1010....
 615    To merge "b", the mask should be 0101....
 616 */
 617 rtx
 618 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
 619                                     machine_mode inner_mode) const
 620 {
 621   unsigned HOST_WIDE_INT mask = 0;
 622   unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
 623   /* Here we construct a mask pattern that will later be broadcast
 624      to a vector register.  The maximum broadcast size for vmv.v.x/vmv.s.x
 625      is determined by the length of a vector element (ELEN) and not by
 626      XLEN so make sure we do not exceed it.  One example is -march=zve32*
 627      which mandates ELEN == 32 but can be combined with -march=rv64
 628      with XLEN == 64.  */
 629   unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
 630
 631   gcc_assert (elen % npatterns () == 0);
 632
 633   int limit = elen / npatterns ();
 634
 635   for (int i = 0; i < limit; i++)
 636     mask |= base_mask << (i * npatterns ());
 637
 638   return gen_int_mode (mask, inner_mode);
 639 }
 640
 641 /* Return true if the variable-length vector is single step.
 642    Single step means step all patterns in NPATTERNS are equal.
 643    Consider this following case:
 644
 645      CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 646        { 0, 2, 2, 4, 4, 6, ... }
 647      First pattern: step1 = 2 - 0 = 2
 648                     step2 = 4 - 2 = 2
 649      Second pattern: step1 = 4 - 2 = 2
 650                      step2 = 6 - 4 = 2
 651      Since all steps of NPATTERNS are equal step = 2.
 652      Return true in this case.
 653
 654      CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 655        { 0, 1, 2, 4, 4, 7, ... }
 656      First pattern: step1 = 2 - 0 = 2
 657                     step2 = 4 - 2 = 2
 658      Second pattern: step1 = 4 - 1 = 3
 659                      step2 = 7 - 4 = 3
 660      Since not all steps are equal, return false.  */
 661 bool
 662 rvv_builder::single_step_npatterns_p () const
 663 {
 664   if (nelts_per_pattern () != 3)
 665     return false;
 666
 667   poly_int64 step
 668     = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
 669   for (unsigned int i = 0; i < npatterns (); i++)
 670     {
 671       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 672       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 673       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 674       poly_int64 diff1 = ele1 - ele0;
 675       poly_int64 diff2 = ele2 - ele1;
 676       if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
 677         return false;
 678     }
 679   return true;
 680 }
 681
 682 /* Return true if the diff between const vector and vid sequence
 683    is repeated. For example as below cases:
 684    The diff means the const vector - vid.
 685      CASE 1:
 686      CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
 687      VID         : {0, 1, 2, 3, 4, 5, 6, 7, ... }
 688      DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
 689      The diff sequence {3, 1,-1,-3} is repeated in the npattern and
 690      return TRUE for case 1.
 691
 692      CASE 2:
 693      CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
 694      VID         : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
 695      DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
 696      The diff sequence {-4, 3} is not repated in the npattern and
 697      return FALSE for case 2.  */
 698 bool
 699 rvv_builder::npatterns_vid_diff_repeated_p () const
 700 {
 701   if (nelts_per_pattern () != 3)
 702     return false;
 703   else if (npatterns () == 0)
 704     return false;
 705
 706   for (unsigned i = 0; i < npatterns (); i++)
 707     {
 708       poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
 709       poly_int64 diff_1
 710         = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
 711
 712       if (maybe_ne (diff_0, diff_1))
 713         return false;
 714     }
 715
 716   return true;
 717 }
 718
 719 /* Return true if the permutation consists of two
 720    interleaved patterns with a constant step each.
 721    TODO: We currently only support NPATTERNS = 2.  */
 722 bool
 723 rvv_builder::interleaved_stepped_npatterns_p () const
 724 {
 725   if (npatterns () != 2 || nelts_per_pattern () != 3)
 726     return false;
 727   for (unsigned int i = 0; i < npatterns (); i++)
 728     {
 729       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 730       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 731       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 732       poly_int64 diff1 = ele1 - ele0;
 733       poly_int64 diff2 = ele2 - ele1;
 734       if (maybe_ne (diff1, diff2))
 735         return false;
 736     }
 737   return true;
 738 }
 739
 740 /* Return true if all elements of NPATTERNS are equal.
 741
 742    E.g. NPATTERNS = 4:
 743      { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
 744    E.g. NPATTERNS = 8:
 745      { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
 746    We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
 747    We don't need to check the elements[n] with n >= NPATTERNS since
 748    they don't belong to the same pattern.
 749 */
 750 bool
 751 rvv_builder::npatterns_all_equal_p () const
 752 {
 753   poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
 754   for (unsigned int i = 1; i < npatterns (); i++)
 755     {
 756       poly_int64 ele = rtx_to_poly_int64 (elt (i));
 757       if (!known_eq (ele, ele0))
 758         return false;
 759     }
 760   return true;
 761 }
 762
 763 static unsigned
 764 get_sew (machine_mode mode)
 765 {
 766   unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
 767                        ? 8
 768                        : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
 769   return sew;
 770 }
 771
 772 /* Return true if X is a const_vector with all duplicate elements, which is in
 773    the range between MINVAL and MAXVAL.  */
 774 bool
 775 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
 776                                HOST_WIDE_INT maxval)
 777 {
 778   rtx elt;
 779   return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
 780           && IN_RANGE (INTVAL (elt), minval, maxval));
 781 }
 782
 783 /* Return true if VEC is a constant in which every element is in the range
 784    [MINVAL, MAXVAL].  The elements do not need to have the same value.
 785
 786    This function also exists in aarch64, we may unify it in middle-end in the
 787    future.  */
 788
 789 static bool
 790 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
 791 {
 792   if (!CONST_VECTOR_P (vec)
 793       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
 794     return false;
 795
 796   int nunits;
 797   if (!CONST_VECTOR_STEPPED_P (vec))
 798     nunits = const_vector_encoded_nelts (vec);
 799   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
 800     return false;
 801
 802   for (int i = 0; i < nunits; i++)
 803     {
 804       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
 805       poly_int64 value;
 806       if (!poly_int_rtx_p (vec_elem, &value)
 807           || maybe_lt (value, minval)
 808           || maybe_gt (value, maxval))
 809         return false;
 810     }
 811   return true;
 812 }
 813
 814 /* Return a const vector of VAL. The VAL can be either const_int or
 815    const_poly_int.  */
 816
 817 static rtx
 818 gen_const_vector_dup (machine_mode mode, poly_int64 val)
 819 {
 820   scalar_mode smode = GET_MODE_INNER (mode);
 821   rtx c = gen_int_mode (val, smode);
 822   if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
 823     {
 824       /* When VAL is const_poly_int value, we need to explicitly broadcast
 825          it into a vector using RVV broadcast instruction.  */
 826       return expand_vector_broadcast (mode, c);
 827     }
 828    return gen_const_vec_duplicate (mode, c);
 829 }
 830
 831 /* Emit a vlmax vsetvl instruction.  This should only be used when
 832    optimization is disabled or after vsetvl insertion pass.  */
 833 void
 834 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
 835 {
 836   unsigned int sew = get_sew (vmode);
 837   emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
 838                          gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
 839                          const0_rtx));
 840 }
 841
 842 void
 843 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
 844 {
 845   unsigned int sew = get_sew (vmode);
 846   enum vlmul_type vlmul = get_vlmul (vmode);
 847   unsigned int ratio = calculate_ratio (sew, vlmul);
 848
 849   if (!optimize)
 850     emit_hard_vlmax_vsetvl (vmode, vl);
 851   else
 852     emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
 853 }
 854
 855 /* Calculate SEW/LMUL ratio.  */
 856 unsigned int
 857 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
 858 {
 859   unsigned int ratio;
 860   switch (vlmul)
 861     {
 862     case LMUL_1:
 863       ratio = sew;
 864       break;
 865     case LMUL_2:
 866       ratio = sew / 2;
 867       break;
 868     case LMUL_4:
 869       ratio = sew / 4;
 870       break;
 871     case LMUL_8:
 872       ratio = sew / 8;
 873       break;
 874     case LMUL_F8:
 875       ratio = sew * 8;
 876       break;
 877     case LMUL_F4:
 878       ratio = sew * 4;
 879       break;
 880     case LMUL_F2:
 881       ratio = sew * 2;
 882       break;
 883     default:
 884       gcc_unreachable ();
 885     }
 886   return ratio;
 887 }
 888
 889 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
 890    compile-time unknown). FIXED meands that the vector-length is specific
 891    (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
 892    auto-vectorization using VLMAX vsetvl configuration.  */
 893 static bool
 894 autovec_use_vlmax_p (void)
 895 {
 896   return (riscv_autovec_preference == RVV_SCALABLE
 897           || riscv_autovec_preference == RVV_FIXED_VLMAX);
 898 }
 899
 900 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
 901    is a const duplicate vector. Otherwise, emit vrgather.vv.  */
 902 static void
 903 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
 904 {
 905   rtx elt;
 906   insn_code icode;
 907   machine_mode data_mode = GET_MODE (target);
 908   machine_mode sel_mode = GET_MODE (sel);
 909   if (const_vec_duplicate_p (sel, &elt))
 910     {
 911       icode = code_for_pred_gather_scalar (data_mode);
 912       sel = elt;
 913     }
 914   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 915     icode = code_for_pred_gatherei16 (data_mode);
 916   else
 917     icode = code_for_pred_gather (data_mode);
 918   rtx ops[] = {target, op, sel};
 919   emit_vlmax_insn (icode, BINARY_OP, ops);
 920 }
 921
 922 static void
 923 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
 924 {
 925   rtx elt;
 926   insn_code icode;
 927   machine_mode data_mode = GET_MODE (target);
 928   machine_mode sel_mode = GET_MODE (sel);
 929   if (const_vec_duplicate_p (sel, &elt))
 930     {
 931       icode = code_for_pred_gather_scalar (data_mode);
 932       sel = elt;
 933     }
 934   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 935     icode = code_for_pred_gatherei16 (data_mode);
 936   else
 937     icode = code_for_pred_gather (data_mode);
 938   rtx ops[] = {target, mask, target, op, sel};
 939   emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
 940 }
 941
 942 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
 943    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
 944
 945   There is no inverse vdecompress provided, as this operation can be readily
 946   synthesized using iota and a masked vrgather:
 947
 948       Desired functionality of 'vdecompress'
 949         7 6 5 4 3 2 1 0     # vid
 950
 951               e d c b a     # packed vector of 5 elements
 952         1 0 0 1 1 1 0 1     # mask vector of 8 elements
 953         p q r s t u v w     # destination register before vdecompress
 954
 955         e q r d c b v a     # result of vdecompress
 956        # v0 holds mask
 957        # v1 holds packed data
 958        # v11 holds input expanded vector and result
 959        viota.m v10, v0                 # Calc iota from mask in v0
 960        vrgather.vv v11, v1, v10, v0.t  # Expand into destination
 961      p q r s t u v w  # v11 destination register
 962            e d c b a  # v1 source vector
 963      1 0 0 1 1 1 0 1  # v0 mask vector
 964
 965      4 4 4 3 2 1 1 0  # v10 result of viota.m
 966      e q r d c b v a  # v11 destination after vrgather using viota.m under mask
 967 */
 968 static void
 969 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
 970 {
 971   machine_mode data_mode = GET_MODE (target);
 972   machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
 973   if (GET_MODE_INNER (data_mode) == QImode)
 974     sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
 975
 976   rtx sel = gen_reg_rtx (sel_mode);
 977   rtx iota_ops[] = {sel, mask};
 978   emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
 979   emit_vlmax_gather_insn (target, op0, sel);
 980   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
 981 }
 982
 983 /* Emit merge instruction.  */
 984
 985 static machine_mode
 986 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
 987                                          machine_mode mask_bit_mode)
 988 {
 989   unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
 990   unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
 991     ? builder.inner_bits_size () : mask_precision;
 992
 993   scalar_mode inner_mode;
 994   unsigned minimal_bits_size;
 995
 996   switch (mask_scalar_size)
 997     {
 998       case 8:
 999         inner_mode = QImode;
1000         minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8.  */
1001         break;
1002       case 16:
1003         inner_mode = HImode;
1004         minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4.  */
1005         break;
1006       case 32:
1007         inner_mode = SImode;
1008         minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2.  */
1009         break;
1010       case 64:
1011         inner_mode = DImode;
1012         minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1.  */
1013         break;
1014       default:
1015         gcc_unreachable ();
1016         break;
1017     }
1018
1019   gcc_assert (mask_precision % mask_scalar_size == 0);
1020
1021   uint64_t dup_nunit = mask_precision > mask_scalar_size
1022     ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1023
1024   return get_vector_mode (inner_mode, dup_nunit).require ();
1025 }
1026
1027 /* Expand series const vector.  If VID is NULL_RTX, we use vid.v
1028    instructions to generate sequence for VID:
1029
1030      VID = { 0, 1, 2, 3, ... }
1031
1032    Otherwise, we use the VID argument directly.  */
1033
1034 void
1035 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1036 {
1037   machine_mode mode = GET_MODE (dest);
1038   poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1039   poly_int64 value;
1040   rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1041
1042   /* VECT_IV = BASE + I * STEP.  */
1043
1044   /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v.  */
1045   bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1046                    && poly_int_rtx_p (base, &value)
1047                    && known_eq (nunits_m1, value);
1048   if (!vid)
1049     {
1050       vid = gen_reg_rtx (mode);
1051       rtx op[] = {vid};
1052       emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1053     }
1054
1055   rtx step_adj;
1056   if (reverse_p)
1057     {
1058       /* Special case:
1059            {nunits - 1, nunits - 2, ... , 0}.
1060            nunits can be either const_int or const_poly_int.
1061
1062          Code sequence:
1063            vid.v v
1064            vrsub nunits - 1, v.  */
1065       rtx ops[]
1066         = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1067       insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1068       emit_vlmax_insn (icode, BINARY_OP, ops);
1069     }
1070   else
1071     {
1072       /* Step 2: Generate I * STEP.
1073          - STEP is 1, we don't emit any instructions.
1074          - STEP is power of 2, we use vsll.vi/vsll.vx.
1075          - STEP is non-power of 2, we use vmul.vx.  */
1076       if (rtx_equal_p (step, const1_rtx))
1077         step_adj = vid;
1078       else
1079         {
1080           step_adj = gen_reg_rtx (mode);
1081           if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1082             {
1083               /* Emit logical left shift operation.  */
1084               int shift = exact_log2 (INTVAL (step));
1085               rtx shift_amount = gen_int_mode (shift, Pmode);
1086               insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1087               rtx ops[] = {step_adj, vid, shift_amount};
1088               emit_vlmax_insn (icode, BINARY_OP, ops);
1089             }
1090           else
1091             {
1092               insn_code icode = code_for_pred_scalar (MULT, mode);
1093               rtx ops[] = {step_adj, vid, step};
1094               emit_vlmax_insn (icode, BINARY_OP, ops);
1095             }
1096         }
1097
1098       /* Step 3: Generate BASE + I * STEP.
1099           - BASE is 0, use result of vid.
1100           - BASE is not 0, we use vadd.vx/vadd.vi.  */
1101       if (rtx_equal_p (base, const0_rtx))
1102         emit_move_insn (result, step_adj);
1103       else
1104         {
1105           insn_code icode = code_for_pred_scalar (PLUS, mode);
1106           rtx ops[] = {result, step_adj, base};
1107           emit_vlmax_insn (icode, BINARY_OP, ops);
1108         }
1109     }
1110
1111   if (result != dest)
1112     emit_move_insn (dest, result);
1113 }
1114
1115 static void
1116 expand_const_vector (rtx target, rtx src)
1117 {
1118   machine_mode mode = GET_MODE (target);
1119   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1120     {
1121       rtx elt;
1122       gcc_assert (
1123         const_vec_duplicate_p (src, &elt)
1124         && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx)));
1125       rtx ops[] = {target, src};
1126       emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1127       return;
1128     }
1129
1130   rtx elt;
1131   if (const_vec_duplicate_p (src, &elt))
1132     {
1133       rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1134       /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1135          we use vmv.v.i instruction.  */
1136       if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src))
1137         {
1138           rtx ops[] = {tmp, src};
1139           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1140         }
1141       else
1142         {
1143           /* Emit vec_duplicate<mode> split pattern before RA so that
1144              we could have a better optimization opportunity in LICM
1145              which will hoist vmv.v.x outside the loop and in fwprop && combine
1146              which will transform 'vv' into 'vx' instruction.
1147
1148              The reason we don't emit vec_duplicate<mode> split pattern during
1149              RA since the split stage after RA is a too late stage to generate
1150              RVV instruction which need an additional register (We can't
1151              allocate a new register after RA) for VL operand of vsetvl
1152              instruction (vsetvl a5, zero).  */
1153           if (lra_in_progress)
1154             {
1155               rtx ops[] = {tmp, elt};
1156               emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1157             }
1158           else
1159             {
1160               struct expand_operand ops[2];
1161               enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1162               gcc_assert (icode != CODE_FOR_nothing);
1163               create_output_operand (&ops[0], tmp, mode);
1164               create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1165               expand_insn (icode, 2, ops);
1166               tmp = ops[0].value;
1167             }
1168         }
1169
1170       if (tmp != target)
1171         emit_move_insn (target, tmp);
1172       return;
1173     }
1174
1175   /* Support scalable const series vector.  */
1176   rtx base, step;
1177   if (const_vec_series_p (src, &base, &step))
1178     {
1179       expand_vec_series (target, base, step);
1180       return;
1181     }
1182
1183   /* Handle variable-length vector.  */
1184   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1185   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1186   rvv_builder builder (mode, npatterns, nelts_per_pattern);
1187   for (unsigned int i = 0; i < nelts_per_pattern; i++)
1188     {
1189       for (unsigned int j = 0; j < npatterns; j++)
1190         builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1191     }
1192   builder.finalize ();
1193
1194   if (CONST_VECTOR_DUPLICATE_P (src))
1195     {
1196       /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1197          E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1198               NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1199         The elements within NPATTERNS are not necessary regular.  */
1200       if (builder.can_duplicate_repeating_sequence_p ())
1201         {
1202           /* We handle the case that we can find a vector containter to hold
1203              element bitsize = NPATTERNS * ele_bitsize.
1204
1205                NPATTERNS = 8, element width = 8
1206                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1207                In this case, we can combine NPATTERNS element into a larger
1208                element. Use element width = 64 and broadcast a vector with
1209                all element equal to 0x0706050403020100.  */
1210           rtx ele = builder.get_merged_repeating_sequence ();
1211           rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1212           emit_move_insn (target, gen_lowpart (mode, dup));
1213         }
1214       else
1215         {
1216           /* We handle the case that we can't find a vector containter to hold
1217              element bitsize = NPATTERNS * ele_bitsize.
1218
1219                NPATTERNS = 8, element width = 16
1220                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1221                Since NPATTERNS * element width = 128, we can't find a container
1222                to hold it.
1223
1224                In this case, we use NPATTERNS merge operations to generate such
1225                vector.  */
1226           unsigned int nbits = npatterns - 1;
1227
1228           /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1229           rtx vid = gen_reg_rtx (builder.int_mode ());
1230           rtx op[] = {vid};
1231           emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1232                             NULLARY_OP, op);
1233
1234           /* Generate vid_repeat = { 0, 1, ... nbits, ... }  */
1235           rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1236           rtx and_ops[] = {vid_repeat, vid,
1237                            gen_int_mode (nbits, builder.inner_int_mode ())};
1238           emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1239                             BINARY_OP, and_ops);
1240
1241           rtx tmp = gen_reg_rtx (builder.mode ());
1242           rtx dup_ops[] = {tmp, builder.elt (0)};
1243           emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1244                             dup_ops);
1245           for (unsigned int i = 1; i < builder.npatterns (); i++)
1246             {
1247               /* Generate mask according to i.  */
1248               rtx mask = gen_reg_rtx (builder.mask_mode ());
1249               rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1250               expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1251
1252               /* Merge scalar to each i.  */
1253               rtx tmp2 = gen_reg_rtx (builder.mode ());
1254               rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask};
1255               insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1256               emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1257               tmp = tmp2;
1258             }
1259           emit_move_insn (target, tmp);
1260         }
1261     }
1262   else if (CONST_VECTOR_STEPPED_P (src))
1263     {
1264       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1265       if (builder.single_step_npatterns_p ())
1266         {
1267           /* Describe the case by choosing NPATTERNS = 4 as an example.  */
1268           insn_code icode;
1269
1270           /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1271           rtx vid = gen_reg_rtx (builder.mode ());
1272           rtx vid_ops[] = {vid};
1273           icode = code_for_pred_series (builder.mode ());
1274           emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1275
1276           if (builder.npatterns_all_equal_p ())
1277             {
1278               /* Generate the variable-length vector following this rule:
1279                  { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1280                    E.g. { 0, 0, 8, 8, 16, 16, ... } */
1281               /* We want to create a pattern where value[ix] = floor (ix /
1282                  NPATTERNS). As NPATTERNS is always a power of two we can
1283                  rewrite this as = ix & -NPATTERNS.  */
1284               /* Step 2: VID AND -NPATTERNS:
1285                  { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1286               */
1287               rtx imm
1288                 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1289               rtx tmp = gen_reg_rtx (builder.mode ());
1290               rtx and_ops[] = {tmp, vid, imm};
1291               icode = code_for_pred_scalar (AND, builder.mode ());
1292               emit_vlmax_insn (icode, BINARY_OP, and_ops);
1293               HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1294               if (init_val == 0)
1295                 emit_move_insn (target, tmp);
1296               else
1297                 {
1298                   rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1299                   rtx add_ops[] = {target, tmp, dup};
1300                   icode = code_for_pred (PLUS, builder.mode ());
1301                   emit_vlmax_insn (icode, BINARY_OP, add_ops);
1302                 }
1303             }
1304           else
1305             {
1306               /* Generate the variable-length vector following this rule:
1307                 { a, b, a + step, b + step, a + step*2, b + step*2, ... }  */
1308
1309               if (builder.npatterns_vid_diff_repeated_p ())
1310                 {
1311                   /* Case 1: For example as below:
1312                      {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1313                      We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1314                      repeated as below after minus vid.
1315                      {3, 1, -1, -3, 3, 1, -1, -3...}
1316                      Then we can simplify the diff code gen to at most
1317                      npatterns().  */
1318                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1319
1320                   /* Step 1: Generate diff = TARGET - VID.  */
1321                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1322                     {
1323                      poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1324                      v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1325                     }
1326
1327                   /* Step 2: Generate result = VID + diff.  */
1328                   rtx vec = v.build ();
1329                   rtx add_ops[] = {target, vid, vec};
1330                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1331                                    BINARY_OP, add_ops);
1332                 }
1333               else
1334                 {
1335                   /* Case 2: For example as below:
1336                      { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1337                    */
1338                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1339
1340                   /* Step 1: Generate { a, b, a, b, ... }  */
1341                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1342                     v.quick_push (builder.elt (i));
1343                   rtx new_base = v.build ();
1344
1345                   /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS).  */
1346                   rtx shift_count
1347                     = gen_int_mode (exact_log2 (builder.npatterns ()),
1348                                     builder.inner_mode ());
1349                   rtx tmp = expand_simple_binop (builder.mode (), LSHIFTRT,
1350                                                  vid, shift_count, NULL_RTX,
1351                                                  false, OPTAB_DIRECT);
1352
1353                   /* Step 3: Generate tmp2 = tmp * step.  */
1354                   rtx tmp2 = gen_reg_rtx (builder.mode ());
1355                   rtx step
1356                     = simplify_binary_operation (MINUS, builder.inner_mode (),
1357                                                  builder.elt (v.npatterns()),
1358                                                  builder.elt (0));
1359                   expand_vec_series (tmp2, const0_rtx, step, tmp);
1360
1361                   /* Step 4: Generate target = tmp2 + new_base.  */
1362                   rtx add_ops[] = {target, tmp2, new_base};
1363                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1364                                    BINARY_OP, add_ops);
1365                 }
1366             }
1367         }
1368       else if (builder.interleaved_stepped_npatterns_p ())
1369         {
1370           rtx base1 = builder.elt (0);
1371           rtx base2 = builder.elt (1);
1372           poly_int64 step1
1373             = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1374               - rtx_to_poly_int64 (base1);
1375           poly_int64 step2
1376             = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1377               - rtx_to_poly_int64 (base2);
1378
1379           /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1380              integer vector mode to generate such vector efficiently.
1381
1382              E.g. EEW = 16, { 2, 0, 4, 0, ... }
1383
1384              can be interpreted into:
1385
1386                   EEW = 32, { 2, 4, ... }  */
1387           unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1388           scalar_int_mode new_smode;
1389           machine_mode new_mode;
1390           poly_uint64 new_nunits
1391             = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1392           if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1393               && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1394             {
1395               rtx tmp = gen_reg_rtx (new_mode);
1396               base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1397               expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode));
1398
1399               if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1400                 /* { 1, 0, 2, 0, ... }.  */
1401                 emit_move_insn (target, gen_lowpart (mode, tmp));
1402               else if (known_eq (step2, 0))
1403                 {
1404                   /* { 1, 1, 2, 1, ... }.  */
1405                   rtx scalar = expand_simple_binop (
1406                     new_smode, ASHIFT,
1407                     gen_int_mode (rtx_to_poly_int64 (base2), new_smode),
1408                     gen_int_mode (builder.inner_bits_size (), new_smode),
1409                     NULL_RTX, false, OPTAB_DIRECT);
1410                   rtx tmp2 = gen_reg_rtx (new_mode);
1411                   rtx and_ops[] = {tmp2, tmp, scalar};
1412                   emit_vlmax_insn (code_for_pred_scalar (AND, new_mode),
1413                                    BINARY_OP, and_ops);
1414                   emit_move_insn (target, gen_lowpart (mode, tmp2));
1415                 }
1416               else
1417                 {
1418                   /* { 1, 3, 2, 6, ... }.  */
1419                   rtx tmp2 = gen_reg_rtx (new_mode);
1420                   base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1421                   expand_vec_series (tmp2, base2,
1422                                      gen_int_mode (step2, new_smode));
1423                   rtx shifted_tmp2 = expand_simple_binop (
1424                     new_mode, ASHIFT, tmp2,
1425                     gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1426                     false, OPTAB_DIRECT);
1427                   rtx tmp3 = gen_reg_rtx (new_mode);
1428                   rtx ior_ops[] = {tmp3, tmp, shifted_tmp2};
1429                   emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1430                                    ior_ops);
1431                   emit_move_insn (target, gen_lowpart (mode, tmp3));
1432                 }
1433             }
1434           else
1435             {
1436               rtx vid = gen_reg_rtx (mode);
1437               expand_vec_series (vid, const0_rtx, const1_rtx);
1438               /* Transform into { 0, 0, 1, 1, 2, 2, ... }.  */
1439               rtx shifted_vid
1440                 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1441                                        NULL_RTX, false, OPTAB_DIRECT);
1442               rtx tmp1 = gen_reg_rtx (mode);
1443               rtx tmp2 = gen_reg_rtx (mode);
1444               expand_vec_series (tmp1, base1,
1445                                  gen_int_mode (step1, builder.inner_mode ()),
1446                                  shifted_vid);
1447               expand_vec_series (tmp2, base2,
1448                                  gen_int_mode (step2, builder.inner_mode ()),
1449                                  shifted_vid);
1450
1451               /* Transform into { 0, 1, 0, 1, 0, 1, ... }.  */
1452               rtx and_vid = gen_reg_rtx (mode);
1453               rtx and_ops[] = {and_vid, vid, const1_rtx};
1454               emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1455                                and_ops);
1456               rtx mask = gen_reg_rtx (builder.mask_mode ());
1457               expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1458
1459               rtx ops[] = {target, tmp1, tmp2, mask};
1460               emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1461             }
1462         }
1463       else if (npatterns == 1 && nelts_per_pattern == 3)
1464         {
1465           /* Generate the following CONST_VECTOR:
1466              { base0, base1, base1 + step, base1 + step * 2, ... }  */
1467           rtx base0 = builder.elt (0);
1468           rtx base1 = builder.elt (1);
1469           rtx base2 = builder.elt (2);
1470
1471           rtx step = simplify_binary_operation (MINUS, builder.inner_mode (),
1472                                                 base2, base1);
1473
1474           /* Step 1 - { base1, base1 + step, base1 + step * 2, ... }  */
1475           rtx tmp = gen_reg_rtx (mode);
1476           expand_vec_series (tmp, base1, step);
1477           /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... }  */
1478           if (!rtx_equal_p (base0, const0_rtx))
1479             base0 = force_reg (builder.inner_mode (), base0);
1480
1481           insn_code icode = optab_handler (vec_shl_insert_optab, mode);
1482           gcc_assert (icode != CODE_FOR_nothing);
1483           emit_insn (GEN_FCN (icode) (target, tmp, base0));
1484         }
1485       else
1486         /* TODO: We will enable more variable-length vector in the future.  */
1487         gcc_unreachable ();
1488     }
1489   else
1490     gcc_unreachable ();
1491 }
1492
1493 /* Get the frm mode with given CONST_INT rtx, the default mode is
1494    FRM_DYN.  */
1495 enum floating_point_rounding_mode
1496 get_frm_mode (rtx operand)
1497 {
1498   gcc_assert (CONST_INT_P (operand));
1499
1500   switch (INTVAL (operand))
1501     {
1502     case FRM_RNE:
1503       return FRM_RNE;
1504     case FRM_RTZ:
1505       return FRM_RTZ;
1506     case FRM_RDN:
1507       return FRM_RDN;
1508     case FRM_RUP:
1509       return FRM_RUP;
1510     case FRM_RMM:
1511       return FRM_RMM;
1512     case FRM_DYN:
1513       return FRM_DYN;
1514     default:
1515       gcc_unreachable ();
1516     }
1517
1518   gcc_unreachable ();
1519 }
1520
1521 /* Expand a pre-RA RVV data move from SRC to DEST.
1522    It expands move for RVV fractional vector modes.
1523    Return true if the move as already been emitted.  */
1524 bool
1525 legitimize_move (rtx dest, rtx *srcp)
1526 {
1527   rtx src = *srcp;
1528   machine_mode mode = GET_MODE (dest);
1529   if (CONST_VECTOR_P (src))
1530     {
1531       expand_const_vector (dest, src);
1532       return true;
1533     }
1534
1535   if (riscv_v_ext_vls_mode_p (mode))
1536     {
1537       if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1538         {
1539           /* For NUNITS <= 31 VLS modes, we don't need extrac
1540              scalar regisers so we apply the naive (set (op0) (op1)) pattern. */
1541           if (can_create_pseudo_p ())
1542             {
1543               /* Need to force register if mem <- !reg.  */
1544               if (MEM_P (dest) && !REG_P (src))
1545                 *srcp = force_reg (mode, src);
1546
1547               return false;
1548             }
1549         }
1550       else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1551         {
1552           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1553           return true;
1554         }
1555     }
1556   else
1557     {
1558       /* In order to decrease the memory traffic, we don't use whole register
1559        * load/store for the LMUL less than 1 and mask mode, so those case will
1560        * require one extra general purpose register, but it's not allowed during
1561        * LRA process, so we have a special move pattern used for LRA, which will
1562        * defer the expansion after LRA.  */
1563       if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1564            || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1565           && lra_in_progress)
1566         {
1567           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1568           return true;
1569         }
1570
1571       if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1572           && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1573         {
1574           /* Need to force register if mem <- !reg.  */
1575           if (MEM_P (dest) && !REG_P (src))
1576             *srcp = force_reg (mode, src);
1577
1578           return false;
1579         }
1580     }
1581
1582   if (register_operand (src, mode) && register_operand (dest, mode))
1583     {
1584       emit_insn (gen_rtx_SET (dest, src));
1585       return true;
1586     }
1587
1588   unsigned insn_flags
1589     = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1590   if (!register_operand (src, mode) && !register_operand (dest, mode))
1591     {
1592       rtx tmp = gen_reg_rtx (mode);
1593       if (MEM_P (src))
1594         {
1595           rtx ops[] = {tmp, src};
1596           emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1597         }
1598       else
1599         emit_move_insn (tmp, src);
1600       src = tmp;
1601     }
1602
1603   if (satisfies_constraint_vu (src))
1604     return false;
1605
1606   rtx ops[] = {dest, src};
1607   emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1608   return true;
1609 }
1610
1611 /* VTYPE information for machine_mode.  */
1612 struct mode_vtype_group
1613 {
1614   enum vlmul_type vlmul[NUM_MACHINE_MODES];
1615   uint8_t ratio[NUM_MACHINE_MODES];
1616   machine_mode subpart_mode[NUM_MACHINE_MODES];
1617   uint8_t nf[NUM_MACHINE_MODES];
1618   mode_vtype_group ()
1619   {
1620 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO)                                 \
1621   vlmul[MODE##mode] = VLMUL;                                                   \
1622   ratio[MODE##mode] = RATIO;
1623 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO)         \
1624   subpart_mode[MODE##mode] = SUBPART_MODE##mode;                               \
1625   nf[MODE##mode] = NF;                                                         \
1626   vlmul[MODE##mode] = VLMUL;                                                   \
1627   ratio[MODE##mode] = RATIO;
1628 #include "riscv-vector-switch.def"
1629 #undef ENTRY
1630 #undef TUPLE_ENTRY
1631   }
1632 };
1633
1634 static mode_vtype_group mode_vtype_infos;
1635
1636 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR.  */
1637 enum vlmul_type
1638 get_vlmul (machine_mode mode)
1639 {
1640   /* For VLS modes, the vlmul should be dynamically
1641      calculated since we need to adjust VLMUL according
1642      to TARGET_MIN_VLEN.  */
1643   if (riscv_v_ext_vls_mode_p (mode))
1644     {
1645       int size = GET_MODE_BITSIZE (mode).to_constant ();
1646       int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1647       if (size < TARGET_MIN_VLEN)
1648         {
1649           int factor = TARGET_MIN_VLEN / size;
1650           if (inner_size == 8)
1651             factor = MIN (factor, 8);
1652           else if (inner_size == 16)
1653             factor = MIN (factor, 4);
1654           else if (inner_size == 32)
1655             factor = MIN (factor, 2);
1656           else if (inner_size == 64)
1657             factor = MIN (factor, 1);
1658           else
1659             gcc_unreachable ();
1660
1661           switch (factor)
1662             {
1663             case 1:
1664               return LMUL_1;
1665             case 2:
1666               return LMUL_F2;
1667             case 4:
1668               return LMUL_F4;
1669             case 8:
1670               return LMUL_F8;
1671
1672             default:
1673               gcc_unreachable ();
1674             }
1675         }
1676       else
1677         {
1678           int factor = size / TARGET_MIN_VLEN;
1679           switch (factor)
1680             {
1681             case 1:
1682               return LMUL_1;
1683             case 2:
1684               return LMUL_2;
1685             case 4:
1686               return LMUL_4;
1687             case 8:
1688               return LMUL_8;
1689
1690             default:
1691               gcc_unreachable ();
1692             }
1693         }
1694     }
1695   return mode_vtype_infos.vlmul[mode];
1696 }
1697
1698 /* Return the VLMAX rtx of vector mode MODE.  */
1699 rtx
1700 get_vlmax_rtx (machine_mode mode)
1701 {
1702   gcc_assert (riscv_v_ext_vector_mode_p (mode));
1703   return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1704 }
1705
1706 /* Return the NF value of the corresponding mode.  */
1707 unsigned int
1708 get_nf (machine_mode mode)
1709 {
1710   /* We don't allow non-tuple modes go through this function.  */
1711   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1712   return mode_vtype_infos.nf[mode];
1713 }
1714
1715 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1716    the subpart mode is RVVM2SImode. This will help to build
1717    array/struct type in builtins.  */
1718 machine_mode
1719 get_subpart_mode (machine_mode mode)
1720 {
1721   /* We don't allow non-tuple modes go through this function.  */
1722   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1723   return mode_vtype_infos.subpart_mode[mode];
1724 }
1725
1726 /* Get ratio according to machine mode.  */
1727 unsigned int
1728 get_ratio (machine_mode mode)
1729 {
1730   if (riscv_v_ext_vls_mode_p (mode))
1731     {
1732       unsigned int sew = get_sew (mode);
1733       vlmul_type vlmul = get_vlmul (mode);
1734       switch (vlmul)
1735         {
1736         case LMUL_1:
1737           return sew;
1738         case LMUL_2:
1739           return sew / 2;
1740         case LMUL_4:
1741           return sew / 4;
1742         case LMUL_8:
1743           return sew / 8;
1744         case LMUL_F8:
1745           return sew * 8;
1746         case LMUL_F4:
1747           return sew * 4;
1748         case LMUL_F2:
1749           return sew * 2;
1750
1751         default:
1752           gcc_unreachable ();
1753         }
1754     }
1755   return mode_vtype_infos.ratio[mode];
1756 }
1757
1758 /* Get ta according to operand[tail_op_idx].  */
1759 int
1760 get_ta (rtx ta)
1761 {
1762   if (INTVAL (ta) == TAIL_ANY)
1763     return INVALID_ATTRIBUTE;
1764   return INTVAL (ta);
1765 }
1766
1767 /* Get ma according to operand[mask_op_idx].  */
1768 int
1769 get_ma (rtx ma)
1770 {
1771   if (INTVAL (ma) == MASK_ANY)
1772     return INVALID_ATTRIBUTE;
1773   return INTVAL (ma);
1774 }
1775
1776 /* Get prefer tail policy.  */
1777 enum tail_policy
1778 get_prefer_tail_policy ()
1779 {
1780   /* TODO: By default, we choose to use TAIL_ANY which allows
1781      compiler pick up either agnostic or undisturbed. Maybe we
1782      will have a compile option like -mprefer=agnostic to set
1783      this value???.  */
1784   return TAIL_ANY;
1785 }
1786
1787 /* Get prefer mask policy.  */
1788 enum mask_policy
1789 get_prefer_mask_policy ()
1790 {
1791   /* TODO: By default, we choose to use MASK_ANY which allows
1792      compiler pick up either agnostic or undisturbed. Maybe we
1793      will have a compile option like -mprefer=agnostic to set
1794      this value???.  */
1795   return MASK_ANY;
1796 }
1797
1798 /* Get avl_type rtx.  */
1799 rtx
1800 get_avl_type_rtx (enum avl_type type)
1801 {
1802   return gen_int_mode (type, Pmode);
1803 }
1804
1805 /* Return the appropriate mask mode for MODE.  */
1806
1807 machine_mode
1808 get_mask_mode (machine_mode mode)
1809 {
1810   poly_int64 nunits = GET_MODE_NUNITS (mode);
1811   if (riscv_v_ext_tuple_mode_p (mode))
1812     {
1813       unsigned int nf = get_nf (mode);
1814       nunits = exact_div (nunits, nf);
1815     }
1816   return get_vector_mode (BImode, nunits).require ();
1817 }
1818
1819 /* Return the appropriate M1 mode for MODE.  */
1820
1821 static opt_machine_mode
1822 get_m1_mode (machine_mode mode)
1823 {
1824   scalar_mode smode = GET_MODE_INNER (mode);
1825   unsigned int bytes = GET_MODE_SIZE (smode);
1826   poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1827   return get_vector_mode (smode, m1_nunits);
1828 }
1829
1830 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1831    This function is not only used by builtins, but also will be used by
1832    auto-vectorization in the future.  */
1833 opt_machine_mode
1834 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1835 {
1836   enum mode_class mclass;
1837   if (inner_mode == E_BImode)
1838     mclass = MODE_VECTOR_BOOL;
1839   else if (FLOAT_MODE_P (inner_mode))
1840     mclass = MODE_VECTOR_FLOAT;
1841   else
1842     mclass = MODE_VECTOR_INT;
1843   machine_mode mode;
1844   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1845     if (inner_mode == GET_MODE_INNER (mode)
1846         && known_eq (nunits, GET_MODE_NUNITS (mode))
1847         && (riscv_v_ext_vector_mode_p (mode)
1848             || riscv_v_ext_vls_mode_p (mode)))
1849       return mode;
1850   return opt_machine_mode ();
1851 }
1852
1853 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1854    corresponding subpart mode and NF.  */
1855 opt_machine_mode
1856 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1857 {
1858   poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1859   scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1860   enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1861   machine_mode mode;
1862   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1863     if (inner_mode == GET_MODE_INNER (mode)
1864         && known_eq (nunits, GET_MODE_NUNITS (mode))
1865         && riscv_v_ext_tuple_mode_p (mode)
1866         && get_subpart_mode (mode) == subpart_mode)
1867       return mode;
1868   return opt_machine_mode ();
1869 }
1870
1871 bool
1872 simm5_p (rtx x)
1873 {
1874   if (!CONST_INT_P (x))
1875     return false;
1876   return IN_RANGE (INTVAL (x), -16, 15);
1877 }
1878
1879 bool
1880 neg_simm5_p (rtx x)
1881 {
1882   if (!CONST_INT_P (x))
1883     return false;
1884   return IN_RANGE (INTVAL (x), -15, 16);
1885 }
1886
1887 bool
1888 has_vi_variant_p (rtx_code code, rtx x)
1889 {
1890   switch (code)
1891     {
1892     case PLUS:
1893     case AND:
1894     case IOR:
1895     case XOR:
1896     case SS_PLUS:
1897     case US_PLUS:
1898     case EQ:
1899     case NE:
1900     case LE:
1901     case LEU:
1902     case GT:
1903     case GTU:
1904       return simm5_p (x);
1905
1906     case LT:
1907     case LTU:
1908     case GE:
1909     case GEU:
1910     case MINUS:
1911     case SS_MINUS:
1912       return neg_simm5_p (x);
1913
1914     default:
1915       return false;
1916     }
1917 }
1918
1919 bool
1920 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
1921                      machine_mode vector_mode, bool has_vi_variant_p,
1922                      void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
1923 {
1924   machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
1925   if (has_vi_variant_p)
1926     {
1927       *scalar_op = force_reg (scalar_mode, *scalar_op);
1928       return false;
1929     }
1930
1931   if (TARGET_64BIT)
1932     {
1933       if (!rtx_equal_p (*scalar_op, const0_rtx))
1934         *scalar_op = force_reg (scalar_mode, *scalar_op);
1935       return false;
1936     }
1937
1938   if (immediate_operand (*scalar_op, Pmode))
1939     {
1940       if (!rtx_equal_p (*scalar_op, const0_rtx))
1941         *scalar_op = force_reg (Pmode, *scalar_op);
1942
1943       *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
1944       return false;
1945     }
1946
1947   if (CONST_INT_P (*scalar_op))
1948     {
1949       if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
1950         *scalar_op = force_const_mem (scalar_mode, *scalar_op);
1951       else
1952         *scalar_op = force_reg (scalar_mode, *scalar_op);
1953     }
1954
1955   rtx tmp = gen_reg_rtx (vector_mode);
1956   rtx ops[] = {tmp, *scalar_op};
1957   if (type == VLMAX)
1958     emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
1959   else
1960     emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
1961                         vl);
1962   emit_vector_func (operands, tmp);
1963
1964   return true;
1965 }
1966
1967 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask.  */
1968 rtx
1969 gen_scalar_move_mask (machine_mode mode)
1970 {
1971   rtx_vector_builder builder (mode, 1, 2);
1972   builder.quick_push (const1_rtx);
1973   builder.quick_push (const0_rtx);
1974   return builder.build ();
1975 }
1976
1977 static unsigned
1978 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
1979 {
1980   // Original equation:
1981   //   VLMAX = (VectorBits / EltSize) * LMUL
1982   //   where LMUL = MinSize / TARGET_MIN_VLEN
1983   // The following equations have been reordered to prevent loss of precision
1984   // when calculating fractional LMUL.
1985   return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
1986 }
1987
1988 static unsigned
1989 get_unknown_min_value (machine_mode mode)
1990 {
1991   enum vlmul_type vlmul = get_vlmul (mode);
1992   switch (vlmul)
1993     {
1994     case LMUL_1:
1995       return TARGET_MIN_VLEN;
1996     case LMUL_2:
1997       return TARGET_MIN_VLEN * 2;
1998     case LMUL_4:
1999       return TARGET_MIN_VLEN * 4;
2000     case LMUL_8:
2001       return TARGET_MIN_VLEN * 8;
2002     default:
2003       gcc_unreachable ();
2004     }
2005 }
2006
2007 static rtx
2008 force_vector_length_operand (rtx vl)
2009 {
2010   if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2011     return force_reg (Pmode, vl);
2012   return vl;
2013 }
2014
2015 rtx
2016 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2017 {
2018   unsigned int sew = get_sew (vmode);
2019   rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2020   rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2021   return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2022                                      gen_int_mode (get_vlmul (vmode), Pmode),
2023                                      tail_policy, mask_policy);
2024 }
2025
2026 /* GET VL * 2 rtx.  */
2027 static rtx
2028 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2029 {
2030   rtx i32vl = NULL_RTX;
2031   if (CONST_INT_P (avl))
2032     {
2033       unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2034       unsigned min_size = get_unknown_min_value (mode);
2035       unsigned vlen_max = RVV_65536;
2036       unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2037       unsigned vlen_min = TARGET_MIN_VLEN;
2038       unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2039
2040       unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2041       if (avl_int <= vlmax_min)
2042         i32vl = gen_int_mode (2 * avl_int, Pmode);
2043       else if (avl_int >= 2 * vlmax_max)
2044         {
2045           // Just set i32vl to VLMAX in this situation
2046           i32vl = gen_reg_rtx (Pmode);
2047           emit_insn (
2048             gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2049         }
2050       else
2051         {
2052           // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2053           // is related to the hardware implementation.
2054           // So let the following code handle
2055         }
2056     }
2057   if (!i32vl)
2058     {
2059       // Using vsetvli instruction to get actually used length which related to
2060       // the hardware implementation
2061       rtx i64vl = gen_reg_rtx (Pmode);
2062       emit_insn (
2063         gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2064       // scale 2 for 32-bit length
2065       i32vl = gen_reg_rtx (Pmode);
2066       emit_insn (
2067         gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2068     }
2069
2070   return force_vector_length_operand (i32vl);
2071 }
2072
2073 bool
2074 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2075                      machine_mode demote_mask_mode, rtx *ops)
2076 {
2077   rtx scalar_op = ops[4];
2078   rtx avl = ops[5];
2079   machine_mode scalar_mode = GET_MODE_INNER (mode);
2080   if (rtx_equal_p (scalar_op, const0_rtx))
2081     {
2082       ops[5] = force_vector_length_operand (ops[5]);
2083       return false;
2084     }
2085
2086   if (TARGET_64BIT)
2087     {
2088       ops[4] = force_reg (scalar_mode, scalar_op);
2089       ops[5] = force_vector_length_operand (ops[5]);
2090       return false;
2091     }
2092
2093   if (immediate_operand (scalar_op, Pmode))
2094     {
2095       ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2096       ops[5] = force_vector_length_operand (ops[5]);
2097       return false;
2098     }
2099
2100   if (CONST_INT_P (scalar_op))
2101     scalar_op = force_reg (scalar_mode, scalar_op);
2102
2103   rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2104
2105   rtx demote_scalar_op1, demote_scalar_op2;
2106   if (unspec == UNSPEC_VSLIDE1UP)
2107     {
2108       demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2109       demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2110     }
2111   else
2112     {
2113       demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2114       demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2115     }
2116
2117   rtx temp = gen_reg_rtx (demote_mode);
2118   rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2119   rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2120   rtx merge = RVV_VUNDEF (demote_mode);
2121   /* Handle vslide1<ud>_tu.  */
2122   if (register_operand (ops[2], mode)
2123       && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2124     {
2125       merge = gen_lowpart (demote_mode, ops[2]);
2126       ta = ops[6];
2127       ma = ops[7];
2128     }
2129
2130   emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2131                              CONSTM1_RTX (demote_mask_mode), merge,
2132                              gen_lowpart (demote_mode, ops[3]),
2133                              demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2134   emit_insn (gen_pred_slide (unspec, demote_mode,
2135                              gen_lowpart (demote_mode, ops[0]),
2136                              CONSTM1_RTX (demote_mask_mode), merge, temp,
2137                              demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2138
2139   if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2140       && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2141     emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2142                                force_vector_length_operand (ops[5]), ops[6],
2143                                ops[8]));
2144   return true;
2145 }
2146
2147 rtx
2148 gen_avl_for_scalar_move (rtx avl)
2149 {
2150   /* AVL for scalar move has different behavior between 0 and large than 0.  */
2151   if (CONST_INT_P (avl))
2152     {
2153       /* So we could just set AVL to 1 for any constant other than 0.  */
2154       if (rtx_equal_p (avl, const0_rtx))
2155         return const0_rtx;
2156       else
2157         return const1_rtx;
2158     }
2159   else
2160     {
2161       /* For non-constant value, we set any non zero value to 1 by
2162          `sgtu new_avl,input_avl,zero` + `vsetvli`.  */
2163       rtx tmp = gen_reg_rtx (Pmode);
2164       emit_insn (
2165         gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2166       return tmp;
2167     }
2168 }
2169
2170 /* Expand tuple modes data movement for.  */
2171 void
2172 expand_tuple_move (rtx *ops)
2173 {
2174   unsigned int i;
2175   machine_mode tuple_mode = GET_MODE (ops[0]);
2176   machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2177   poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2178   unsigned int nf = get_nf (tuple_mode);
2179   bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2180
2181   if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2182     {
2183       rtx val;
2184       gcc_assert (can_create_pseudo_p ()
2185                   && const_vec_duplicate_p (ops[1], &val));
2186       for (i = 0; i < nf; ++i)
2187         {
2188           poly_int64 offset = i * subpart_size;
2189           rtx subreg
2190             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2191           rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2192           emit_move_insn (subreg, dup);
2193         }
2194     }
2195   else if (REG_P (ops[0]) && REG_P (ops[1]))
2196     {
2197       for (i = 0; i < nf; ++i)
2198         {
2199           int index = i;
2200
2201           /* Take NF = 2 and LMUL = 1 for example:
2202
2203               - move v8 to v9:
2204                  vmv1r v10,v9
2205                  vmv1r v9,v8
2206
2207               - move v8 to v7:
2208                  vmv1r v7,v8
2209                  vmv1r v8,v9  */
2210           if (REGNO (ops[0]) > REGNO (ops[1]))
2211             index = nf - 1 - i;
2212           poly_int64 offset = index * subpart_size;
2213           rtx dst_subreg
2214             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2215           rtx src_subreg
2216             = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2217           emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2218         }
2219     }
2220   else
2221     {
2222       /* Expand tuple memory data movement.  */
2223       gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2224       rtx offset = gen_int_mode (subpart_size, Pmode);
2225       if (!subpart_size.is_constant ())
2226         {
2227           emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2228           if (fractional_p)
2229             {
2230               unsigned int factor
2231                 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2232                     .to_constant ();
2233               rtx pat
2234                 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2235                                     gen_int_mode (exact_log2 (factor), Pmode));
2236               emit_insn (gen_rtx_SET (ops[2], pat));
2237             }
2238
2239           if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2240             {
2241               unsigned int factor
2242                 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2243                     .to_constant ();
2244               rtx pat
2245                 = gen_rtx_ASHIFT (Pmode, ops[2],
2246                                   gen_int_mode (exact_log2 (factor), Pmode));
2247               emit_insn (gen_rtx_SET (ops[2], pat));
2248             }
2249           offset = ops[2];
2250         }
2251
2252       /* Non-fractional LMUL has whole register moves that don't require a
2253          vsetvl for VLMAX.  */
2254       if (fractional_p)
2255         emit_vlmax_vsetvl (subpart_mode, ops[4]);
2256       if (MEM_P (ops[1]))
2257         {
2258           /* Load operations.  */
2259           emit_move_insn (ops[3], XEXP (ops[1], 0));
2260           for (i = 0; i < nf; i++)
2261             {
2262               rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2263                                                 tuple_mode, i * subpart_size);
2264               if (i != 0)
2265                 {
2266                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2267                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2268                 }
2269               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2270
2271               if (fractional_p)
2272                 {
2273                   rtx operands[] = {subreg, mem};
2274                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2275                                         UNARY_OP, operands, ops[4]);
2276                 }
2277               else
2278                 emit_move_insn (subreg, mem);
2279             }
2280         }
2281       else
2282         {
2283           /* Store operations.  */
2284           emit_move_insn (ops[3], XEXP (ops[0], 0));
2285           for (i = 0; i < nf; i++)
2286             {
2287               rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2288                                                 tuple_mode, i * subpart_size);
2289               if (i != 0)
2290                 {
2291                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2292                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2293                 }
2294               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2295
2296               if (fractional_p)
2297                 {
2298                   rtx operands[] = {mem, subreg};
2299                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2300                                         UNARY_OP, operands, ops[4]);
2301                 }
2302               else
2303                 emit_move_insn (mem, subreg);
2304             }
2305         }
2306     }
2307 }
2308
2309 /* Return the vectorization machine mode for RVV according to LMUL.  */
2310 machine_mode
2311 preferred_simd_mode (scalar_mode mode)
2312 {
2313   if (autovec_use_vlmax_p ())
2314     {
2315       /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2316          riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
2317          get the auto-vectorization mode.  */
2318       poly_uint64 nunits;
2319       poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2320       poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2321       /* Disable vectorization when we can't find a RVV mode for it.
2322          E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2323          a double (DFmode) type.  */
2324       if (!multiple_p (vector_size, scalar_size, &nunits))
2325         return word_mode;
2326       machine_mode rvv_mode;
2327       if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2328         return rvv_mode;
2329     }
2330   return word_mode;
2331 }
2332
2333 /* Subroutine of riscv_vector_expand_vector_init.
2334    Works as follows:
2335    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
2336    (b) Skip leading elements from BUILDER, which are the same as
2337        element NELTS_REQD - 1.
2338    (c) Insert earlier elements in reverse order in TARGET using vslide1down.  */
2339
2340 static void
2341 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
2342                                  int nelts_reqd)
2343 {
2344   machine_mode mode = GET_MODE (target);
2345   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2346   emit_move_insn (target, dup);
2347   int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2348   for (int i = ndups; i < nelts_reqd; i++)
2349     {
2350       unsigned int unspec
2351         = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
2352       insn_code icode = code_for_pred_slide (unspec, mode);
2353       rtx ops[] = {target, target, builder.elt (i)};
2354       emit_vlmax_insn (icode, BINARY_OP, ops);
2355     }
2356 }
2357
2358 /* Use merge approach to initialize the vector with repeating sequence.
2359    v = {a, b, a, b, a, b, a, b}.
2360
2361    v = broadcast (a).
2362    mask = 0b01010101....
2363    v = merge (v, b, mask)
2364 */
2365 static void
2366 expand_vector_init_merge_repeating_sequence (rtx target,
2367                                              const rvv_builder &builder)
2368 {
2369   /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2370      since we don't have such instruction in RVV.
2371      Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2372      instruction to generate the mask data we want.  */
2373   machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2374   machine_mode mask_int_mode
2375     = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2376   uint64_t full_nelts = builder.full_nelts ().to_constant ();
2377
2378   /* Step 1: Broadcast the first pattern.  */
2379   rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2380   emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2381                     UNARY_OP, ops);
2382   /* Step 2: Merge the rest iteration of pattern.  */
2383   for (unsigned int i = 1; i < builder.npatterns (); i++)
2384     {
2385       /* Step 2-1: Generate mask register v0 for each merge.  */
2386       rtx merge_mask
2387         = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2388       rtx mask = gen_reg_rtx (mask_bit_mode);
2389       rtx dup = gen_reg_rtx (mask_int_mode);
2390
2391       if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x.  */
2392         {
2393           rtx ops[] = {dup, merge_mask};
2394           emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2395                                SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2396         }
2397       else /* vmv.v.x.  */
2398         {
2399           rtx ops[] = {dup,
2400                        force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2401           rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2402                                  Pmode);
2403           emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2404                                ops, vl);
2405         }
2406
2407       emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2408
2409       /* Step 2-2: Merge pattern according to the mask.  */
2410       rtx ops[] = {target, target, builder.elt (i), mask};
2411       emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2412                         MERGE_OP, ops);
2413     }
2414 }
2415
2416 /* Use slideup approach to combine the vectors.
2417      v = {a, a, a, a, b, b, b, b}
2418
2419    First:
2420      v1 = {a, a, a, a, a, a, a, a}
2421      v2 = {b, b, b, b, b, b, b, b}
2422      v = slideup (v1, v2, nelt / 2)
2423 */
2424 static void
2425 expand_vector_init_slideup_combine_sequence (rtx target,
2426                                              const rvv_builder &builder)
2427 {
2428   machine_mode mode = GET_MODE (target);
2429   int nelts = builder.full_nelts ().to_constant ();
2430   rtx first_elt = builder.elt (0);
2431   rtx last_elt = builder.elt (nelts - 1);
2432   rtx low = expand_vector_broadcast (mode, first_elt);
2433   rtx high = expand_vector_broadcast (mode, last_elt);
2434   insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2435   rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2436   emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2437 }
2438
2439 /* Use merge approach to merge a scalar into a vector.
2440      v = {a, a, a, a, a, a, b, b}
2441
2442      v1 = {a, a, a, a, a, a, a, a}
2443      scalar = b
2444      mask = {0, 0, 0, 0, 0, 0, 1, 1}
2445 */
2446 static void
2447 expand_vector_init_merge_combine_sequence (rtx target,
2448                                            const rvv_builder &builder)
2449 {
2450   machine_mode mode = GET_MODE (target);
2451   machine_mode imode = builder.int_mode ();
2452   machine_mode mmode = builder.mask_mode ();
2453   int nelts = builder.full_nelts ().to_constant ();
2454   int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2455   if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2456       || riscv_get_v_regno_alignment (imode) > 1)
2457     imode = get_vector_mode (HImode, nelts).require ();
2458
2459   /* Generate vid = { 0, 1, 2, ..., n }.  */
2460   rtx vid = gen_reg_rtx (imode);
2461   expand_vec_series (vid, const0_rtx, const1_rtx);
2462
2463   /* Generate mask.  */
2464   rtx mask = gen_reg_rtx (mmode);
2465   insn_code icode = code_for_pred_cmp_scalar (imode);
2466   rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2467   rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2468   /* vmsgtu.vi/vmsgtu.vx.  */
2469   rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2470   rtx sel = builder.elt (nelts - 1);
2471   rtx mask_ops[] = {mask, cmp, vid, index};
2472   emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2473
2474   /* Duplicate the first elements.  */
2475   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2476   /* Merge scalar into vector according to mask.  */
2477   rtx merge_ops[] = {target, dup, sel, mask};
2478   icode = code_for_pred_merge_scalar (mode);
2479   emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2480 }
2481
2482 /* Subroutine of expand_vec_init to handle case
2483    when all trailing elements of builder are same.
2484    This works as follows:
2485    (a) Use expand_insn interface to broadcast last vector element in TARGET.
2486    (b) Insert remaining elements in TARGET using insr.
2487
2488    ??? The heuristic used is to do above if number of same trailing elements
2489    is greater than leading_ndups, loosely based on
2490    heuristic from mostly_zeros_p.  May need fine-tuning.  */
2491
2492 static bool
2493 expand_vector_init_trailing_same_elem (rtx target,
2494                                        const rtx_vector_builder &builder,
2495                                        int nelts_reqd)
2496 {
2497   int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2498   int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
2499   machine_mode mode = GET_MODE (target);
2500
2501   if (trailing_ndups > leading_ndups)
2502     {
2503       rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
2504       for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
2505         {
2506           unsigned int unspec
2507             = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
2508           insn_code icode = code_for_pred_slide (unspec, mode);
2509           rtx tmp = gen_reg_rtx (mode);
2510           rtx ops[] = {tmp, dup, builder.elt (i)};
2511           emit_vlmax_insn (icode, BINARY_OP, ops);
2512           /* slide1up need source and dest to be different REG.  */
2513           dup = tmp;
2514         }
2515
2516       emit_move_insn (target, dup);
2517       return true;
2518     }
2519
2520   return false;
2521 }
2522
2523 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
2524
2525 void
2526 expand_vec_init (rtx target, rtx vals)
2527 {
2528   machine_mode mode = GET_MODE (target);
2529   int nelts = XVECLEN (vals, 0);
2530
2531   rvv_builder v (mode, nelts, 1);
2532   for (int i = 0; i < nelts; i++)
2533     v.quick_push (XVECEXP (vals, 0, i));
2534   v.finalize ();
2535
2536   if (nelts > 3)
2537     {
2538       /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }.  */
2539       if (v.can_duplicate_repeating_sequence_p ())
2540         {
2541           rtx ele = v.get_merged_repeating_sequence ();
2542           rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2543           emit_move_insn (target, gen_lowpart (mode, dup));
2544           return;
2545         }
2546
2547       /* Case 2: Optimize repeating sequence cases that Case 1 can
2548          not handle and it is profitable.  For example:
2549          ELEMENT BITSIZE = 64.
2550          v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2551          We can't find a vector mode for "ab" which will be combined into
2552          128-bit element to duplicate.  */
2553       if (v.repeating_sequence_use_merge_profitable_p ())
2554         {
2555           expand_vector_init_merge_repeating_sequence (target, v);
2556           return;
2557         }
2558
2559       /* Case 3: Optimize combine sequence.
2560          E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2561          We can combine:
2562            v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2563          and
2564            v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2565          by slideup.  */
2566       if (v.combine_sequence_use_slideup_profitable_p ())
2567         {
2568           expand_vector_init_slideup_combine_sequence (target, v);
2569           return;
2570         }
2571
2572       /* Case 4: Optimize combine sequence.
2573          E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2574
2575          Generate vector:
2576            v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2577
2578          Generate mask:
2579            mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2580
2581          Merge b into v by mask:
2582            v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.  */
2583       if (v.combine_sequence_use_merge_profitable_p ())
2584         {
2585           expand_vector_init_merge_combine_sequence (target, v);
2586           return;
2587         }
2588     }
2589
2590   /* Optimize trailing same elements sequence:
2591       v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x};  */
2592   if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2593     /* Handle common situation by vslide1down. This function can handle any
2594        situation of vec_init<mode>. Only the cases that are not optimized above
2595        will fall through here.  */
2596     expand_vector_init_insert_elems (target, v, nelts);
2597 }
2598
2599 /* Get insn code for corresponding comparison.  */
2600
2601 static insn_code
2602 get_cmp_insn_code (rtx_code code, machine_mode mode)
2603 {
2604   insn_code icode;
2605   switch (code)
2606     {
2607     case EQ:
2608     case NE:
2609     case LE:
2610     case LEU:
2611     case GT:
2612     case GTU:
2613     case LTGT:
2614       icode = code_for_pred_cmp (mode);
2615       break;
2616     case LT:
2617     case LTU:
2618     case GE:
2619     case GEU:
2620       if (FLOAT_MODE_P (mode))
2621         icode = code_for_pred_cmp (mode);
2622       else
2623         icode = code_for_pred_ltge (mode);
2624       break;
2625     default:
2626       gcc_unreachable ();
2627     }
2628   return icode;
2629 }
2630
2631 /* This hook gives the vectorizer more vector mode options.  We want it to not
2632    only try modes with the maximum number of units a full vector can hold but
2633    for example also half the number of units for a smaller elements size.
2634    Such vectors can be promoted to a full vector of widened elements
2635    (still with the same number of elements, essentially vectorizing at a
2636    fixed number of units rather than a fixed number of bytes).  */
2637 unsigned int
2638 autovectorize_vector_modes (vector_modes *modes, bool)
2639 {
2640   if (autovec_use_vlmax_p ())
2641     {
2642       poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2643
2644       /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2645          fit a whole vector.
2646          Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2647          is guided by the extensions we have available (vf2, vf4 and vf8).
2648
2649          - full_size: Try using full vectors for all element types.
2650          - full_size / 2:
2651            Try using 16-bit containers for 8-bit elements and full vectors
2652            for wider elements.
2653          - full_size / 4:
2654            Try using 32-bit containers for 8-bit and 16-bit elements and
2655            full vectors for wider elements.
2656          - full_size / 8:
2657            Try using 64-bit containers for all element types.  */
2658       static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2659       for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2660         {
2661           poly_uint64 units;
2662           machine_mode mode;
2663           if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2664               && get_vector_mode (QImode, units).exists (&mode))
2665             modes->safe_push (mode);
2666         }
2667     }
2668     /* Push all VLSmodes according to TARGET_MIN_VLEN.  */
2669     unsigned int i = 0;
2670     unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2671     unsigned int size = base_size;
2672     machine_mode mode;
2673     while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2674      {
2675         if (vls_mode_valid_p (mode))
2676           modes->safe_push (mode);
2677
2678         i++;
2679         size = base_size / (1U << i);
2680      }
2681   /* Enable LOOP_VINFO comparison in COST model.  */
2682   return VECT_COMPARE_COSTS;
2683 }
2684
2685 /* Return true if we can find the related MODE according to default LMUL. */
2686 static bool
2687 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2688                          poly_uint64 *nunits)
2689 {
2690   if (!autovec_use_vlmax_p ())
2691     return false;
2692   if (riscv_v_ext_vector_mode_p (vector_mode)
2693       && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2694                      GET_MODE_SIZE (element_mode), nunits))
2695     return true;
2696   if (riscv_v_ext_vls_mode_p (vector_mode)
2697       && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2698                      GET_MODE_SIZE (element_mode), nunits))
2699     return true;
2700   return false;
2701 }
2702
2703 /* If the given VECTOR_MODE is an RVV mode,  first get the largest number
2704    of units that fit into a full vector at the given ELEMENT_MODE.
2705    We will have the vectorizer call us with a successively decreasing
2706    number of units (as specified in autovectorize_vector_modes).
2707    The starting mode is always the one specified by preferred_simd_mode. */
2708 opt_machine_mode
2709 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2710                         poly_uint64 nunits)
2711 {
2712   /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2713   poly_uint64 min_units;
2714   if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2715     {
2716       machine_mode rvv_mode;
2717       if (maybe_ne (nunits, 0U))
2718         {
2719           /* If we were given a number of units NUNITS, try to find an
2720              RVV vector mode of inner mode ELEMENT_MODE with the same
2721              number of units.  */
2722           if (multiple_p (min_units, nunits)
2723               && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2724             return rvv_mode;
2725         }
2726       else
2727         {
2728           /* Look for a vector mode with the same number of units as the
2729              VECTOR_MODE we were given.  We keep track of the minimum
2730              number of units so far which determines the smallest necessary
2731              but largest possible, suitable mode for vectorization.  */
2732           min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2733           if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2734             return rvv_mode;
2735         }
2736     }
2737
2738   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2739 }
2740
2741 /* Expand an RVV comparison.  */
2742
2743 void
2744 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1)
2745 {
2746   machine_mode mask_mode = GET_MODE (target);
2747   machine_mode data_mode = GET_MODE (op0);
2748   insn_code icode = get_cmp_insn_code (code, data_mode);
2749
2750   if (code == LTGT)
2751     {
2752       rtx lt = gen_reg_rtx (mask_mode);
2753       rtx gt = gen_reg_rtx (mask_mode);
2754       expand_vec_cmp (lt, LT, op0, op1);
2755       expand_vec_cmp (gt, GT, op0, op1);
2756       icode = code_for_pred (IOR, mask_mode);
2757       rtx ops[] = {target, lt, gt};
2758       emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2759       return;
2760     }
2761
2762   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2763   rtx ops[] = {target, cmp, op0, op1};
2764   emit_vlmax_insn (icode, COMPARE_OP, ops);
2765 }
2766
2767 void
2768 expand_vec_cmp (rtx target, rtx_code code, rtx mask, rtx maskoff, rtx op0,
2769                 rtx op1)
2770 {
2771   machine_mode mask_mode = GET_MODE (target);
2772   machine_mode data_mode = GET_MODE (op0);
2773   insn_code icode = get_cmp_insn_code (code, data_mode);
2774
2775   if (code == LTGT)
2776     {
2777       rtx lt = gen_reg_rtx (mask_mode);
2778       rtx gt = gen_reg_rtx (mask_mode);
2779       expand_vec_cmp (lt, LT, mask, maskoff, op0, op1);
2780       expand_vec_cmp (gt, GT, mask, maskoff, op0, op1);
2781       icode = code_for_pred (IOR, mask_mode);
2782       rtx ops[] = {target, lt, gt};
2783       emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2784       return;
2785     }
2786
2787   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2788   rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2789   emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2790 }
2791
2792 /* Expand an RVV floating-point comparison:
2793
2794    If CAN_INVERT_P is true, the caller can also handle inverted results;
2795    return true if the result is in fact inverted.  */
2796
2797 bool
2798 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2799                       bool can_invert_p)
2800 {
2801   machine_mode mask_mode = GET_MODE (target);
2802   machine_mode data_mode = GET_MODE (op0);
2803
2804   /* If can_invert_p = true:
2805      It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2806
2807        vmfeq.vv    v0, va, va
2808        vmfeq.vv    v1, vb, vb
2809        vmand.mm    v0, v0, v1
2810        vmflt.vv    v0, va, vb, v0.t
2811        vmnot.m     v0, v0
2812
2813      And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2814      second vmfeq.vv:
2815
2816        vmfeq.vv    v0, va, va
2817        vmfeq.vv    v0, vb, vb, v0.t
2818        vmflt.vv    v0, va, vb, v0.t
2819        vmnot.m     v0, v0
2820
2821      If can_invert_p = false:
2822
2823        # Example of implementing isgreater()
2824        vmfeq.vv v0, va, va        # Only set where A is not NaN.
2825        vmfeq.vv v1, vb, vb        # Only set where B is not NaN.
2826        vmand.mm v0, v0, v1        # Only set where A and B are ordered,
2827        vmfgt.vv v0, va, vb, v0.t  #  so only set flags on ordered values.
2828   */
2829
2830   rtx eq0 = gen_reg_rtx (mask_mode);
2831   rtx eq1 = gen_reg_rtx (mask_mode);
2832   switch (code)
2833     {
2834     case EQ:
2835     case NE:
2836     case LT:
2837     case LE:
2838     case GT:
2839     case GE:
2840     case LTGT:
2841       /* There is native support for the comparison.  */
2842       expand_vec_cmp (target, code, op0, op1);
2843       return false;
2844     case UNEQ:
2845     case ORDERED:
2846     case UNORDERED:
2847     case UNLT:
2848     case UNLE:
2849     case UNGT:
2850     case UNGE:
2851       /* vmfeq.vv v0, va, va  */
2852       expand_vec_cmp (eq0, EQ, op0, op0);
2853       if (HONOR_SNANS (data_mode))
2854         {
2855           /*
2856              vmfeq.vv    v1, vb, vb
2857              vmand.mm    v0, v0, v1
2858           */
2859           expand_vec_cmp (eq1, EQ, op1, op1);
2860           insn_code icode = code_for_pred (AND, mask_mode);
2861           rtx ops[] = {eq0, eq0, eq1};
2862           emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2863         }
2864       else
2865         {
2866           /* vmfeq.vv    v0, vb, vb, v0.t  */
2867           expand_vec_cmp (eq0, EQ, eq0, eq0, op1, op1);
2868         }
2869       break;
2870     default:
2871       gcc_unreachable ();
2872     }
2873
2874   if (code == ORDERED)
2875     {
2876       emit_move_insn (target, eq0);
2877       return false;
2878     }
2879
2880   /* There is native support for the inverse comparison.  */
2881   code = reverse_condition_maybe_unordered (code);
2882   if (code == ORDERED)
2883     emit_move_insn (target, eq0);
2884   else
2885     expand_vec_cmp (eq0, code, eq0, eq0, op0, op1);
2886
2887   if (can_invert_p)
2888     {
2889       emit_move_insn (target, eq0);
2890       return true;
2891     }
2892
2893   /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2894      into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm.  */
2895   emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2896   return false;
2897 }
2898
2899 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2900    MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2901    2 * nunits - 1.  */
2902 static rtx
2903 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2904 {
2905   rtx sel_mod;
2906   machine_mode sel_mode = GET_MODE (sel);
2907   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2908   poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2909   /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2910      Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2911      indice.  */
2912   if (CONST_VECTOR_P (sel)
2913       && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2914     sel_mod = sel;
2915   else
2916     {
2917       rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2918       sel_mod
2919         = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2920     }
2921   return sel_mod;
2922 }
2923
2924 /* Implement vec_perm<mode>.  */
2925
2926 void
2927 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2928 {
2929   machine_mode data_mode = GET_MODE (target);
2930   machine_mode sel_mode = GET_MODE (sel);
2931   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2932
2933   /* Check if the sel only references the first values vector. If each select
2934      index is in range of [0, nunits - 1]. A single vrgather instructions is
2935      enough. Since we will use vrgatherei16.vv for variable-length vector,
2936      it is never out of range and we don't need to modulo the index.  */
2937   if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2938     {
2939       emit_vlmax_gather_insn (target, op0, sel);
2940       return;
2941     }
2942
2943   /* Check if all the indices are same.  */
2944   rtx elt;
2945   if (const_vec_duplicate_p (sel, &elt))
2946     {
2947       poly_uint64 value = rtx_to_poly_int64 (elt);
2948       rtx op = op0;
2949       if (maybe_gt (value, nunits - 1))
2950         {
2951           sel = gen_const_vector_dup (sel_mode, value - nunits);
2952           op = op1;
2953         }
2954       emit_vlmax_gather_insn (target, op, sel);
2955     }
2956
2957   /* Note: vec_perm indices are supposed to wrap when they go beyond the
2958      size of the two value vectors, i.e. the upper bits of the indices
2959      are effectively ignored.  RVV vrgather instead produces 0 for any
2960      out-of-range indices, so we need to modulo all the vec_perm indices
2961      to ensure they are all in range of [0, nunits - 1] when op0 == op1
2962      or all in range of [0, 2 * nunits - 1] when op0 != op1.  */
2963   rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2964
2965   /* Check if the two values vectors are the same.  */
2966   if (rtx_equal_p (op0, op1))
2967     {
2968       emit_vlmax_gather_insn (target, op0, sel_mod);
2969       return;
2970     }
2971
2972   /* This following sequence is handling the case that:
2973      __builtin_shufflevector (vec1, vec2, index...), the index can be any
2974      value in range of [0, 2 * nunits - 1].  */
2975   machine_mode mask_mode;
2976   mask_mode = get_mask_mode (data_mode);
2977   rtx mask = gen_reg_rtx (mask_mode);
2978   rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
2979
2980   /* Step 1: generate a mask that should select everything >= nunits into the
2981    * mask.  */
2982   expand_vec_cmp (mask, GEU, sel_mod, max_sel);
2983
2984   /* Step2: gather every op0 values indexed by sel into target,
2985             we don't need to care about the result of the element
2986             whose index >= nunits.  */
2987   emit_vlmax_gather_insn (target, op0, sel_mod);
2988
2989   /* Step3: shift the range from (nunits, max_of_mode] to
2990             [0, max_of_mode - nunits].  */
2991   rtx tmp = gen_reg_rtx (sel_mode);
2992   rtx ops[] = {tmp, sel_mod, max_sel};
2993   emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
2994
2995   /* Step4: gather those into the previously masked-out elements
2996             of target.  */
2997   emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
2998 }
2999
3000 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
3001
3002 /* vec_perm support.  */
3003
3004 struct expand_vec_perm_d
3005 {
3006   rtx target, op0, op1;
3007   vec_perm_indices perm;
3008   machine_mode vmode;
3009   machine_mode op_mode;
3010   bool one_vector_p;
3011   bool testing_p;
3012 };
3013
3014 /* Return the appropriate index mode for gather instructions.  */
3015 opt_machine_mode
3016 get_gather_index_mode (struct expand_vec_perm_d *d)
3017 {
3018   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3019   poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3020
3021   if (GET_MODE_INNER (d->vmode) == QImode)
3022     {
3023       if (nunits.is_constant ())
3024         {
3025           /* If indice is LMUL8 CONST_VECTOR and any element value
3026              exceed the range of 0 ~ 255, Forbid such permutation
3027              since we need vector HI mode to hold such indice and
3028              we don't have it.  */
3029           if (!d->perm.all_in_range_p (0, 255)
3030               && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3031             return opt_machine_mode ();
3032         }
3033       else
3034         {
3035           /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3036              Otherwise, it could overflow the index range.  */
3037           if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3038             return opt_machine_mode ();
3039         }
3040     }
3041   else if (riscv_get_v_regno_alignment (sel_mode) > 1
3042            && GET_MODE_INNER (sel_mode) != HImode)
3043     sel_mode = get_vector_mode (HImode, nunits).require ();
3044   return sel_mode;
3045 }
3046
3047 /* Recognize the patterns that we can use merge operation to shuffle the
3048    vectors. The value of Each element (index i) in selector can only be
3049    either i or nunits + i.  We will check the pattern is actually monotonic.
3050
3051    E.g.
3052    v = VEC_PERM_EXPR (v0, v1, selector),
3053    selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ...  }
3054
3055    We can transform such pattern into:
3056
3057    v = vcond_mask (v0, v1, mask),
3058    mask = { 0, 1, 0, 1, 0, 1, ... }.  */
3059
3060 static bool
3061 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3062 {
3063   machine_mode vmode = d->vmode;
3064   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3065   int n_patterns = d->perm.encoding ().npatterns ();
3066   poly_int64 vec_len = d->perm.length ();
3067
3068   for (int i = 0; i < n_patterns; ++i)
3069     if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3070       return false;
3071
3072   /* Check the pattern is monotonic here, otherwise, return false.  */
3073   for (int i = n_patterns; i < n_patterns * 2; i++)
3074     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3075         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3076       return false;
3077
3078   /* We need to use precomputed mask for such situation and such mask
3079      can only be computed in compile-time known size modes.  */
3080   bool indices_fit_selector_p
3081     = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3082   if (!indices_fit_selector_p && !vec_len.is_constant ())
3083     return false;
3084
3085   if (d->testing_p)
3086     return true;
3087
3088   machine_mode mask_mode = get_mask_mode (vmode);
3089   rtx mask = gen_reg_rtx (mask_mode);
3090
3091   if (indices_fit_selector_p)
3092     {
3093       /* MASK = SELECTOR < NUNTIS ? 1 : 0.  */
3094       rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3095       rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3096       insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3097       rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3098       rtx ops[] = {mask, cmp, sel, x};
3099       emit_vlmax_insn (icode, COMPARE_OP, ops);
3100     }
3101   else
3102     {
3103       /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3104          directly to generate the selector mask, instead, we can only use
3105          precomputed mask.
3106
3107          E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3108          don't have a QImode scalar register to hold larger than 255.
3109          We also cannot hold that in a vector QImode register if LMUL = 8, and,
3110          since there is no larger HI mode vector we cannot create a larger
3111          selector.
3112
3113          As the mask is a simple {0, 1, ...} pattern and the length is known we
3114          can store it in a scalar register and broadcast it to a mask register.
3115        */
3116       gcc_assert (vec_len.is_constant ());
3117       int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3118       machine_mode mode = get_vector_mode (QImode, size).require ();
3119       rtx tmp = gen_reg_rtx (mode);
3120       rvv_builder v (mode, 1, size);
3121       for (int i = 0; i < vec_len.to_constant () / 8; i++)
3122         {
3123           uint8_t value = 0;
3124           for (int j = 0; j < 8; j++)
3125             {
3126               int index = i * 8 + j;
3127               if (known_lt (d->perm[index], 256))
3128                 value |= 1 << j;
3129             }
3130           v.quick_push (gen_int_mode (value, QImode));
3131         }
3132       emit_move_insn (tmp, v.build ());
3133       emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3134     }
3135
3136   /* TARGET = MASK ? OP0 : OP1.  */
3137   /* swap op0 and op1 since the order is opposite to pred_merge.  */
3138   rtx ops2[] = {d->target, d->op1, d->op0, mask};
3139   emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3140   return true;
3141 }
3142
3143 /* Recognize the consecutive index that we can use a single
3144    vrgather.v[x|i] to shuffle the vectors.
3145
3146    e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3147    Use SEW = 32, index = 1 vrgather.vi to get the result.  */
3148 static bool
3149 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3150 {
3151   machine_mode vmode = d->vmode;
3152   scalar_mode smode = GET_MODE_INNER (vmode);
3153   poly_int64 vec_len = d->perm.length ();
3154   HOST_WIDE_INT elt;
3155
3156   if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3157     return false;
3158   int vlen = vec_len.to_constant ();
3159
3160   /* Compute the last element index of consecutive pattern from the leading
3161      consecutive elements.  */
3162   int last_consecutive_idx = -1;
3163   int consecutive_num = -1;
3164   for (int i = 1; i < vlen; i++)
3165     {
3166       if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3167         break;
3168       last_consecutive_idx = i;
3169       consecutive_num = last_consecutive_idx + 1;
3170     }
3171
3172   int new_vlen = vlen / consecutive_num;
3173   if (last_consecutive_idx < 0 || consecutive_num == vlen
3174       || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3175     return false;
3176   /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3177      All elements of index, index + 1, ... index + consecutive_num - 1 should
3178      locate at the same vector.  */
3179   if (maybe_ge (d->perm[0], vec_len)
3180       != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3181     return false;
3182   /* If a vector has 8 elements.  We allow optimizations on consecutive
3183      patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3184      Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3185      to be optimized.  */
3186   if (d->perm[0].to_constant () % consecutive_num != 0)
3187     return false;
3188   unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3189   if (container_bits > 64)
3190     return false;
3191   else if (container_bits == 64)
3192     {
3193       if (!TARGET_VECTOR_ELEN_64)
3194         return false;
3195       else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3196         return false;
3197     }
3198
3199   /* Check the rest of elements are the same consecutive pattern.  */
3200   for (int i = consecutive_num; i < vlen; i++)
3201     if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3202       return false;
3203
3204   if (FLOAT_MODE_P (smode))
3205     smode = float_mode_for_size (container_bits).require ();
3206   else
3207     smode = int_mode_for_size (container_bits, 0).require ();
3208   if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3209     return false;
3210   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3211
3212   /* Success! */
3213   if (d->testing_p)
3214     return true;
3215
3216   int index = elt / consecutive_num;
3217   if (index >= new_vlen)
3218     index = index - new_vlen;
3219   rtx sel = gen_const_vector_dup (sel_mode, index);
3220   rtx op = elt >= vlen ? d->op0 : d->op1;
3221   emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3222                           gen_lowpart (vmode, op), sel);
3223   return true;
3224 }
3225
3226 /* Recognize the patterns that we can use compress operation to shuffle the
3227    vectors. The perm selector of compress pattern is divided into 2 part:
3228    The first part is the random index number < NUNITS.
3229    The second part is consecutive last N index number >= NUNITS.
3230
3231    E.g.
3232    v = VEC_PERM_EXPR (v0, v1, selector),
3233    selector = { 0, 2, 6, 7 }
3234
3235    We can transform such pattern into:
3236
3237    op1 = vcompress (op0, mask)
3238    mask = { 1, 0, 1, 0 }
3239    v = op1.  */
3240
3241 static bool
3242 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3243 {
3244   machine_mode vmode = d->vmode;
3245   poly_int64 vec_len = d->perm.length ();
3246
3247   if (!vec_len.is_constant ())
3248     return false;
3249
3250   int vlen = vec_len.to_constant ();
3251
3252   /* It's not worthwhile the compress pattern has elemenets < 4
3253      and we can't modulo indices for compress pattern.  */
3254   if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3255     return false;
3256
3257   /* Compress pattern doesn't work for one vector.  */
3258   if (d->one_vector_p)
3259     return false;
3260
3261   /* Compress point is the point that all elements value with index i >=
3262      compress point of the selector are all consecutive series increasing and
3263      each selector value >= NUNTIS. In this case, we could compress all elements
3264      of i < compress point into the op1.  */
3265   int compress_point = -1;
3266   for (int i = 0; i < vlen; i++)
3267     {
3268       if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3269         {
3270           compress_point = i;
3271           break;
3272         }
3273     }
3274
3275   /* We don't apply compress approach if we can't find the compress point.  */
3276   if (compress_point < 0)
3277     return false;
3278
3279   /* We can only apply compress approach when all index values from 0 to
3280      compress point are increasing.  */
3281   for (int i = 1; i < compress_point; i++)
3282     if (maybe_le (d->perm[i], d->perm[i - 1]))
3283       return false;
3284
3285   /* It must be series increasing from compress point.  */
3286   for (int i = 1 + compress_point; i < vlen; i++)
3287     if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3288       return false;
3289
3290   /* Success!  */
3291   if (d->testing_p)
3292     return true;
3293
3294   /* Check whether we need to slideup op1 to apply compress approach.
3295
3296        E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3297             is 2 * NUNITS - 1, so we don't need to slide up.
3298
3299             For index = { 0, 2, 5, 6}, we need to slide op1 up before
3300             we apply compress approach.  */
3301   bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3302                         && !const_vec_duplicate_p (d->op1);
3303
3304   /* If we leave it directly be handled by general gather,
3305      the code sequence will be:
3306         VECTOR LOAD  selector
3307         GEU          mask, selector, NUNITS
3308         GATHER       dest, op0, selector
3309         SUB          selector, selector, NUNITS
3310         GATHER       dest, op1, selector, mask
3311      Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3312      as COST = 4. So, we consider the general gather handling COST = 9.
3313      TODO: This cost is not accurate, we can adjust it by tune info.  */
3314   int general_cost = 9;
3315
3316   /* If we can use compress approach, the code squence will be:
3317         MASK LOAD    mask
3318         COMPRESS     op1, op0, mask
3319      If it needs slide up, it will be:
3320         MASK LOAD    mask
3321         SLIDEUP      op1
3322         COMPRESS     op1, op0, mask
3323      By default, mask load COST = 2.
3324      TODO: This cost is not accurate, we can adjust it by tune info.  */
3325   int compress_cost = 4;
3326
3327   if (general_cost <= compress_cost)
3328     return false;
3329
3330   /* Build a mask that is true when selector element is true.  */
3331   machine_mode mask_mode = get_mask_mode (vmode);
3332   rvv_builder builder (mask_mode, vlen, 1);
3333   for (int i = 0; i < vlen; i++)
3334     {
3335       bool is_compress_index = false;
3336       for (int j = 0; j < compress_point; j++)
3337         {
3338           if (known_eq (d->perm[j], i))
3339             {
3340               is_compress_index = true;
3341               break;
3342             }
3343         }
3344       if (is_compress_index)
3345         builder.quick_push (CONST1_RTX (BImode));
3346       else
3347         builder.quick_push (CONST0_RTX (BImode));
3348     }
3349   rtx mask = force_reg (mask_mode, builder.build ());
3350
3351   rtx merge = d->op1;
3352   if (need_slideup_p)
3353     {
3354       int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3355       merge = gen_reg_rtx (vmode);
3356       rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3357       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3358       emit_vlmax_insn (icode, BINARY_OP, ops);
3359     }
3360
3361   insn_code icode = code_for_pred_compress (vmode);
3362   rtx ops[] = {d->target, merge, d->op0, mask};
3363   emit_vlmax_insn (icode, COMPRESS_OP_MERGE, ops);
3364   return true;
3365 }
3366
3367 /* Recognize decompress patterns:
3368
3369    1. VEC_PERM_EXPR op0 and op1
3370       with isel = { 0, nunits, 1, nunits + 1, ... }.
3371       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3372
3373    2. VEC_PERM_EXPR op0 and op1
3374       with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3375       Slide down op0 and op1 with OFFSET = 1/2 nunits.
3376       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3377 */
3378 static bool
3379 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3380 {
3381   poly_uint64 nelt = d->perm.length ();
3382   machine_mode mask_mode = get_mask_mode (d->vmode);
3383
3384   /* For constant size indices, we dont't need to handle it here.
3385      Just leave it to vec_perm<mode>.  */
3386   if (d->perm.length ().is_constant ())
3387     return false;
3388
3389   poly_uint64 first = d->perm[0];
3390   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3391       || !d->perm.series_p (0, 2, first, 1)
3392       || !d->perm.series_p (1, 2, first + nelt, 1))
3393     return false;
3394
3395   /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3396      Otherwise, it could overflow the index range.  */
3397   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3398   if (GET_MODE_INNER (d->vmode) == QImode
3399       && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3400     return false;
3401
3402   /* Success!  */
3403   if (d->testing_p)
3404     return true;
3405
3406   rtx op0, op1;
3407   if (known_eq (first, 0U))
3408     {
3409       op0 = d->op0;
3410       op1 = d->op1;
3411     }
3412   else
3413     {
3414       op0 = gen_reg_rtx (d->vmode);
3415       op1 = gen_reg_rtx (d->vmode);
3416       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3417       rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3418       rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3419       emit_vlmax_insn (icode, BINARY_OP, ops0);
3420       emit_vlmax_insn (icode, BINARY_OP, ops1);
3421     }
3422   /* Generate { 0, 1, .... } mask.  */
3423   rtx vid = gen_reg_rtx (sel_mode);
3424   rtx vid_repeat = gen_reg_rtx (sel_mode);
3425   expand_vec_series (vid, const0_rtx, const1_rtx);
3426   rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3427   emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3428   rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3429   rtx mask = gen_reg_rtx (mask_mode);
3430   expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3431   emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3432   return true;
3433 }
3434
3435 static bool
3436 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3437 {
3438   HOST_WIDE_INT diff;
3439   unsigned i, size, step;
3440
3441   if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3442     return false;
3443
3444   step = diff + 1;
3445   size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3446
3447   switch (size)
3448     {
3449     case 16:
3450       break;
3451     case 32:
3452     case 64:
3453       /* We will have VEC_PERM_EXPR after rtl expand when invoking
3454          __builtin_bswap. It will generate about 9 instructions in
3455          loop as below, no matter it is bswap16, bswap32 or bswap64.
3456            .L2:
3457          1 vle16.v v4,0(a0)
3458          2 vmv.v.x v2,a7
3459          3 vand.vv v2,v6,v2
3460          4 slli    a2,a5,1
3461          5 vrgatherei16.vv v1,v4,v2
3462          6 sub     a4,a4,a5
3463          7 vse16.v v1,0(a3)
3464          8 add     a0,a0,a2
3465          9 add     a3,a3,a2
3466            bne     a4,zero,.L2
3467
3468          But for bswap16 we may have a even simple code gen, which
3469          has only 7 instructions in loop as below.
3470            .L5
3471          1 vle8.v  v2,0(a5)
3472          2 addi    a5,a5,32
3473          3 vsrl.vi v4,v2,8
3474          4 vsll.vi v2,v2,8
3475          5 vor.vv  v4,v4,v2
3476          6 vse8.v  v4,0(a4)
3477          7 addi    a4,a4,32
3478            bne     a5,a6,.L5
3479
3480          Unfortunately, the instructions in loop will grow to 13 and 24
3481          for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3482          for both the bswap64 and bswap32, but take shift and or (7 insn)
3483          for bswap16.
3484        */
3485     default:
3486       return false;
3487     }
3488
3489   for (i = 0; i < step; i++)
3490     if (!d->perm.series_p (i, step, diff - i, step))
3491       return false;
3492
3493   /* Disable when nunits < 4 since the later generic approach
3494      is more profitable on BSWAP.  */
3495   if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3496     return false;
3497
3498   if (d->testing_p)
3499     return true;
3500
3501   machine_mode vhi_mode;
3502   poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3503
3504   if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3505     return false;
3506
3507   /* Step-1: Move op0 to src with VHI mode.  */
3508   rtx src = gen_reg_rtx (vhi_mode);
3509   emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3510
3511   /* Step-2: Shift right 8 bits to dest.  */
3512   rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3513                            NULL_RTX, 0, OPTAB_DIRECT);
3514
3515   /* Step-3: Shift left 8 bits to src.  */
3516   src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3517                       NULL_RTX, 0, OPTAB_DIRECT);
3518
3519   /* Step-4: Logic Or dest and src to dest.  */
3520   dest = expand_binop (vhi_mode, ior_optab, dest, src,
3521                        NULL_RTX, 0, OPTAB_DIRECT);
3522
3523   /* Step-5: Move src to target with VQI mode.  */
3524   emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3525
3526   return true;
3527 }
3528
3529 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3530    approach.  */
3531
3532 static bool
3533 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)
3534 {
3535   poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3536
3537   /* Recognize { nunits - 1, nunits, nunits + 1, ... }.  */
3538   if (!d->perm.series_p (0, 2, nunits - 1, 2)
3539       || !d->perm.series_p (1, 2, nunits, 2))
3540     return false;
3541
3542   /* Disable when nunits < 4 since the later generic approach
3543      is more profitable on indice = { nunits - 1, nunits }.  */
3544   if (!known_gt (nunits, 2))
3545     return false;
3546
3547   /* Success! */
3548   if (d->testing_p)
3549     return true;
3550
3551   /* Extract the last element of the first vector.  */
3552   scalar_mode smode = GET_MODE_INNER (d->vmode);
3553   rtx tmp = gen_reg_rtx (smode);
3554   emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3555
3556   /* Insert the scalar into element 0.  */
3557   unsigned int unspec
3558     = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3559   insn_code icode = code_for_pred_slide (unspec, d->vmode);
3560   rtx ops[] = {d->target, d->op1, tmp};
3561   emit_vlmax_insn (icode, BINARY_OP, ops);
3562   return true;
3563 }
3564
3565 static bool
3566 shuffle_series_patterns (struct expand_vec_perm_d *d)
3567 {
3568   if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3569     return false;
3570
3571   poly_int64 el1 = d->perm[0];
3572   poly_int64 el2 = d->perm[1];
3573   poly_int64 el3 = d->perm[2];
3574
3575   poly_int64 step1 = el2 - el1;
3576   poly_int64 step2 = el3 - el2;
3577
3578   bool need_insert = false;
3579   bool have_series = false;
3580
3581   /* Check for a full series.  */
3582   if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3583     have_series = true;
3584
3585   /* Check for a series starting at the second element.  */
3586   else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3587     {
3588       have_series = true;
3589       need_insert = true;
3590     }
3591
3592   if (!have_series)
3593     return false;
3594
3595   /* Disable shuffle if we can't find an appropriate integer index mode for
3596      gather.  */
3597   machine_mode sel_mode;
3598   if (!get_gather_index_mode (d).exists (&sel_mode))
3599     return false;
3600
3601   /* Success! */
3602   if (d->testing_p)
3603     return true;
3604
3605   /* Create the series.  */
3606   machine_mode eltmode = Pmode;
3607   rtx series = gen_reg_rtx (sel_mode);
3608   expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3609                      gen_int_mode (need_insert ? step2 : step1, eltmode));
3610
3611   /* Insert the remaining element if necessary.  */
3612   if (need_insert)
3613     {
3614       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3615       rtx ops[]
3616         = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3617       emit_vlmax_insn (icode, BINARY_OP, ops);
3618     }
3619
3620   emit_vlmax_gather_insn (d->target, d->op0, series);
3621
3622   return true;
3623 }
3624
3625 /* Recognize the pattern that can be shuffled by generic approach.  */
3626
3627 static bool
3628 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3629 {
3630   machine_mode sel_mode;
3631
3632   /* We don't enable SLP for non-power of 2 NPATTERNS.  */
3633   if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3634     return false;
3635
3636   /* Disable shuffle if we can't find an appropriate integer index mode for
3637      gather.  */
3638   if (!get_gather_index_mode (d).exists (&sel_mode))
3639     return false;
3640
3641   /* Success! */
3642   if (d->testing_p)
3643     return true;
3644
3645   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3646   /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3647      instead of expand vec_perm<mode>, we handle it directly.  */
3648   expand_vec_perm (d->target, d->op0, d->op1, sel);
3649   return true;
3650 }
3651
3652 /* This function recognizes and supports different permutation patterns
3653    and enable VLA SLP auto-vectorization.  */
3654 static bool
3655 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3656 {
3657   gcc_assert (d->op_mode != E_VOIDmode);
3658
3659   /* The pattern matching functions above are written to look for a small
3660      number to begin the sequence (0, 1, N/2).  If we begin with an index
3661      from the second operand, we can swap the operands.  */
3662   poly_int64 nelt = d->perm.length ();
3663   if (known_ge (d->perm[0], nelt))
3664     {
3665       d->perm.rotate_inputs (1);
3666       std::swap (d->op0, d->op1);
3667     }
3668
3669   if (known_gt (nelt, 1))
3670     {
3671       if (d->vmode == d->op_mode)
3672         {
3673           if (shuffle_merge_patterns (d))
3674             return true;
3675           if (shuffle_consecutive_patterns (d))
3676             return true;
3677           if (shuffle_compress_patterns (d))
3678             return true;
3679           if (shuffle_decompress_patterns (d))
3680             return true;
3681           if (shuffle_bswap_pattern (d))
3682             return true;
3683           if (shuffle_extract_and_slide1up_patterns (d))
3684             return true;
3685           if (shuffle_series_patterns (d))
3686             return true;
3687           if (shuffle_generic_patterns (d))
3688             return true;
3689           return false;
3690         }
3691       else
3692         return false;
3693     }
3694   return false;
3695 }
3696
3697 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3698  * instructions.  */
3699 bool
3700 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
3701                        rtx op0, rtx op1, const vec_perm_indices &sel)
3702 {
3703   /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3704      mask to do the iteration loop control. Just disable it directly.  */
3705   if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
3706     return false;
3707   /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3708      may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3709      Ideally, middle-end loop vectorizer should be able to disable it
3710      itself, We can remove the codes here when middle-end code is able
3711      to disable VLA SLP vectorization for poly size (1, 1) VF.  */
3712   if (!BYTES_PER_RISCV_VECTOR.is_constant ()
3713       && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
3714                    poly_int64 (16, 16)))
3715     return false;
3716
3717   struct expand_vec_perm_d d;
3718
3719   /* Check whether the mask can be applied to a single vector.  */
3720   if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
3721     d.one_vector_p = true;
3722   else if (sel.all_from_input_p (0))
3723     {
3724       d.one_vector_p = true;
3725       op1 = op0;
3726     }
3727   else if (sel.all_from_input_p (1))
3728     {
3729       d.one_vector_p = true;
3730       op0 = op1;
3731     }
3732   else
3733     d.one_vector_p = false;
3734
3735   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
3736                      sel.nelts_per_input ());
3737   d.vmode = vmode;
3738   d.op_mode = op_mode;
3739   d.target = target;
3740   d.op0 = op0;
3741   if (op0 == op1)
3742     d.op1 = d.op0;
3743   else
3744     d.op1 = op1;
3745   d.testing_p = !target;
3746
3747   if (!d.testing_p)
3748     return expand_vec_perm_const_1 (&d);
3749
3750   rtx_insn *last = get_last_insn ();
3751   bool ret = expand_vec_perm_const_1 (&d);
3752   gcc_assert (last == get_last_insn ());
3753
3754   return ret;
3755 }
3756
3757 /* Generate no side effects vsetvl to get the vector length.  */
3758 void
3759 expand_select_vl (rtx *ops)
3760 {
3761   poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
3762   if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
3763     {
3764       /* If length is known <= VF, we just use the length directly instead
3765          of using vsetvli.
3766
3767          E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3768          We move 3 into _255 intead of using explicit vsetvl.  */
3769       emit_move_insn (ops[0], ops[1]);
3770       return;
3771     }
3772   /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3773      since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
3774   scalar_int_mode mode = QImode;
3775   machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
3776   emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
3777 }
3778
3779 /* Expand MASK_LEN_{LOAD,STORE}.  */
3780 void
3781 expand_load_store (rtx *ops, bool is_load)
3782 {
3783   poly_int64 value;
3784   rtx mask = ops[2];
3785   rtx len = ops[3];
3786   machine_mode mode = GET_MODE (ops[0]);
3787
3788   if (is_vlmax_len_p (mode, len))
3789     {
3790       /* If the length operand is equal to VF, it is VLMAX load/store.  */
3791       if (is_load)
3792         {
3793           rtx m_ops[] = {ops[0], mask, ops[1]};
3794           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
3795         }
3796       else
3797         {
3798           len = gen_reg_rtx (Pmode);
3799           emit_vlmax_vsetvl (mode, len);
3800           emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3801                                      get_avl_type_rtx (VLMAX)));
3802         }
3803     }
3804   else
3805     {
3806       if (!satisfies_constraint_K (len))
3807         len = force_reg (Pmode, len);
3808       if (is_load)
3809         {
3810           rtx m_ops[] = {ops[0], mask, ops[1]};
3811           emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
3812                                len);
3813         }
3814       else
3815         emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3816                                    get_avl_type_rtx (NONVLMAX)));
3817     }
3818 }
3819
3820
3821 /* Return true if the operation is the floating-point operation need FRM.  */
3822 static bool
3823 needs_fp_rounding (unsigned icode, machine_mode mode)
3824 {
3825   if (!FLOAT_MODE_P (mode))
3826     return false;
3827
3828   return icode != maybe_code_for_pred (SMIN, mode)
3829          && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
3830          && icode != maybe_code_for_pred (SMAX, mode)
3831          && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
3832          && icode != maybe_code_for_pred (NEG, mode)
3833          && icode != maybe_code_for_pred (ABS, mode)
3834          /* narrower-FP -> FP */
3835          && icode != maybe_code_for_pred_extend (mode)
3836          /* narrower-INT -> FP */
3837          && icode != maybe_code_for_pred_widen (FLOAT, mode)
3838          && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
3839          /* vfsgnj */
3840          && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
3841          && icode != maybe_code_for_pred_mov (mode);
3842 }
3843
3844 /* Subroutine to expand COND_LEN_* patterns.  */
3845 static void
3846 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
3847 {
3848   rtx dest = ops[0];
3849   rtx mask = ops[1];
3850   machine_mode mode = GET_MODE (dest);
3851   machine_mode mask_mode = GET_MODE (mask);
3852   poly_int64 value;
3853   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
3854   bool is_vlmax_len = is_vlmax_len_p (mode, len);
3855
3856   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
3857   /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
3858      dummy mask) into NEG_EXPR in GIMPLE FOLD yet.  So, we do such
3859      simplification in RISC-V backend and may do that in middle-end in the
3860      future.  */
3861   if (is_dummy_mask && is_vlmax_len)
3862     insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
3863   else if (is_dummy_mask)
3864     insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
3865   else if (is_vlmax_len)
3866     insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
3867   else
3868     insn_flags |= TU_POLICY_P | MU_POLICY_P;
3869
3870   if (needs_fp_rounding (icode, mode))
3871     insn_flags |= FRM_DYN_P;
3872
3873   if (is_vlmax_len)
3874     emit_vlmax_insn (icode, insn_flags, ops);
3875   else
3876     emit_nonvlmax_insn (icode, insn_flags, ops, len);
3877 }
3878
3879 /* Return RVV_VUNDEF if the ELSE value is scratch rtx.  */
3880 static rtx
3881 get_else_operand (rtx op)
3882 {
3883   return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
3884 }
3885
3886 /* Expand unary ops COND_LEN_*.  */
3887 void
3888 expand_cond_len_unop (unsigned icode, rtx *ops)
3889 {
3890   rtx dest = ops[0];
3891   rtx mask = ops[1];
3892   rtx src = ops[2];
3893   rtx merge = get_else_operand (ops[3]);
3894   rtx len = ops[4];
3895
3896   rtx cond_ops[] = {dest, mask, merge, src};
3897   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3898 }
3899
3900 /* Expand unary ops COND_*.  */
3901 void
3902 expand_cond_unop (unsigned icode, rtx *ops)
3903 {
3904   rtx dest = ops[0];
3905   rtx mask = ops[1];
3906   rtx src = ops[2];
3907   rtx merge = get_else_operand (ops[3]);
3908   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3909
3910   rtx cond_ops[] = {dest, mask, merge, src};
3911   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3912 }
3913
3914 /* Expand binary ops COND_LEN_*.  */
3915 void
3916 expand_cond_len_binop (unsigned icode, rtx *ops)
3917 {
3918   rtx dest = ops[0];
3919   rtx mask = ops[1];
3920   rtx src1 = ops[2];
3921   rtx src2 = ops[3];
3922   rtx merge = get_else_operand (ops[4]);
3923   rtx len = ops[5];
3924
3925   rtx cond_ops[] = {dest, mask, merge, src1, src2};
3926   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3927 }
3928
3929 /* Expand binary ops COND_*.  */
3930 void
3931 expand_cond_binop (unsigned icode, rtx *ops)
3932 {
3933   rtx dest = ops[0];
3934   rtx mask = ops[1];
3935   rtx src1 = ops[2];
3936   rtx src2 = ops[3];
3937   rtx merge = get_else_operand (ops[4]);
3938   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3939
3940   rtx cond_ops[] = {dest, mask, merge, src1, src2};
3941   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3942 }
3943
3944 /* Prepare insn_code for gather_load/scatter_store according to
3945    the vector mode and index mode.  */
3946 static insn_code
3947 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
3948                         bool is_load)
3949 {
3950   if (!is_load)
3951     return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
3952   else
3953     {
3954       unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
3955       unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
3956       if (dst_eew_bitsize == src_eew_bitsize)
3957         return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
3958       else if (dst_eew_bitsize > src_eew_bitsize)
3959         {
3960           unsigned factor = dst_eew_bitsize / src_eew_bitsize;
3961           switch (factor)
3962             {
3963             case 2:
3964               return code_for_pred_indexed_load_x2_greater_eew (
3965                 UNSPEC_UNORDERED, vec_mode);
3966             case 4:
3967               return code_for_pred_indexed_load_x4_greater_eew (
3968                 UNSPEC_UNORDERED, vec_mode);
3969             case 8:
3970               return code_for_pred_indexed_load_x8_greater_eew (
3971                 UNSPEC_UNORDERED, vec_mode);
3972             default:
3973               gcc_unreachable ();
3974             }
3975         }
3976       else
3977         {
3978           unsigned factor = src_eew_bitsize / dst_eew_bitsize;
3979           switch (factor)
3980             {
3981             case 2:
3982               return code_for_pred_indexed_load_x2_smaller_eew (
3983                 UNSPEC_UNORDERED, vec_mode);
3984             case 4:
3985               return code_for_pred_indexed_load_x4_smaller_eew (
3986                 UNSPEC_UNORDERED, vec_mode);
3987             case 8:
3988               return code_for_pred_indexed_load_x8_smaller_eew (
3989                 UNSPEC_UNORDERED, vec_mode);
3990             default:
3991               gcc_unreachable ();
3992             }
3993         }
3994     }
3995 }
3996
3997 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}.  */
3998 void
3999 expand_gather_scatter (rtx *ops, bool is_load)
4000 {
4001   rtx ptr, vec_offset, vec_reg;
4002   bool zero_extend_p;
4003   int scale_log2;
4004   rtx mask = ops[5];
4005   rtx len = ops[6];
4006   if (is_load)
4007     {
4008       vec_reg = ops[0];
4009       ptr = ops[1];
4010       vec_offset = ops[2];
4011       zero_extend_p = INTVAL (ops[3]);
4012       scale_log2 = exact_log2 (INTVAL (ops[4]));
4013     }
4014   else
4015     {
4016       vec_reg = ops[4];
4017       ptr = ops[0];
4018       vec_offset = ops[1];
4019       zero_extend_p = INTVAL (ops[2]);
4020       scale_log2 = exact_log2 (INTVAL (ops[3]));
4021     }
4022
4023   machine_mode vec_mode = GET_MODE (vec_reg);
4024   machine_mode idx_mode = GET_MODE (vec_offset);
4025   scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4026   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4027   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4028   poly_int64 value;
4029   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4030
4031   /* Extend the offset element to address width.  */
4032   if (inner_offsize < BITS_PER_WORD)
4033     {
4034       /* 7.2. Vector Load/Store Addressing Modes.
4035          If the vector offset elements are narrower than XLEN, they are
4036          zero-extended to XLEN before adding to the ptr effective address. If
4037          the vector offset elements are wider than XLEN, the least-significant
4038          XLEN bits are used in the address calculation. An implementation must
4039          raise an illegal instruction exception if the EEW is not supported for
4040          offset elements.
4041
4042          RVV spec only refers to the scale_log == 0 case.  */
4043       if (!zero_extend_p || scale_log2 != 0)
4044         {
4045           if (zero_extend_p)
4046             inner_idx_mode
4047               = int_mode_for_size (inner_offsize * 2, 0).require ();
4048           else
4049             inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4050           machine_mode new_idx_mode
4051             = get_vector_mode (inner_idx_mode, nunits).require ();
4052           rtx tmp = gen_reg_rtx (new_idx_mode);
4053           emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4054                                       zero_extend_p ? true : false));
4055           vec_offset = tmp;
4056           idx_mode = new_idx_mode;
4057         }
4058     }
4059
4060   if (scale_log2 != 0)
4061     {
4062       rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4063                               gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
4064                               OPTAB_DIRECT);
4065       vec_offset = tmp;
4066     }
4067
4068   insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4069   if (is_vlmax)
4070     {
4071       if (is_load)
4072         {
4073           rtx load_ops[]
4074             = {vec_reg, mask, ptr, vec_offset};
4075           emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4076         }
4077       else
4078         {
4079           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4080           emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4081         }
4082     }
4083   else
4084     {
4085       if (is_load)
4086         {
4087           rtx load_ops[]
4088             = {vec_reg, mask, ptr, vec_offset};
4089           emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4090         }
4091       else
4092         {
4093           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4094           emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4095         }
4096     }
4097 }
4098
4099 /* Expand COND_LEN_*.  */
4100 void
4101 expand_cond_len_ternop (unsigned icode, rtx *ops)
4102 {
4103   rtx dest = ops[0];
4104   rtx mask = ops[1];
4105   rtx src1 = ops[2];
4106   rtx src2 = ops[3];
4107   rtx src3 = ops[4];
4108   rtx merge = get_else_operand (ops[5]);
4109   rtx len = ops[6];
4110
4111   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4112   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4113 }
4114
4115 /* Expand COND_*.  */
4116 void
4117 expand_cond_ternop (unsigned icode, rtx *ops)
4118 {
4119   rtx dest = ops[0];
4120   rtx mask = ops[1];
4121   rtx src1 = ops[2];
4122   rtx src2 = ops[3];
4123   rtx src3 = ops[4];
4124   rtx merge = get_else_operand (ops[5]);
4125   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4126
4127   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4128   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4129 }
4130
4131 /* Expand reduction operations.
4132      Case 1: ops = {scalar_dest, vector_src}
4133      Case 2: ops = {scalar_dest, vector_src, mask, vl}
4134 */
4135 void
4136 expand_reduction (unsigned unspec, unsigned insn_flags, rtx *ops, rtx init)
4137 {
4138   rtx scalar_dest = ops[0];
4139   rtx vector_src = ops[1];
4140   machine_mode vmode = GET_MODE (vector_src);
4141   machine_mode vel_mode = GET_MODE (scalar_dest);
4142   machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4143
4144   rtx m1_tmp = gen_reg_rtx (m1_mode);
4145   rtx scalar_move_ops[] = {m1_tmp, init};
4146   emit_nonvlmax_insn (code_for_pred_broadcast (m1_mode), SCALAR_MOVE_OP,
4147                       scalar_move_ops,
4148                       need_mask_operand_p (insn_flags) ? ops[3]
4149                                                        : CONST1_RTX (Pmode));
4150   rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4151   rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4152   insn_code icode = code_for_pred (unspec, vmode);
4153
4154   if (need_mask_operand_p (insn_flags))
4155     {
4156       rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4157       emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, ops[3]);
4158     }
4159   else
4160     emit_vlmax_insn (icode, insn_flags, reduc_ops);
4161
4162   emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4163 }
4164
4165 /* Prepare ops for ternary operations.
4166    It can be called before or after RA.  */
4167 void
4168 prepare_ternary_operands (rtx *ops)
4169 {
4170   machine_mode mode = GET_MODE (ops[0]);
4171
4172   if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4173       && (VECTOR_MODE_P (GET_MODE (ops[2]))
4174           && !rtx_equal_p (ops[2], ops[5]))
4175       && !rtx_equal_p (ops[3], ops[5])
4176       && !rtx_equal_p (ops[4], ops[5]))
4177     {
4178       /* RA will fail to find vector REG and report ICE, so we pre-merge
4179          the ops for LMUL = 8.  */
4180       if (satisfies_constraint_Wc1 (ops[1]))
4181         {
4182           emit_move_insn (ops[0], ops[5]);
4183           emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4184                                    ops[7], ops[8], ops[9]));
4185         }
4186       else
4187         emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4188                                    ops[4], ops[1], ops[6], ops[7], ops[9]));
4189       ops[5] = ops[4] = ops[0];
4190     }
4191   else
4192     {
4193       /* Swap the multiplication ops if the fallback value is the
4194          second of the two.  */
4195       if (rtx_equal_p (ops[3], ops[5]))
4196         std::swap (ops[2], ops[3]);
4197
4198       /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4199          into PLUS (ASHIFT (a, 2), b) according to uarchs.  */
4200     }
4201   gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4202               || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4203 }
4204
4205 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}.  */
4206 void
4207 expand_lanes_load_store (rtx *ops, bool is_load)
4208 {
4209   poly_int64 value;
4210   rtx mask = ops[2];
4211   rtx len = ops[3];
4212   rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4213   rtx reg = is_load ? ops[0] : ops[1];
4214   machine_mode mode = GET_MODE (ops[0]);
4215
4216   if (is_vlmax_len_p (mode, len))
4217     {
4218       /* If the length operand is equal to VF, it is VLMAX load/store.  */
4219       if (is_load)
4220         {
4221           rtx m_ops[] = {reg, mask, addr};
4222           emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4223                             m_ops);
4224         }
4225       else
4226         {
4227           len = gen_reg_rtx (Pmode);
4228           emit_vlmax_vsetvl (mode, len);
4229           emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4230                                                   get_avl_type_rtx (VLMAX)));
4231         }
4232     }
4233   else
4234     {
4235       if (!satisfies_constraint_K (len))
4236         len = force_reg (Pmode, len);
4237       if (is_load)
4238         {
4239           rtx m_ops[] = {reg, mask, addr};
4240           emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4241                                UNARY_OP_TAMA, m_ops, len);
4242         }
4243       else
4244         emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4245                                                 get_avl_type_rtx (NONVLMAX)));
4246     }
4247 }
4248
4249 /* Expand LEN_FOLD_EXTRACT_LAST.  */
4250 void
4251 expand_fold_extract_last (rtx *ops)
4252 {
4253   rtx dst = ops[0];
4254   rtx default_value = ops[1];
4255   rtx mask = ops[2];
4256   rtx anchor = gen_reg_rtx (Pmode);
4257   rtx index = gen_reg_rtx (Pmode);
4258   rtx vect = ops[3];
4259   rtx else_label = gen_label_rtx ();
4260   rtx end_label = gen_label_rtx ();
4261   rtx len = ops[4];
4262   poly_int64 value;
4263   machine_mode mode = GET_MODE (vect);
4264   machine_mode mask_mode = GET_MODE (mask);
4265   rtx compress_vect = gen_reg_rtx (mode);
4266   rtx slide_vect = gen_reg_rtx (mode);
4267   insn_code icode;
4268
4269   if (is_vlmax_len_p (mode, len))
4270     len = NULL_RTX;
4271
4272   /* Calculate the number of 1-bit in mask. */
4273   rtx cpop_ops[] = {anchor, mask};
4274   if (len)
4275     emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4276                          cpop_ops, len);
4277   else
4278     emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4279                       cpop_ops);
4280
4281   riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4282   emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4283   /* Compress the vector.  */
4284   icode = code_for_pred_compress (mode);
4285   rtx compress_ops[] = {compress_vect, vect, mask};
4286   if (len)
4287     emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4288   else
4289     emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4290   /* Emit the slide down to index 0 in a new vector.  */
4291   rtx slide_ops[] = {slide_vect, compress_vect, index};
4292   icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4293   if (len)
4294     emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4295   else
4296     emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4297   /* Emit v(f)mv.[xf].s.  */
4298   emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4299
4300   emit_jump_insn (gen_jump (end_label));
4301   emit_barrier ();
4302   emit_label (else_label);
4303   emit_move_insn (dst, default_value);
4304   emit_label (end_label);
4305 }
4306
4307 /* Return true if the LMUL of comparison less than or equal to one.  */
4308 bool
4309 cmp_lmul_le_one (machine_mode mode)
4310 {
4311   if (riscv_v_ext_vector_mode_p (mode))
4312     return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4313   else if (riscv_v_ext_vls_mode_p (mode))
4314     return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4315   return false;
4316 }
4317
4318 /* Return true if the LMUL of comparison greater than one.  */
4319 bool
4320 cmp_lmul_gt_one (machine_mode mode)
4321 {
4322   if (riscv_v_ext_vector_mode_p (mode))
4323     return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4324   else if (riscv_v_ext_vls_mode_p (mode))
4325     return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4326   return false;
4327 }
4328
4329 /* Return true if the VLS mode is legal. There are 2 cases here.
4330
4331    1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4332       is the highest priority choice and should not conflict with VLS modes.
4333    2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4334       VLS mode are smaller than the minimal vla.
4335
4336    Take vlen = 2048 as example for case 2.
4337
4338    Note: Below table based on vlen = 2048.
4339    +----------------------------------------------------+----------------------+
4340    | VLS mode                                           | VLA mode             |
4341    +----------------------------------------------------+----------------------+
4342    | Name       | Precision | Inner Precision | Enabled | Min mode  | Min bits |
4343    +------------+-----------+-----------------+---------+-----------+----------+
4344    | V1BI       |     1     |              1  | Yes     | RVVMF64BI |    32    |
4345    | V2BI       |     2     |              1  | Yes     | RVVMF64BI |    32    |
4346    | V4BI       |     4     |              1  | Yes     | RVVMF64BI |    32    |
4347    | V8BI       |     8     |              1  | Yes     | RVVMF64BI |    32    |
4348    | V16BI      |    16     |              1  | Yes     | RVVMF64BI |    32    |
4349    | V32BI      |    32     |              1  | NO      | RVVMF64BI |    32    |
4350    | V64BI      |    64     |              1  | NO      | RVVMF64BI |    32    |
4351    | ...        |   ...     |            ...  | ...     | RVVMF64BI |    32    |
4352    | V4096BI    |  4096     |              1  | NO      | RVVMF64BI |    32    |
4353    +------------+-----------+-----------------+---------+-----------+----------+
4354    | V1QI       |     8     |              8  | Yes     | RVVMF8QI  |   256    |
4355    | V2QI       |    16     |              8  | Yes     | RVVMF8QI  |   256    |
4356    | V4QI       |    32     |              8  | Yes     | RVVMF8QI  |   256    |
4357    | V8QI       |    64     |              8  | Yes     | RVVMF8QI  |   256    |
4358    | V16QI      |   128     |              8  | Yes     | RVVMF8QI  |   256    |
4359    | V32QI      |   256     |              8  | NO      | RVVMF8QI  |   256    |
4360    | V64QI      |   512     |              8  | NO      | RVVMF8QI  |   256    |
4361    | ...        |   ...     |              .. | ...     | RVVMF8QI  |   256    |
4362    | V4096QI    | 32768     |              8  | NO      | RVVMF8QI  |   256    |
4363    +------------+-----------+-----------------+---------+-----------+----------+
4364    | V1HI       |    16     |              16 | Yes     | RVVMF4HI  |   512    |
4365    | V2HI       |    32     |              16 | Yes     | RVVMF4HI  |   512    |
4366    | V4HI       |    64     |              16 | Yes     | RVVMF4HI  |   512    |
4367    | V8HI       |   128     |              16 | Yes     | RVVMF4HI  |   512    |
4368    | V16HI      |   256     |              16 | Yes     | RVVMF4HI  |   512    |
4369    | V32HI      |   512     |              16 | NO      | RVVMF4HI  |   512    |
4370    | V64HI      |  1024     |              16 | NO      | RVVMF4HI  |   512    |
4371    | ...        |   ...     |              .. | ...     | RVVMF4HI  |   512    |
4372    | V2048HI    | 32768     |              16 | NO      | RVVMF4HI  |   512    |
4373    +------------+-----------+-----------------+---------+-----------+----------+
4374    | V1SI/SF    |    32     |              32 | Yes     | RVVMF2SI  |  1024    |
4375    | V2SI/SF    |    64     |              32 | Yes     | RVVMF2SI  |  1024    |
4376    | V4SI/SF    |   128     |              32 | Yes     | RVVMF2SI  |  1024    |
4377    | V8SI/SF    |   256     |              32 | Yes     | RVVMF2SI  |  1024    |
4378    | V16SI/SF   |   512     |              32 | Yes     | RVVMF2SI  |  1024    |
4379    | V32SI/SF   |  1024     |              32 | NO      | RVVMF2SI  |  1024    |
4380    | V64SI/SF   |  2048     |              32 | NO      | RVVMF2SI  |  1024    |
4381    | ...        |   ...     |              .. | ...     | RVVMF2SI  |  1024    |
4382    | V1024SI/SF | 32768     |              32 | NO      | RVVMF2SI  |  1024    |
4383    +------------+-----------+-----------------+---------+-----------+----------+
4384    | V1DI/DF    |    64     |              64 | Yes     | RVVM1DI   |  2048    |
4385    | V2DI/DF    |   128     |              64 | Yes     | RVVM1DI   |  2048    |
4386    | V4DI/DF    |   256     |              64 | Yes     | RVVM1DI   |  2048    |
4387    | V8DI/DF    |   512     |              64 | Yes     | RVVM1DI   |  2048    |
4388    | V16DI/DF   |  1024     |              64 | Yes     | RVVM1DI   |  2048    |
4389    | V32DI/DF   |  2048     |              64 | NO      | RVVM1DI   |  2048    |
4390    | V64DI/DF   |  4096     |              64 | NO      | RVVM1DI   |  2048    |
4391    | ...        |   ...     |              .. | ...     | RVVM1DI   |  2048    |
4392    | V512DI/DF  | 32768     |              64 | NO      | RVVM1DI   |  2048    |
4393    +------------+-----------+-----------------+---------+-----------+----------+
4394
4395    Then we can have the condition for VLS mode in fixed-vlmax, aka:
4396      PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)).  */
4397 bool
4398 vls_mode_valid_p (machine_mode vls_mode)
4399 {
4400   if (!TARGET_VECTOR)
4401     return false;
4402
4403   if (riscv_autovec_preference == RVV_SCALABLE)
4404     {
4405       if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4406           && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4407                          GET_MODE_PRECISION (vls_mode)))
4408         /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4409            BITS_PER_RISCV_VECTOR.
4410
4411            e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4412            We enable VLS modes have fixed size <= 128bit.  Since ordered_p is
4413            false between VLA modes with size = (128, 128) bits and VLS mode
4414            with size = 128 bits, we will end up with multiple ICEs in
4415            middle-end generic codes.  */
4416         return false;
4417       return true;
4418     }
4419
4420   if (riscv_autovec_preference == RVV_FIXED_VLMAX)
4421     {
4422       machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4423       int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4424       int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4425
4426       return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4427     }
4428
4429   return false;
4430 }
4431
4432 /* We don't have to convert the floating point to integer when the
4433    mantissa is zero.  Thus, ther will be a limitation for both the
4434    single and double precision floating point.  There will be no
4435    mantissa if the floating point is greater than the limit.
4436
4437    1. Half floating point.
4438       +-----------+---------------+
4439       | float     | binary layout |
4440       +-----------+---------------+
4441       | 1023.5    | 0x63ff        |
4442       +-----------+---------------+
4443       | 1024.0    | 0x6400        |
4444       +-----------+---------------+
4445       | 1025.0    | 0x6401        |
4446       +-----------+---------------+
4447       | ...       | ...           |
4448
4449       All half floating point will be unchanged for ceil if it is
4450       greater than and equal to 1024.
4451
4452    2. Single floating point.
4453       +-----------+---------------+
4454       | float     | binary layout |
4455       +-----------+---------------+
4456       | 8388607.5 | 0x4affffff    |
4457       +-----------+---------------+
4458       | 8388608.0 | 0x4b000000    |
4459       +-----------+---------------+
4460       | 8388609.0 | 0x4b000001    |
4461       +-----------+---------------+
4462       | ...       | ...           |
4463
4464       All single floating point will be unchanged for ceil if it is
4465       greater than and equal to 8388608.
4466
4467    3. Double floating point.
4468       +--------------------+--------------------+
4469       | float              | binary layout      |
4470       +--------------------+--------------------+
4471       | 4503599627370495.5 | 0X432fffffffffffff |
4472       +--------------------+--------------------+
4473       | 4503599627370496.0 | 0X4330000000000000 |
4474       +--------------------+--------------------+
4475       | 4503599627370497.0 | 0X4340000000000000 |
4476       +--------------------+--------------------+
4477       | ...                | ...                |
4478
4479       All double floating point will be unchanged for ceil if it is
4480       greater than and equal to 4503599627370496.
4481  */
4482 static rtx
4483 get_fp_rounding_coefficient (machine_mode inner_mode)
4484 {
4485   REAL_VALUE_TYPE real;
4486
4487   if (inner_mode == E_HFmode)
4488     real_from_integer (&real, inner_mode, 1024, SIGNED);
4489   else if (inner_mode == E_SFmode)
4490     real_from_integer (&real, inner_mode, 8388608, SIGNED);
4491   else if (inner_mode == E_DFmode)
4492     real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4493   else
4494     gcc_unreachable ();
4495
4496   return const_double_from_real_value (real, inner_mode);
4497 }
4498
4499 static rtx
4500 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4501                          machine_mode vec_fp_mode)
4502 {
4503   /* Step-1: Prepare the scalar float compare register.  */
4504   rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4505   emit_insn (gen_move_insn (fp_reg, fp_scalar));
4506
4507   /* Step-2: Generate the mask.  */
4508   machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4509   rtx mask = gen_reg_rtx (mask_mode);
4510   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4511   rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4512   insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4513   emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4514
4515   return mask;
4516 }
4517
4518 static void
4519 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4520                    machine_mode vec_mode)
4521 {
4522   rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4523   insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4524
4525   emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4526 }
4527
4528 static void
4529 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4530 {
4531   rtx abs_ops[] = {op_dest, op_src};
4532   insn_code icode = code_for_pred (ABS, vec_mode);
4533
4534   emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4535 }
4536
4537 static void
4538 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4539                   insn_type type, machine_mode vec_mode)
4540 {
4541   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4542
4543   if (type & USE_VUNDEF_MERGE_P)
4544     {
4545       rtx cvt_x_ops[] = {op_dest, mask, op_src};
4546       emit_vlmax_insn (icode, type, cvt_x_ops);
4547     }
4548   else
4549     {
4550       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4551       emit_vlmax_insn (icode, type, cvt_x_ops);
4552     }
4553 }
4554
4555 static void
4556 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4557                   machine_mode vec_mode)
4558 {
4559   rtx ops[] = {op_dest, op_src};
4560   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4561
4562   emit_vlmax_insn (icode, type, ops);
4563 }
4564
4565 static void
4566 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4567                          machine_mode vec_mode)
4568 {
4569   rtx ops[] = {op_dest, op_src};
4570   insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4571
4572   emit_vlmax_insn (icode, type, ops);
4573 }
4574
4575 static void
4576 emit_vec_widden_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4577                          machine_mode vec_mode)
4578 {
4579   rtx ops[] = {op_dest, op_src};
4580   insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4581
4582   emit_vlmax_insn (icode, type, ops);
4583 }
4584
4585 static void
4586 emit_vec_widden_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4587                          machine_mode vec_mode)
4588 {
4589   rtx ops[] = {op_dest, op_src};
4590   insn_code icode = code_for_pred_extend (vec_mode);
4591
4592   emit_vlmax_insn (icode, type, ops);
4593 }
4594
4595 static void
4596 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
4597                   insn_type type, machine_mode vec_mode)
4598 {
4599   rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
4600   insn_code icode = code_for_pred (FLOAT, vec_mode);
4601
4602   emit_vlmax_insn (icode, type, cvt_fp_ops);
4603 }
4604
4605 static void
4606 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
4607                       insn_type type, machine_mode vec_mode)
4608 {
4609   insn_code icode = code_for_pred (FIX, vec_mode);
4610
4611   if (type & USE_VUNDEF_MERGE_P)
4612     {
4613       rtx cvt_x_ops[] = {op_dest, mask, op_src};
4614       emit_vlmax_insn (icode, type, cvt_x_ops);
4615     }
4616   else
4617     {
4618       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4619       emit_vlmax_insn (icode, type, cvt_x_ops);
4620     }
4621 }
4622
4623 void
4624 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4625                  machine_mode vec_int_mode)
4626 {
4627   /* Step-1: Get the abs float value for mask generation.  */
4628   emit_vec_abs (op_0, op_1, vec_fp_mode);
4629
4630   /* Step-2: Generate the mask on const fp.  */
4631   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4632   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4633
4634   /* Step-3: Convert to integer on mask, with rounding up (aka ceil).  */
4635   rtx tmp = gen_reg_rtx (vec_int_mode);
4636   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
4637
4638   /* Step-4: Convert to floating-point on mask for the final result.
4639      To avoid unnecessary frm register access, we use RUP here and it will
4640      never do the rounding up because the tmp rtx comes from the float
4641      to int conversion.  */
4642   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
4643
4644   /* Step-5: Retrieve the sign bit for -0.0.  */
4645   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4646 }
4647
4648 void
4649 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4650                   machine_mode vec_int_mode)
4651 {
4652   /* Step-1: Get the abs float value for mask generation.  */
4653   emit_vec_abs (op_0, op_1, vec_fp_mode);
4654
4655   /* Step-2: Generate the mask on const fp.  */
4656   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4657   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4658
4659   /* Step-3: Convert to integer on mask, with rounding down (aka floor).  */
4660   rtx tmp = gen_reg_rtx (vec_int_mode);
4661   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
4662
4663   /* Step-4: Convert to floating-point on mask for the floor result.  */
4664   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
4665
4666   /* Step-5: Retrieve the sign bit for -0.0.  */
4667   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4668 }
4669
4670 void
4671 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4672                       machine_mode vec_int_mode)
4673 {
4674   /* Step-1: Get the abs float value for mask generation.  */
4675   emit_vec_abs (op_0, op_1, vec_fp_mode);
4676
4677   /* Step-2: Generate the mask on const fp.  */
4678   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4679   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4680
4681   /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4682   rtx fflags = gen_reg_rtx (SImode);
4683   emit_insn (gen_riscv_frflags (fflags));
4684
4685   /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint).  */
4686   rtx tmp = gen_reg_rtx (vec_int_mode);
4687   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4688
4689   /* Step-5: Convert to floating-point on mask for the nearbyint result.  */
4690   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4691
4692   /* Step-6: Restore FP exception flags. */
4693   emit_insn (gen_riscv_fsflags (fflags));
4694
4695   /* Step-7: Retrieve the sign bit for -0.0.  */
4696   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4697 }
4698
4699 void
4700 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4701                  machine_mode vec_int_mode)
4702 {
4703   /* Step-1: Get the abs float value for mask generation.  */
4704   emit_vec_abs (op_0, op_1, vec_fp_mode);
4705
4706   /* Step-2: Generate the mask on const fp.  */
4707   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4708   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4709
4710   /* Step-3: Convert to integer on mask, with dyn rounding (aka rint).  */
4711   rtx tmp = gen_reg_rtx (vec_int_mode);
4712   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4713
4714   /* Step-4: Convert to floating-point on mask for the rint result.  */
4715   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4716
4717   /* Step-5: Retrieve the sign bit for -0.0.  */
4718   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4719 }
4720
4721 void
4722 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4723                   machine_mode vec_int_mode)
4724 {
4725   /* Step-1: Get the abs float value for mask generation.  */
4726   emit_vec_abs (op_0, op_1, vec_fp_mode);
4727
4728   /* Step-2: Generate the mask on const fp.  */
4729   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4730   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4731
4732   /* Step-3: Convert to integer on mask, rounding to nearest (aka round).  */
4733   rtx tmp = gen_reg_rtx (vec_int_mode);
4734   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
4735
4736   /* Step-4: Convert to floating-point on mask for the round result.  */
4737   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
4738
4739   /* Step-5: Retrieve the sign bit for -0.0.  */
4740   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4741 }
4742
4743 void
4744 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4745                   machine_mode vec_int_mode)
4746 {
4747   /* Step-1: Get the abs float value for mask generation.  */
4748   emit_vec_abs (op_0, op_1, vec_fp_mode);
4749
4750   /* Step-2: Generate the mask on const fp.  */
4751   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4752   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4753
4754   /* Step-3: Convert to integer on mask, rounding to zero (aka truncate).  */
4755   rtx tmp = gen_reg_rtx (vec_int_mode);
4756   emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
4757
4758   /* Step-4: Convert to floating-point on mask for the rint result.  */
4759   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4760
4761   /* Step-5: Retrieve the sign bit for -0.0.  */
4762   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4763 }
4764
4765 void
4766 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4767                       machine_mode vec_int_mode)
4768 {
4769   /* Step-1: Get the abs float value for mask generation.  */
4770   emit_vec_abs (op_0, op_1, vec_fp_mode);
4771
4772   /* Step-2: Generate the mask on const fp.  */
4773   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4774   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4775
4776   /* Step-3: Convert to integer on mask, rounding to nearest, ties to even.  */
4777   rtx tmp = gen_reg_rtx (vec_int_mode);
4778   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
4779
4780   /* Step-4: Convert to floating-point on mask for the rint result.  */
4781   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
4782
4783   /* Step-5: Retrieve the sign bit for -0.0.  */
4784   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4785 }
4786
4787 /* Handling the rounding from floating-point to int/long/long long.  */
4788 static void
4789 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
4790                               machine_mode vec_fp_mode,
4791                               machine_mode vec_int_mode,
4792                               machine_mode vec_bridge_mode = E_VOIDmode)
4793 {
4794   poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
4795   poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
4796
4797   if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI.  */
4798     emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4799   else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI.  */
4800     emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4801   else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI.  */
4802     emit_vec_widden_cvt_x_f (op_0, op_1, type, vec_int_mode);
4803   else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI.  */
4804     {
4805       gcc_assert (vec_bridge_mode != E_VOIDmode);
4806
4807       rtx op_sf = gen_reg_rtx (vec_bridge_mode);
4808
4809       /* Step-1: HF => SF, no rounding here.  */
4810       emit_vec_widden_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
4811       /* Step-2: SF => DI.  */
4812       emit_vec_widden_cvt_x_f (op_0, op_sf, type, vec_int_mode);
4813     }
4814   else
4815     gcc_unreachable ();
4816 }
4817
4818 void
4819 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4820                   machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4821 {
4822   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
4823                                 vec_int_mode, vec_bridge_mode);
4824 }
4825
4826 void
4827 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4828                    machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4829 {
4830   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
4831                                 vec_int_mode, vec_bridge_mode);
4832 }
4833
4834 void
4835 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4836                   machine_mode vec_int_mode)
4837 {
4838   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
4839                                 vec_int_mode);
4840 }
4841
4842 void
4843 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4844                    machine_mode vec_int_mode)
4845 {
4846   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
4847                                 vec_int_mode);
4848 }
4849
4850 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
4851    well.  */
4852 void
4853 expand_popcount (rtx *ops)
4854 {
4855   rtx dst = ops[0];
4856   rtx src = ops[1];
4857   machine_mode mode = GET_MODE (dst);
4858   scalar_mode imode = GET_MODE_INNER (mode);
4859   static const uint64_t m5 = 0x5555555555555555ULL;
4860   static const uint64_t m3 = 0x3333333333333333ULL;
4861   static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
4862   static const uint64_t m1 = 0x0101010101010101ULL;
4863
4864   rtx x1 = gen_reg_rtx (mode);
4865   rtx x2 = gen_reg_rtx (mode);
4866   rtx x3 = gen_reg_rtx (mode);
4867   rtx x4 = gen_reg_rtx (mode);
4868
4869   /* x1 = src - (src >> 1) & 0x555...);  */
4870   rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
4871                              OPTAB_DIRECT);
4872
4873   rtx and1 = gen_reg_rtx (mode);
4874   rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
4875   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4876                    ops1);
4877
4878   x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
4879
4880   /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
4881    */
4882   rtx and2 = gen_reg_rtx (mode);
4883   rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
4884   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4885                    ops2);
4886
4887   rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
4888                              OPTAB_DIRECT);
4889
4890   rtx and22 = gen_reg_rtx (mode);
4891   rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
4892   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4893                    ops22);
4894
4895   x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
4896
4897   /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL;  */
4898   rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
4899                              OPTAB_DIRECT);
4900
4901   rtx plus3
4902     = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
4903
4904   rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
4905   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4906                    ops3);
4907
4908   /* dest = (x3 * 0x0101010101010101ULL) >> 56;  */
4909   rtx mul4 = gen_reg_rtx (mode);
4910   rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
4911   emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
4912                    ops4);
4913
4914   x4 = expand_binop (mode, lshr_optab, mul4,
4915                      GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
4916                      OPTAB_DIRECT);
4917
4918   emit_move_insn (dst, x4);
4919 }
4920
4921 /* Return true if it is VLMAX AVL TYPE.  */
4922 bool
4923 vlmax_avl_type_p (rtx_insn *rinsn)
4924 {
4925   extract_insn_cached (rinsn);
4926   int index = get_attr_avl_type_idx (rinsn);
4927   if (index == INVALID_ATTRIBUTE)
4928     return false;
4929   rtx avl_type = recog_data.operand[index];
4930   return INTVAL (avl_type) == VLMAX;
4931 }
4932
4933 /* Return true if it is an RVV instruction depends on VL global
4934    status register.  */
4935 bool
4936 has_vl_op (rtx_insn *rinsn)
4937 {
4938   return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
4939 }
4940
4941 /* Get default tail policy.  */
4942 static bool
4943 get_default_ta ()
4944 {
4945   /* For the instruction that doesn't require TA, we still need a default value
4946      to emit vsetvl. We pick up the default value according to prefer policy. */
4947   return (bool) (get_prefer_tail_policy () & 0x1
4948                  || (get_prefer_tail_policy () >> 1 & 0x1));
4949 }
4950
4951 /* Helper function to get TA operand.  */
4952 bool
4953 tail_agnostic_p (rtx_insn *rinsn)
4954 {
4955   /* If it doesn't have TA, we return agnostic by default.  */
4956   extract_insn_cached (rinsn);
4957   int ta = get_attr_ta (rinsn);
4958   return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
4959 }
4960
4961 /* Change insn and Assert the change always happens.  */
4962 void
4963 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
4964 {
4965   bool change_p = validate_change (object, loc, new_rtx, in_group);
4966   gcc_assert (change_p);
4967 }
4968
4969 /* Return true if it is NONVLMAX AVL TYPE.  */
4970 bool
4971 nonvlmax_avl_type_p (rtx_insn *rinsn)
4972 {
4973   extract_insn_cached (rinsn);
4974   int index = get_attr_avl_type_idx (rinsn);
4975   if (index == INVALID_ATTRIBUTE)
4976     return false;
4977   rtx avl_type = recog_data.operand[index];
4978   return INTVAL (avl_type) == NONVLMAX;
4979 }
4980
4981 /* Return true if RTX is RVV VLMAX AVL.  */
4982 bool
4983 vlmax_avl_p (rtx x)
4984 {
4985   return x && rtx_equal_p (x, RVV_VLMAX);
4986 }
4987
4988 /* Helper function to get SEW operand. We always have SEW value for
4989    all RVV instructions that have VTYPE OP.  */
4990 uint8_t
4991 get_sew (rtx_insn *rinsn)
4992 {
4993   return get_attr_sew (rinsn);
4994 }
4995
4996 /* Helper function to get VLMUL operand. We always have VLMUL value for
4997    all RVV instructions that have VTYPE OP. */
4998 enum vlmul_type
4999 get_vlmul (rtx_insn *rinsn)
5000 {
5001   return (enum vlmul_type) get_attr_vlmul (rinsn);
5002 }
5003
5004 /* Count the number of REGNO in RINSN.  */
5005 int
5006 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5007 {
5008   int count = 0;
5009   extract_insn (rinsn);
5010   for (int i = 0; i < recog_data.n_operands; i++)
5011     if (refers_to_regno_p (regno, recog_data.operand[i]))
5012       count++;
5013   return count;
5014 }
5015
5016 /* Return true if the OP can be directly broadcasted.  */
5017 bool
5018 can_be_broadcasted_p (rtx op)
5019 {
5020   machine_mode mode = GET_MODE (op);
5021   /* We don't allow RA (register allocation) reload generate
5022     (vec_duplicate:DI reg) in RV32 system wheras we allow
5023     (vec_duplicate:DI mem) in RV32 system.  */
5024   if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5025       && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5026       && !satisfies_constraint_Wdm (op))
5027     return false;
5028
5029   if (satisfies_constraint_K (op) || register_operand (op, mode)
5030       || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5031     return true;
5032
5033   return can_create_pseudo_p () && nonmemory_operand (op, mode);
5034 }
5035
5036 void
5037 emit_vec_extract (rtx target, rtx src, rtx index)
5038 {
5039   machine_mode vmode = GET_MODE (src);
5040   machine_mode smode = GET_MODE (target);
5041   class expand_operand ops[3];
5042   enum insn_code icode
5043     = convert_optab_handler (vec_extract_optab, vmode, smode);
5044   gcc_assert (icode != CODE_FOR_nothing);
5045   create_output_operand (&ops[0], target, smode);
5046   ops[0].target = 1;
5047   create_input_operand (&ops[1], src, vmode);
5048
5049   poly_int64 val;
5050   if (poly_int_rtx_p (index, &val))
5051     create_integer_operand (&ops[2], val);
5052   else
5053     create_input_operand (&ops[2], index, Pmode);
5054
5055   expand_insn (icode, 3, ops);
5056   if (ops[0].value != target)
5057     emit_move_insn (target, ops[0].value);
5058 }
5059
5060 /* Return true if the offset mode is valid mode that we use for gather/scatter
5061    autovectorization.  */
5062 bool
5063 gather_scatter_valid_offset_p (machine_mode mode)
5064 {
5065   /* If the element size of offset mode is already >= Pmode size,
5066      we don't need any extensions.  */
5067   if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5068     return true;
5069
5070   /* Since we are very likely extend the offset mode into vector Pmode,
5071      Disable gather/scatter autovectorization if we can't extend the offset
5072      mode into vector Pmode.  */
5073   if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5074     return false;
5075   return true;
5076 }
5077
5078 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5079    Look into the tuning structure for an estimate.
5080    KIND specifies the type of requested estimate: min, max or likely.
5081    For cores with a known VLA width all three estimates are the same.
5082    For generic VLA tuning we want to distinguish the maximum estimate from
5083    the minimum and likely ones.
5084    The likely estimate is the same as the minimum in that case to give a
5085    conservative behavior of auto-vectorizing with VLA when it is a win
5086    even for VLA vectorization.
5087    When VLA width information is available VAL.coeffs[1] is multiplied by
5088    the number of VLA chunks over the initial VLS bits.  */
5089 HOST_WIDE_INT
5090 estimated_poly_value (poly_int64 val, unsigned int kind)
5091 {
5092   unsigned int width_source
5093     = BITS_PER_RISCV_VECTOR.is_constant ()
5094         ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5095         : (unsigned int) RVV_SCALABLE;
5096
5097   /* If there is no core-specific information then the minimum and likely
5098      values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5099      the architectural maximum of 65536 bits.  */
5100   unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5101   if (width_source == RVV_SCALABLE)
5102     switch (kind)
5103       {
5104       case POLY_VALUE_MIN:
5105       case POLY_VALUE_LIKELY:
5106         return val.coeffs[0];
5107
5108       case POLY_VALUE_MAX:
5109         return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5110       }
5111
5112   /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5113      lowest as likely.  This could be made more general if future -mtune
5114      options need it to be.  */
5115   if (kind == POLY_VALUE_MAX)
5116     width_source = 1 << floor_log2 (width_source);
5117   else
5118     width_source = least_bit_hwi (width_source);
5119
5120   /* If the core provides width information, use that.  */
5121   HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5122   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5123 }
5124
5125 } // namespace riscv_vector