gcc/config/riscv/riscv-vector-costs.cc

   1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
   2    Copyright (C) 2023-2023 Free Software Foundation, Inc.
   3    Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3, or (at your option)
  10 any later version.
  11
  12 GCC is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "tm.h"
  28 #include "target.h"
  29 #include "function.h"
  30 #include "tree.h"
  31 #include "basic-block.h"
  32 #include "rtl.h"
  33 #include "gimple.h"
  34 #include "targhooks.h"
  35 #include "cfgloop.h"
  36 #include "fold-const.h"
  37 #include "tm_p.h"
  38 #include "tree-vectorizer.h"
  39 #include "gimple-iterator.h"
  40 #include "bitmap.h"
  41 #include "ssa.h"
  42 #include "backend.h"
  43 #include "tree-data-ref.h"
  44 #include "tree-ssa-loop-niter.h"
  45
  46 /* This file should be included last.  */
  47 #include "riscv-vector-costs.h"
  48
  49 namespace riscv_vector {
  50
  51 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
  52    determine LMUL
  53
  54      - Collect all vectorize STMTs locally for each loop block.
  55      - Build program point based graph, ignore non-vectorize STMTs:
  56
  57            vectorize STMT 0 - point 0
  58            scalar STMT 0 - ignore.
  59            vectorize STMT 1 - point 1
  60            ...
  61      - Compute the number of live V_REGs live at each program point
  62      - Determine LMUL in VECTOR COST model according to the program point
  63        which has maximum live V_REGs.
  64
  65      Note:
  66
  67      - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
  68        It's important for mixed size auto-vectorization (Conversions, ... etc).
  69        E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
  70        The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
  71        We compute the number live V_REGs at each program point according to
  72        this information.
  73      - We only compute program points and live ranges locally (within a block)
  74        since we just need to compute the number of live V_REGs at each program
  75        point and we are not really allocating the registers for each SSA.
  76        We can make the variable has another local live range in another block
  77        if it live out/live in to another block.  Such approach doesn't affect
  78        out accurate live range analysis.
  79      - Current analysis didn't consider any instruction scheduling which
  80        may improve the register pressure.  So we are conservatively doing the
  81        analysis which may end up with smaller LMUL.
  82        TODO: Maybe we could support a reasonable live range shrink algorithm
  83        which take advantage of instruction scheduling.
  84      - We may have these following possible autovec modes analysis:
  85
  86          1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
  87          2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
  88          3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
  89 */
  90
  91 /* Collect all STMTs that are vectorized and compute their program points.
  92    Note that we don't care about the STMTs that are not vectorized and
  93    we only build the local graph (within a block) of program points.
  94
  95    Loop:
  96      bb 2:
  97        STMT 1 (be vectorized)      -- point 0
  98        STMT 2 (not be vectorized)  -- ignored
  99        STMT 3 (be vectorized)      -- point 1
 100        STMT 4 (be vectorized)      -- point 2
 101        STMT 5 (be vectorized)      -- point 3
 102        ...
 103      bb 3:
 104        STMT 1 (be vectorized)      -- point 0
 105        STMT 2 (be vectorized)      -- point 1
 106        STMT 3 (not be vectorized)  -- ignored
 107        STMT 4 (not be vectorized)  -- ignored
 108        STMT 5 (be vectorized)      -- point 2
 109        ...
 110 */
 111 static void
 112 compute_local_program_points (
 113   vec_info *vinfo,
 114   hash_map<basic_block, vec<stmt_point>> &program_points_per_bb)
 115 {
 116   if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
 117     {
 118       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 119       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 120       unsigned int nbbs = loop->num_nodes;
 121       gimple_stmt_iterator si;
 122       unsigned int i;
 123       /* Collect the stmts that is vectorized and mark their program point.  */
 124       for (i = 0; i < nbbs; i++)
 125         {
 126           int point = 1;
 127           basic_block bb = bbs[i];
 128           vec<stmt_point> program_points = vNULL;
 129           if (dump_enabled_p ())
 130             dump_printf_loc (MSG_NOTE, vect_location,
 131                              "Compute local program points for bb %d:\n",
 132                              bb->index);
 133           for (si = gsi_start_bb (bbs[i]); !gsi_end_p (si); gsi_next (&si))
 134             {
 135               if (!(is_gimple_assign (gsi_stmt (si))
 136                     || is_gimple_call (gsi_stmt (si))))
 137                 continue;
 138               stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 139               enum stmt_vec_info_type type
 140                 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
 141               if (type != undef_vec_info_type)
 142                 {
 143                   stmt_point info = {point, gsi_stmt (si)};
 144                   program_points.safe_push (info);
 145                   point++;
 146                   if (dump_enabled_p ())
 147                     dump_printf_loc (MSG_NOTE, vect_location,
 148                                      "program point %d: %G", info.point,
 149                                      gsi_stmt (si));
 150                 }
 151             }
 152           program_points_per_bb.put (bb, program_points);
 153         }
 154     }
 155 }
 156
 157 static machine_mode
 158 get_biggest_mode (machine_mode mode1, machine_mode mode2)
 159 {
 160   unsigned int mode1_size = GET_MODE_BITSIZE (mode1).to_constant ();
 161   unsigned int mode2_size = GET_MODE_BITSIZE (mode2).to_constant ();
 162   return mode1_size >= mode2_size ? mode1 : mode2;
 163 }
 164
 165 /* Compute local live ranges of each vectorized variable.
 166    Note that we only compute local live ranges (within a block) since
 167    local live ranges information is accurate enough for us to determine
 168    the LMUL/vectorization factor of the loop.
 169
 170    Loop:
 171      bb 2:
 172        STMT 1               -- point 0
 173        STMT 2 (def SSA 1)   -- point 1
 174        STMT 3 (use SSA 1)   -- point 2
 175        STMT 4               -- point 3
 176      bb 3:
 177        STMT 1               -- point 0
 178        STMT 2               -- point 1
 179        STMT 3               -- point 2
 180        STMT 4 (use SSA 2)   -- point 3
 181
 182    The live range of SSA 1 is [1, 3] in bb 2.
 183    The live range of SSA 2 is [0, 4] in bb 3.  */
 184 static machine_mode
 185 compute_local_live_ranges (
 186   const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
 187   hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
 188 {
 189   machine_mode biggest_mode = QImode;
 190   if (!program_points_per_bb.is_empty ())
 191     {
 192       auto_vec<tree> visited_vars;
 193       unsigned int i;
 194       for (hash_map<basic_block, vec<stmt_point>>::iterator iter
 195            = program_points_per_bb.begin ();
 196            iter != program_points_per_bb.end (); ++iter)
 197         {
 198           basic_block bb = (*iter).first;
 199           vec<stmt_point> program_points = (*iter).second;
 200           bool existed_p = false;
 201           hash_map<tree, pair> *live_ranges
 202             = &live_ranges_per_bb.get_or_insert (bb, &existed_p);
 203           gcc_assert (!existed_p);
 204           if (dump_enabled_p ())
 205             dump_printf_loc (MSG_NOTE, vect_location,
 206                              "Compute local live ranges for bb %d:\n",
 207                              bb->index);
 208           for (const auto program_point : program_points)
 209             {
 210               unsigned int point = program_point.point;
 211               gimple *stmt = program_point.stmt;
 212               tree lhs = gimple_get_lhs (stmt);
 213               if (lhs != NULL_TREE && is_gimple_reg (lhs)
 214                   && !POINTER_TYPE_P (TREE_TYPE (lhs)))
 215                 {
 216                   biggest_mode = get_biggest_mode (biggest_mode,
 217                                                    TYPE_MODE (TREE_TYPE (lhs)));
 218                   bool existed_p = false;
 219                   pair &live_range
 220                     = live_ranges->get_or_insert (lhs, &existed_p);
 221                   gcc_assert (!existed_p);
 222                   live_range = pair (point, point);
 223                 }
 224               for (i = 0; i < gimple_num_args (stmt); i++)
 225                 {
 226                   tree var = gimple_arg (stmt, i);
 227                   /* Both IMM and REG are included since a VECTOR_CST may be
 228                      potentially held in a vector register.  However, it's not
 229                      accurate, since a PLUS_EXPR can be vectorized into vadd.vi
 230                      if IMM is -16 ~ 15.
 231
 232                      TODO: We may elide the cases that the unnecessary IMM in
 233                      the future.  */
 234                   if (poly_int_tree_p (var)
 235                       || (is_gimple_val (var)
 236                           && !POINTER_TYPE_P (TREE_TYPE (var))))
 237                     {
 238                       biggest_mode
 239                         = get_biggest_mode (biggest_mode,
 240                                             TYPE_MODE (TREE_TYPE (var)));
 241                       bool existed_p = false;
 242                       pair &live_range
 243                         = live_ranges->get_or_insert (var, &existed_p);
 244                       if (existed_p)
 245                         /* We will grow the live range for each use.  */
 246                         live_range = pair (live_range.first, point);
 247                       else
 248                         /* We assume the variable is live from the start of
 249                            this block.  */
 250                         live_range = pair (0, point);
 251                     }
 252                 }
 253             }
 254           if (dump_enabled_p ())
 255             for (hash_map<tree, pair>::iterator iter = live_ranges->begin ();
 256                  iter != live_ranges->end (); ++iter)
 257               dump_printf_loc (MSG_NOTE, vect_location,
 258                                "%T: type = %T, start = %d, end = %d\n",
 259                                (*iter).first, TREE_TYPE ((*iter).first),
 260                                (*iter).second.first, (*iter).second.second);
 261         }
 262     }
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location, "Biggest mode = %s\n",
 265                      GET_MODE_NAME (biggest_mode));
 266   return biggest_mode;
 267 }
 268
 269 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
 270
 271    E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
 272         Then return RVVM4SImode (LMUL = 4, element mode = SImode).  */
 273 static unsigned int
 274 compute_nregs_for_mode (machine_mode mode, machine_mode biggest_mode, int lmul)
 275 {
 276   unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
 277   unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant ();
 278   gcc_assert (biggest_size >= mode_size);
 279   unsigned int ratio = biggest_size / mode_size;
 280   return lmul / ratio;
 281 }
 282
 283 /* This function helps to determine whether current LMUL will cause
 284    potential vector register (V_REG) spillings according to live range
 285    information.
 286
 287      - First, compute how many variable are alive of each program point
 288        in each bb of the loop.
 289      - Second, compute how many V_REGs are alive of each program point
 290        in each bb of the loop according the BIGGEST_MODE and the variable
 291        mode.
 292      - Third, Return the maximum V_REGs are alive of the loop.  */
 293 static unsigned int
 294 max_number_of_live_regs (const basic_block bb,
 295                          const hash_map<tree, pair> &live_ranges,
 296                          unsigned int max_point, machine_mode biggest_mode,
 297                          int lmul)
 298 {
 299   unsigned int max_nregs = 0;
 300   unsigned int i;
 301   unsigned int live_point = 0;
 302   auto_vec<unsigned int> live_vars_vec;
 303   live_vars_vec.safe_grow_cleared (max_point, true);
 304   for (hash_map<tree, pair>::iterator iter = live_ranges.begin ();
 305        iter != live_ranges.end (); ++iter)
 306     {
 307       tree var = (*iter).first;
 308       pair live_range = (*iter).second;
 309       for (i = live_range.first + 1; i <= live_range.second; i++)
 310         {
 311           machine_mode mode = TYPE_MODE (TREE_TYPE (var));
 312           unsigned int nregs
 313             = compute_nregs_for_mode (mode, biggest_mode, lmul);
 314           live_vars_vec[i] += nregs;
 315           if (live_vars_vec[i] > max_nregs)
 316             max_nregs = live_vars_vec[i];
 317         }
 318     }
 319
 320   /* Collect user explicit RVV type.  */
 321   auto_vec<basic_block> all_preds
 322     = get_all_dominated_blocks (CDI_POST_DOMINATORS, bb);
 323   tree t;
 324   FOR_EACH_SSA_NAME (i, t, cfun)
 325     {
 326       machine_mode mode = TYPE_MODE (TREE_TYPE (t));
 327       if (!lookup_vector_type_attribute (TREE_TYPE (t))
 328           && !riscv_v_ext_vls_mode_p (mode))
 329         continue;
 330
 331       gimple *def = SSA_NAME_DEF_STMT (t);
 332       if (gimple_bb (def) && !all_preds.contains (gimple_bb (def)))
 333         continue;
 334       use_operand_p use_p;
 335       imm_use_iterator iterator;
 336
 337       FOR_EACH_IMM_USE_FAST (use_p, iterator, t)
 338         {
 339           if (!USE_STMT (use_p) || is_gimple_debug (USE_STMT (use_p))
 340               || !dominated_by_p (CDI_POST_DOMINATORS, bb,
 341                                   gimple_bb (USE_STMT (use_p))))
 342             continue;
 343
 344           int regno_alignment = riscv_get_v_regno_alignment (mode);
 345           max_nregs += regno_alignment;
 346           if (dump_enabled_p ())
 347             dump_printf_loc (
 348               MSG_NOTE, vect_location,
 349               "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
 350               "V_REG live in bb %d at program point %d\n",
 351               t, TREE_TYPE (t), GET_MODE_NAME (mode), regno_alignment,
 352               bb->index, live_point);
 353           break;
 354         }
 355     }
 356
 357   if (dump_enabled_p ())
 358     dump_printf_loc (
 359       MSG_NOTE, vect_location,
 360       "Maximum lmul = %d, At most %d number of live V_REG at program "
 361       "point %d for bb %d\n",
 362       lmul, max_nregs, live_point, bb->index);
 363   return max_nregs;
 364 }
 365
 366 /* Get STORE value.  */
 367 static tree
 368 get_store_value (gimple *stmt)
 369 {
 370   if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
 371     {
 372       if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 373         return gimple_call_arg (stmt, 3);
 374       else
 375         gcc_unreachable ();
 376     }
 377   else
 378     return gimple_assign_rhs1 (stmt);
 379 }
 380
 381 /* Return true if it is non-contiguous load/store.  */
 382 static bool
 383 non_contiguous_memory_access_p (stmt_vec_info stmt_info)
 384 {
 385   enum stmt_vec_info_type type
 386     = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
 387   return ((type == load_vec_info_type || type == store_vec_info_type)
 388           && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)));
 389 }
 390
 391 /* Return the LMUL of the current analysis.  */
 392 static int
 393 compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode)
 394 {
 395   gcc_assert (GET_MODE_BITSIZE (mode).is_constant ());
 396   int regno_alignment = riscv_get_v_regno_alignment (loop_vinfo->vector_mode);
 397   if (riscv_v_ext_vls_mode_p (loop_vinfo->vector_mode))
 398     return regno_alignment;
 399   else if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo), 1U)
 400            || LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo).is_constant ())
 401     {
 402       int estimated_vf = vect_vf_for_cost (loop_vinfo);
 403       return estimated_vf * GET_MODE_BITSIZE (mode).to_constant ()
 404              / TARGET_MIN_VLEN;
 405     }
 406   else
 407     {
 408       /* Estimate the VLA SLP LMUL.  */
 409       if (regno_alignment > RVV_M1)
 410         return regno_alignment;
 411       else if (mode != QImode)
 412         {
 413           int ratio;
 414           if (can_div_trunc_p (BYTES_PER_RISCV_VECTOR,
 415                                GET_MODE_SIZE (loop_vinfo->vector_mode), &ratio))
 416             {
 417               if (ratio == 1)
 418                 return RVV_M4;
 419               else if (ratio == 2)
 420                 return RVV_M2;
 421             }
 422         }
 423     }
 424   return 0;
 425 }
 426
 427 /* Update the live ranges according PHI.
 428
 429    Loop:
 430      bb 2:
 431        STMT 1               -- point 0
 432        STMT 2 (def SSA 1)   -- point 1
 433        STMT 3 (use SSA 1)   -- point 2
 434        STMT 4               -- point 3
 435      bb 3:
 436        SSA 2 = PHI<SSA 1>
 437        STMT 1               -- point 0
 438        STMT 2               -- point 1
 439        STMT 3 (use SSA 2)   -- point 2
 440        STMT 4               -- point 3
 441
 442    Before this function, the SSA 1 live range is [2, 3] in bb 2
 443    and SSA 2 is [0, 3] in bb 3.
 444
 445    Then, after this function, we update SSA 1 live range in bb 2
 446    into [2, 4] since SSA 1 is live out into bb 3.  */
 447 static void
 448 update_local_live_ranges (
 449   vec_info *vinfo,
 450   hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
 451   hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb,
 452   machine_mode *biggest_mode)
 453 {
 454   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
 455   if (!loop_vinfo)
 456     return;
 457
 458   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 459   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 460   unsigned int nbbs = loop->num_nodes;
 461   unsigned int i, j;
 462   gphi_iterator psi;
 463   gimple_stmt_iterator si;
 464   for (i = 0; i < nbbs; i++)
 465     {
 466       basic_block bb = bbs[i];
 467       if (dump_enabled_p ())
 468         dump_printf_loc (MSG_NOTE, vect_location,
 469                          "Update local program points for bb %d:\n",
 470                          bbs[i]->index);
 471       for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
 472         {
 473           gphi *phi = psi.phi ();
 474           stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
 475           if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
 476               == undef_vec_info_type)
 477             continue;
 478
 479           for (j = 0; j < gimple_phi_num_args (phi); j++)
 480             {
 481               edge e = gimple_phi_arg_edge (phi, j);
 482               tree def = gimple_phi_arg_def (phi, j);
 483               auto *live_ranges = live_ranges_per_bb.get (bb);
 484               auto *live_range = live_ranges->get (def);
 485               if (poly_int_tree_p (def))
 486                 {
 487                   /* Insert live range of INTEGER_CST or POLY_CST since we will
 488                      need to allocate a vector register for it.
 489
 490                      E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
 491                      into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
 492
 493                      The live range for such value is short which only lives
 494                      from program point 0 to 1.  */
 495                   if (live_range)
 496                     {
 497                       unsigned int start = (*live_range).first;
 498                       (*live_range).first = 0;
 499                       if (dump_enabled_p ())
 500                         dump_printf_loc (
 501                           MSG_NOTE, vect_location,
 502                           "Update %T start point from %d to 0:\n", def, start);
 503                     }
 504                   else
 505                     {
 506                       live_ranges->put (def, pair (0, 1));
 507                       auto &program_points = (*program_points_per_bb.get (bb));
 508                       if (program_points.is_empty ())
 509                         {
 510                           stmt_point info = {1, phi};
 511                           program_points.safe_push (info);
 512                         }
 513                       if (dump_enabled_p ())
 514                         dump_printf_loc (MSG_NOTE, vect_location,
 515                                          "Add %T start point from 0 to 1:\n",
 516                                          def);
 517                     }
 518                   continue;
 519                 }
 520               if (live_range && flow_bb_inside_loop_p (loop, e->src))
 521                 {
 522                   unsigned int start = (*live_range).first;
 523                   (*live_range).first = 0;
 524                   if (dump_enabled_p ())
 525                     dump_printf_loc (MSG_NOTE, vect_location,
 526                                      "Update %T start point from %d to %d:\n",
 527                                      def, start, (*live_range).first);
 528                 }
 529               live_ranges = live_ranges_per_bb.get (e->src);
 530               if (!program_points_per_bb.get (e->src))
 531                 continue;
 532               unsigned int max_point
 533                 = (*program_points_per_bb.get (e->src)).length ();
 534               live_range = live_ranges->get (def);
 535               if (!live_range)
 536                 continue;
 537
 538               unsigned int end = (*live_range).second;
 539               (*live_range).second = max_point;
 540               if (dump_enabled_p ())
 541                 dump_printf_loc (MSG_NOTE, vect_location,
 542                                  "Update %T end point from %d to %d:\n", def,
 543                                  end, (*live_range).second);
 544             }
 545         }
 546       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 547         {
 548           if (!(is_gimple_assign (gsi_stmt (si))
 549                 || is_gimple_call (gsi_stmt (si))))
 550             continue;
 551           stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 552           enum stmt_vec_info_type type
 553             = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
 554           if (non_contiguous_memory_access_p (stmt_info)
 555               /* LOAD_LANES/STORE_LANES doesn't need a perm indice.  */
 556               && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
 557                    != VMAT_LOAD_STORE_LANES)
 558             {
 559               /* For non-adjacent load/store STMT, we will potentially
 560                  convert it into:
 561
 562                    1. MASK_LEN_GATHER_LOAD (..., perm indice).
 563                    2. Continguous load/store + VEC_PERM (..., perm indice)
 564
 565                 We will be likely using one more vector variable.  */
 566               unsigned int max_point
 567                 = (*program_points_per_bb.get (bb)).length () - 1;
 568               auto *live_ranges = live_ranges_per_bb.get (bb);
 569               bool existed_p = false;
 570               tree var = type == load_vec_info_type
 571                            ? gimple_get_lhs (gsi_stmt (si))
 572                            : get_store_value (gsi_stmt (si));
 573               tree sel_type = build_nonstandard_integer_type (
 574                 TYPE_PRECISION (TREE_TYPE (var)), 1);
 575               *biggest_mode
 576                 = get_biggest_mode (*biggest_mode, TYPE_MODE (sel_type));
 577               tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
 578                                      get_identifier ("vect_perm"), sel_type);
 579               pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
 580               gcc_assert (!existed_p);
 581               live_range = pair (0, max_point);
 582               if (dump_enabled_p ())
 583                 dump_printf_loc (MSG_NOTE, vect_location,
 584                                  "Add perm indice %T, start = 0, end = %d\n",
 585                                  sel, max_point);
 586             }
 587         }
 588     }
 589 }
 590
 591 /* Compute the maximum live V_REGS.  */
 592 static bool
 593 has_unexpected_spills_p (loop_vec_info loop_vinfo)
 594 {
 595   /* Compute local program points.
 596      It's a fast and effective computation.  */
 597   hash_map<basic_block, vec<stmt_point>> program_points_per_bb;
 598   compute_local_program_points (loop_vinfo, program_points_per_bb);
 599
 600   /* Compute local live ranges.  */
 601   hash_map<basic_block, hash_map<tree, pair>> live_ranges_per_bb;
 602   machine_mode biggest_mode
 603     = compute_local_live_ranges (program_points_per_bb, live_ranges_per_bb);
 604
 605   /* Update live ranges according to PHI.  */
 606   update_local_live_ranges (loop_vinfo, program_points_per_bb,
 607                             live_ranges_per_bb, &biggest_mode);
 608
 609   int lmul = compute_estimated_lmul (loop_vinfo, biggest_mode);
 610   /* TODO: We calculate the maximum live vars base on current STMTS
 611      sequence.  We can support live range shrink if it can give us
 612      big improvement in the future.  */
 613   if (lmul > RVV_M1)
 614     {
 615       if (!live_ranges_per_bb.is_empty ())
 616         {
 617           unsigned int max_nregs = 0;
 618           for (hash_map<basic_block, hash_map<tree, pair>>::iterator iter
 619                = live_ranges_per_bb.begin ();
 620                iter != live_ranges_per_bb.end (); ++iter)
 621             {
 622               basic_block bb = (*iter).first;
 623               unsigned int max_point
 624                 = (*program_points_per_bb.get (bb)).length () + 1;
 625               if ((*iter).second.is_empty ())
 626                 continue;
 627               /* We prefer larger LMUL unless it causes register spillings. */
 628               unsigned int nregs
 629                 = max_number_of_live_regs (bb, (*iter).second, max_point,
 630                                            biggest_mode, lmul);
 631               if (nregs > max_nregs)
 632                 max_nregs = nregs;
 633             }
 634           live_ranges_per_bb.empty ();
 635           if (max_nregs > V_REG_NUM)
 636             return true;
 637         }
 638     }
 639   if (!program_points_per_bb.is_empty ())
 640     {
 641       for (hash_map<basic_block, vec<stmt_point>>::iterator iter
 642            = program_points_per_bb.begin ();
 643            iter != program_points_per_bb.end (); ++iter)
 644         {
 645           vec<stmt_point> program_points = (*iter).second;
 646           if (!program_points.is_empty ())
 647             program_points.release ();
 648         }
 649       program_points_per_bb.empty ();
 650     }
 651   return false;
 652 }
 653
 654 costs::costs (vec_info *vinfo, bool costing_for_scalar)
 655   : vector_costs (vinfo, costing_for_scalar)
 656 {
 657   if (costing_for_scalar)
 658     m_cost_type = SCALAR_COST;
 659   else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode))
 660     m_cost_type = VLA_VECTOR_COST;
 661   else
 662     m_cost_type = VLS_VECTOR_COST;
 663 }
 664
 665 /* Do one-time initialization of the costs given that we're
 666    costing the loop vectorization described by LOOP_VINFO.  */
 667 void
 668 costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
 669 {
 670   /* Record the number of times that the vector loop would execute,
 671      if known.  */
 672   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 673   auto scalar_niters = max_stmt_executions_int (loop);
 674   if (scalar_niters >= 0)
 675     {
 676       unsigned int vf = vect_vf_for_cost (loop_vinfo);
 677       if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
 678         m_num_vector_iterations = scalar_niters / vf;
 679       else
 680         m_num_vector_iterations = CEIL (scalar_niters, vf);
 681     }
 682
 683   /* Detect whether we're vectorizing for VLA and should apply the unrolling
 684      heuristic described above m_unrolled_vls_niters.  */
 685   record_potential_vls_unrolling (loop_vinfo);
 686
 687   /* Detect whether the LOOP has unexpected spills.  */
 688   record_potential_unexpected_spills (loop_vinfo);
 689 }
 690
 691 /* Analyze the vectorized program stataments and use dynamic LMUL
 692    heuristic to detect whether the loop has unexpected spills.  */
 693 void
 694 costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo)
 695 {
 696   /* We only want to apply the heuristic if LOOP_VINFO is being
 697      vectorized for VLA and known NITERS VLS loop.  */
 698   if (riscv_autovec_lmul == RVV_DYNAMIC
 699       && (m_cost_type == VLA_VECTOR_COST
 700           || (m_cost_type == VLS_VECTOR_COST
 701               && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))))
 702     {
 703       bool post_dom_available_p = dom_info_available_p (CDI_POST_DOMINATORS);
 704       if (!post_dom_available_p)
 705         calculate_dominance_info (CDI_POST_DOMINATORS);
 706       m_has_unexpected_spills_p = has_unexpected_spills_p (loop_vinfo);
 707       if (!post_dom_available_p)
 708         free_dominance_info (CDI_POST_DOMINATORS);
 709     }
 710 }
 711
 712 /* Decide whether to use the unrolling heuristic described above
 713    m_unrolled_vls_niters, updating that field if so.  LOOP_VINFO
 714    describes the loop that we're vectorizing.  */
 715 void
 716 costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo)
 717 {
 718   /* We only want to apply the heuristic if LOOP_VINFO is being
 719      vectorized for VLA.  */
 720   if (m_cost_type != VLA_VECTOR_COST)
 721     return;
 722
 723   /* We don't want to apply the heuristic to outer loops, since it's
 724      harder to track two levels of unrolling.  */
 725   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
 726     return;
 727
 728   /* Only handle cases in which the number of VLS iterations
 729      would be known at compile time but the number of SVE iterations
 730      would not.  */
 731   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 732       || BYTES_PER_RISCV_VECTOR.is_constant ())
 733     return;
 734
 735   /* Guess how many times the VLS loop would iterate and make
 736      sure that it is within the complete unrolling limit.  Even if the
 737      number of iterations is small enough, the number of statements might
 738      not be, which is why we need to estimate the number of statements too.  */
 739   unsigned int vls_vf = vect_vf_for_cost (loop_vinfo);
 740   unsigned HOST_WIDE_INT unrolled_vls_niters
 741     = LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf;
 742   if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times)
 743     return;
 744
 745   /* Record that we're applying the heuristic and should try to estimate
 746      the number of statements in the VLS loop.  */
 747   m_unrolled_vls_niters = unrolled_vls_niters;
 748 }
 749
 750 /* Return true if (a) we're applying the VLS vs. VLA unrolling
 751    heuristic described above m_unrolled_vls_niters and (b) the heuristic
 752    says that we should prefer the VLS loop.  */
 753 bool
 754 costs::prefer_unrolled_loop () const
 755 {
 756   if (!m_unrolled_vls_stmts)
 757     return false;
 758
 759   if (dump_enabled_p ())
 760     dump_printf_loc (MSG_NOTE, vect_location,
 761                      "Number of insns in"
 762                      " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n",
 763                      m_unrolled_vls_stmts);
 764
 765   /* The balance here is tricky.  On the one hand, we can't be sure whether
 766      the code is vectorizable with VLS or not.  However, even if
 767      it isn't vectorizable with VLS, there's a possibility that
 768      the scalar code could also be unrolled.  Some of the code might then
 769      benefit from SLP, or from using LDP and STP.  We therefore apply
 770      the heuristic regardless of can_use_vls_p.  */
 771   return (m_unrolled_vls_stmts
 772           && (m_unrolled_vls_stmts
 773               <= (unsigned int) param_max_completely_peeled_insns));
 774 }
 775
 776 bool
 777 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
 778 {
 779   auto other = static_cast<const costs *> (uncast_other);
 780   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
 781   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
 782
 783   if (dump_enabled_p ())
 784     dump_printf_loc (MSG_NOTE, vect_location,
 785                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
 786                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
 787                      vect_vf_for_cost (this_loop_vinfo),
 788                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
 789                      vect_vf_for_cost (other_loop_vinfo));
 790
 791   /* Apply the unrolling heuristic described above m_unrolled_vls_niters.  */
 792   if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
 793     {
 794       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
 795       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
 796       if (this_prefer_unrolled != other_prefer_unrolled)
 797         {
 798           if (dump_enabled_p ())
 799             dump_printf_loc (MSG_NOTE, vect_location,
 800                              "Preferring VLS loop because"
 801                              " it can be unrolled\n");
 802           return other_prefer_unrolled;
 803         }
 804     }
 805   else if (riscv_autovec_lmul == RVV_DYNAMIC
 806            && !LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo))
 807     {
 808       if (other->m_has_unexpected_spills_p)
 809         {
 810           if (dump_enabled_p ())
 811             dump_printf_loc (MSG_NOTE, vect_location,
 812                              "Preferring smaller LMUL loop because"
 813                              " it has unexpected spills\n");
 814           return true;
 815         }
 816       else
 817         return false;
 818     }
 819
 820   return vector_costs::better_main_loop_than_p (other);
 821 }
 822
 823 unsigned
 824 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 825                       stmt_vec_info stmt_info, slp_tree, tree vectype,
 826                       int misalign, vect_cost_model_location where)
 827 {
 828   int stmt_cost
 829     = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);
 830
 831   /* Do one-time initialization based on the vinfo.  */
 832   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
 833   if (!m_analyzed_vinfo)
 834     {
 835       if (loop_vinfo)
 836         analyze_loop_vinfo (loop_vinfo);
 837
 838       m_analyzed_vinfo = true;
 839     }
 840
 841   if (stmt_info)
 842     {
 843       /* If we're applying the VLA vs. VLS unrolling heuristic,
 844          estimate the number of statements in the unrolled VLS
 845          loop.  For simplicitly, we assume that one iteration of the
 846          VLS loop would need the same number of statements
 847          as one iteration of the VLA loop.  */
 848       if (where == vect_body && m_unrolled_vls_niters)
 849         m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
 850     }
 851
 852   return record_stmt_cost (stmt_info, where, count * stmt_cost);
 853 }
 854
 855 void
 856 costs::finish_cost (const vector_costs *scalar_costs)
 857 {
 858   vector_costs::finish_cost (scalar_costs);
 859 }
 860
 861 } // namespace riscv_vector