#define IN_TARGET_CODE 1
+/* We have a maximum of 11 operands for RVV instruction patterns according to
+ the vector.md. */
+#define RVV_INSN_OPERANDS_MAX 11
+
#include "config.h"
#include "system.h"
#include "coretypes.h"
: rtx_vector_builder (mode, npatterns, nelts_per_pattern)
{
m_inner_mode = GET_MODE_INNER (mode);
- m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant ();
+ m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
+ m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
+
+ gcc_assert (
+ int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
}
bool can_duplicate_repeating_sequence_p ();
rtx get_merged_repeating_sequence ();
+ bool repeating_sequence_use_merge_profitable_p ();
+ rtx get_merge_scalar_mask (unsigned int) const;
+
machine_mode new_mode () const { return m_new_mode; }
+ scalar_mode inner_mode () const { return m_inner_mode; }
+ scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
+ unsigned int inner_bits_size () const { return m_inner_bits_size; }
+ unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
private:
- machine_mode m_inner_mode;
+ scalar_mode m_inner_mode;
+ scalar_int_mode m_inner_int_mode;
machine_mode m_new_mode;
scalar_int_mode m_new_inner_mode;
- unsigned int m_inner_size;
+ unsigned int m_inner_bits_size;
+ unsigned int m_inner_bytes_size;
};
/* Return true if the vector duplicated by a super element which is the fusion
rvv_builder::can_duplicate_repeating_sequence_p ()
{
poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
- unsigned int new_inner_size = m_inner_size * npatterns ();
+ unsigned int new_inner_size = m_inner_bits_size * npatterns ();
if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
|| GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
|| !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
}
+/* Return true if it is a repeating sequence that using
+ merge approach has better codegen than using default
+ approach (slide1down).
+
+ Sequence A:
+ {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
+
+ nelts = 16
+ npatterns = 2
+
+ for merging a we need mask 101010....
+ for merging b we need mask 010101....
+
+ Foreach element in the npattern, we need to build a mask in scalar register.
+ Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
+ instruction and 1 scalar move to v0 register. Finally we need vector merge
+ to merge them.
+
+ lui a5, #imm
+ add a5, #imm
+ vmov.s.x v0, a5
+ vmerge.vxm v9, v9, a1, v0
+
+ So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
+ If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
+ So return true in this case as it is profitable.
+
+ Sequence B:
+ {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
+
+ nelts = 16
+ npatterns = 8
+
+ COST of merge approach = (3 + 1) * npatterns = 24
+ COST of slide1down approach = nelts = 16
+ Return false in this case as it is NOT profitable in merge approach.
+*/
+bool
+rvv_builder::repeating_sequence_use_merge_profitable_p ()
+{
+ if (inner_bytes_size () > UNITS_PER_WORD)
+ return false;
+
+ unsigned int nelts = full_nelts ().to_constant ();
+
+ if (!repeating_sequence_p (0, nelts, npatterns ()))
+ return false;
+
+ unsigned int merge_cost = 1;
+ unsigned int build_merge_mask_cost = 3;
+ unsigned int slide1down_cost = nelts;
+
+ return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
+}
+
/* Merge the repeating sequence into a single element and return the RTX. */
rtx
rvv_builder::get_merged_repeating_sequence ()
scalar_int_mode mode = Pmode;
rtx target = gen_reg_rtx (mode);
emit_move_insn (target, const0_rtx);
- rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode);
+ rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
/* { a, b, a, b }: Generate duplicate element = b << bits | a. */
for (unsigned int i = 0; i < npatterns (); i++)
{
- unsigned int loc = m_inner_size * i;
+ unsigned int loc = m_inner_bits_size * i;
rtx shift = gen_int_mode (loc, mode);
rtx ele = gen_lowpart (mode, elt (i));
rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
return target;
}
+/* Get the mask for merge approach.
+
+ Consider such following case:
+ {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
+ To merge "a", the mask should be 1010....
+ To merge "b", the mask should be 0101....
+*/
+rtx
+rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const
+{
+ unsigned HOST_WIDE_INT mask = 0;
+ unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
+
+ gcc_assert (BITS_PER_WORD % npatterns () == 0);
+
+ int limit = BITS_PER_WORD / npatterns ();
+
+ for (int i = 0; i < limit; i++)
+ mask |= base_mask << (i * npatterns ());
+
+ return gen_int_mode (mask, inner_int_mode ());
+}
+
/* Subroutine of riscv_vector_expand_vector_init.
Works as follows:
(a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
}
}
+/* Emit vmv.s.x instruction. */
+
+static void
+emit_scalar_move_insn (unsigned icode, rtx *ops)
+{
+ machine_mode data_mode = GET_MODE (ops[0]);
+ machine_mode mask_mode = get_mask_mode (data_mode).require ();
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP,
+ /* HAS_DEST_P */ true,
+ /* FULLY_UNMASKED_P */ false,
+ /* USE_REAL_MERGE_P */ true,
+ /* HAS_AVL_P */ true,
+ /* VLMAX_P */ false,
+ data_mode, mask_mode);
+ e.set_policy (TAIL_ANY);
+ e.set_policy (MASK_ANY);
+ e.set_vl (CONST1_RTX (Pmode));
+ e.emit_insn ((enum insn_code) icode, ops);
+}
+
+/* Emit vmv.v.x instruction with vlmax. */
+
+static void
+emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl)
+{
+ emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl);
+}
+
+/* Emit vmv.v.x instruction with nonvlmax. */
+
+static void
+emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl)
+{
+ emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl);
+}
+
+/* Emit merge instruction. */
+
+static machine_mode
+get_repeating_sequence_dup_machine_mode (const rvv_builder &builder)
+{
+ poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ());
+
+ if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR))
+ {
+ dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR,
+ builder.inner_bytes_size ());
+ }
+
+ return get_vector_mode (builder.inner_int_mode (), dup_nunits).require ();
+}
+
+/* Use merge approach to initialize the vector with repeating sequence.
+ v = {a, b, a, b, a, b, a, b}.
+
+ v = broadcast (a).
+ mask = 0b01010101....
+ v = merge (v, b, mask)
+*/
+static void
+expand_vector_init_merge_repeating_sequence (rtx target,
+ const rvv_builder &builder)
+{
+ machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder);
+ machine_mode dup_mask_mode = get_mask_mode (dup_mode).require ();
+ machine_mode mask_mode = get_mask_mode (builder.mode ()).require ();
+ uint64_t full_nelts = builder.full_nelts ().to_constant ();
+
+ /* Step 1: Broadcast the first pattern. */
+ rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))};
+ emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()),
+ ops, NULL_RTX);
+
+ /* Step 2: Merge the rest iteration of pattern. */
+ for (unsigned int i = 1; i < builder.npatterns (); i++)
+ {
+ /* Step 2-1: Generate mask register v0 for each merge. */
+ rtx merge_mask = builder.get_merge_scalar_mask (i);
+ rtx mask = gen_reg_rtx (mask_mode);
+ rtx dup = gen_reg_rtx (dup_mode);
+
+ if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */
+ {
+ rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode),
+ RVV_VUNDEF (dup_mode), merge_mask};
+ emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)),
+ ops);
+ }
+ else /* vmv.v.x. */
+ {
+ rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)};
+ rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode);
+ emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode),
+ ops, vl);
+ }
+
+ emit_move_insn (mask, gen_lowpart (mask_mode, dup));
+
+ /* Step 2-2: Merge pattern according to the mask. */
+ rtx ops[] = {target, target, builder.elt (i), mask};
+ emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)),
+ riscv_vector::RVV_MERGE_OP, ops);
+ }
+}
+
/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
void
emit_move_insn (target, gen_lowpart (mode, dup));
return;
}
+
+ /* Case 2: Optimize repeating sequence cases that Case 1 can
+ not handle and it is profitable. For example:
+ ELEMENT BITSIZE = 64.
+ v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
+ We can't find a vector mode for "ab" which will be combined into
+ 128-bit element to duplicate. */
+ if (v.repeating_sequence_use_merge_profitable_p ())
+ {
+ expand_vector_init_merge_repeating_sequence (target, v);
+ return;
+ }
+
/* TODO: We will support more Initialization of vector in the future. */
}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx16di __attribute__ ((vector_size (1024)));
+
+__attribute__ ((noipa)) void
+f_vnx16di (int64_t a, int64_t b, int64_t *out)
+{
+ vnx16di v = {
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ };
+ *(vnx16di *) out = v;
+}
+
+/* { dg-final { scan-assembler-times {vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */