This patch is add vec_mask_len_{load_lanes,store_stores} autovectorization patterns.
Here we want to support this following autovectorization:
void
foo (int8_t *__restrict a,
int8_t *__restrict b,
int8_t *__restrict cond,
int n)
{
for (intptr_t i = 0; i < n; ++i)
{
if (cond[i])
a[i] = b[i * 2] + b[i * 2 + 1];
}
}
ARM SVE IR:
https://godbolt.org/z/cro1Eqc6a
# loop_mask_60 = PHI <next_mask_82(4), max_mask_81(3)>
...
mask__39.12_63 = vect__3.11_61 != { 0, ... };
vec_mask_and_66 = loop_mask_60 & mask__39.12_63;
...
vect_array.15 = .MASK_LOAD_LANES (_57, 8B, vec_mask_and_66);
...
For RVV, we would like to see IR:
loop_len = SELECT_VL;
...
mask__39.12_63 = vect__3.11_61 != { 0, ... };
...
vect_array.15 = .MASK_LEN_LOAD_LANES (_57, 8B, mask__39.12_63, loop_len, bias);
...
Bootstrap and Regression on X86 passed.
Ok for trunk ?
gcc/ChangeLog:
* doc/md.texi: Add vec_mask_len_{load_lanes,store_lanes} patterns.
* internal-fn.cc (expand_partial_load_optab_fn): Ditto.
(expand_partial_store_optab_fn): Ditto.
* internal-fn.def (MASK_LEN_LOAD_LANES): Ditto.
(MASK_LEN_STORE_LANES): Ditto.
* optabs.def (OPTAB_CD): Ditto.
This pattern is not allowed to @code{FAIL}.
+@cindex @code{vec_mask_len_load_lanes@var{m}@var{n}} instruction pattern
+@item @samp{vec_mask_len_load_lanes@var{m}@var{n}}
+Like @samp{vec_load_lanes@var{m}@var{n}}, but takes an additional
+mask operand (operand 2), length operand (operand 3) as well as bias operand (operand 4)
+that specifies which elements of the destination vectors should be loaded.
+Other elements of the destination vectors are undefined. The operation is equivalent to:
+
+@smallexample
+int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n});
+for (j = 0; j < operand3 + operand4; j++)
+ if (operand2[j])
+ for (i = 0; i < c; i++)
+ operand0[i][j] = operand1[j * c + i];
+@end smallexample
+
+This pattern is not allowed to @code{FAIL}.
+
@cindex @code{vec_store_lanes@var{m}@var{n}} instruction pattern
@item @samp{vec_store_lanes@var{m}@var{n}}
Equivalent to @samp{vec_load_lanes@var{m}@var{n}}, with the memory
This pattern is not allowed to @code{FAIL}.
+@cindex @code{vec_mask_len_store_lanes@var{m}@var{n}} instruction pattern
+@item @samp{vec_mask_len_store_lanes@var{m}@var{n}}
+Like @samp{vec_store_lanes@var{m}@var{n}}, but takes an additional
+mask operand (operand 2), length operand (operand 3) as well as bias operand (operand 4)
+that specifies which elements of the source vectors should be stored.
+The operation is equivalent to:
+
+@smallexample
+int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n});
+for (j = 0; j < operand3 + operand4; j++)
+ if (operand2[j])
+ for (i = 0; i < c; i++)
+ operand0[j * c + i] = operand1[i][j];
+@end smallexample
+
+This pattern is not allowed to @code{FAIL}.
+
@cindex @code{gather_load@var{m}@var{n}} instruction pattern
@item @samp{gather_load@var{m}@var{n}}
Load several separate memory locations into a vector of mode @var{m}.
type = TREE_TYPE (lhs);
rhs = expand_call_mem_ref (type, stmt, 0);
- if (optab == vec_mask_load_lanes_optab)
+ if (optab == vec_mask_load_lanes_optab
+ || optab == vec_mask_len_load_lanes_optab)
icode = get_multi_vector_move (type, optab);
else if (optab == len_load_optab)
icode = direct_optab_handler (optab, TYPE_MODE (type));
type = TREE_TYPE (rhs);
lhs = expand_call_mem_ref (type, stmt, 0);
- if (optab == vec_mask_store_lanes_optab)
+ if (optab == vec_mask_store_lanes_optab
+ || optab == vec_mask_len_store_lanes_optab)
icode = get_multi_vector_move (type, optab);
else if (optab == len_store_optab)
icode = direct_optab_handler (optab, TYPE_MODE (type));
- mask_load: currently just maskload
- load_lanes: currently just vec_load_lanes
- mask_load_lanes: currently just vec_mask_load_lanes
+ - mask_len_load_lanes: currently just vec_mask_len_load_lanes
- gather_load: used for {mask_,mask_len_,}gather_load
- len_load: currently just len_load
- mask_len_load: currently just mask_len_load
- mask_store: currently just maskstore
- store_lanes: currently just vec_store_lanes
- mask_store_lanes: currently just vec_mask_store_lanes
+ - mask_len_store_lanes: currently just vec_mask_len_store_lanes
- scatter_store: used for {mask_,mask_len_,}scatter_store
- len_store: currently just len_store
- mask_len_store: currently just mask_len_store
DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
vec_mask_load_lanes, mask_load_lanes)
+DEF_INTERNAL_OPTAB_FN (MASK_LEN_LOAD_LANES, ECF_PURE,
+ vec_mask_len_load_lanes, mask_load_lanes)
DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
vec_mask_store_lanes, mask_store_lanes)
+DEF_INTERNAL_OPTAB_FN (MASK_LEN_STORE_LANES, 0,
+ vec_mask_len_store_lanes, mask_store_lanes)
DEF_INTERNAL_OPTAB_FN (VCOND, ECF_CONST | ECF_NOTHROW, vcond, vec_cond)
DEF_INTERNAL_OPTAB_FN (VCONDU, ECF_CONST | ECF_NOTHROW, vcondu, vec_cond)
OPTAB_CD(vec_store_lanes_optab, "vec_store_lanes$a$b")
OPTAB_CD(vec_mask_load_lanes_optab, "vec_mask_load_lanes$a$b")
OPTAB_CD(vec_mask_store_lanes_optab, "vec_mask_store_lanes$a$b")
+OPTAB_CD(vec_mask_len_load_lanes_optab, "vec_mask_len_load_lanes$a$b")
+OPTAB_CD(vec_mask_len_store_lanes_optab, "vec_mask_len_store_lanes$a$b")
OPTAB_CD(vcond_optab, "vcond$a$b")
OPTAB_CD(vcondu_optab, "vcondu$a$b")
OPTAB_CD(vcondeq_optab, "vcondeq$a$b")