This patch support combining cond extend and reduce_sum to cond widen reduce_sum
like combine the following three insns:
(set (reg:RVVM2HI 149)
(if_then_else:RVVM2HI
(unspec:RVVMF8BI [
(const_vector:RVVMF8BI repeat [
(const_int 1 [0x1])
])
(reg:DI 146)
(const_int 2 [0x2]) repeated x2
(const_int 1 [0x1])
(reg:SI 66 vl)
(reg:SI 67 vtype)
] UNSPEC_VPREDICATE)
(const_vector:RVVM2HI repeat [
(const_int 0 [0])
])
(unspec:RVVM2HI [
(reg:SI 0 zero)
] UNSPEC_VUNDEF)))
(set (reg:RVVM2HI 138)
(if_then_else:RVVM2HI
(reg:RVVMF8BI 135)
(reg:RVVM2HI 148)
(reg:RVVM2HI 149)))
(set (reg:HI 150)
(unspec:HI [
(reg:RVVM2HI 138)
] UNSPEC_REDUC_SUM))
into one insn:
(set (reg:SI 147)
(unspec:SI [
(if_then_else:RVVM2SI
(reg:RVVMF16BI 135)
(sign_extend:RVVM2SI (reg:RVVM1HI 136))
(if_then_else:RVVM2HI
(unspec:RVVMF8BI [
(const_vector:RVVMF8BI repeat [
(const_int 1 [0x1])
])
(reg:DI 146)
(const_int 2 [0x2]) repeated x2
(const_int 1 [0x1])
(reg:SI 66 vl)
(reg:SI 67 vtype)
] UNSPEC_VPREDICATE)
(const_vector:RVVM2HI repeat [
(const_int 0 [0])
])
(unspec:RVVM2HI [
(reg:SI 0 zero)
] UNSPEC_VUNDEF)))
] UNSPEC_REDUC_SUM))
Consider the following C code:
int16_t foo (int8_t *restrict a, int8_t *restrict pred)
{
int16_t sum = 0;
for (int i = 0; i < 16; i += 1)
if (pred[i])
sum += a[i];
return sum;
}
assembly before this patch:
foo:
vsetivli zero,16,e16,m2,ta,ma
li a5,0
vmv.v.i v2,0
vsetvli zero,zero,e8,m1,ta,ma
vl1re8.v v0,0(a1)
vmsne.vi v0,v0,0
vsetvli zero,zero,e16,m2,ta,mu
vle8.v v4,0(a0),v0.t
vmv.s.x v1,a5
vsext.vf2 v2,v4,v0.t
vredsum.vs v2,v2,v1
vmv.x.s a0,v2
slliw a0,a0,16
sraiw a0,a0,16
ret
assembly after this patch:
foo:
li a5,0
vsetivli zero,16,e16,m1,ta,ma
vmv.s.x v3,a5
vsetivli zero,16,e8,m1,ta,ma
vl1re8.v v0,0(a1)
vmsne.vi v0,v0,0
vle8.v v2,0(a0),v0.t
vwredsum.vs v1,v2,v3,v0.t
vsetivli zero,0,e16,m1,ta,ma
vmv.x.s a0,v1
slliw a0,a0,16
sraiw a0,a0,16
ret
gcc/ChangeLog:
* config/riscv/autovec-opt.md (*cond_widen_reduc_plus_scal_<mode>):
New combine patterns.
* config/riscv/riscv-protos.h (enum insn_type): New insn_type.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc-2.c: New test.
* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc_run-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/cond_widen_reduc_run-2.c: New test.
}
[(set_attr "type" "vfwmuladd")])
+;; Combine mask_extend + vredsum to mask_vwredsum[u]
+;; where the mrege of mask_extend is vector const 0
+(define_insn_and_split "*cond_widen_reduc_plus_scal_<mode>"
+ [(set (match_operand:<V_DOUBLE_EXTEND_VEL> 0 "register_operand")
+ (unspec:<V_DOUBLE_EXTEND_VEL> [
+ (if_then_else:<V_DOUBLE_EXTEND>
+ (match_operand:<VM> 1 "register_operand")
+ (any_extend:<V_DOUBLE_EXTEND>
+ (match_operand:VI_QHS_NO_M8 2 "register_operand"))
+ (if_then_else:<V_DOUBLE_EXTEND>
+ (unspec:<VM> [
+ (match_operand:<VM> 3 "vector_all_trues_mask_operand")
+ (match_operand 6 "vector_length_operand")
+ (match_operand 7 "const_int_operand")
+ (match_operand 8 "const_int_operand")
+ (match_operand 9 "const_1_or_2_operand")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)
+ ] UNSPEC_VPREDICATE)
+ (match_operand:<V_DOUBLE_EXTEND> 5 "vector_const_0_operand")
+ (match_operand:<V_DOUBLE_EXTEND> 4 "vector_merge_operand")))
+ ] UNSPEC_REDUC_SUM))]
+ "TARGET_VECTOR && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx ops[] = {operands[0], operands[2], operands[1],
+ gen_int_mode (GET_MODE_NUNITS (<MODE>mode), Pmode)};
+ riscv_vector::expand_reduction (<WREDUC_UNSPEC>,
+ riscv_vector::REDUCE_OP_M,
+ ops, CONST0_RTX (<V_DOUBLE_EXTEND_VEL>mode));
+ DONE;
+}
+[(set_attr "type" "vector")])
+
+;; Combine mask_extend + vfredsum to mask_vfwredusum
+;; where the mrege of mask_extend is vector const 0
+(define_insn_and_split "*cond_widen_reduc_plus_scal_<mode>"
+ [(set (match_operand:<V_DOUBLE_EXTEND_VEL> 0 "register_operand")
+ (unspec:<V_DOUBLE_EXTEND_VEL> [
+ (if_then_else:<V_DOUBLE_EXTEND>
+ (match_operand:<VM> 1 "register_operand")
+ (float_extend:<V_DOUBLE_EXTEND>
+ (match_operand:VF_HS_NO_M8 2 "register_operand"))
+ (if_then_else:<V_DOUBLE_EXTEND>
+ (unspec:<VM> [
+ (match_operand:<VM> 3 "vector_all_trues_mask_operand")
+ (match_operand 6 "vector_length_operand")
+ (match_operand 7 "const_int_operand")
+ (match_operand 8 "const_int_operand")
+ (match_operand 9 "const_1_or_2_operand")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)
+ ] UNSPEC_VPREDICATE)
+ (match_operand:<V_DOUBLE_EXTEND> 5 "vector_const_0_operand")
+ (match_operand:<V_DOUBLE_EXTEND> 4 "vector_merge_operand")))
+ ] UNSPEC_REDUC_SUM_UNORDERED))]
+ "TARGET_VECTOR && can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx ops[] = {operands[0], operands[2], operands[1],
+ gen_int_mode (GET_MODE_NUNITS (<MODE>mode), Pmode)};
+ riscv_vector::expand_reduction (UNSPEC_WREDUC_SUM_UNORDERED,
+ riscv_vector::REDUCE_OP_M_FRM_DYN,
+ ops, CONST0_RTX (<V_DOUBLE_EXTEND_VEL>mode));
+ DONE;
+}
+[(set_attr "type" "vector")])
+
;; =============================================================================
;; Misc combine patterns
;; =============================================================================
/* For vreduce, no mask policy operand. */
REDUCE_OP = __NORMAL_OP_TA | BINARY_OP_P | VTYPE_MODE_FROM_OP1_P,
+ REDUCE_OP_M = __MASK_OP_TA | BINARY_OP_P | VTYPE_MODE_FROM_OP1_P,
REDUCE_OP_FRM_DYN = REDUCE_OP | FRM_DYN_P | VTYPE_MODE_FROM_OP1_P,
REDUCE_OP_M_FRM_DYN
= __MASK_OP_TA | BINARY_OP_P | FRM_DYN_P | VTYPE_MODE_FROM_OP1_P,
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvfh_zvl128b -mabi=lp64d --param riscv-autovec-preference=fixed-vlmax --param riscv-autovec-lmul=m2 -fno-vect-cost-model -ffast-math" } */
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE1, TYPE2, N) \
+ __attribute__ ((noipa)) \
+ TYPE1 reduc_##TYPE1##_##TYPE2 (TYPE2 *restrict a, TYPE2 *restrict pred) \
+ { \
+ TYPE1 sum = 0; \
+ for (int i = 0; i < N; i += 1) \
+ if (pred[i]) \
+ sum += a[i]; \
+ return sum; \
+ }
+
+#define TEST_ALL(TEST) \
+ TEST (int16_t, int8_t, 16) \
+ TEST (int32_t, int16_t, 8) \
+ TEST (int64_t, int32_t, 4) \
+ TEST (uint16_t, uint8_t, 16) \
+ TEST (uint32_t, uint16_t, 8) \
+ TEST (uint64_t, uint32_t, 4) \
+ TEST (float, _Float16, 8) \
+ TEST (double, float, 4)
+
+TEST_ALL (TEST_TYPE)
+
+/* { dg-final { scan-assembler-times {\tvfwredusum\.vs\tv[0-9]+,v[0-9]+,v[0-9]+,v0\.t} 2 } } */
+/* { dg-final { scan-assembler-times {\tvwredsum\.vs\tv[0-9]+,v[0-9]+,v[0-9]+,v0\.t} 3 } } */
+/* { dg-final { scan-assembler-times {\tvwredsumu\.vs\tv[0-9]+,v[0-9]+,v[0-9]+,v0\.t} 3 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvfh_zvl128b -mabi=lp64d --param riscv-autovec-preference=scalable --param riscv-autovec-lmul=m2 -fno-vect-cost-model -ffast-math" } */
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE1, TYPE2, N) \
+ __attribute__ ((noipa)) \
+ TYPE1 reduc_##TYPE1##_##TYPE2 (TYPE2 *restrict a, TYPE2 *restrict pred) \
+ { \
+ TYPE1 sum = 0; \
+ for (int i = 0; i < N; i += 1) \
+ if (pred[i]) \
+ sum += a[i]; \
+ return sum; \
+ }
+
+#define TEST_ALL(TEST) \
+ TEST (int16_t, int8_t, 16) \
+ TEST (int32_t, int16_t, 8) \
+ TEST (int64_t, int32_t, 4) \
+ TEST (uint16_t, uint8_t, 16) \
+ TEST (uint32_t, uint16_t, 8) \
+ TEST (uint64_t, uint32_t, 4) \
+ TEST (float, _Float16, 8) \
+ TEST (double, float, 4)
+
+TEST_ALL (TEST_TYPE)
+
+/* { dg-final { scan-assembler-times {\tvfwredusum\.vs\tv[0-9]+,v[0-9]+,v[0-9]+,v0\.t} 2 } } */
+/* { dg-final { scan-assembler-times {\tvwredsum\.vs\tv[0-9]+,v[0-9]+,v[0-9]+,v0\.t} 3 } } */
+/* { dg-final { scan-assembler-times {\tvwredsumu\.vs\tv[0-9]+,v[0-9]+,v[0-9]+,v0\.t} 3 } } */
--- /dev/null
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=fixed-vlmax --param riscv-autovec-lmul=m2 -fno-vect-cost-model -ffast-math" } */
+
+#include "cond_widen_reduc-1.c"
+
+#define RUN(TYPE1, TYPE2, N) \
+ { \
+ TYPE2 a[N]; \
+ TYPE2 pred[N]; \
+ TYPE1 r = 0; \
+ for (int i = 0; i < N; i++) \
+ { \
+ a[i] = (i * 0.1) * (i & 1 ? 1 : -1); \
+ pred[i] = i % 3; \
+ if (pred[i]) \
+ r += a[i]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ if (r != reduc_##TYPE1##_##TYPE2 (a, pred)) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+ TEST_ALL (RUN)
+ return 0;
+}
--- /dev/null
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable --param riscv-autovec-lmul=m2 -fno-vect-cost-model -ffast-math" } */
+
+#include "cond_widen_reduc-2.c"
+
+#define RUN(TYPE1, TYPE2, N) \
+ { \
+ TYPE2 a[N]; \
+ TYPE2 pred[N]; \
+ TYPE1 r = 0; \
+ for (int i = 0; i < N; i++) \
+ { \
+ a[i] = (i * 0.1) * (i & 1 ? 1 : -1); \
+ pred[i] = i % 3; \
+ if (pred[i]) \
+ r += a[i]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ if (r != reduc_##TYPE1##_##TYPE2 (a, pred)) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+ TEST_ALL (RUN)
+ return 0;
+}