[(set_attr "move_type" "fmove")
(set_attr "mode" "<UNITMODE>")])
+(define_expand "vec_extract<mode><lasxhalf>"
+ [(match_operand:<VHMODE256_ALL> 0 "register_operand")
+ (match_operand:LASX 1 "register_operand")
+ (match_operand 2 "const_0_or_1_operand")]
+ "ISA_HAS_LASX"
+{
+ if (INTVAL (operands[2]))
+ {
+ operands[2] = loongarch_lsx_vec_parallel_const_half (<MODE>mode, true);
+ emit_insn (gen_vec_extract_hi_<mode> (operands[0], operands[1],
+ operands[2]));
+ }
+ else
+ {
+ operands[2] = loongarch_lsx_vec_parallel_const_half (<MODE>mode, false);
+ emit_insn (gen_vec_extract_lo_<mode> (operands[0], operands[1],
+ operands[2]));
+ }
+ DONE;
+})
+
+(define_insn_and_split "vec_extract_lo_<mode>"
+ [(set (match_operand:<VHMODE256_ALL> 0 "register_operand" "=f")
+ (vec_select:<VHMODE256_ALL>
+ (match_operand:LASX 1 "register_operand" "f")
+ (match_operand:LASX 2 "vect_par_cnst_low_half")))]
+ "ISA_HAS_LASX"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (match_dup 1))]
+ "operands[1] = gen_lowpart (<VHMODE256_ALL>mode, operands[1]);")
+
+(define_insn "vec_extract_hi_<mode>"
+ [(set (match_operand:<VHMODE256_ALL> 0 "register_operand" "=f")
+ (vec_select:<VHMODE256_ALL>
+ (match_operand:LASX 1 "register_operand" "f")
+ (match_operand:LASX 2 "vect_par_cnst_high_half")))]
+ "ISA_HAS_LASX"
+ "xvpermi.d\t%u0,%u1,0xe"
+ [(set_attr "move_type" "fmove")
+ (set_attr "mode" "<MODE>")])
+
(define_expand "vec_perm<mode>"
[(match_operand:LASX 0 "register_operand")
(match_operand:LASX 1 "register_operand")
extern bool loongarch_const_vector_shuffle_set_p (rtx, machine_mode);
extern bool loongarch_const_vector_bitimm_set_p (rtx, machine_mode);
extern bool loongarch_const_vector_bitimm_clr_p (rtx, machine_mode);
+extern bool loongarch_check_vect_par_cnst_half (rtx, machine_mode, bool);
extern rtx loongarch_const_vector_vrepli (rtx, machine_mode);
extern rtx loongarch_lsx_vec_parallel_const_half (machine_mode, bool);
extern rtx loongarch_gen_const_int_vector (machine_mode, HOST_WIDE_INT);
return true;
}
+/* Check if OP is a PARALLEL RTX with CONST_INT elements representing
+ the HIGH (high_p == TRUE) or LOW (high_p == FALSE) half of a vector
+ for mode MODE. Returns true if the pattern matches, false otherwise. */
+
+bool
+loongarch_check_vect_par_cnst_half (rtx op, machine_mode mode, bool high_p)
+{
+ int nunits = XVECLEN (op, 0);
+ int nelts = GET_MODE_NUNITS (mode);
+
+ if (!known_eq (nelts, nunits * 2))
+ return false;
+
+ rtx first = XVECEXP (op, 0, 0);
+ if (!CONST_INT_P (first))
+ return false;
+
+ int base = high_p ? nelts / 2 : 0;
+ if (INTVAL (first) != base)
+ return false;
+
+ for (int i = 1; i < nunits; i++)
+ {
+ rtx elem = XVECEXP (op, 0, i);
+ if (!CONST_INT_P (elem) || INTVAL (elem) != INTVAL (first) + i)
+ return false;
+ }
+
+ return true;
+}
+
rtx
loongarch_const_vector_vrepli (rtx x, machine_mode mode)
{
}
}
+/* All CPUs prefer to avoid cross-lane operations so perform reductions
+ upper against lower halves up to LSX reg size. */
+
+machine_mode
+loongarch_split_reduction (machine_mode mode)
+{
+ if (LSX_SUPPORTED_MODE_P (mode))
+ return mode;
+
+ return mode_for_vector (as_a <scalar_mode> (GET_MODE_INNER (mode)),
+ GET_MODE_NUNITS (mode) / 2).require ();
+}
+
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
loongarch_autovectorize_vector_modes
+#undef TARGET_VECTORIZE_SPLIT_REDUCTION
+#define TARGET_VECTORIZE_SPLIT_REDUCTION \
+ loongarch_split_reduction
+
#undef TARGET_OPTAB_SUPPORTED_P
#define TARGET_OPTAB_SUPPORTED_P loongarch_optab_supported_p
return true;
})
+
+;; PARALLEL for a vec_select that selects the low half
+;; elements of a vector of MODE.
+(define_special_predicate "vect_par_cnst_low_half"
+ (match_code "parallel")
+{
+ return loongarch_check_vect_par_cnst_half (op, mode, false);
+})
+
+;; PARALLEL for a vec_select that selects the high half
+;; elements of a vector of MODE.
+(define_special_predicate "vect_par_cnst_high_half"
+ (match_code "parallel")
+{
+ return loongarch_check_vect_par_cnst_half (op, mode, true);;
+})
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -funsafe-math-optimizations -mlasx -fno-unroll-loops -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "\.REDUC_PLUS" 4 "optimized" } } */
+
+#define DEFINE_SUM_FUNCTION(T, FUNC_NAME, SIZE) \
+T FUNC_NAME(const T arr[]) { \
+ arr = __builtin_assume_aligned(arr, 64); \
+ T sum = 0; \
+ for (int i = 0; i < SIZE; i++) \
+ sum += arr[i]; \
+ return sum; \
+}
+
+DEFINE_SUM_FUNCTION (int, sum_int_1040, 1028)
+DEFINE_SUM_FUNCTION (float, sum_float_1040, 1028)
+DEFINE_SUM_FUNCTION (long, sum_long_1040, 1026)
+DEFINE_SUM_FUNCTION (double, sum_double_1040, 1026)