From d626debcb3717f18bf2ee88f4281b109b13e1181 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Tue, 15 Jul 2025 03:01:12 +0800 Subject: [PATCH] LoongArch: Fix wrong code generated by TARGET_VECTORIZE_VEC_PERM_CONST [PR121064] When TARGET_VECTORIZE_VEC_PERM_CONST is called, target may be the same pseudo as op0 and/or op1. Loading the selector into target would clobber the input, producing wrong code like vld $vr0, $t0 vshuf.w $vr0, $vr0, $vr1 So don't load the selector into d->target, use a new pseudo to hold the selector instead. The reload pass will load the pseudo for selector and the pseudo for target into the same hard register (following our constraint '0' on the shuf instructions) anyway. gcc/ChangeLog: PR target/121064 * config/loongarch/lsx.md (lsx_vshuf_): Add '@' to generate a mode-aware helper. Use as the mode of the operand 1 (selector). * config/loongarch/lasx.md (lasx_xvshuf_): Likewise. * config/loongarch/loongarch.cc (loongarch_try_expand_lsx_vshuf_const): Create a new pseudo for the selector. Use the mode-aware helper to simplify the code. (loongarch_expand_vec_perm_const): Likewise. gcc/testsuite/ChangeLog: PR target/121064 * gcc.target/loongarch/pr121064.c: New test. --- gcc/config/loongarch/lasx.md | 4 +- gcc/config/loongarch/loongarch.cc | 126 +++++------------- gcc/config/loongarch/lsx.md | 4 +- gcc/testsuite/gcc.target/loongarch/pr121064.c | 38 ++++++ 4 files changed, 73 insertions(+), 99 deletions(-) create mode 100644 gcc/testsuite/gcc.target/loongarch/pr121064.c diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md index 43e3ab0026a..3d71f30a54b 100644 --- a/gcc/config/loongarch/lasx.md +++ b/gcc/config/loongarch/lasx.md @@ -2060,9 +2060,9 @@ [(set_attr "type" "simd_int_arith") (set_attr "mode" "")]) -(define_insn "lasx_xvshuf_" +(define_insn "@lasx_xvshuf_" [(set (match_operand:LASX_DWH 0 "register_operand" "=f") - (unspec:LASX_DWH [(match_operand:LASX_DWH 1 "register_operand" "0") + (unspec:LASX_DWH [(match_operand: 1 "register_operand" "0") (match_operand:LASX_DWH 2 "register_operand" "f") (match_operand:LASX_DWH 3 "register_operand" "f")] UNSPEC_LASX_XVSHUF))] diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index f62e4163c71..b00fcc71a20 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -8380,7 +8380,7 @@ static bool loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d) { int i; - rtx target, op0, op1, sel, tmp; + rtx target, op0, op1; rtx rperm[MAX_VECT_LEN]; if (GET_MODE_SIZE (d->vmode) == 16) @@ -8399,47 +8399,23 @@ loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d) for (i = 0; i < d->nelt; i += 1) rperm[i] = GEN_INT (d->perm[i]); - if (d->vmode == E_V2DFmode) - { - sel = gen_rtx_CONST_VECTOR (E_V2DImode, gen_rtvec_v (d->nelt, rperm)); - tmp = simplify_gen_subreg (E_V2DImode, d->target, d->vmode, 0); - emit_move_insn (tmp, sel); - } - else if (d->vmode == E_V4SFmode) - { - sel = gen_rtx_CONST_VECTOR (E_V4SImode, gen_rtvec_v (d->nelt, rperm)); - tmp = simplify_gen_subreg (E_V4SImode, d->target, d->vmode, 0); - emit_move_insn (tmp, sel); - } + machine_mode sel_mode = related_int_vector_mode (d->vmode) + .require (); + rtvec sel_v = gen_rtvec_v (d->nelt, rperm); + + /* Despite vshuf.* (except vshuf.b) needs sel == target, we cannot + load sel into target right now: here we are dealing with + pseudo regs, and target may be the same pseudo as one of op0 + or op1. Then we'd clobber the input. Instead, we use a new + pseudo reg here. The reload pass will look at the constraint + of vshuf.* and move sel into target first if needed. */ + rtx sel = force_reg (sel_mode, + gen_rtx_CONST_VECTOR (sel_mode, sel_v)); + + if (d->vmode == E_V16QImode) + emit_insn (gen_lsx_vshuf_b (target, op1, op0, sel)); else - { - sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm)); - emit_move_insn (d->target, sel); - } - - switch (d->vmode) - { - case E_V2DFmode: - emit_insn (gen_lsx_vshuf_d_f (target, target, op1, op0)); - break; - case E_V2DImode: - emit_insn (gen_lsx_vshuf_d (target, target, op1, op0)); - break; - case E_V4SFmode: - emit_insn (gen_lsx_vshuf_w_f (target, target, op1, op0)); - break; - case E_V4SImode: - emit_insn (gen_lsx_vshuf_w (target, target, op1, op0)); - break; - case E_V8HImode: - emit_insn (gen_lsx_vshuf_h (target, target, op1, op0)); - break; - case E_V16QImode: - emit_insn (gen_lsx_vshuf_b (target, op1, op0, target)); - break; - default: - break; - } + emit_insn (gen_lsx_vshuf (d->vmode, target, sel, op1, op0)); return true; } @@ -9435,7 +9411,7 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d) bool flag = false; unsigned int i; unsigned char idx; - rtx target, op0, op1, sel, tmp; + rtx target, op0, op1; rtx rperm[MAX_VECT_LEN]; unsigned int remapped[MAX_VECT_LEN]; unsigned char perm2[MAX_VECT_LEN]; @@ -9615,63 +9591,23 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d) expand_perm_const_end: if (flag) { - /* Copy selector vector from memory to vector register for later insn - gen function. - If vector's element in floating point value, we cannot fit - selector argument into insn gen function directly, because of the - insn template definition. As a solution, generate a integral mode - subreg of target, then copy selector vector (that is in integral - mode) to this subreg. */ - switch (d->vmode) - { - case E_V4DFmode: - sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt, - rperm)); - tmp = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0); - emit_move_insn (tmp, sel); - break; - case E_V8SFmode: - sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt, - rperm)); - tmp = simplify_gen_subreg (E_V8SImode, d->target, d->vmode, 0); - emit_move_insn (tmp, sel); - break; - default: - sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, - rperm)); - emit_move_insn (d->target, sel); - break; - } - target = d->target; op0 = d->op0; op1 = d->one_vector_p ? d->op0 : d->op1; - /* We FINALLY can generate xvshuf.* insn. */ - switch (d->vmode) - { - case E_V4DFmode: - emit_insn (gen_lasx_xvshuf_d_f (target, target, op1, op0)); - break; - case E_V4DImode: - emit_insn (gen_lasx_xvshuf_d (target, target, op1, op0)); - break; - case E_V8SFmode: - emit_insn (gen_lasx_xvshuf_w_f (target, target, op1, op0)); - break; - case E_V8SImode: - emit_insn (gen_lasx_xvshuf_w (target, target, op1, op0)); - break; - case E_V16HImode: - emit_insn (gen_lasx_xvshuf_h (target, target, op1, op0)); - break; - case E_V32QImode: - emit_insn (gen_lasx_xvshuf_b (target, op1, op0, target)); - break; - default: - gcc_unreachable (); - break; - } + machine_mode sel_mode = related_int_vector_mode (d->vmode) + .require (); + rtvec sel_v = gen_rtvec_v (d->nelt, rperm); + + /* See the comment in loongarch_expand_lsx_shuffle for why + we don't simply use a SUBREG to pun target. */ + rtx sel = force_reg (sel_mode, + gen_rtx_CONST_VECTOR (sel_mode, sel_v)); + + if (d->vmode == E_V32QImode) + emit_insn (gen_lasx_xvshuf_b (target, op1, op0, sel)); + else + emit_insn (gen_lasx_xvshuf (d->vmode, target, sel, op1, op0)); return true; } diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md index 407c86870df..fb0236ba0f1 100644 --- a/gcc/config/loongarch/lsx.md +++ b/gcc/config/loongarch/lsx.md @@ -535,9 +535,9 @@ DONE; }) -(define_insn "lsx_vshuf_" +(define_insn "@lsx_vshuf_" [(set (match_operand:LSX_DWH 0 "register_operand" "=f") - (unspec:LSX_DWH [(match_operand:LSX_DWH 1 "register_operand" "0") + (unspec:LSX_DWH [(match_operand: 1 "register_operand" "0") (match_operand:LSX_DWH 2 "register_operand" "f") (match_operand:LSX_DWH 3 "register_operand" "f")] UNSPEC_LSX_VSHUF))] diff --git a/gcc/testsuite/gcc.target/loongarch/pr121064.c b/gcc/testsuite/gcc.target/loongarch/pr121064.c new file mode 100644 index 00000000000..a466c7abc70 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/pr121064.c @@ -0,0 +1,38 @@ +/* { dg-require-effective-target loongarch_sx_hw } */ +/* { dg-do run } */ +/* { dg-options "-march=loongarch64 -mfpu=64 -mlsx -O3" } */ + +typedef __INT32_TYPE__ int32_t; +typedef unsigned __INT32_TYPE__ uint32_t; + +__attribute__ ((noipa)) static int32_t +long_filter_ehigh_3830_1 (int32_t *buffer, int length) +{ + int i, j; + int32_t dotprod = 0; + int32_t delay[4] = { 0 }; + uint32_t coeffs[4] = { 0 }; + + for (i = 0; i < length; i++) + { + dotprod = 0; + for (j = 3; j >= 0; j--) + { + dotprod += delay[j] * coeffs[j]; + coeffs[j] += ((delay[j] >> 31) | 1); + } + for (j = 3; j > 0; j--) + delay[j] = delay[j - 1]; + delay[0] = buffer[i]; + } + + return dotprod; +} + +int +main () +{ + int32_t buffer[] = { -1, 1 }; + if (long_filter_ehigh_3830_1 (buffer, 2) != -1) + __builtin_trap (); +} -- 2.47.3