]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
RISC-V: Refine unsigned avg_floor/avg_ceil
authorJuzhe-Zhong <juzhe.zhong@rivai.ai>
Wed, 10 Jan 2024 05:05:38 +0000 (13:05 +0800)
committerPan Li <pan2.li@intel.com>
Wed, 10 Jan 2024 07:19:30 +0000 (15:19 +0800)
This patch is inspired by LLVM patches:
https://github.com/llvm/llvm-project/pull/76550
https://github.com/llvm/llvm-project/pull/77473

Use vaaddu for AVG vectorization.

Before this patch:

        vsetivli        zero,8,e8,mf2,ta,ma
        vle8.v  v3,0(a1)
        vle8.v  v2,0(a2)
        vwaddu.vv        v1,v3,v2
        vsetvli zero,zero,e16,m1,ta,ma
        vadd.vi v1,v1,1
        vsetvli zero,zero,e8,mf2,ta,ma
        vnsrl.wi        v1,v1,1
        vse8.v  v1,0(a0)
        ret

After this patch:

vsetivli zero,8,e8,mf2,ta,ma
csrwi vxrm,0
vle8.v v1,0(a1)
vle8.v v2,0(a2)
vaaddu.vv v1,v1,v2
vse8.v v1,0(a0)
ret

Note on signed averaging addition

Based on the rvv spec, there is also a variant for signed averaging addition called vaadd.
But AFAIU, no matter in which rounding mode, we cannot achieve the semantic of signed averaging addition through vaadd.
Thus this patch only introduces vaaddu.

More details in:
https://github.com/riscv/riscv-v-spec/issues/935
https://github.com/riscv/riscv-v-spec/issues/934

Tested on both RV32 and RV64 no regression.

Ok for trunk ?

gcc/ChangeLog:

* config/riscv/autovec.md (<u>avg<v_double_trunc>3_floor): Remove.
(avg<v_double_trunc>3_floor): New pattern.
(<u>avg<v_double_trunc>3_ceil): Remove.
(avg<v_double_trunc>3_ceil): New pattern.
(uavg<mode>3_floor): Ditto.
(uavg<mode>3_ceil): Ditto.
* config/riscv/riscv-protos.h (enum insn_flags): Add for average addition.
(enum insn_type): Ditto.
* config/riscv/riscv-v.cc: Ditto.
* config/riscv/vector-iterators.md (ashiftrt): Remove.
(ASHIFTRT): Ditto.
* config/riscv/vector.md: Add VLS modes.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/avg-1.c: Adapt test.
* gcc.target/riscv/rvv/autovec/vls/avg-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/vec-avg-rv32gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/vec-avg-rv64gcv.c: Ditto.

13 files changed:
gcc/config/riscv/autovec.md
gcc/config/riscv/riscv-protos.h
gcc/config/riscv/riscv-v.cc
gcc/config/riscv/vector-iterators.md
gcc/config/riscv/vector.md
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/avg-1.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/avg-2.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/avg-3.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/avg-4.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/avg-5.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/avg-6.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/vec-avg-rv32gcv.c
gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/vec-avg-rv64gcv.c

index 775eaa825b0cb8d471e0a46797f94046f4585d38..706cd9717cb1e554a8362e9ee364ba1ae9cb7e24 100644 (file)
 ;;  op[0] = (narrow) ((wide) op[1] + (wide) op[2] + 1)) >> 1;
 ;; -------------------------------------------------------------------------
 
-(define_expand "<u>avg<v_double_trunc>3_floor"
+(define_expand "avg<v_double_trunc>3_floor"
  [(set (match_operand:<V_DOUBLE_TRUNC> 0 "register_operand")
    (truncate:<V_DOUBLE_TRUNC>
-    (<ext_to_rshift>:VWEXTI
+    (ashiftrt:VWEXTI
      (plus:VWEXTI
-      (any_extend:VWEXTI
+      (sign_extend:VWEXTI
        (match_operand:<V_DOUBLE_TRUNC> 1 "register_operand"))
-      (any_extend:VWEXTI
+      (sign_extend:VWEXTI
        (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand"))))))]
   "TARGET_VECTOR"
 {
   /* First emit a widening addition.  */
   rtx tmp1 = gen_reg_rtx (<MODE>mode);
   rtx ops1[] = {tmp1, operands[1], operands[2]};
-  insn_code icode = code_for_pred_dual_widen (PLUS, <CODE>, <MODE>mode);
+  insn_code icode = code_for_pred_dual_widen (PLUS, SIGN_EXTEND, <MODE>mode);
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
 
   /* Then a narrowing shift.  */
   rtx ops2[] = {operands[0], tmp1, const1_rtx};
-  icode = code_for_pred_narrow_scalar (<EXT_TO_RSHIFT>, <MODE>mode);
+  icode = code_for_pred_narrow_scalar (ASHIFTRT, <MODE>mode);
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops2);
   DONE;
 })
 
-(define_expand "<u>avg<v_double_trunc>3_ceil"
+(define_expand "avg<v_double_trunc>3_ceil"
  [(set (match_operand:<V_DOUBLE_TRUNC> 0 "register_operand")
    (truncate:<V_DOUBLE_TRUNC>
-    (<ext_to_rshift>:VWEXTI
+    (ashiftrt:VWEXTI
      (plus:VWEXTI
       (plus:VWEXTI
-       (any_extend:VWEXTI
+       (sign_extend:VWEXTI
        (match_operand:<V_DOUBLE_TRUNC> 1 "register_operand"))
-       (any_extend:VWEXTI
+       (sign_extend:VWEXTI
        (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")))
       (const_int 1)))))]
   "TARGET_VECTOR"
   /* First emit a widening addition.  */
   rtx tmp1 = gen_reg_rtx (<MODE>mode);
   rtx ops1[] = {tmp1, operands[1], operands[2]};
-  insn_code icode = code_for_pred_dual_widen (PLUS, <CODE>, <MODE>mode);
+  insn_code icode = code_for_pred_dual_widen (PLUS, SIGN_EXTEND, <MODE>mode);
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
 
   /* Then add 1.  */
 
   /* Finally, a narrowing shift.  */
   rtx ops3[] = {operands[0], tmp2, const1_rtx};
-  icode = code_for_pred_narrow_scalar (<EXT_TO_RSHIFT>, <MODE>mode);
+  icode = code_for_pred_narrow_scalar (ASHIFTRT, <MODE>mode);
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops3);
   DONE;
 })
 
+;; csrwi vxrm, 2
+;; vaaddu.vv vd, vs2, vs1
+(define_expand "uavg<mode>3_floor"
+ [(match_operand:V_VLSI 0 "register_operand")
+  (match_operand:V_VLSI 1 "register_operand")
+  (match_operand:V_VLSI 2 "register_operand")]
+  "TARGET_VECTOR"
+{
+  insn_code icode = code_for_pred (UNSPEC_VAADDU, <MODE>mode);
+  riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP_VXRM_RDN, operands);
+  DONE;
+})
+
+;; csrwi vxrm, 0
+;; vaaddu.vv vd, vs2, vs1
+(define_expand "uavg<mode>3_ceil"
+ [(match_operand:V_VLSI 0 "register_operand")
+  (match_operand:V_VLSI 1 "register_operand")
+  (match_operand:V_VLSI 2 "register_operand")]
+  "TARGET_VECTOR"
+{
+  insn_code icode = code_for_pred (UNSPEC_VAADDU, <MODE>mode);
+  riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP_VXRM_RNU, operands);
+  DONE;
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Rounding.
 ;; -------------------------------------------------------------------------
index 00a5b645abe03c8799b9a322d24d7897773f633f..fc0097acde3900a8cffca8a35c09c8f1ae27bf8f 100644 (file)
@@ -366,6 +366,12 @@ enum insn_flags : unsigned int
 
   /* Means INSN has FRM operand and the value is FRM_RNE.  */
   FRM_RNE_P = 1 << 19,
+
+  /* Means INSN has VXRM operand and the value is VXRM_RNU.  */
+  VXRM_RNU_P = 1 << 20,
+
+  /* Means INSN has VXRM operand and the value is VXRM_RDN.  */
+  VXRM_RDN_P = 1 << 21,
 };
 
 enum insn_type : unsigned int
@@ -426,6 +432,8 @@ enum insn_type : unsigned int
   BINARY_OP_TAMU = __MASK_OP_TAMU | BINARY_OP_P,
   BINARY_OP_TUMA = __MASK_OP_TUMA | BINARY_OP_P,
   BINARY_OP_FRM_DYN = BINARY_OP | FRM_DYN_P,
+  BINARY_OP_VXRM_RNU = BINARY_OP | VXRM_RNU_P,
+  BINARY_OP_VXRM_RDN = BINARY_OP | VXRM_RDN_P,
 
   /* Ternary operator. Always have real merge operand.  */
   TERNARY_OP = HAS_DEST_P | HAS_MASK_P | USE_ALL_TRUES_MASK_P | HAS_MERGE_P
index 2491522191a12f20e58ce0f4b4f91e9501922879..7ae579ba890142ed4d890e66cbac21ff4a430750 100644 (file)
@@ -207,6 +207,13 @@ public:
     add_input_operand (frm_rtx, Pmode);
   }
 
+  void
+  add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode)
+  {
+    rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
+    add_input_operand (frm_rtx, Pmode);
+  }
+
   /* Return the vtype mode based on insn_flags.
      vtype mode mean the mode vsetvl insn set. */
   machine_mode
@@ -334,6 +341,10 @@ public:
       add_rounding_mode_operand (FRM_RMM);
     else if (m_insn_flags & FRM_RNE_P)
       add_rounding_mode_operand (FRM_RNE);
+    else if (m_insn_flags & VXRM_RNU_P)
+      add_rounding_mode_operand (VXRM_RNU);
+    else if (m_insn_flags & VXRM_RDN_P)
+      add_rounding_mode_operand (VXRM_RDN);
 
     gcc_assert (insn_data[(int) icode].n_operands == m_opno);
     expand (icode, any_mem_p);
index b4a276dc2c89c4edb0d46beb8d4da63bc5665ae9..c2ea7e8b10ac8d6b9de36a7909e34ca13a4cedf8 100644 (file)
 (define_code_attr nmsub_nmadd [(plus "nmsub") (minus "nmadd")])
 (define_code_attr nmsac_nmacc [(plus "nmsac") (minus "nmacc")])
 
-(define_code_attr ext_to_rshift [(sign_extend "ashiftrt")
-                                 (zero_extend "lshiftrt")])
-(define_code_attr EXT_TO_RSHIFT [(sign_extend "ASHIFTRT")
-                                 (zero_extend "LSHIFTRT")])
-
 (define_code_iterator and_ior [and ior])
 
 (define_code_iterator any_float_binop [plus mult minus div])
index 24b7b4394bee223b9b44429642ffe53d72150c87..c1a282a27b307ff85118790f3e86586bda9ddb49 100644 (file)
    (set_attr "mode" "<MODE>")])
 
 (define_insn "@pred_<sat_op><mode>"
-  [(set (match_operand:VI 0 "register_operand"           "=vd, vd, vr, vr")
-       (if_then_else:VI
+  [(set (match_operand:V_VLSI 0 "register_operand"       "=vd, vd, vr, vr")
+       (if_then_else:V_VLSI
          (unspec:<VM>
            [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
             (match_operand 5 "vector_length_operand"    " rK, rK, rK, rK")
             (reg:SI VL_REGNUM)
             (reg:SI VTYPE_REGNUM)
             (reg:SI VXRM_REGNUM)] UNSPEC_VPREDICATE)
-         (unspec:VI
-           [(match_operand:VI 3 "register_operand"      " vr, vr, vr, vr")
-            (match_operand:VI 4 "register_operand"      " vr, vr, vr, vr")] VSAT_OP)
-         (match_operand:VI 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+         (unspec:V_VLSI
+           [(match_operand:V_VLSI 3 "register_operand"   " vr, vr, vr, vr")
+            (match_operand:V_VLSI 4 "register_operand"   " vr, vr, vr, vr")] VSAT_OP)
+         (match_operand:V_VLSI 2 "vector_merge_operand"  " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "v<sat_op>.vv\t%0,%3,%4%p1"
   [(set_attr "type" "<sat_insn_type>")
index d53bd3a386ad8397a8676539475a8625e2df3b26..2327a3d018e151f5eee5828ba24f81ba0f6f8cc8 100644 (file)
@@ -26,9 +26,9 @@ DEF_AVG_FLOOR (uint8_t, uint16_t, 1024)
 DEF_AVG_FLOOR (uint8_t, uint16_t, 2048)
 
 /* { dg-final { scan-assembler-times {vwadd\.vv} 10 } } */
-/* { dg-final { scan-assembler-times {vwaddu\.vv} 10 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*2} 10 } } */
 /* { dg-final { scan-assembler-times {vnsra\.wi} 10 } } */
-/* { dg-final { scan-assembler-times {vnsrl\.wi} 10 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 10 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
index 68d1df73a5fb742bc0194371f6a0bcd5f2b0894f..8030810fdbd136fa2dffcbc01ca0428af56d19b5 100644 (file)
@@ -24,9 +24,9 @@ DEF_AVG_FLOOR (uint16_t, uint32_t, 512)
 DEF_AVG_FLOOR (uint16_t, uint32_t, 1024)
 
 /* { dg-final { scan-assembler-times {vwadd\.vv} 9 } } */
-/* { dg-final { scan-assembler-times {vwaddu\.vv} 9 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*2} 9 } } */
 /* { dg-final { scan-assembler-times {vnsra\.wi} 9 } } */
-/* { dg-final { scan-assembler-times {vnsrl\.wi} 9 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 9 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
index 07ffab61f678bcabdf45db27fe3aab80d9c36eab..dce0ffa346e6c395a3e84e1e19ba8d51afbd2bbf 100644 (file)
@@ -22,9 +22,9 @@ DEF_AVG_FLOOR (uint32_t, uint64_t, 256)
 DEF_AVG_FLOOR (uint32_t, uint64_t, 512)
 
 /* { dg-final { scan-assembler-times {vwadd\.vv} 8 } } */
-/* { dg-final { scan-assembler-times {vwaddu\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*2} 8 } } */
 /* { dg-final { scan-assembler-times {vnsra\.wi} 8 } } */
-/* { dg-final { scan-assembler-times {vnsrl\.wi} 8 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 8 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
index 83e219ca09a980dc79dac139d01048bceab6541e..65912fb39f2f88738579f545258c51963454b77c 100644 (file)
@@ -26,10 +26,10 @@ DEF_AVG_CEIL (uint8_t, uint16_t, 1024)
 DEF_AVG_CEIL (uint8_t, uint16_t, 2048)
 
 /* { dg-final { scan-assembler-times {vwadd\.vv} 10 } } */
-/* { dg-final { scan-assembler-times {vwaddu\.vv} 10 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*0} 10 } } */
 /* { dg-final { scan-assembler-times {vnsra\.wi} 10 } } */
-/* { dg-final { scan-assembler-times {vnsrl\.wi} 10 } } */
-/* { dg-final { scan-assembler-times {vadd\.vi} 20 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 10 } } */
+/* { dg-final { scan-assembler-times {vadd\.vi} 10 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
index 325faeaa93039d3d69239eaea0c99eafa7572441..a197b24c234ba02abe1cd65875904a29cdd1bbc7 100644 (file)
@@ -24,10 +24,10 @@ DEF_AVG_CEIL (uint16_t, uint32_t, 512)
 DEF_AVG_CEIL (uint16_t, uint32_t, 1024)
 
 /* { dg-final { scan-assembler-times {vwadd\.vv} 9 } } */
-/* { dg-final { scan-assembler-times {vwaddu\.vv} 9 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*0} 9 } } */
 /* { dg-final { scan-assembler-times {vnsra\.wi} 9 } } */
-/* { dg-final { scan-assembler-times {vnsrl\.wi} 9 } } */
-/* { dg-final { scan-assembler-times {vadd\.vi} 18 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 9 } } */
+/* { dg-final { scan-assembler-times {vadd\.vi} 9 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
index d836428c7f46a0e9270ac315248dfa0db47e79d3..a53de71a01b09f8e10e25e7b6cd3fd86dc2164a8 100644 (file)
@@ -22,10 +22,10 @@ DEF_AVG_CEIL (uint16_t, uint32_t, 256)
 DEF_AVG_CEIL (uint16_t, uint32_t, 512)
 
 /* { dg-final { scan-assembler-times {vwadd\.vv} 8 } } */
-/* { dg-final { scan-assembler-times {vwaddu\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*0} 8 } } */
 /* { dg-final { scan-assembler-times {vnsra\.wi} 8 } } */
-/* { dg-final { scan-assembler-times {vnsrl\.wi} 8 } } */
-/* { dg-final { scan-assembler-times {vadd\.vi} 16 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 8 } } */
+/* { dg-final { scan-assembler-times {vadd\.vi} 8 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
index e2754339d942c0d6c56378abbd3944a2624c4bf7..6874a3dab1b6beb5d695ec3178816ad2f05d273a 100644 (file)
@@ -4,7 +4,8 @@
 #include "vec-avg-template.h"
 
 /* { dg-final { scan-assembler-times {\tvwadd\.vv} 6 } } */
-/* { dg-final { scan-assembler-times {\tvwaddu\.vv} 6 } } */
-/* { dg-final { scan-assembler-times {\tvadd\.vi} 6 } } */
-/* { dg-final { scan-assembler-times {\tvnsrl.wi} 6 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*0} 3 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*2} 3 } } */
+/* { dg-final { scan-assembler-times {\tvadd\.vi} 3 } } */
 /* { dg-final { scan-assembler-times {\tvnsra.wi} 6 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 6 } } */
index 1f0ef29566dd9bde2589ec0fc8a4127abd1a2bd0..06f35e1481237e8cbbfa696e3075c62405756d0a 100644 (file)
@@ -4,7 +4,8 @@
 #include "vec-avg-template.h"
 
 /* { dg-final { scan-assembler-times {\tvwadd\.vv} 6 } } */
-/* { dg-final { scan-assembler-times {\tvwaddu\.vv} 6 } } */
-/* { dg-final { scan-assembler-times {\tvadd\.vi} 6 } } */
-/* { dg-final { scan-assembler-times {\tvnsrl\.wi} 6 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*0} 3 } } */
+/* { dg-final { scan-assembler-times {csrwi\s*vxrm,\s*2} 3 } } */
+/* { dg-final { scan-assembler-times {\tvadd\.vi} 3 } } */
 /* { dg-final { scan-assembler-times {\tvnsra\.wi} 6 } } */
+/* { dg-final { scan-assembler-times {vaaddu\.vv} 6 } } */