RISC-V: Refine bswap16 auto vectorization code gen

author Pan Li <pan2.li@intel.com>

Mon, 9 Oct 2023 08:12:15 +0000 (16:12 +0800)

committer Pan Li <pan2.li@intel.com>

Mon, 9 Oct 2023 13:12:59 +0000 (21:12 +0800)
author Pan Li <pan2.li@intel.com>
Mon, 9 Oct 2023 08:12:15 +0000 (16:12 +0800)
committer Pan Li <pan2.li@intel.com>
Mon, 9 Oct 2023 13:12:59 +0000 (21:12 +0800)
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc

index 23633a2a74d64d11e983cc138dc7c4ba4a02e072..c72e411f1258c98be7f0b9093a3de1e6baa479e7 100644 (file)
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3030,6 +3030,95 @@ shuffle_decompress_patterns (struct expand_vec_perm_d *d)
    return true;
  }
  
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+  HOST_WIDE_INT diff;
+  unsigned i, size, step;
+
+  if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+    return false;
+
+  step = diff + 1;
+  size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+  switch (size)
+    {
+    case 16:
+      break;
+    case 32:
+    case 64:
+      /* We will have VEC_PERM_EXPR after rtl expand when invoking
+        __builtin_bswap. It will generate about 9 instructions in
+        loop as below, no matter it is bswap16, bswap32 or bswap64.
+          .L2:
+        1 vle16.v v4,0(a0)
+        2 vmv.v.x v2,a7
+        3 vand.vv v2,v6,v2
+        4 slli    a2,a5,1
+        5 vrgatherei16.vv v1,v4,v2
+        6 sub     a4,a4,a5
+        7 vse16.v v1,0(a3)
+        8 add     a0,a0,a2
+        9 add     a3,a3,a2
+          bne     a4,zero,.L2
+
+        But for bswap16 we may have a even simple code gen, which
+        has only 7 instructions in loop as below.
+          .L5
+        1 vle8.v  v2,0(a5)
+        2 addi    a5,a5,32
+        3 vsrl.vi v4,v2,8
+        4 vsll.vi v2,v2,8
+        5 vor.vv  v4,v4,v2
+        6 vse8.v  v4,0(a4)
+        7 addi    a4,a4,32
+          bne     a5,a6,.L5
+
+        Unfortunately, the instructions in loop will grow to 13 and 24
+        for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+        for both the bswap64 and bswap32, but take shift and or (7 insn)
+        for bswap16.
+       */
+    default:
+      return false;
+    }
+
+  for (i = 0; i < step; i++)
+    if (!d->perm.series_p (i, step, diff - i, step))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode vhi_mode;
+  poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+  if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+    return false;
+
+  /* Step-1: Move op0 to src with VHI mode.  */
+  rtx src = gen_reg_rtx (vhi_mode);
+  emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+  /* Step-2: Shift right 8 bits to dest.  */
+  rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
+                          NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-3: Shift left 8 bits to src.  */
+  src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
+                     NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-4: Logic Or dest and src to dest.  */
+  dest = expand_binop (vhi_mode, ior_optab, dest, src,
+                      NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-5: Move src to target with VQI mode.  */
+  emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+  return true;
+}
+
  /* Recognize the pattern that can be shuffled by generic approach.  */
  
  static bool
@@ -3089,6 +3178,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
             return true;
           if (shuffle_decompress_patterns (d))
             return true;
+         if (shuffle_bswap_pattern (d))
+           return true;
           if (shuffle_generic_patterns (d))
             return true;
           return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c

new file mode 100644 (file)

index 0000000..10d235a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+**   ...
+**   vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+**   vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+**   ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c

new file mode 100644 (file)

index 0000000..8d45ceb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+         +-------+-------+---------------------------+---------+
+         | type  | input | reference                 | test id |
+         +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+  /* RUN_TEST Arguments:
+          +------+---------+-------------+----+-----+-----+------------+
+          | type | test id | fun to test | in | out | ref | array size |
+          +------+---------+-------------+----+-----+-----+------------+
+  */
+  RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c

new file mode 100644 (file)

index 0000000..11880ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c

index 4d6862cf1c04ab8a8e76acc234400b3a4c50b31e..d2d49388a39bee5fd1a82214e5604aac2ad0e213 100644 (file)
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,7 +3,7 @@
  
  #include "../vls-vlmax/perm-4.c"
  
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
  /* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
  /* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
author	Pan Li <pan2.li@intel.com>
	Mon, 9 Oct 2023 08:12:15 +0000 (16:12 +0800)
committer	Pan Li <pan2.li@intel.com>
	Mon, 9 Oct 2023 13:12:59 +0000 (21:12 +0800)
gcc/config/riscv/riscv-v.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c		patch \| blob \| blame \| history