return true;
}
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+ HOST_WIDE_INT diff;
+ unsigned i, size, step;
+
+ if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+ return false;
+
+ step = diff + 1;
+ size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+ switch (size)
+ {
+ case 16:
+ break;
+ case 32:
+ case 64:
+ /* We will have VEC_PERM_EXPR after rtl expand when invoking
+ __builtin_bswap. It will generate about 9 instructions in
+ loop as below, no matter it is bswap16, bswap32 or bswap64.
+ .L2:
+ 1 vle16.v v4,0(a0)
+ 2 vmv.v.x v2,a7
+ 3 vand.vv v2,v6,v2
+ 4 slli a2,a5,1
+ 5 vrgatherei16.vv v1,v4,v2
+ 6 sub a4,a4,a5
+ 7 vse16.v v1,0(a3)
+ 8 add a0,a0,a2
+ 9 add a3,a3,a2
+ bne a4,zero,.L2
+
+ But for bswap16 we may have a even simple code gen, which
+ has only 7 instructions in loop as below.
+ .L5
+ 1 vle8.v v2,0(a5)
+ 2 addi a5,a5,32
+ 3 vsrl.vi v4,v2,8
+ 4 vsll.vi v2,v2,8
+ 5 vor.vv v4,v4,v2
+ 6 vse8.v v4,0(a4)
+ 7 addi a4,a4,32
+ bne a5,a6,.L5
+
+ Unfortunately, the instructions in loop will grow to 13 and 24
+ for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+ for both the bswap64 and bswap32, but take shift and or (7 insn)
+ for bswap16.
+ */
+ default:
+ return false;
+ }
+
+ for (i = 0; i < step; i++)
+ if (!d->perm.series_p (i, step, diff - i, step))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ machine_mode vhi_mode;
+ poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+ if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+ return false;
+
+ /* Step-1: Move op0 to src with VHI mode. */
+ rtx src = gen_reg_rtx (vhi_mode);
+ emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+ /* Step-2: Shift right 8 bits to dest. */
+ rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* Step-3: Shift left 8 bits to src. */
+ src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* Step-4: Logic Or dest and src to dest. */
+ dest = expand_binop (vhi_mode, ior_optab, dest, src,
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* Step-5: Move src to target with VQI mode. */
+ emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+ return true;
+}
+
/* Recognize the pattern that can be shuffled by generic approach. */
static bool
return true;
if (shuffle_decompress_patterns (d))
return true;
+ if (shuffle_bswap_pattern (d))
+ return true;
if (shuffle_generic_patterns (d))
return true;
return false;
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+** ...
+** vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+** vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+** vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+** vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+** ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
--- /dev/null
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+ +-------+-------+---------------------------+---------+
+ | type | input | reference | test id |
+ +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+ /* RUN_TEST Arguments:
+ +------+---------+-------------+----+-----+-----+------------+
+ | type | test id | fun to test | in | out | ref | array size |
+ +------+---------+-------------+----+-----+-----+------------+
+ */
+ RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+ RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+ RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+ RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+ RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+ RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
#include "../vls-vlmax/perm-4.c"
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
/* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */