So this is the cmpmem patch from Sergei, updated for the trunk.
Updates included adjusting the existing cmpmemsi expander to
conditionally try expansion via vector. And a minor testsuite
adjustment to turn off vector expansion in one test that is primarily
focused on vset optimization and ensuring we don't have extras.
I've spun this in my tester successfully and just want to see a clean
run through precommit CI before moving forward.
Jeff
gcc/ChangeLog:
* config/riscv/riscv-protos.h (riscv_vector::expand_vec_cmpmem): New
function declaration.
* config/riscv/riscv-string.cc (riscv_vector::expand_vec_cmpmem): New
function.
* config/riscv/riscv.md (cmpmemsi): Try riscv_vector::expand_vec_cmpmem
for constant lengths.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/base/cmpmem-1.c: New codegen tests
* gcc.target/riscv/rvv/base/cmpmem-2.c: New execution tests
* gcc.target/riscv/rvv/base/cmpmem-3.c: New codegen tests
* gcc.target/riscv/rvv/base/cmpmem-4.c: New codegen tests
* gcc.target/riscv/rvv/autovec/vls/misalign-1.c: Turn off vector mem* and
str* handling.
bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool);
void emit_vec_extract (rtx, rtx, rtx);
bool expand_vec_setmem (rtx, rtx, rtx);
+bool expand_vec_cmpmem (rtx, rtx, rtx, rtx);
/* Rounding mode bitfield for fixed point VXRM. */
enum fixed_point_rounding_mode
return true;
}
+/* Used by cmpmemsi in riscv.md. */
+
+bool
+expand_vec_cmpmem (rtx result_out, rtx blk_a_in, rtx blk_b_in, rtx length_in)
+{
+ HOST_WIDE_INT lmul;
+ /* Check we are able and allowed to vectorise this operation;
+ bail if not. */
+ if (!check_vectorise_memory_operation (length_in, lmul))
+ return false;
+
+ /* Strategy:
+ load entire blocks at a and b into vector regs
+ generate mask of bytes that differ
+ find first set bit in mask
+ find offset of first set bit in mask, use 0 if none set
+ result is ((char*)a[offset] - (char*)b[offset])
+ */
+
+ machine_mode vmode
+ = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul)
+ .require ();
+ rtx blk_a_addr = copy_addr_to_reg (XEXP (blk_a_in, 0));
+ rtx blk_a = change_address (blk_a_in, vmode, blk_a_addr);
+ rtx blk_b_addr = copy_addr_to_reg (XEXP (blk_b_in, 0));
+ rtx blk_b = change_address (blk_b_in, vmode, blk_b_addr);
+
+ rtx vec_a = gen_reg_rtx (vmode);
+ rtx vec_b = gen_reg_rtx (vmode);
+
+ machine_mode mask_mode = get_mask_mode (vmode);
+ rtx mask = gen_reg_rtx (mask_mode);
+ rtx mismatch_ofs = gen_reg_rtx (Pmode);
+
+ rtx ne = gen_rtx_NE (mask_mode, vec_a, vec_b);
+ rtx vmsops[] = { mask, ne, vec_a, vec_b };
+ rtx vfops[] = { mismatch_ofs, mask };
+
+ /* If the length is exactly vlmax for the selected mode, do that.
+ Otherwise, use a predicated store. */
+
+ if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+ {
+ emit_move_insn (vec_a, blk_a);
+ emit_move_insn (vec_b, blk_b);
+ emit_vlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP,
+ vmsops);
+
+ emit_vlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
+ riscv_vector::CPOP_OP, vfops);
+ }
+ else
+ {
+ if (!satisfies_constraint_K (length_in))
+ length_in = force_reg (Pmode, length_in);
+
+ rtx memmask = CONSTM1_RTX (mask_mode);
+
+ rtx m_ops_a[] = { vec_a, memmask, blk_a };
+ rtx m_ops_b[] = { vec_b, memmask, blk_b };
+
+ emit_nonvlmax_insn (code_for_pred_mov (vmode),
+ riscv_vector::UNARY_OP_TAMA, m_ops_a, length_in);
+ emit_nonvlmax_insn (code_for_pred_mov (vmode),
+ riscv_vector::UNARY_OP_TAMA, m_ops_b, length_in);
+
+ emit_nonvlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP,
+ vmsops, length_in);
+
+ emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
+ riscv_vector::CPOP_OP, vfops, length_in);
+ }
+
+ /* Mismatch_ofs is -1 if blocks match, or the offset of
+ the first mismatch otherwise. */
+ rtx ltz = gen_reg_rtx (Xmode);
+ emit_insn (gen_slt_3 (LT, Xmode, Xmode, ltz, mismatch_ofs, const0_rtx));
+ /* mismatch_ofs += (mismatch_ofs < 0) ? 1 : 0. */
+ emit_insn (
+ gen_rtx_SET (mismatch_ofs, gen_rtx_PLUS (Pmode, mismatch_ofs, ltz)));
+
+ /* Unconditionally load the bytes at mismatch_ofs and subtract them
+ to get our result. */
+ emit_insn (gen_rtx_SET (blk_a_addr,
+ gen_rtx_PLUS (Pmode, mismatch_ofs, blk_a_addr)));
+ emit_insn (gen_rtx_SET (blk_b_addr,
+ gen_rtx_PLUS (Pmode, mismatch_ofs, blk_b_addr)));
+
+ blk_a = change_address (blk_a, QImode, blk_a_addr);
+ blk_b = change_address (blk_b, QImode, blk_b_addr);
+
+ rtx byte_a = gen_reg_rtx (SImode);
+ rtx byte_b = gen_reg_rtx (SImode);
+ do_zero_extendqi2 (byte_a, blk_a);
+ do_zero_extendqi2 (byte_b, blk_b);
+
+ emit_insn (gen_rtx_SET (result_out, gen_rtx_MINUS (SImode, byte_a, byte_b)));
+
+ return true;
+}
}
(use (match_operand:SI 4))])]
"!optimize_size"
{
+ /* If TARGET_VECTOR is false, this routine will return false and we will
+ try scalar expansion. */
+ if (riscv_vector::expand_vec_cmpmem (operands[0], operands[1],
+ operands[2], operands[3]))
+ DONE;
+
if (riscv_expand_block_compare (operands[0], operands[1], operands[2],
operands[3]))
DONE;
FAIL;
})
-
;; Expand in-line code to clear the instruction cache between operand[0] and
;; operand[1].
(define_expand "clear_cache"
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m4 -fno-tree-loop-distribute-patterns -mno-vector-strict-align" } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m4 -fno-tree-loop-distribute-patterns -mno-vector-strict-align -mstringop-strategy=libcall" } */
#include <stdlib.h>
--- /dev/null
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=dynamic" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Trivial memcmp should use inline scalar ops.
+** f1:
+** lbu\s+a\d+,0\(a0\)
+** lbu\s+a\d+,0\(a1\)
+** subw?\s+a0,a\d+,a\d+
+** ret
+*/
+int
+f1 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, 1);
+}
+
+/* Tiny __builtin_memcmp should use libc.
+** f2:
+** li\s+a\d,\d+
+** tail\s+memcmp
+*/
+int
+f2 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width with LMUL=1
+** f3:
+** (
+** vsetivli\s+zero,\d+,e8,m1,ta,ma
+** |
+** li\s+a\d+,\d+
+** vsetvli\s+zero,a\d+,e8,m1,ta,ma
+** )
+** ...
+** ret
+*/
+int
+f3 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Vectorised code should use smallest lmul known to fit length
+** f4:
+** (
+** vsetivli\s+zero,\d+,e8,m2,ta,ma
+** |
+** li\s+a\d+,\d+
+** vsetvli\s+zero,a\d+,e8,m2,ta,ma
+** )
+** ...
+** ret
+*/
+int
+f4 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES + 1);
+}
+
+/* Vectorise+inline up to LMUL=8
+** f5:
+** li\s+a\d+,\d+
+** vsetvli\s+zero,a\d+,e8,m8,ta,ma
+** ...
+** ret
+*/
+int
+f5 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8);
+}
+
+/* Don't inline if the length is too large for one operation.
+** f6:
+** li\s+a2,\d+
+** tail\s+memcmp
+*/
+int
+f6 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8 + 1);
+}
--- /dev/null
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-add-options riscv_v } */
+/* { dg-options "-O2 -mrvv-max-lmul=dynamic" } */
+
+#include <stdlib.h>
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+static inline __attribute__ ((always_inline)) void
+do_one_test (int const size, int const diff_offset, int const diff_dir)
+{
+ unsigned char A[size];
+ unsigned char B[size];
+ unsigned char const fill_value = 0x55;
+ __builtin_memset (A, fill_value, size);
+ __builtin_memset (B, fill_value, size);
+
+ if (diff_dir != 0)
+ {
+ if (diff_dir < 0)
+ {
+ A[diff_offset] = fill_value - 1;
+ }
+ else
+ {
+ A[diff_offset] = fill_value + 1;
+ }
+ }
+
+ if (__builtin_memcmp (A, B, size) != diff_dir)
+ {
+ abort ();
+ }
+}
+
+int
+main ()
+{
+ do_one_test (0, 0, 0);
+
+ do_one_test (1, 0, -1);
+ do_one_test (1, 0, 0);
+ do_one_test (1, 0, 1);
+
+ do_one_test (MIN_VECTOR_BYTES - 1, 0, -1);
+ do_one_test (MIN_VECTOR_BYTES - 1, 0, 0);
+ do_one_test (MIN_VECTOR_BYTES - 1, 0, 1);
+ do_one_test (MIN_VECTOR_BYTES - 1, 1, -1);
+ do_one_test (MIN_VECTOR_BYTES - 1, 1, 0);
+ do_one_test (MIN_VECTOR_BYTES - 1, 1, 1);
+
+ do_one_test (MIN_VECTOR_BYTES, 0, -1);
+ do_one_test (MIN_VECTOR_BYTES, 0, 0);
+ do_one_test (MIN_VECTOR_BYTES, 0, 1);
+ do_one_test (MIN_VECTOR_BYTES, MIN_VECTOR_BYTES - 1, -1);
+ do_one_test (MIN_VECTOR_BYTES, MIN_VECTOR_BYTES - 1, 0);
+ do_one_test (MIN_VECTOR_BYTES, MIN_VECTOR_BYTES - 1, 1);
+
+ do_one_test (MIN_VECTOR_BYTES + 1, 0, -1);
+ do_one_test (MIN_VECTOR_BYTES + 1, 0, 0);
+ do_one_test (MIN_VECTOR_BYTES + 1, 0, 1);
+ do_one_test (MIN_VECTOR_BYTES + 1, MIN_VECTOR_BYTES, -1);
+ do_one_test (MIN_VECTOR_BYTES + 1, MIN_VECTOR_BYTES, 0);
+ do_one_test (MIN_VECTOR_BYTES + 1, MIN_VECTOR_BYTES, 1);
+
+ do_one_test (MIN_VECTOR_BYTES * 8, 0, -1);
+ do_one_test (MIN_VECTOR_BYTES * 8, 0, 0);
+ do_one_test (MIN_VECTOR_BYTES * 8, 0, 1);
+ do_one_test (MIN_VECTOR_BYTES * 8, MIN_VECTOR_BYTES * 8 - 1, -1);
+ do_one_test (MIN_VECTOR_BYTES * 8, MIN_VECTOR_BYTES * 8 - 1, 0);
+ do_one_test (MIN_VECTOR_BYTES * 8, MIN_VECTOR_BYTES * 8 - 1, 1);
+
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Tiny __builtin_memcmp should use libc.
+** f1:
+** li\s+a\d,\d+
+** tail\s+memcmp
+*/
+int
+f1 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width with LMUL=1
+** f2:
+** (
+** vsetivli\s+zero,\d+,e8,m1,ta,ma
+** |
+** li\s+a\d+,\d+
+** vsetvli\s+zero,a\d+,e8,m1,ta,ma
+** )
+** ...
+** ret
+*/
+int
+f2 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Don't inline if the length is too large for one operation.
+** f3:
+** li\s+a2,\d+
+** tail\s+memcmp
+*/
+int
+f3 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES + 1);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Tiny __builtin_memcmp should use libc.
+** f1:
+** li\s+a\d,\d+
+** tail\s+memcmp
+*/
+int
+f1 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width with LMUL=8 as requested
+** f2:
+** (
+** vsetivli\s+zero,\d+,e8,m8,ta,ma
+** |
+** li\s+a\d+,\d+
+** vsetvli\s+zero,a\d+,e8,m8,ta,ma
+** )
+** ...
+** ret
+*/
+int
+f2 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Vectorise+inline anything that fits
+** f3:
+** (
+** vsetivli\s+zero,\d+,e8,m8,ta,ma
+** |
+** li\s+a\d+,\d+
+** vsetvli\s+zero,a\d+,e8,m8,ta,ma
+** )
+** ...
+** ret
+*/
+int
+f3 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8);
+}
+
+/* Don't inline if the length is too large for one operation.
+** f4:
+** li\s+a2,\d+
+** tail\s+memcmp
+*/
+int
+f4 (void *a, void *b)
+{
+ return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8 + 1);
+}