DONE;
}
)
+
+;; Implement rawmemchr[qi|si|hi].
+(define_expand "rawmemchr<ANYI:mode>"
+ [(match_operand 0 "register_operand")
+ (match_operand 1 "memory_operand")
+ (match_operand:ANYI 2 "const_int_operand")]
+ "TARGET_VECTOR"
+ {
+ riscv_vector::expand_rawmemchr(<MODE>mode, operands[0], operands[1],
+ operands[2]);
+ DONE;
+ }
+)
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
bool, void (*)(rtx *, rtx));
rtx gen_scalar_move_mask (machine_mode);
+rtx gen_no_side_effects_vsetvl_rtx (machine_mode, rtx, rtx);
/* RVV vector register sizes.
TODO: Currently, we only add RVV_32/RVV_64/RVV_128, we may need to
void expand_cond_binop (unsigned, rtx *);
void expand_cond_ternop (unsigned, rtx *);
void expand_popcount (rtx *);
+void expand_rawmemchr (machine_mode, rtx, rtx, rtx);
/* Rounding mode bitfield for fixed point VXRM. */
enum fixed_point_rounding_mode
#include "target.h"
#include "predict.h"
#include "optabs.h"
+#include "riscv-protos.h"
+#include "recog.h"
+#include "tm-constrs.h"
/* Emit proper instruction depending on mode of dest. */
}
return false;
}
+
+/* --- Vector expanders --- */
+
+namespace riscv_vector {
+
+/* Used by cpymemsi in riscv.md . */
+
+bool
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+{
+ /*
+ memcpy:
+ mv a3, a0 # Copy destination
+ loop:
+ vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
+ vle8.v v0, (a1) # Load bytes
+ add a1, a1, t0 # Bump pointer
+ sub a2, a2, t0 # Decrement count
+ vse8.v v0, (a3) # Store bytes
+ add a3, a3, t0 # Bump pointer
+ bnez a2, loop # Any more?
+ ret # Return
+ */
+ if (!TARGET_VECTOR)
+ return false;
+ HOST_WIDE_INT potential_ew
+ = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
+ / BITS_PER_UNIT);
+ machine_mode vmode = VOIDmode;
+ bool need_loop = true;
+ bool size_p = optimize_function_for_size_p (cfun);
+ rtx src, dst;
+ rtx end = gen_reg_rtx (Pmode);
+ rtx vec;
+ rtx length_rtx = length_in;
+
+ if (CONST_INT_P (length_in))
+ {
+ HOST_WIDE_INT length = INTVAL (length_in);
+
+ /* By using LMUL=8, we can copy as many bytes in one go as there
+ are bits in a vector register. If the entire block thus fits,
+ we don't need a loop. */
+ if (length <= TARGET_MIN_VLEN)
+ {
+ need_loop = false;
+
+ /* If a single scalar load / store pair can do the job, leave it
+ to the scalar code to do that. */
+ /* ??? If fast unaligned access is supported, the scalar code could
+ use suitably sized scalars irrespective of alignemnt. If that
+ gets fixed, we have to adjust the test here. */
+
+ if (pow2p_hwi (length) && length <= potential_ew)
+ return false;
+ }
+
+ /* Find the vector mode to use. Using the largest possible element
+ size is likely to give smaller constants, and thus potentially
+ reducing code size. However, if we need a loop, we need to update
+ the pointers, and that is more complicated with a larger element
+ size, unless we use an immediate, which prevents us from dynamically
+ using the targets transfer size that the hart supports. And then,
+ unless we know the *exact* vector size of the hart, we'd need
+ multiple vsetvli / branch statements, so it's not even a size win.
+ If, in the future, we find an RISCV-V implementation that is slower
+ for small element widths, we might allow larger element widths for
+ loops too. */
+ if (need_loop)
+ potential_ew = 1;
+ for (; potential_ew; potential_ew >>= 1)
+ {
+ scalar_int_mode elem_mode;
+ unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+ unsigned HOST_WIDE_INT per_iter;
+ HOST_WIDE_INT nunits;
+
+ if (need_loop)
+ per_iter = TARGET_MIN_VLEN;
+ else
+ per_iter = length;
+ nunits = per_iter / potential_ew;
+
+ /* Unless we get an implementation that's slow for small element
+ size / non-word-aligned accesses, we assume that the hardware
+ handles this well, and we don't want to complicate the code
+ with shifting word contents around or handling extra bytes at
+ the start and/or end. So we want the total transfer size and
+ alignment to fit with the element size. */
+ if (length % potential_ew != 0
+ || !int_mode_for_size (bits, 0).exists (&elem_mode))
+ continue;
+ /* Find the mode to use for the copy inside the loop - or the
+ sole copy, if there is no loop. */
+ if (!need_loop)
+ {
+ /* Try if we have an exact mode for the copy. */
+ if (riscv_vector::get_vector_mode (elem_mode,
+ nunits).exists (&vmode))
+ break;
+ /* Since we don't have a mode that exactlty matches the transfer
+ size, we'll need to use pred_store, which is not available
+ for all vector modes, but only iE_RVV_M* modes, hence trying
+ to find a vector mode for a merely rounded-up size is
+ pointless.
+ Still, by choosing a lower LMUL factor that still allows
+ an entire transfer, we can reduce register pressure. */
+ for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
+ if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
+ /* Avoid loosing the option of using vsetivli . */
+ && (nunits <= 31 * lmul || nunits > 31 * 8)
+ && (riscv_vector::get_vector_mode
+ (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul,
+ potential_ew)).exists (&vmode)))
+ break;
+ }
+
+ /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
+ wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by
+ the sizes of larger element types; the LMUL factor of 8 can at
+ the moment be divided by the SEW, with SEW of up to 8 bytes,
+ but there are reserved encodings so there might be larger
+ SEW in the future. */
+ if (riscv_vector::get_vector_mode
+ (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * 8,
+ potential_ew)).exists (&vmode))
+ break;
+
+ /* We may get here if we tried an element size that's larger than
+ the hardware supports, but we should at least find a suitable
+ byte vector mode. */
+ gcc_assert (potential_ew > 1);
+ }
+ if (potential_ew > 1)
+ length_rtx = GEN_INT (length / potential_ew);
+ }
+ else
+ {
+ vmode = E_RVVM8QImode;
+ }
+
+ /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+ arguments + 1 for the call. When RVV should take 7 instructions and
+ we're optimizing for size a libcall may be preferable. */
+ if (size_p && need_loop)
+ return false;
+
+ /* length_rtx holds the (remaining) length of the required copy.
+ cnt holds the length we copy with the current load/store pair. */
+ rtx cnt = length_rtx;
+ rtx label = NULL_RTX;
+ rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+ rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
+
+ if (need_loop)
+ {
+ length_rtx = copy_to_mode_reg (Pmode, length_rtx);
+ cnt = gen_reg_rtx (Pmode);
+ label = gen_label_rtx ();
+
+ emit_label (label);
+ emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (vmode, cnt,
+ length_rtx));
+ }
+
+ vec = gen_reg_rtx (vmode);
+ src = change_address (src_in, vmode, src_addr);
+ dst = change_address (dst_in, vmode, dst_addr);
+
+ /* If we don't need a loop and have a suitable mode to describe the size,
+ just do a load / store pair and leave it up to the later lazy code
+ motion pass to insert the appropriate vsetvli. */
+ if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+ {
+ emit_move_insn (vec, src);
+ emit_move_insn (dst, vec);
+ }
+ else
+ {
+ machine_mode mask_mode = riscv_vector::get_vector_mode
+ (BImode, GET_MODE_NUNITS (vmode)).require ();
+ rtx mask = CONSTM1_RTX (mask_mode);
+ if (!satisfies_constraint_K (cnt))
+ cnt= force_reg (Pmode, cnt);
+ rtx m_ops[] = {vec, mask, src};
+ emit_nonvlmax_insn (code_for_pred_mov (vmode),
+ riscv_vector::UNARY_OP_TAMA, m_ops, cnt);
+ emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
+ get_avl_type_rtx (riscv_vector::NONVLMAX)));
+ }
+
+ if (need_loop)
+ {
+ emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+ emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+ emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt)));
+
+ /* Emit the loop condition. */
+ rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
+ emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label));
+ emit_insn (gen_nop ());
+ }
+
+ return true;
+}
+
+
+/* Implement rawmemchr<mode> using vector instructions.
+ It can be assumed that the needle is in the haystack, otherwise the
+ behavior is undefined. */
+
+void
+expand_rawmemchr (machine_mode mode, rtx dst, rtx src, rtx pat)
+{
+ /*
+ rawmemchr:
+ loop:
+ vsetvli a1, zero, e[8,16,32,64], m1, ta, ma
+ vle[8,16,32,64]ff.v v8, (a0) # Load.
+ csrr a1, vl # Get number of bytes read.
+ vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...})
+ vfirst.m a2, v0 # Find first hit.
+ add a0, a0, a1 # Bump pointer.
+ bltz a2, loop # Not found?
+
+ sub a0, a0, a1 # Go back by a1.
+ shll a2, a2, [0,1,2,3] # Shift to get byte offset.
+ add a0, a0, a2 # Add the offset.
+
+ ret
+ */
+ gcc_assert (TARGET_VECTOR);
+
+ unsigned int isize = GET_MODE_SIZE (mode).to_constant ();
+ int lmul = TARGET_MAX_LMUL;
+ poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize);
+
+ machine_mode vmode;
+ if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode),
+ nunits).exists (&vmode))
+ gcc_unreachable ();
+
+ machine_mode mask_mode = riscv_vector::get_mask_mode (vmode);
+
+ rtx cnt = gen_reg_rtx (Pmode);
+ rtx end = gen_reg_rtx (Pmode);
+ rtx vec = gen_reg_rtx (vmode);
+ rtx mask = gen_reg_rtx (mask_mode);
+
+ /* After finding the first vector element matching the needle, we
+ need to multiply by the vector element width (SEW) in order to
+ return a pointer to the matching byte. */
+ unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ());
+
+ rtx src_addr = copy_addr_to_reg (XEXP (src, 0));
+
+ rtx loop = gen_label_rtx ();
+ emit_label (loop);
+
+ rtx vsrc = change_address (src, vmode, src_addr);
+
+ /* Emit a first-fault load. */
+ rtx vlops[] = {vec, vsrc};
+ emit_vlmax_insn (code_for_pred_fault_load (vmode),
+ riscv_vector::UNARY_OP, vlops);
+
+ /* Read how far we read. */
+ if (Pmode == SImode)
+ emit_insn (gen_read_vlsi (cnt));
+ else
+ emit_insn (gen_read_vldi_zero_extend (cnt));
+
+ /* Compare needle with haystack and store in a mask. */
+ rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, pat), vec);
+ rtx vmsops[] = {mask, eq, vec, pat};
+ emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode),
+ riscv_vector::COMPARE_OP, vmsops, cnt);
+
+ /* Find the first bit in the mask. */
+ rtx vfops[] = {end, mask};
+ emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
+ riscv_vector::CPOP_OP, vfops, cnt);
+
+ /* Bump the pointer. */
+ emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+
+ /* Emit the loop condition. */
+ rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx);
+ emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop));
+
+ /* We overran by CNT, subtract it. */
+ emit_insn (gen_rtx_SET (src_addr, gen_rtx_MINUS (Pmode, src_addr, cnt)));
+
+ /* We found something at SRC + END * [1,2,4,8]. */
+ emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT (shift))));
+ emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end)));
+}
+
+}
return vl;
}
-static rtx
+rtx
gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
{
unsigned int sew = get_sew (vmode);
}
}
-/* Used by cpymemsi in riscv.md . */
-
-bool
-expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
-{
- /*
- memcpy:
- mv a3, a0 # Copy destination
- loop:
- vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
- vle8.v v0, (a1) # Load bytes
- add a1, a1, t0 # Bump pointer
- sub a2, a2, t0 # Decrement count
- vse8.v v0, (a3) # Store bytes
- add a3, a3, t0 # Bump pointer
- bnez a2, loop # Any more?
- ret # Return
- */
- if (!TARGET_VECTOR)
- return false;
- HOST_WIDE_INT potential_ew
- = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
- / BITS_PER_UNIT);
- machine_mode vmode = VOIDmode;
- bool need_loop = true;
- bool size_p = optimize_function_for_size_p (cfun);
- rtx src, dst;
- rtx end = gen_reg_rtx (Pmode);
- rtx vec;
- rtx length_rtx = length_in;
-
- if (CONST_INT_P (length_in))
- {
- HOST_WIDE_INT length = INTVAL (length_in);
-
- /* By using LMUL=8, we can copy as many bytes in one go as there
- are bits in a vector register. If the entire block thus fits,
- we don't need a loop. */
- if (length <= TARGET_MIN_VLEN)
- {
- need_loop = false;
-
- /* If a single scalar load / store pair can do the job, leave it
- to the scalar code to do that. */
- /* ??? If fast unaligned access is supported, the scalar code could
- use suitably sized scalars irrespective of alignemnt. If that
- gets fixed, we have to adjust the test here. */
-
- if (pow2p_hwi (length) && length <= potential_ew)
- return false;
- }
-
- /* Find the vector mode to use. Using the largest possible element
- size is likely to give smaller constants, and thus potentially
- reducing code size. However, if we need a loop, we need to update
- the pointers, and that is more complicated with a larger element
- size, unless we use an immediate, which prevents us from dynamically
- using the targets transfer size that the hart supports. And then,
- unless we know the *exact* vector size of the hart, we'd need
- multiple vsetvli / branch statements, so it's not even a size win.
- If, in the future, we find an RISCV-V implementation that is slower
- for small element widths, we might allow larger element widths for
- loops too. */
- if (need_loop)
- potential_ew = 1;
- for (; potential_ew; potential_ew >>= 1)
- {
- scalar_int_mode elem_mode;
- unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
- unsigned HOST_WIDE_INT per_iter;
- HOST_WIDE_INT nunits;
-
- if (need_loop)
- per_iter = TARGET_MIN_VLEN;
- else
- per_iter = length;
- nunits = per_iter / potential_ew;
-
- /* Unless we get an implementation that's slow for small element
- size / non-word-aligned accesses, we assume that the hardware
- handles this well, and we don't want to complicate the code
- with shifting word contents around or handling extra bytes at
- the start and/or end. So we want the total transfer size and
- alignment to fit with the element size. */
- if (length % potential_ew != 0
- || !int_mode_for_size (bits, 0).exists (&elem_mode))
- continue;
- /* Find the mode to use for the copy inside the loop - or the
- sole copy, if there is no loop. */
- if (!need_loop)
- {
- /* Try if we have an exact mode for the copy. */
- if (get_vector_mode (elem_mode, nunits).exists (&vmode))
- break;
- /* Since we don't have a mode that exactlty matches the transfer
- size, we'll need to use pred_store, which is not available
- for all vector modes, but only iE_RVV_M* modes, hence trying
- to find a vector mode for a merely rounded-up size is
- pointless.
- Still, by choosing a lower LMUL factor that still allows
- an entire transfer, we can reduce register pressure. */
- for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
- if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
- /* Avoid loosing the option of using vsetivli . */
- && (nunits <= 31 * lmul || nunits > 31 * 8)
- && (get_vector_mode
- (elem_mode,
- exact_div (BYTES_PER_RISCV_VECTOR * lmul,
- potential_ew)
- ).exists (&vmode)))
- break;
- }
-
- /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
- wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by
- the sizes of larger element types; the LMUL factor of 8 can at
- the moment be divided by the SEW, with SEW of up to 8 bytes,
- but there are reserved encodings so there might be larger
- SEW in the future. */
- if (get_vector_mode (elem_mode,
- exact_div (BYTES_PER_RISCV_VECTOR * 8,
- potential_ew)).exists (&vmode))
- break;
-
- /* We may get here if we tried an element size that's larger than
- the hardware supports, but we should at least find a suitable
- byte vector mode. */
- gcc_assert (potential_ew > 1);
- }
- if (potential_ew > 1)
- length_rtx = GEN_INT (length / potential_ew);
- }
- else
- {
- vmode = E_RVVM8QImode;
- }
-
- /* A memcpy libcall in the worst case takes 3 instructions to prepare the
- arguments + 1 for the call. When RVV should take 7 instructions and
- we're optimizing for size a libcall may be preferable. */
- if (size_p && need_loop)
- return false;
-
- /* length_rtx holds the (remaining) length of the required copy.
- cnt holds the length we copy with the current load/store pair. */
- rtx cnt = length_rtx;
- rtx label = NULL_RTX;
- rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
- rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
-
- if (need_loop)
- {
- length_rtx = copy_to_mode_reg (Pmode, length_rtx);
- cnt = gen_reg_rtx (Pmode);
- label = gen_label_rtx ();
-
- emit_label (label);
- emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx));
- }
-
- vec = gen_reg_rtx (vmode);
- src = change_address (src_in, vmode, src_addr);
- dst = change_address (dst_in, vmode, dst_addr);
-
- /* If we don't need a loop and have a suitable mode to describe the size,
- just do a load / store pair and leave it up to the later lazy code
- motion pass to insert the appropriate vsetvli. */
- if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
- {
- emit_move_insn (vec, src);
- emit_move_insn (dst, vec);
- }
- else
- {
- machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require ();
- rtx mask = CONSTM1_RTX (mask_mode);
- if (!satisfies_constraint_K (cnt))
- cnt= force_reg (Pmode, cnt);
- rtx m_ops[] = {vec, mask, src};
- emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA,
- m_ops, cnt);
- emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
- get_avl_type_rtx (NONVLMAX)));
- }
-
- if (need_loop)
- {
- emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
- emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
- emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt)));
-
- /* Emit the loop condition. */
- rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
- emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label));
- emit_insn (gen_nop ());
- }
-
- return true;
-}
-
/* Return the vectorization machine mode for RVV according to LMUL. */
machine_mode
preferred_simd_mode (scalar_mode mode)
gcc_unreachable ();
}
-/* Expand IFN_RAWMEMCHAR internal function. */
+/* Expand IFN_RAWMEMCHR internal function. */
void
expand_RAWMEMCHR (internal_fn, gcall *stmt)
-/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops -fpeel-loops -fdump-tree-ch2-details-blocks" } */
+/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops -fpeel-loops -fdump-tree-ch2-details-blocks -fno-tree-loop-distribute-patterns" } */
int a[100];
int n = 1000000;
int zeroc;
-/* { dg-do run { target s390x-*-* } } */
+/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
/* Rawmemchr pattern: reduction stmt and no store */
-/* { dg-do run { target s390x-*-* } } */
+/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
/* Rawmemchr pattern: reduction stmt and store */
--- /dev/null
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=gnu99 -O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
+
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#define rawmemchrT(T, pattern) \
+__attribute__((noinline,noclone)) \
+T* rawmemchr_##T (T *s) \
+{ \
+ while (*s != pattern) \
+ ++s; \
+ return s; \
+}
+
+rawmemchrT(int8_t, (int8_t)0xde)
+rawmemchrT(uint8_t, 0xde)
+rawmemchrT(int16_t, (int16_t)0xdead)
+rawmemchrT(uint16_t, 0xdead)
+rawmemchrT(int32_t, (int32_t)0xdeadbeef)
+rawmemchrT(uint32_t, 0xdeadbeef)
+
+#define runT(T, pattern) \
+void run_##T () \
+{ \
+ T *buf = malloc (4096 * 2 * sizeof(T)); \
+ assert (buf != NULL); \
+ memset (buf, 0xa, 4096 * 2 * sizeof(T)); \
+ /* ensure q is 4096-byte aligned */ \
+ T *q = (T*)((unsigned char *)buf \
+ + (4096 - ((uintptr_t)buf & 4095))); \
+ T *p; \
+ /* unaligned + block boundary + 1st load */ \
+ p = (T *) ((uintptr_t)q - 8); \
+ p[2] = pattern; \
+ assert ((rawmemchr_##T (&p[0]) == &p[2])); \
+ p[2] = (T) 0xaaaaaaaa; \
+ /* unaligned + block boundary + 2nd load */ \
+ p = (T *) ((uintptr_t)q - 8); \
+ p[6] = pattern; \
+ assert ((rawmemchr_##T (&p[0]) == &p[6])); \
+ p[6] = (T) 0xaaaaaaaa; \
+ /* unaligned + 1st load */ \
+ q[5] = pattern; \
+ assert ((rawmemchr_##T (&q[2]) == &q[5])); \
+ q[5] = (T) 0xaaaaaaaa; \
+ /* unaligned + 2nd load */ \
+ q[14] = pattern; \
+ assert ((rawmemchr_##T (&q[2]) == &q[14])); \
+ q[14] = (T) 0xaaaaaaaa; \
+ /* unaligned + 3rd load */ \
+ q[19] = pattern; \
+ assert ((rawmemchr_##T (&q[2]) == &q[19])); \
+ q[19] = (T) 0xaaaaaaaa; \
+ /* unaligned + 4th load */ \
+ q[25] = pattern; \
+ assert ((rawmemchr_##T (&q[2]) == &q[25])); \
+ q[25] = (T) 0xaaaaaaaa; \
+ /* aligned + 1st load */ \
+ q[5] = pattern; \
+ assert ((rawmemchr_##T (&q[0]) == &q[5])); \
+ q[5] = (T) 0xaaaaaaaa; \
+ /* aligned + 2nd load */ \
+ q[14] = pattern; \
+ assert ((rawmemchr_##T (&q[0]) == &q[14])); \
+ q[14] = (T) 0xaaaaaaaa; \
+ /* aligned + 3rd load */ \
+ q[19] = pattern; \
+ assert ((rawmemchr_##T (&q[0]) == &q[19])); \
+ q[19] = (T) 0xaaaaaaaa; \
+ /* aligned + 4th load */ \
+ q[25] = pattern; \
+ assert ((rawmemchr_##T (&q[0]) == &q[25])); \
+ q[25] = (T) 0xaaaaaaaa; \
+ free (buf); \
+}
+
+runT(int8_t, (int8_t)0xde)
+runT(uint8_t, 0xde)
+runT(int16_t, (int16_t)0xdead)
+runT(uint16_t, 0xdead)
+runT(int32_t, (int32_t)0xdeadbeef)
+runT(uint32_t, 0xdeadbeef)
+
+int main (void)
+{
+ run_uint8_t ();
+ run_int8_t ();
+ run_uint16_t ();
+ run_int16_t ();
+ run_uint32_t ();
+ run_int32_t ();
+ return 0;
+}
"" "$op"
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/cond/*.\[cS\]]] \
"" "$op"
+ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/builtin/*.\[cS\]]] \
+ "" "$op"
}
# widening operation only test on LMUL < 8