#include "aarch64-sve-builtins-shapes.h"
#include "aarch64-sve-builtins-base.h"
#include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
using namespace aarch64_sve;
insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
return e.use_contiguous_load_insn (icode);
}
+
+ gimple *
+ fold (gimple_folder &f) const override
+ {
+ tree arg0 = gimple_call_arg (f.call, 0);
+ tree arg1 = gimple_call_arg (f.call, 1);
+
+ /* Transform:
+ lhs = svld1rq ({-1, -1, ... }, arg1)
+ into:
+ tmp = mem_ref<vectype> [(elem * {ref-all}) arg1]
+ lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+ on little endian target.
+ vectype is the corresponding ADVSIMD type. */
+
+ if (!BYTES_BIG_ENDIAN
+ && integer_all_onesp (arg0))
+ {
+ tree lhs = gimple_call_lhs (f.call);
+ tree lhs_type = TREE_TYPE (lhs);
+ poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+ tree eltype = TREE_TYPE (lhs_type);
+
+ scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+ machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+ tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+ tree elt_ptr_type
+ = build_pointer_type_for_mode (eltype, VOIDmode, true);
+ tree zero = build_zero_cst (elt_ptr_type);
+
+ /* Use element type alignment. */
+ tree access_type
+ = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+ tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+ tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+ gimple *mem_ref_stmt
+ = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+ gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+ int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+ vec_perm_builder sel (lhs_len, source_nelts, 1);
+ for (int i = 0; i < source_nelts; i++)
+ sel.quick_push (i);
+
+ vec_perm_indices indices (sel, 1, source_nelts);
+ gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+ TYPE_MODE (access_type),
+ indices));
+ tree mask_type = build_vector_type (ssizetype, lhs_len);
+ tree mask = vec_perm_indices_to_tree (mask_type, indices);
+ return gimple_build_assign (lhs, VEC_PERM_EXPR,
+ mem_ref_lhs, mem_ref_lhs, mask);
+ }
+
+ return NULL;
+ }
};
class svld1ro_impl : public load_replicate
rtx target, op0, op1;
vec_perm_indices perm;
machine_mode vmode;
+ machine_mode op_mode;
unsigned int vec_flags;
+ unsigned int op_vec_flags;
bool one_vector_p;
bool testing_p;
};
newd.vmode = new_mode;
newd.vec_flags = VEC_ADVSIMD;
+ newd.op_mode = newd.vmode;
+ newd.op_vec_flags = newd.vec_flags;
newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
return true;
}
+/* Try to implement D using SVE dup instruction. */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+ if (BYTES_BIG_ENDIAN
+ || !d->one_vector_p
+ || d->vec_flags != VEC_SVE_DATA
+ || d->op_vec_flags != VEC_ADVSIMD
+ || d->perm.encoding ().nelts_per_pattern () != 1
+ || !known_eq (d->perm.encoding ().npatterns (),
+ GET_MODE_NUNITS (d->op_mode))
+ || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+ return false;
+
+ int npatterns = d->perm.encoding ().npatterns ();
+ for (int i = 0; i < npatterns; i++)
+ if (!known_eq (d->perm[i], i))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+ return true;
+}
+
/* Try to implement D using SVE SEL instruction. */
static bool
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
+ gcc_assert (d->op_mode != E_VOIDmode);
+
/* The pattern matching functions above are written to look for a small
number to begin the sequence (0, 1, N/2). If we begin with an index
from the second operand, we can swap the operands. */
|| d->vec_flags == VEC_SVE_PRED)
&& known_gt (nelt, 1))
{
- if (aarch64_evpc_rev_local (d))
- return true;
- else if (aarch64_evpc_rev_global (d))
- return true;
- else if (aarch64_evpc_ext (d))
- return true;
- else if (aarch64_evpc_dup (d))
- return true;
- else if (aarch64_evpc_zip (d))
- return true;
- else if (aarch64_evpc_uzp (d))
- return true;
- else if (aarch64_evpc_trn (d))
- return true;
- else if (aarch64_evpc_sel (d))
- return true;
- else if (aarch64_evpc_ins (d))
- return true;
- else if (aarch64_evpc_reencode (d))
- return true;
- if (d->vec_flags == VEC_SVE_DATA)
- return aarch64_evpc_sve_tbl (d);
- else if (d->vec_flags == VEC_ADVSIMD)
- return aarch64_evpc_tbl (d);
+ if (d->vmode == d->op_mode)
+ {
+ if (aarch64_evpc_rev_local (d))
+ return true;
+ else if (aarch64_evpc_rev_global (d))
+ return true;
+ else if (aarch64_evpc_ext (d))
+ return true;
+ else if (aarch64_evpc_dup (d))
+ return true;
+ else if (aarch64_evpc_zip (d))
+ return true;
+ else if (aarch64_evpc_uzp (d))
+ return true;
+ else if (aarch64_evpc_trn (d))
+ return true;
+ else if (aarch64_evpc_sel (d))
+ return true;
+ else if (aarch64_evpc_ins (d))
+ return true;
+ else if (aarch64_evpc_reencode (d))
+ return true;
+
+ if (d->vec_flags == VEC_SVE_DATA)
+ return aarch64_evpc_sve_tbl (d);
+ else if (d->vec_flags == VEC_ADVSIMD)
+ return aarch64_evpc_tbl (d);
+ }
+ else
+ {
+ if (aarch64_evpc_sve_dup (d))
+ return true;
+ }
}
return false;
}
rtx target, rtx op0, rtx op1,
const vec_perm_indices &sel)
{
- if (vmode != op_mode)
- return false;
-
struct expand_vec_perm_d d;
/* Check whether the mask can be applied to a single vector. */
sel.nelts_per_input ());
d.vmode = vmode;
d.vec_flags = aarch64_classify_vector_mode (d.vmode);
+ d.op_mode = op_mode;
+ d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
d.target = target;
d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
if (op0 == op1)
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type x) \
+{ \
+ return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8x16_t, s8)
+TEST(svint16_t, int16x8_t, s16)
+TEST(svint32_t, int32x4_t, s32)
+TEST(svint64_t, int64x2_t, s64)
+
+TEST(svuint8_t, uint8x16_t, u8)
+TEST(svuint16_t, uint16x8_t, u16)
+TEST(svuint32_t, uint32x4_t, u32)
+TEST(svuint64_t, uint64x2_t, u64)
+
+TEST(svfloat16_t, float16x8_t, f16)
+TEST(svfloat32_t, float32x4_t, f32)
+TEST(svfloat64_t, float64x2_t, f64)
+
+TEST(svbfloat16_t, bfloat16x8_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type *x) \
+{ \
+ return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8_t, s8)
+TEST(svint16_t, int16_t, s16)
+TEST(svint32_t, int32_t, s32)
+TEST(svint64_t, int64_t, s64)
+
+TEST(svuint8_t, uint8_t, u8)
+TEST(svuint16_t, uint16_t, u16)
+TEST(svuint32_t, uint32_t, u32)
+TEST(svuint64_t, uint64_t, u64)
+
+TEST(svfloat16_t, float16_t, f16)
+TEST(svfloat32_t, float32_t, f32)
+TEST(svfloat64_t, float64_t, f64)
+
+TEST(svbfloat16_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
break;
case VEC_PERM_EXPR:
- if (!useless_type_conversion_p (lhs_type, rhs1_type)
- || !useless_type_conversion_p (lhs_type, rhs2_type))
- {
- error ("type mismatch in %qs", code_name);
- debug_generic_expr (lhs_type);
- debug_generic_expr (rhs1_type);
- debug_generic_expr (rhs2_type);
- debug_generic_expr (rhs3_type);
- return true;
- }
+ /* If permute is constant, then we allow for lhs and rhs
+ to have different vector types, provided:
+ (1) lhs, rhs1, rhs2 have same element type.
+ (2) rhs3 vector is constant and has integer element type.
+ (3) len(lhs) == len(rhs3) && len(rhs1) == len(rhs2). */
- if (TREE_CODE (rhs1_type) != VECTOR_TYPE
+ if (TREE_CODE (lhs_type) != VECTOR_TYPE
+ || TREE_CODE (rhs1_type) != VECTOR_TYPE
|| TREE_CODE (rhs2_type) != VECTOR_TYPE
|| TREE_CODE (rhs3_type) != VECTOR_TYPE)
{
return true;
}
+ /* If rhs3 is constant, we allow lhs, rhs1 and rhs2 to be different vector types,
+ as long as lhs, rhs1 and rhs2 have same element type. */
+ if (TREE_CONSTANT (rhs3)
+ ? (!useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs1_type))
+ || !useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs2_type)))
+ : (!useless_type_conversion_p (lhs_type, rhs1_type)
+ || !useless_type_conversion_p (lhs_type, rhs2_type)))
+ {
+ error ("type mismatch in %qs", code_name);
+ debug_generic_expr (lhs_type);
+ debug_generic_expr (rhs1_type);
+ debug_generic_expr (rhs2_type);
+ debug_generic_expr (rhs3_type);
+ return true;
+ }
+
+ /* If rhs3 is constant, relax the check len(rhs2) == len(rhs3). */
if (maybe_ne (TYPE_VECTOR_SUBPARTS (rhs1_type),
TYPE_VECTOR_SUBPARTS (rhs2_type))
- || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
- TYPE_VECTOR_SUBPARTS (rhs3_type))
+ || (!TREE_CONSTANT(rhs3)
+ && maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
+ TYPE_VECTOR_SUBPARTS (rhs3_type)))
|| maybe_ne (TYPE_VECTOR_SUBPARTS (rhs3_type),
TYPE_VECTOR_SUBPARTS (lhs_type)))
{