Certain permute that blends a vector with zero can be interpreted as an AND of a
mask. This idea was suggested by Richard Sandiford when he was reviewing my
patch which tries to optimizes certain vector permute with the FMOV instruction
for the aarch64 target.
For example, for the aarch64 target, at present:
v4hi
f_v4hi (v4hi x)
{
return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
}
generates:
f_v4hi:
uzp1 v0.2d, v0.2d, v0.2d
adrp x0, .LC0
ldr d31, [x0, #:lo12:.LC0]
tbl v0.8b, {v0.16b}, v31.8b
ret
.LC0:
.byte -1
.byte -1
.byte 2
.byte 3
.byte -1
.byte -1
.byte 6
.byte 7
With this patch, it generates:
f_v4hi:
mvni v31.2s, 0xff, msl 8
and v0.8b, v0.8b, v31.8b
ret
This patch also provides a target-independent routine for detecting vector
permute patterns which can be interpreted as AND.
Changes since v1:
* v2: Rework the patch to only perform the optimization for aarch64 by calling
the target independent routine vec_perm_and_mask.
PR target/100165
gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_evpc_and): New.
(aarch64_expand_vec_perm_const_1): Call aarch64_evpc_and.
* optabs.cc (vec_perm_and_mask): New.
* optabs.h (vec_perm_and_mask): New prototype.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/and-be.c: New test.
* gcc.target/aarch64/and-le.c: New test.
Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
return true;
}
+/* Recognize patterns suitable for the AND instructions. */
+static bool
+aarch64_evpc_and (struct expand_vec_perm_d *d)
+{
+ /* Either d->op0 or d->op1 should be a vector of all zeros. */
+ if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
+ return false;
+
+ machine_mode mode = d->vmode;
+ machine_mode sel_mode;
+ if (!related_int_vector_mode (mode).exists (&sel_mode))
+ return false;
+
+ insn_code and_code = optab_handler (and_optab, sel_mode);
+ rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p);
+ if (and_code == CODE_FOR_nothing || !and_mask)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ class expand_operand ops[3];
+ rtx in = d->zero_op0_p ? d->op1 : d->op0;
+ create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), sel_mode);
+ create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode);
+ create_input_operand (&ops[2], and_mask, sel_mode);
+ expand_insn (and_code, 3, ops);
+ rtx result = gen_lowpart (mode, ops[0].value);
+ if (!rtx_equal_p (d->target, result))
+ emit_move_insn (d->target, result);
+
+ return true;
+}
+
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
return true;
else if (aarch64_evpc_uzp (d))
return true;
+ else if (aarch64_evpc_and (d))
+ return true;
else if (aarch64_evpc_trn (d))
return true;
else if (aarch64_evpc_sel (d))
return NULL_RTX;
}
+/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
+ the non-zero vec_perm operand with some mask consisting of 0xffs and 0x00s,
+ assuming the other vec_perm operand is a constant vector of zeros. Return
+ the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
+ not be modeled as an and. MODE is the mode of the value being anded.
+ ZERO_OP0_P is true if the first operand of the vec_perm is a constant vector
+ of zeros or false if the second operand of the vec_perm is a constant vector
+ of zeros. */
+rtx
+vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
+ bool zero_op0_p)
+{
+ unsigned int nelt;
+ if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
+ return NULL_RTX;
+
+ rtx_vector_builder builder (mode, nelt, 1);
+ machine_mode emode = GET_MODE_INNER (mode);
+
+ for (unsigned int i = 0; i < nelt; i++)
+ {
+ if (zero_op0_p)
+ {
+ if (known_eq (sel[i], nelt + i))
+ builder.quick_push (CONSTM1_RTX (emode));
+ else if (known_lt (sel[i], nelt))
+ builder.quick_push (CONST0_RTX (emode));
+ else
+ return NULL_RTX;
+ }
+ else
+ {
+ if (known_eq (sel[i], i))
+ builder.quick_push (CONSTM1_RTX (emode));
+ else if (known_ge (sel[i], nelt))
+ builder.quick_push (CONST0_RTX (emode));
+ else
+ return NULL_RTX;
+ }
+ }
+
+ return builder.build ();
+}
+
/* Implement a permutation of vectors v0 and v1 using the permutation
vector in SEL and return the result. Use TARGET to hold the result
if nonnull and convenient.
/* Generate a conditional trap instruction. */
extern rtx_insn *gen_cond_trap (enum rtx_code, rtx, rtx, rtx);
+/* Check whether the vec_perm can be interpreted as an and operation. */
+extern rtx vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
+ bool zero_op0_p);
+
/* Generate code for VEC_PERM_EXPR. */
extern rtx expand_vec_perm_var (machine_mode, rtx, rtx, rtx, rtx);
extern rtx expand_vec_perm_const (machine_mode, rtx, rtx,
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+** movi v([0-9]+).2s, 0xff, msl 8
+** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+** ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+ return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+** mvni v([0-9]+).2s, 0xff, msl 8
+** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+** ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+ return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+** ...
+** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+** ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+ return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+** movi v([0-9]+).2d, 0xffffffff00000000
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4si
+f_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+** movi v([0-9]+).2d, 0xffffffff
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4si
+g_v4si (v4si x)
+{
+ return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+** movi v([0-9]+).2d, 0xffffffff
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4si
+h_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+** movi v([0-9]+).2d, 0xffffffff00000000
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+ return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+** movi d([0-9]+), 0xff00ff00ff000000
+** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+** ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+ return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+** ...
+** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+** ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+ return __builtin_shuffle (
+ x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+** mvni v([0-9]+).2s, 0xff, msl 8
+** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+** ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+ return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+** movi v([0-9]+).2s, 0xff, msl 8
+** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+** ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+ return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+** ...
+** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+** ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+ return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+** movi v([0-9]+).2d, 0xffffffff
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4si
+f_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+** movi v([0-9]+).2d, 0xffffffff00000000
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4si
+g_v4si (v4si x)
+{
+ return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+** movi v([0-9]+).2d, 0xffffffff00000000
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4si
+h_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+** movi v([0-9]+).2d, 0xffffffff
+** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+** ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+ return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+** movi d([0-9]+), 0xff00ff00ff
+** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+** ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+ return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+** ...
+** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+** ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+ return __builtin_shuffle (
+ x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}