rtx rperm[8], vperm;
unsigned i;
- if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
+ if (!TARGET_AVX || !d->one_operand_p
+ || (d->vmode != V8SImode && d->vmode != V8SFmode))
return false;
/* We can only permute within the 128-bit lane. */
vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
vperm = force_reg (V8SImode, vperm);
- emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
+ rtx target = d->target;
+ rtx op0 = d->op0;
+ if (d->vmode == V8SImode)
+ {
+ target = lowpart_subreg (V8SFmode, target, V8SImode);
+ op0 = lowpart_subreg (V8SFmode, op0, V8SImode);
+ }
+
+ emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm));
return true;
}
switch (mode)
{
case E_V8DFmode:
+ case E_V8DImode:
/* In the 512-bit DFmode case, we can only move elements within
a 128-bit lane. First fill the second part of the mask,
then fallthru. */
/* FALLTHRU */
case E_V4DFmode:
+ case E_V4DImode:
/* In the 256-bit DFmode case, we can only move elements within
a 128-bit lane. */
for (i = 0; i < 2; ++i)
break;
case E_V16SFmode:
+ case E_V16SImode:
/* In 512 bit SFmode case, permutation in the upper 256 bits
must mirror the permutation in the lower 256-bits. */
for (i = 0; i < 8; ++i)
/* FALLTHRU */
case E_V8SFmode:
+ case E_V8SImode:
/* In 256 bit SFmode case, we have full freedom of
movement within the low 128-bit lane, but the high 128-bit
lane must mirror the exact same pattern. */
/* FALLTHRU */
case E_V2DFmode:
+ case E_V2DImode:
case E_V4SFmode:
+ case E_V4SImode:
/* In the 128-bit case, we've full freedom in the placement of
the elements from the source operand. */
for (i = 0; i < nelt; ++i)
V16SF (V8SF "TARGET_AVX512VL")
V8DF (V4DF "TARGET_AVX512VL")])
+(define_mode_iterator V48_AVX
+ [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+ (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2")
+ (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+ (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
;; All AVX-512{F,VL} vector modes. Supposed TARGET_AVX512F baseline.
(define_mode_iterator V48H_AVX512VL
[V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
(define_mode_attr DOUBLEMASKMODE
[(HI "SI") (SI "DI")])
+;; Float mode suffix used for instructions like vpermilpd with integer modes.
+(define_mode_attr ssefltmodesuffix
+ [(V2DI "pd") (V4DI "pd") (V8DI "pd") (V4SI "ps") (V8SI "ps") (V16SI "ps")
+ (V2DF "pd") (V4DF "pd") (V8DF "pd") (V4SF "ps") (V8SF "ps") (V16SF "ps")])
;; Include define_subst patterns for instructions with mask
(include "subst.md")
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "<MODE>")])
-(define_mode_attr ssefltmodesuffix
- [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")
- (V2DF "pd") (V4DF "pd") (V4SF "ps") (V8SF "ps")])
-
(define_mode_attr ssefltvecmode
[(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
;; being a subset of what vpermp* can do), but vpermilp* has shorter
;; latency as it never crosses lanes.
(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
- [(set (match_operand:VF 0 "register_operand" "=v")
- (vec_select:VF
- (match_operand:VF 1 "nonimmediate_operand" "vm")
+ [(set (match_operand:V48_AVX 0 "register_operand" "=v")
+ (vec_select:V48_AVX
+ (match_operand:V48_AVX 1 "nonimmediate_operand" "vm")
(match_parallel 2 ""
[(match_operand 3 "const_int_operand")])))]
"TARGET_AVX && <mask_mode512bit_condition>
{
int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
operands[2] = GEN_INT (mask);
- return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
+ return "vpermil<ssefltmodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
}
[(set_attr "type" "sselog")
(set_attr "prefix_extra" "1")
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sierraforest -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler "vpermilps" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not "vpermd" } } */
+
+int a[256], b[256];
+
+void __attribute__((noinline))
+foo (void)
+{
+ int i;
+ for (i = 0; i < 32; ++i)
+ {
+ b[i*8+0] = a[i*8+0];
+ b[i*8+1] = a[i*8+0];
+ b[i*8+2] = a[i*8+3];
+ b[i*8+3] = a[i*8+3];
+ b[i*8+4] = a[i*8+4];
+ b[i*8+5] = a[i*8+6];
+ b[i*8+6] = a[i*8+4];
+ b[i*8+7] = a[i*8+6];
+ }
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vpermilp[ds]} 2} } */
+/* { dg-final { scan-assembler-not {(?n)vperm[dq]} } } */
+
+
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+
+v4di
+foo (v4di a)
+{
+ return __builtin_shufflevector (a, a, 1, 0, 3, 2);
+}
+
+v8si
+foo1 (v8si a)
+{
+ return __builtin_shufflevector (a, a, 1, 0, 3, 2, 7, 6, 5, 4);
+}
+
asm volatile ("" : "+v" (a));
}
-/* { dg-final { scan-assembler-times "vpalignr\[^\n\r]*\\\$8\[^\n\r]*%xmm16\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
-
typedef float V5 __attribute__((vector_size (16)));
void
asm volatile ("" : "+v" (a));
}
-/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 2 } } */
asm volatile ("" : "+v" (a));
}
-/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilpd\[^\n\r]*\\\$1\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 2 } } */