1. Support vpermw/vpermb in ix86_expand_vec_one_operand_perm_avx512.
2. Support 256/128-bits vpermi2b ix86_expand_vec_perm_vpermt2.
3. Add define_insn_and_split to optimize specific vector permutation to opmov{dw,wb,qd}.
gcc/ChangeLog:
PR target/101846
* config/i386/i386-expand.c (ix86_expand_vec_perm_vpermt2):
Support vpermi2b for V32QI/V16QImode.
(ix86_extract_perm_from_pool_constant): New function.
(ix86_expand_vec_one_operand_perm_avx512): Support
vpermw/vpermb under TARGET_AVX512BW/TARGET_AVX512VBMI.
(expand_vec_perm_1): Adjust comments for upper.
* config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant):
New declare.
* config/i386/predicates.md (permvar_truncate_operand): New predicate.
(pshufb_truncv4siv4hi_operand): Ditto.
(pshufb_truncv8hiv8qi_operand): Ditto.
* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1):
New pre_reload define_insn_and_split.
(*avx512f_permvar_truncv8siv8hi_1): Ditto.
(*avx512f_vpermvar_truncv8div8si_1): Ditto.
(*avx512f_permvar_truncv32hiv32qi_1): Ditto.
(*avx512f_permvar_truncv16hiv16qi_1): Ditto.
(*avx512f_permvar_truncv4div4si_1): Ditto.
(*avx512f_pshufb_truncv8hiv8qi_1): Ditto.
(*avx512f_pshufb_truncv4siv4hi_1): Ditto.
(*avx512f_pshufd_truncv2div2si_1): Ditto.
gcc/testsuite/ChangeLog:
PR target/101846
* gcc.target/i386/pr101846-2.c: New test.
* gcc.target/i386/pr101846-3.c: New test.
* gcc.target/i386/pr101846-4.c: New test.
switch (mode)
{
+ case E_V16QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+ gen = gen_avx512vl_vpermt2varv16qi3;
+ break;
+ case E_V32QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+ gen = gen_avx512vl_vpermt2varv32qi3;
+ break;
+ case E_V64QImode:
+ if (TARGET_AVX512VBMI)
+ gen = gen_avx512bw_vpermt2varv64qi3;
+ break;
case E_V8HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv8hi3;
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv16hi3;
break;
- case E_V64QImode:
- if (TARGET_AVX512VBMI)
- gen = gen_avx512bw_vpermt2varv64qi3;
- break;
case E_V32HImode:
if (TARGET_AVX512BW)
gen = gen_avx512bw_vpermt2varv32hi3;
}
}
+/* Return true if mem is pool constant which contains a const_vector
+ perm index, assign the index to PERM. */
+bool
+ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
+{
+ machine_mode mode = GET_MODE (mem);
+ int nelt = GET_MODE_NUNITS (mode);
+
+ if (!INTEGRAL_MODE_P (mode))
+ return false;
+
+ /* Needs to be constant pool. */
+ if (!(MEM_P (mem))
+ || !SYMBOL_REF_P (XEXP (mem, 0))
+ || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
+ return false;
+
+ rtx constant = get_pool_constant (XEXP (mem, 0));
+
+ if (GET_CODE (constant) != CONST_VECTOR)
+ return false;
+
+ /* There could be some rtx like
+ (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+ but with "*.LC1" refer to V2DI constant vector. */
+ if (GET_MODE (constant) != mode)
+ {
+ constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+
+ if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+ return false;
+ }
+
+ for (int i = 0; i != nelt; i++)
+ perm[i] = UINTVAL (XVECEXP (constant, 0, i));
+
+ return true;
+}
+
/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
but works for floating pointer parameters and nonoffsetable memories.
For pushes, it returns just stack offsets; the values will be saved
{
machine_mode mode = GET_MODE (d->op0);
machine_mode maskmode = mode;
+ unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
rtx (*gen) (rtx, rtx, rtx) = NULL;
rtx target, op0, mask;
rtx vec[64];
if (!TARGET_AVX512F)
return false;
+ /* Accept VNxHImode and VNxQImode now. */
+ if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
+ return false;
+
+ /* vpermw. */
+ if (!TARGET_AVX512BW && inner_size == 2)
+ return false;
+
+ /* vpermb. */
+ if (!TARGET_AVX512VBMI && inner_size == 1)
+ return false;
+
switch (mode)
{
case E_V16SImode:
gen = gen_avx512f_permvarv8df;
maskmode = V8DImode;
break;
+ case E_V32HImode:
+ gen = gen_avx512bw_permvarv32hi;
+ break;
+ case E_V16HImode:
+ gen = gen_avx512vl_permvarv16hi;
+ break;
+ case E_V8HImode:
+ gen = gen_avx512vl_permvarv8hi;
+ break;
+ case E_V64QImode:
+ gen = gen_avx512bw_permvarv64qi;
+ break;
+ case E_V32QImode:
+ gen = gen_avx512vl_permvarv32qi;
+ break;
+ case E_V16QImode:
+ gen = gen_avx512vl_permvarv16qi;
+ break;
+
default:
return false;
}
if (expand_vec_perm_palignr (d, true))
return true;
- /* Try the AVX512F vperm{s,d} instructions. */
+ /* Try the AVX512F vperm{w,b,s,d} instructions */
if (ix86_expand_vec_one_operand_perm_avx512 (d))
return true;
extern void ix86_expand_sse2_abs (rtx, rtx);
extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
rtx);
+extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
/* In i386-c.c */
extern void ix86_target_macros (void);
return true;
})
+;; Return true if OP is a constant pool in perm{w,d,b} which constains index
+;; match pmov{dw,wb,qd}.
+(define_predicate "permvar_truncate_operand"
+ (match_code "mem")
+{
+ int nelt = GET_MODE_NUNITS (mode);
+ int perm[128];
+ int id;
+
+ if (!INTEGRAL_MODE_P (mode) || !VECTOR_MODE_P (mode))
+ return false;
+
+ if (nelt < 2)
+ return false;
+
+ if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+ return false;
+
+ id = exact_log2 (nelt);
+
+ /* Check that the permutation is suitable for pmovz{bw,wd,dq}.
+ For example V16HImode to V8HImode
+ { 0 2 4 6 8 10 12 14 * * * * * * * * }. */
+ for (int i = 0; i != nelt / 2; i++)
+ if ((perm[i] & ((1 << id) - 1)) != i * 2)
+ return false;
+
+ return true;
+})
+
+;; Return true if OP is a constant pool in shufb which constains index
+;; match pmovdw.
+(define_predicate "pshufb_truncv4siv4hi_operand"
+ (match_code "mem")
+{
+ int perm[128];
+
+ if (mode != E_V16QImode)
+ return false;
+
+ if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+ return false;
+
+ /* Check that the permutation is suitable for pmovdw.
+ For example V4SImode to V4HImode
+ { 0 1 4 5 8 9 12 13 * * * * * * * * }.
+ index = i % 2 + (i / 2) * 4. */
+ for (int i = 0; i != 8; i++)
+ {
+ /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */
+ if (perm[i] & 128)
+ return false;
+
+ if ((perm[i] & 15) != ((i & 1) + (i & 0xFE) * 2))
+ return false;
+ }
+
+ return true;
+})
+
+;; Return true if OP is a constant pool in shufb which constains index
+;; match pmovdw.
+(define_predicate "pshufb_truncv8hiv8qi_operand"
+ (match_code "mem")
+{
+ int perm[128];
+
+ if (mode != E_V16QImode)
+ return false;
+
+ if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+ return false;
+
+ /* Check that the permutation is suitable for pmovwb.
+ For example V16QImode to V8QImode
+ { 0 2 4 6 8 10 12 14 * * * * * * * * }.
+ index = i % 2 + (i / 2) * 4. */
+ for (int i = 0; i != 8; i++)
+ {
+ /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */
+ if (perm[i] & 128)
+ return false;
+
+ if ((perm[i] & 15) != i * 2)
+ return false;
+ }
+
+ return true;
+})
+
;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select,
;; where one of the two operands of the vec_concat is const0_operand.
(define_predicate "pmovzx_parallel"
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1"
+ [(set (match_operand:V16HI 0 "nonimmediate_operand")
+ (vec_select:V16HI
+ (unspec:V32HI
+ [(match_operand:V32HI 1 "register_operand")
+ (match_operand:V32HI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)
+ (const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)])))]
+ "TARGET_AVX512BW && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V16HI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
+
+(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"
+ [(set (match_operand:V8HI 0 "nonimmediate_operand")
+ (vec_select:V8HI
+ (unspec:V16HI
+ [(match_operand:V16HI 1 "register_operand")
+ (match_operand:V16HI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)])))]
+ "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V8HI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
+
+(define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"
+ [(set (match_operand:V8SI 0 "nonimmediate_operand")
+ (vec_select:V8SI
+ (unspec:V16SI
+ [(match_operand:V16SI 1 "register_operand")
+ (match_operand:V16SI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)])))]
+ "TARGET_AVX512F && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V8SI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V8DImode, operands[1], V16SImode);")
+
(define_insn "avx512f_<code><pmov_src_lower><mode>2_mask"
[(set (match_operand:PMOV_DST_MODE_1 0 "nonimmediate_operand" "=v,m")
(vec_merge:PMOV_DST_MODE_1
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
+(define_insn_and_split "*avx512f_permvar_truncv32hiv32qi_1"
+ [(set (match_operand:V32QI 0 "nonimmediate_operand")
+ (vec_select:V32QI
+ (unspec:V64QI
+ [(match_operand:V64QI 1 "register_operand")
+ (match_operand:V64QI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)
+ (const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)
+ (const_int 16) (const_int 17)
+ (const_int 18) (const_int 19)
+ (const_int 20) (const_int 21)
+ (const_int 22) (const_int 23)
+ (const_int 24) (const_int 25)
+ (const_int 26) (const_int 27)
+ (const_int 28) (const_int 29)
+ (const_int 30) (const_int 31)])))]
+ "TARGET_AVX512VBMI && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V32QI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V32HImode, operands[1], V64QImode);")
+
(define_insn "avx512bw_<code>v32hiv32qi2_mask"
[(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
(vec_merge:V32QI
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*avx512f_permvar_truncv16hiv16qi_1"
+ [(set (match_operand:V16QI 0 "nonimmediate_operand")
+ (vec_select:V16QI
+ (unspec:V32QI
+ [(match_operand:V32QI 1 "register_operand")
+ (match_operand:V32QI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)
+ (const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)])))]
+ "TARGET_AVX512VL && TARGET_AVX512VBMI
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V16QI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V16HImode, operands[1], V32QImode);")
+
+(define_insn_and_split "*avx512f_permvar_truncv4div4si_1"
+ [(set (match_operand:V4SI 0 "nonimmediate_operand")
+ (vec_select:V4SI
+ (unspec:V8SI
+ [(match_operand:V8SI 1 "register_operand")
+ (match_operand:V8SI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V4SI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V4DImode, operands[1], V8SImode);")
+
(define_insn "<avx512>_<code><ssedoublemodelower><mode>2_mask"
[(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
(vec_merge:PMOV_DST_MODE_2
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_insn_and_split "*avx512f_pshufb_truncv8hiv8qi_1"
+ [(set (match_operand:DI 0 "register_operand")
+ (vec_select:DI
+ (subreg:V2DI
+ (unspec:V16QI
+ [(match_operand:V16QI 1 "register_operand")
+ (match_operand:V16QI 2 "pshufb_truncv8hiv8qi_operand")]
+ UNSPEC_PSHUFB) 0)
+ (parallel [(const_int 0)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op1 = gen_reg_rtx (V8QImode);
+ operands[1] = lowpart_subreg (V8HImode, operands[1], V16QImode);
+ emit_insn (gen_truncv8hiv8qi2 (op1, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V8QImode));
+ DONE;
+})
+
(define_insn "*avx512vl_<code>v2div2qi2_store_1"
[(set (match_operand:V2QI 0 "memory_operand" "=m")
(any_truncate:V2QI
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_insn_and_split "*avx512f_pshufb_truncv4siv4hi_1"
+ [(set (match_operand:DI 0 "register_operand")
+ (vec_select:DI
+ (subreg:V2DI
+ (unspec:V16QI
+ [(match_operand:V16QI 1 "register_operand")
+ (match_operand:V16QI 2 "pshufb_truncv4siv4hi_operand")]
+ UNSPEC_PSHUFB) 0)
+ (parallel [(const_int 0)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op1 = gen_reg_rtx (V4HImode);
+ operands[1] = lowpart_subreg (V4SImode, operands[1], V16QImode);
+ emit_insn (gen_truncv4siv4hi2 (op1, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V4HImode));
+ DONE;
+})
+
(define_insn "*avx512vl_<code><mode>v4hi2_store_1"
[(set (match_operand:V4HI 0 "memory_operand" "=m")
(any_truncate:V4HI
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_insn_and_split "*avx512f_pshufd_truncv2div2si_1"
+ [(set (match_operand:DI 0 "register_operand")
+ (vec_select:DI
+ (subreg:V2DI
+ (vec_select:V4SI
+ (match_operand:V4SI 1 "register_operand")
+ (parallel [(const_int 0) (const_int 2)
+ (const_int 2) (const_int 3)])) 0)
+ (parallel [(const_int 0)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op1 = gen_reg_rtx (V2SImode);
+ operands[1] = lowpart_subreg (V2DImode, operands[1], V4SImode);
+ emit_insn (gen_truncv2div2si2 (op1, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V2SImode));
+ DONE;
+})
+
(define_insn "*avx512vl_<code>v2div2si2_store_1"
[(set (match_operand:V2SI 0 "memory_operand" "=m")
(any_truncate:V2SI
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpmovwb" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovqd" "3" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v16hi
+foo_dw_512 (v32hi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30);
+}
+
+v8hi
+foo_dw_256 (v16hi x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14);
+}
+
+v4hi
+foo_dw_128 (v8hi x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6);
+}
+
+v8si
+foo_qd_512 (v16si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14);
+}
+
+v4si
+foo_qd_256 (v8si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6);
+}
+
+v2si
+foo_qd_128 (v4si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2);
+}
+
+v32qi
+foo_wb_512 (v64qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62);
+}
+
+v16qi
+foo_wb_256 (v32qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30);
+}
+
+v8qi
+foo_wb_128 (v16qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpermb" "2" } } */
+/* { dg-final { scan-assembler-times "vpermw" "2" } } */
+/* { dg-final { scan-assembler-times "vpermd" "2" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v32hi
+foow_512 (v32hi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31);
+}
+
+v16hi
+foow_256 (v16hi x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14,
+ 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+
+v16si
+food_512 (v16si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14,
+ 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+v8si
+food_256 (v8si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 4, 5, 6, 7);
+}
+
+v64qi
+foob_512 (v64qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63);
+}
+
+v32qi
+foob_256 (v32qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpermi2b" "3" } } */
+
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+
+
+v64qi
+foob_512 (v64qi x, v64qi y)
+{
+ return __builtin_shufflevector (x, y,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 77, 79, 74, 72, 70,
+ 89, 88, 78, 86, 85, 75, 83, 82,
+ 112, 108, 101, 100, 86, 96, 97, 95);
+}
+
+v32qi
+foob_256 (v32qi x, v32qi y)
+{
+ return __builtin_shufflevector (x, y,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62);
+}
+
+v16qi
+foob_128 (v16qi x, v16qi y)
+{
+ return __builtin_shufflevector (x, y,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30);
+}