For comparison NEQ/LT/NLE, it's simplified to 0.
For comparison LE/EQ/NLT, it's simplied to (1u << nelt) - 1
gcc/ChangeLog:
PR target/122320
* config/i386/sse.md (*<avx512>_cmp<mode>3_dup_op): New define_insn_and_split.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr122320-mask16.c: New test.
* gcc.target/i386/pr122320-mask2.c: New test.
* gcc.target/i386/pr122320-mask32.c: New test.
* gcc.target/i386/pr122320-mask4.c: New test.
* gcc.target/i386/pr122320-mask64.c: New test.
* gcc.target/i386/pr122320-mask8.c: New test.
UNSPEC_PCMP_ITER))]
"operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);")
+(define_insn_and_split "*<avx512>_cmp<mode>3_dup_op"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI1248_AVX512VLBW 1 "general_operand")
+ (match_operand:VI1248_AVX512VLBW 2 "general_operand")
+ (match_operand:SI 3 "<cmp_imm_predicate>")]
+ UNSPEC_PCMP_ITER))]
+ "TARGET_AVX512F && ix86_pre_reload_split ()
+ && rtx_equal_p (operands[1], operands[2])"
+ "#"
+ "&& 1"
+ [(set (match_dup 0) (match_dup 4))]
+{
+ int cmp_imm = INTVAL (operands[3]);
+ rtx res = CONST0_RTX (<avx512fmaskmode>mode);
+ /* EQ/LE/NLT. */
+ if (cmp_imm == 0 || cmp_imm == 2 || cmp_imm == 5)
+ {
+ int nelts = GET_MODE_NUNITS (<MODE>mode);
+ if (nelts >= 8)
+ res = CONSTM1_RTX (<avx512fmaskmode>mode);
+ else
+ res = gen_int_mode ((1u << nelts) - 1, QImode);
+ }
+ operands[4] = res;
+})
+
(define_insn "*<avx512>_eq<mode>3<mask_scalar_merge_name>_1"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k,k")
(unspec:<avx512fmaskmode>
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vpcmp" } } */
+
+#include <immintrin.h>
+
+__mmask16 dumpy_eq (__m512i vx){
+ return _mm512_cmp_epi32_mask (vx, vx, 0);
+}
+
+__mmask16 dumpy_lt (__m512i vx)
+{
+ return _mm512_cmp_epi32_mask (vx, vx, 1);
+}
+
+__mmask16 dumpy_le (__m512i vx){
+ return _mm512_cmp_epi32_mask (vx, vx, 2);
+}
+
+__mmask16 dumpy_ne (__m512i vx)
+{
+ return _mm512_cmp_epi32_mask (vx, vx, 4);
+}
+
+__mmask16 dumpy_nlt (__m512i vx)
+{
+ return _mm512_cmp_epi32_mask (vx, vx, 5);
+}
+
+__mmask16 dumpy_nle (__m512i vx){
+ return _mm512_cmp_epi32_mask (vx, vx, 6);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vpcmp" } } */
+
+#include <immintrin.h>
+
+__mmask8 dumpy_eq (__m128i vx){
+ return _mm_cmp_epi64_mask (vx, vx, 0);
+}
+
+__mmask8 dumpy_lt (__m128i vx)
+{
+ return _mm_cmp_epi64_mask (vx, vx, 1);
+}
+
+__mmask8 dumpy_le (__m128i vx){
+ return _mm_cmp_epi64_mask (vx, vx, 2);
+}
+
+__mmask8 dumpy_ne (__m128i vx)
+{
+ return _mm_cmp_epi64_mask (vx, vx, 4);
+}
+
+__mmask8 dumpy_nlt (__m128i vx)
+{
+ return _mm_cmp_epi64_mask (vx, vx, 5);
+}
+
+__mmask8 dumpy_nle (__m128i vx){
+ return _mm_cmp_epi64_mask (vx, vx, 6);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vpcmp" } } */
+
+#include <immintrin.h>
+
+__mmask32 dumpy_eq (__m512i vx){
+ return _mm512_cmp_epi16_mask (vx, vx, 0);
+}
+
+__mmask32 dumpy_lt (__m512i vx)
+{
+ return _mm512_cmp_epi16_mask (vx, vx, 1);
+}
+
+__mmask32 dumpy_le (__m512i vx){
+ return _mm512_cmp_epi16_mask (vx, vx, 2);
+}
+
+__mmask32 dumpy_ne (__m512i vx)
+{
+ return _mm512_cmp_epi16_mask (vx, vx, 4);
+}
+
+__mmask32 dumpy_nlt (__m512i vx)
+{
+ return _mm512_cmp_epi16_mask (vx, vx, 5);
+}
+
+__mmask32 dumpy_nle (__m512i vx){
+ return _mm512_cmp_epi16_mask (vx, vx, 6);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vpcmp" } } */
+
+#include <immintrin.h>
+
+__mmask8 dumpy_eq (__m256i vx){
+ return _mm256_cmp_epi64_mask (vx, vx, 0);
+}
+
+__mmask8 dumpy_lt (__m256i vx)
+{
+ return _mm256_cmp_epi64_mask (vx, vx, 1);
+}
+
+__mmask8 dumpy_le (__m256i vx){
+ return _mm256_cmp_epi64_mask (vx, vx, 2);
+}
+
+__mmask8 dumpy_ne (__m256i vx)
+{
+ return _mm256_cmp_epi64_mask (vx, vx, 4);
+}
+
+__mmask8 dumpy_nlt (__m256i vx)
+{
+ return _mm256_cmp_epi64_mask (vx, vx, 5);
+}
+
+__mmask8 dumpy_nle (__m256i vx){
+ return _mm256_cmp_epi64_mask (vx, vx, 6);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vpcmp" } } */
+
+#include <immintrin.h>
+
+__mmask64 dumpy_eq (__m512i vx){
+ return _mm512_cmp_epi8_mask (vx, vx, 0);
+}
+
+__mmask64 dumpy_lt (__m512i vx)
+{
+ return _mm512_cmp_epi8_mask (vx, vx, 1);
+}
+
+__mmask64 dumpy_le (__m512i vx){
+ return _mm512_cmp_epi8_mask (vx, vx, 2);
+}
+
+__mmask64 dumpy_ne (__m512i vx)
+{
+ return _mm512_cmp_epi8_mask (vx, vx, 4);
+}
+
+__mmask64 dumpy_nlt (__m512i vx)
+{
+ return _mm512_cmp_epi8_mask (vx, vx, 5);
+}
+
+__mmask64 dumpy_nle (__m512i vx){
+ return _mm512_cmp_epi8_mask (vx, vx, 6);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vpcmp" } } */
+
+#include <immintrin.h>
+
+__mmask8 dumpy_eq (__m512i vx){
+ return _mm512_cmp_epi64_mask (vx, vx, 0);
+}
+
+__mmask8 dumpy_lt (__m512i vx)
+{
+ return _mm512_cmp_epi64_mask (vx, vx, 1);
+}
+
+__mmask8 dumpy_le (__m512i vx){
+ return _mm512_cmp_epi64_mask (vx, vx, 2);
+}
+
+__mmask8 dumpy_ne (__m512i vx)
+{
+ return _mm512_cmp_epi64_mask (vx, vx, 4);
+}
+
+__mmask8 dumpy_nlt (__m512i vx)
+{
+ return _mm512_cmp_epi64_mask (vx, vx, 5);
+}
+
+__mmask8 dumpy_nle (__m512i vx){
+ return _mm512_cmp_epi64_mask (vx, vx, 6);
+}