From 48cf8d0773fee2f7acf2b9778c7af7e344de07e6 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 4 Dec 2023 11:47:32 +0800 Subject: [PATCH] Support udot_prodv*qi with emulation sdot_prodv*hi Like r14-5990-gb4a7c1c8c59d19, but the patch optimized for udot_prod. Since (zero_extend) (unsigned char)-> int is equal to (zero_extend)(unsigned char) -> short + (sign_extend) (short) -> int It should be safe to emulate udot_prodv*qi with vec_unpacku_lo_v32qi vec_unpacku_lo_v32qi vec_unpacku_hi_v32qi vec_unpacku_hi_v32qi sdot_prodv16hi sdot_prodv16hi add3v8si gcc/ChangeLog: * config/i386/sse.md (udot_prodv64qi): New expander. (udot_prod): Emulates with VEC_UNPACKU_EXPR + DOT_PROD (short, int). gcc/testsuite/ChangeLog: * gcc.target/i386/udotprodint8_emulate.c: New test. --- gcc/config/i386/sse.md | 82 ++++++++++++++++--- .../gcc.target/i386/udotprodint8_emulate.c | 15 ++++ 2 files changed, 85 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0df33419f73a..4c81f669c4b8 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -30838,20 +30838,78 @@ (define_expand "udot_prod" [(match_operand: 0 "register_operand") - (match_operand:VI1 1 "register_operand") - (match_operand:VI1 2 "register_operand") + (match_operand:VI1_AVX2 1 "register_operand") + (match_operand:VI1_AVX2 2 "register_operand") (match_operand: 3 "register_operand")] - "TARGET_AVXVNNIINT8" + "TARGET_SSE2" { - operands[1] = lowpart_subreg (mode, - force_reg (mode, operands[1]), - mode); - operands[2] = lowpart_subreg (mode, - force_reg (mode, operands[2]), - mode); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - emit_insn (gen_vpdpbuud_ (operands[0], operands[3], - operands[1], operands[2])); + if (TARGET_AVXVNNIINT8) + { + operands[1] = lowpart_subreg (mode, + force_reg (mode, operands[1]), + mode); + operands[2] = lowpart_subreg (mode, + force_reg (mode, operands[2]), + mode); + emit_insn (gen_rtx_SET (operands[0], operands[3])); + emit_insn (gen_vpdpbuud_ (operands[0], operands[3], + operands[1], operands[2])); + } + else + { + /* Emulate with vpdpwssd. */ + rtx op1_lo = gen_reg_rtx (mode); + rtx op1_hi = gen_reg_rtx (mode); + rtx op2_lo = gen_reg_rtx (mode); + rtx op2_hi = gen_reg_rtx (mode); + + emit_insn (gen_vec_unpacku_lo_ (op1_lo, operands[1])); + emit_insn (gen_vec_unpacku_lo_ (op2_lo, operands[2])); + emit_insn (gen_vec_unpacku_hi_ (op1_hi, operands[1])); + emit_insn (gen_vec_unpacku_hi_ (op2_hi, operands[2])); + + rtx res1 = gen_reg_rtx (mode); + rtx res2 = gen_reg_rtx (mode); + rtx sum = gen_reg_rtx (mode); + + emit_move_insn (sum, CONST0_RTX (mode)); + emit_insn (gen_sdot_prod (res1, op1_lo, + op2_lo, sum)); + emit_insn (gen_sdot_prod (res2, op1_hi, + op2_hi, operands[3])); + emit_insn (gen_add3 (operands[0], res1, res2)); + } + + DONE; +}) + +(define_expand "udot_prodv64qi" + [(match_operand:V16SI 0 "register_operand") + (match_operand:V64QI 1 "register_operand") + (match_operand:V64QI 2 "register_operand") + (match_operand:V16SI 3 "register_operand")] + "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512" +{ + /* Emulate with vpdpwssd. */ + rtx op1_lo = gen_reg_rtx (V32HImode); + rtx op1_hi = gen_reg_rtx (V32HImode); + rtx op2_lo = gen_reg_rtx (V32HImode); + rtx op2_hi = gen_reg_rtx (V32HImode); + + emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1])); + emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2])); + emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1])); + emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2])); + + rtx res1 = gen_reg_rtx (V16SImode); + rtx res2 = gen_reg_rtx (V16SImode); + rtx sum = gen_reg_rtx (V16SImode); + + emit_move_insn (sum, CONST0_RTX (V16SImode)); + emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum)); + emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3])); + + emit_insn (gen_addv16si3 (operands[0], res1, res2)); DONE; }) diff --git a/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c b/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c new file mode 100644 index 000000000000..1e8f2cfe5212 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-mavxvnni -O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times "DOT_PROD_EXPR" 1 "optimized" } } */ +/* { dg-final { scan-assembler-times "vpdpwssd" 2 } } */ + +int +foo (unsigned char* a, unsigned char* b) +{ + int sum = 0; + for (int i = 0; i != 16; i++) + { + sum += a[i] * b[i]; + } + return sum; +} -- 2.47.2