From: konglin1 Date: Tue, 19 Oct 2021 01:35:30 +0000 (+0800) Subject: Combine the FADD(A, FMA(B, C, 0)) to FMA(B, C, A) and combine FADD(A, FMUL(B, C)... X-Git-Tag: basepoints/gcc-13~3664 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7c20a9b738a2257bed4e2b0593275336d1e2047a;p=thirdparty%2Fgcc.git Combine the FADD(A, FMA(B, C, 0)) to FMA(B, C, A) and combine FADD(A, FMUL(B, C)) to FMA(B, C, A). This patch is to support transform in fast-math something like _mm512_add_ph(x1, _mm512_fmadd_pch(a, b, _mm512_setzero_ph())) to _mm512_fmadd_pch(a, b, x1). And support transform _mm512_add_ph(x1, _mm512_fmul_pch(a, b)) to _mm512_fmadd_pch(a, b, x1). gcc/ChangeLog: * config/i386/sse.md (fma__fadd_fmul): Add new define_insn_and_split. (fma__fadd_fcmul):Likewise (fma___fma_zero):Likewise gcc/testsuite/ChangeLog: * gcc.target/i386/avx512fp16-complex-fma.c: New test. --- diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f37c5c0e7062..431236ab3a43 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5958,6 +5958,58 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) +(define_insn_and_split "fma__fadd_fmul" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (plus:VF_AVX512FP16VL + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "vector_operand") + (match_operand:VF_AVX512FP16VL 2 "vector_operand")] + UNSPEC_COMPLEX_FMUL) + (match_operand:VF_AVX512FP16VL 3 "vector_operand")))] + "TARGET_AVX512FP16 && flag_unsafe_math_optimizations + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VF_AVX512FP16VL + [(match_dup 1) (match_dup 2) (match_dup 3)] + UNSPEC_COMPLEX_FMA))]) + +(define_insn_and_split "fma__fadd_fcmul" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (plus:VF_AVX512FP16VL + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "vector_operand") + (match_operand:VF_AVX512FP16VL 2 "vector_operand")] + UNSPEC_COMPLEX_FCMUL) + (match_operand:VF_AVX512FP16VL 3 "vector_operand")))] + "TARGET_AVX512FP16 && flag_unsafe_math_optimizations + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VF_AVX512FP16VL + [(match_dup 1) (match_dup 2) (match_dup 3)] + UNSPEC_COMPLEX_FCMA))]) + +(define_insn_and_split "fma___fma_zero" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (plus:VF_AVX512FP16VL + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "vector_operand") + (match_operand:VF_AVX512FP16VL 2 "vector_operand") + (match_operand:VF_AVX512FP16VL 3 "const0_operand")] + UNSPEC_COMPLEX_F_C_MA) + (match_operand:VF_AVX512FP16VL 4 "vector_operand")))] + "TARGET_AVX512FP16 && flag_unsafe_math_optimizations + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:VF_AVX512FP16VL + [(match_dup 1) (match_dup 2) (match_dup 4)] + UNSPEC_COMPLEX_F_C_MA))]) + (define_insn "___mask" [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") (vec_merge:VF_AVX512FP16VL diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-complex-fma.c b/gcc/testsuite/gcc.target/i386/avx512fp16-complex-fma.c new file mode 100644 index 000000000000..2dfd369e7852 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-complex-fma.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -O2 -Ofast" } */ +/* { dg-final { scan-assembler-times "vfmaddcph\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-not "vaddph\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)"} } */ +/* { dg-final { scan-assembler-not "vfmulcph\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)"} } */ +/* { dg-final { scan-assembler-times "vfcmaddcph\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ + +#include +volatile __m512h x1, x2, res, a, b; +void extern +avx512f_test (void) +{ + res = _mm512_add_ph (x1, _mm512_fmadd_pch (a, b, _mm512_setzero_ph())); + res = _mm512_add_ph (x1, _mm512_fcmadd_pch (a, b, _mm512_setzero_ph())); + + res = _mm512_add_ph (x1, _mm512_fmul_pch (a, b)); + res = _mm512_add_ph (x1, _mm512_fcmul_pch (a, b)); +}