From: liuhongt Date: Mon, 1 Dec 2025 08:51:26 +0000 (-0800) Subject: Transform std::max(t, float(0)) into vmaxps w/o fast-math. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=106e001b4bb5313a8d59b858e80d5633f90fcec0;p=thirdparty%2Fgcc.git Transform std::max(t, float(0)) into vmaxps w/o fast-math. The pattern is simplied to below since there's const0_operand (set (reg:V8SF 124) (and:V8SF (not:V8SF (lt:V8SF (reg:V8SF 123 [ MEM [(const float *)input_12(D) + ivtmp.30_4 * 1] ]) (const_vector:V8SF [ (const_double:SF 0.0 [0x0.0p+0]) repeated x8 ]))) (reg:V8SF 123 [ MEM [(const float *)input_12(D) + ivtmp.30_4 * 1] ])) Add new combine pattern to match it. Also extend related avx512 pattern to accept immediate_operand, so that the optimization is also available under AVX512. The codegen is now better than before, however, vpxor is not hoisted outside of the loop after it's created in split1. gcc/ChangeLog: PR target/71921 * config/i386/predicates.md (ieee_maxmin_comparison_operator): New predicator. * config/i386/sse.md (*minmax3_3): New define_insn_and_split. (*minmax3_4): Ditto. (*minmax3_1): Extend operands[2]/operands[4] to handle immediate_operand. gcc/testsuite/ChangeLog: * g++.target/i386/avx512-pr71921.C: New test. * g++.target/i386/pr71921.C: New test. --- diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index c468f5ad26e..2863b3ec333 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1584,6 +1584,9 @@ (define_predicate "add_comparison_operator" (match_code "geu,ltu")) +(define_predicate "ieee_maxmin_comparison_operator" + (match_code "lt,gt")) + ;; Return true if OP is a valid comparison operator in valid mode. (define_predicate "ix86_comparison_operator" (match_operand 0 "comparison_operator") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 9a8d1767ec9..0be898c789e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3338,10 +3338,10 @@ [(set (match_operand:VFH 0 "register_operand") (vec_merge:VFH (match_operand:VFH 1 "nonimmediate_operand") - (match_operand:VFH 2 "nonimmediate_operand") + (match_operand:VFH 2 "general_operand") (unspec: [(match_operand:VFH 3 "nonimmediate_operand") - (match_operand:VFH 4 "nonimmediate_operand") + (match_operand:VFH 4 "general_operand") (match_operand:SI 5 "const_0_to_31_operand")] UNSPEC_PCMP)))] "TARGET_SSE && ix86_pre_reload_split () @@ -3352,19 +3352,21 @@ && (INTVAL (operands[5]) == 1 || INTVAL (operands[5]) == 14)" "#" "&& 1" - [(const_int 0)] + [(set (match_dup 0) (match_dup 6))] { int u = UNSPEC_IEEE_MIN; + rtx tmp = operands[2]; if ((INTVAL (operands[5]) == 1 && rtx_equal_p (operands[1], operands[4])) || (INTVAL (operands[5]) == 14 && rtx_equal_p (operands[1], operands[3]))) u = UNSPEC_IEEE_MAX; if (MEM_P (operands[1])) operands[1] = force_reg (mode, operands[1]); - rtvec v = gen_rtvec (2, operands[1], operands[2]); - rtx tmp = gen_rtx_UNSPEC (mode, v, u); - emit_move_insn (operands[0], tmp); - DONE; + + if (immediate_operand (operands[2], mode)) + tmp = force_reg (mode, operands[2]); + rtvec v = gen_rtvec (2, operands[1], tmp); + operands[6] = gen_rtx_UNSPEC (mode, v, u); }) (define_insn_and_split "*minmax3_2" @@ -3383,7 +3385,7 @@ && rtx_equal_p (operands[2], operands[3])))" "#" "&& 1" - [(const_int 0)] + [(set (match_dup 0) (match_dup 5))] { int u = UNSPEC_IEEE_MIN; if (rtx_equal_p (operands[1], operands[3])) @@ -3392,9 +3394,53 @@ if (MEM_P (operands[2])) operands[2] = force_reg (mode, operands[2]); rtvec v = gen_rtvec (2, operands[2], operands[1]); - rtx tmp = gen_rtx_UNSPEC (mode, v, u); - emit_move_insn (operands[0], tmp); - DONE; + operands[5] = gen_rtx_UNSPEC (mode, v, u); + }) + + +(define_insn_and_split "*minmax3_3" + [(set (match_operand:VF_128_256 0 "register_operand") + (and:VF_128_256 + (not:VF_128_256 + (match_operator:VF_128_256 1 "ieee_maxmin_comparison_operator" + [(match_operand:VF_128_256 2 "nonimmediate_operand") + (match_operand:VF_128_256 3 "const0_operand")])) + (match_operand:VF_128_256 4 "nonimmediate_operand")))] + "TARGET_SSE && ix86_pre_reload_split () + && rtx_equal_p (operands[2], operands[4])" + "#" + "&& 1" + [(set (match_dup 0) (match_dup 5))] + { + int u = UNSPEC_IEEE_MIN; + if (GET_CODE (operands[1]) == LT) + u = UNSPEC_IEEE_MAX; + + rtx tmp = force_reg (mode, operands[3]); + rtvec v = gen_rtvec (2, tmp, operands[2]); + operands[5] = gen_rtx_UNSPEC (mode, v, u); + }) + +(define_insn_and_split "*minmax3_4" + [(set (match_operand:VF_128_256 0 "register_operand") + (and:VF_128_256 + (match_operator:VF_128_256 1 "ieee_maxmin_comparison_operator" + [(match_operand:VF_128_256 2 "nonimmediate_operand") + (match_operand:VF_128_256 3 "const0_operand")]) + (match_operand:VF_128_256 4 "nonimmediate_operand")))] + "TARGET_SSE && ix86_pre_reload_split () + && rtx_equal_p (operands[2], operands[4])" + "#" + "&& 1" + [(set (match_dup 0) (match_dup 5))] + { + int u = UNSPEC_IEEE_MIN; + if (GET_CODE (operands[1]) == GT) + u = UNSPEC_IEEE_MAX; + + rtx tmp = force_reg (mode, operands[3]); + rtvec v = gen_rtvec (2, operands[2], tmp); + operands[5] = gen_rtx_UNSPEC (mode, v, u); }) ;; These versions of the min/max patterns implement exactly the operations diff --git a/gcc/testsuite/g++.target/i386/avx512-pr71921.C b/gcc/testsuite/g++.target/i386/avx512-pr71921.C new file mode 100644 index 00000000000..a3e458fa741 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/avx512-pr71921.C @@ -0,0 +1,60 @@ +// PR target/116925 +// { dg-do compile } +// { dg-options "-O2 -march=x86-64-v4" } +// { dg-final { scan-assembler-not "vcmpltps" } } +// { dg-final { scan-assembler-times "vminps" 2 } } +// { dg-final { scan-assembler-times "vmaxps" 2 } } + +#include + +void relu(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::max(t, float(0)); + } +} + +void relu1(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::max(float(0), t); + } +} + +void relu2(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::min(t, float(0)); + } +} + +void relu3(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::min(float(0), t); + } +} diff --git a/gcc/testsuite/g++.target/i386/pr71921.C b/gcc/testsuite/g++.target/i386/pr71921.C new file mode 100644 index 00000000000..baf23526402 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr71921.C @@ -0,0 +1,60 @@ +// PR target/116925 +// { dg-do compile } +// { dg-options "-O2 -march=x86-64-v3" } +// { dg-final { scan-assembler-not "vcmpltps" } } +// { dg-final { scan-assembler-times "vminps" 2 } } +// { dg-final { scan-assembler-times "vmaxps" 2 } } + +#include + +void relu(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::max(t, float(0)); + } +} + +void relu1(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::max(float(0), t); + } +} + +void relu2(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::min(t, float(0)); + } +} + +void relu3(float * __restrict__ output, const float * __restrict__ input, int size) +{ + int i; + int s2; + + s2 = size / 4; + for (i = 0; i < 10000; i++) { + float t; + t = input[i]; + output[i] = std::min(float(0), t); + } +}