From: liuhongt Date: Wed, 28 Jul 2021 08:24:52 +0000 (+0800) Subject: Add a separate function to calculate cost for WIDEN_MULT_EXPR. X-Git-Tag: basepoints/gcc-13~5727 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=231bcc77b953406b8381c7f55a3ec181da67d1e7;p=thirdparty%2Fgcc.git Add a separate function to calculate cost for WIDEN_MULT_EXPR. gcc/ChangeLog: PR target/39821 * config/i386/i386.c (ix86_widen_mult_cost): New function. (ix86_add_stmt_cost): Use ix86_widen_mult_cost for WIDEN_MULT_EXPR. gcc/testsuite/ChangeLog: PR target/39821 * gcc.target/i386/sse2-pr39821.c: New test. * gcc.target/i386/sse4-pr39821.c: New test. --- diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 12ae37e71030..a0285e659ad9 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19845,6 +19845,44 @@ ix86_vec_cost (machine_mode mode, int cost) return cost; } +/* Return cost of vec_widen_mult_hi/lo_, + vec_widen_mul_hi/lo_ is only available for VI124_AVX2. */ +static int +ix86_widen_mult_cost (const struct processor_costs *cost, + enum machine_mode mode, bool uns_p) +{ + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); + int extra_cost = 0; + int basic_cost = 0; + switch (mode) + { + case V8HImode: + case V16HImode: + if (!uns_p || mode == V16HImode) + extra_cost = cost->sse_op * 2; + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + case V4SImode: + case V8SImode: + /* pmulhw/pmullw can be used. */ + basic_cost = cost->mulss * 2 + cost->sse_op * 2; + break; + case V2DImode: + /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, + require extra 4 mul, 4 add, 4 cmp and 2 shift. */ + if (!TARGET_SSE4_1 && !uns_p) + extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 + + cost->sse_op * 2; + /* Fallthru. */ + case V4DImode: + basic_cost = cost->mulss * 2 + cost->sse_op * 4; + break; + default: + gcc_unreachable(); + } + return ix86_vec_cost (mode, basic_cost + extra_cost); +} + /* Return cost of multiplication in MODE. */ static int @@ -22575,10 +22613,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count, break; case MULT_EXPR: - case WIDEN_MULT_EXPR: + /* For MULT_HIGHPART_EXPR, x86 only supports pmulhw, + take it as MULT_EXPR. */ case MULT_HIGHPART_EXPR: stmt_cost = ix86_multiplication_cost (ix86_cost, mode); break; + /* There's no direct instruction for WIDEN_MULT_EXPR, + take emulation into account. */ + case WIDEN_MULT_EXPR: + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode, + TYPE_UNSIGNED (vectype)); + break; + case NEGATE_EXPR: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) stmt_cost = ix86_cost->sse_op; diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c new file mode 100644 index 000000000000..bcd4b772c985 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */ +#include +void +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order) +{ + while (order--) + *v3++ = (int16_t) *v1++ * *v2++; +} + +void +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order) +{ + while (order--) + *v3++ = (uint16_t) *v1++ * *v2++; +} + +void +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order) +{ + while (order--) + *v3++ = (int32_t) *v1++ * *v2++; +} + +void +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order) +{ + while (order--) + *v3++ = (uint32_t) *v1++ * *v2++; +} + +void +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order) +{ + while (order--) + *v3++ = (int64_t) *v1++ * *v2++; +} + +void +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order) +{ + while (order--) + *v3++ = (uint64_t) *v1++ * *v2++; +} diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c new file mode 100644 index 000000000000..4456c31e43ec --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c @@ -0,0 +1,4 @@ +/* { dg-do compile } */ +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */ +#include "sse2-pr39821.c"