From: Hu, Lin1 Date: Mon, 16 Jan 2023 03:23:09 +0000 (+0800) Subject: Optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm X-Git-Tag: basepoints/gcc-15~10043 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c2dac2e5fbbcdda013aa7b0609d579abec8120ec;p=thirdparty%2Fgcc.git Optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk. We can optimze them to vblend, vmovaps when there's no cross-lane. gcc/ChangeLog: * config/i386/sse.md: Modify insn vperm{i,f} and vshuf{i,f}. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test. * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-1.c: New test. * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto. --- diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5dca8dd1e278..b0d9c025fbe6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18438,6 +18438,8 @@ mask = INTVAL (operands[3]) / 2; mask |= (INTVAL (operands[5]) - 4) / 2 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf64x2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -18596,6 +18598,9 @@ mask |= (INTVAL (operands[7]) - 8) / 4 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + return "vshuf32x4\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -25664,7 +25669,28 @@ (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_VPERMTI))] "TARGET_AVX2" - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" + { + int mask = INTVAL (operands[3]); + if ((mask & 0xbb) == 16) + { + if (rtx_equal_p (operands[0], operands[1])) + return ""; + else + return "vmovaps\t{%1, %0|%0, %1}"; + } + if ((mask & 0xbb) == 50) + { + if (rtx_equal_p (operands[0], operands[2])) + return ""; + else + return "vmovaps\t{%2, %0|%0, %2}"; + } + if ((mask & 0xbb) == 18) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 48) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + } [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -26227,9 +26253,11 @@ && avx_vperm2f128_parallel (operands[3], mode)" { int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; - if (mask == 0x12) - return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; - if (mask == 0x20) + if ((mask & 0xbb) == 0x12) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 0x30) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if ((mask & 0xbb) == 0x20) return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c index 6c2fb2f184a4..02aecf4edcea 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f32x4 (x, x, 2); + x = _mm256_shuffle_f32x4 (x, x, 3); x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c index 1191b4001344..563ded5d9dfd 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f64x2 (x, x, 2); + x = _mm256_shuffle_f64x2 (x, x, 3); x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c index ef9a441e7a50..e89c4140d370 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_i32x4 (x, x, 2); + x = _mm256_shuffle_i32x4 (x, x, 3); x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2); x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c index 0bd117e85d49..8e8e47eda384 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_i64x2 (x, x, 2); + x = _mm256_shuffle_i64x2 (x, x, 3); x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2); x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c new file mode 100644 index 000000000000..1ee00b6b4a10 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-times "vmovaps" 1 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */ + +#include + +/* Vpermi128/Vpermf128 */ +__m256i +perm0 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 50); +} + +__m256i +perm1 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 18); +} + +__m256i +perm2 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 48); +} + +/* vshuf{i,f}{32x4,64x2} ymm .*/ +__m256i +shuff0 (__m256i a, __m256i b) +{ + return _mm256_shuffle_i32x4(a, b, 2); +} + +__m256 +shuff1 (__m256 a, __m256 b) +{ + return _mm256_shuffle_f32x4(a, b, 2); +} + +__m256i +shuff2 (__m256i a, __m256i b) +{ + return _mm256_shuffle_i64x2(a, b, 2); +} + +__m256d +shuff3 (__m256d a, __m256d b) +{ + return _mm256_shuffle_f64x2(a, b, 2); +} diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c new file mode 100644 index 000000000000..9775072b97ae --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c @@ -0,0 +1,68 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vblendps" } } */ +/* { dg-final { scan-assembler-not "vperm2i128" } } */ +/* { dg-final { scan-assembler-not "vperm2f128" } } */ + +#include + +__m256i +perm0 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 16); +} + +__m256d +perm1 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 16); +} + +__m256 +perm2 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 16); +} + +__m256i +perm3 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 16); +} + +__m256i +perm4 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 20); +} + +__m256d +perm5 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 20); +} + +__m256i +perm6 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 80); +} + +__m256d +perm7 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 80); +} + +__m256i +perm8 (__m256i a, __m256i b) +{ + return _mm256_permute2x128_si256 (a, b, 84); +} + +__m256d +perm9 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 84); +} diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c new file mode 100644 index 000000000000..a330b14caca0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-times "vmov..." 3 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */ +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */ +/* { dg-final { scan-assembler-not "vperm2f128" } } */ + +#include + +/* Vpermf128 */ +__m256 +perm0 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 50); +} + +__m256 +perm1 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 18); +} + +__m256 +perm2 (__m256 a, __m256 b) +{ + return _mm256_permute2f128_ps (a, b, 48); +} + +__m256i +perm3 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 50); +} + +__m256i +perm4 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 18); +} + +__m256i +perm5 (__m256i a, __m256i b) +{ + return _mm256_permute2f128_si256 (a, b, 48); +} + +__m256d +perm6 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 50); +} + +__m256d +perm7 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 18); +} + +__m256d +perm8 (__m256d a, __m256d b) +{ + return _mm256_permute2f128_pd (a, b, 48); +}