Emulated gather/scatter behave similar to strided elementwise
accesses in that they need to decompose the offset vector
and construct or decompose the data vector so handle them
the same way, pessimizing the cases with may elements.
For pr88531-2c.c instead of
.L4:
leaq (%r15,%rcx), %rdx
incl %edi
movl 16(%rdx), %r13d
movl 24(%rdx), %r14d
movl (%rdx), %r10d
movl 4(%rdx), %r9d
movl 8(%rdx), %ebx
movl 12(%rdx), %r11d
movl 20(%rdx), %r12d
vmovss (%rax,%r14,4), %xmm2
movl 28(%rdx), %edx
vmovss (%rax,%r13,4), %xmm1
vmovss (%rax,%r10,4), %xmm0
vinsertps $0x10, (%rax,%rdx,4), %xmm2, %xmm2
vinsertps $0x10, (%rax,%r12,4), %xmm1, %xmm1
vinsertps $0x10, (%rax,%r9,4), %xmm0, %xmm0
vmovlhps %xmm2, %xmm1, %xmm1
vmovss (%rax,%rbx,4), %xmm2
vinsertps $0x10, (%rax,%r11,4), %xmm2, %xmm2
vmovlhps %xmm2, %xmm0, %xmm0
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
vmulps %ymm3, %ymm0, %ymm0
vmovups %ymm0, (%r8,%rcx)
addq $32, %rcx
cmpl %esi, %edi
jb .L4
we now prefer
.L4:
leaq 0(%rbp,%rdx,8), %rcx
movl (%rcx), %r10d
movl 4(%rcx), %ecx
vmovss (%rsi,%r10,4), %xmm0
vinsertps $0x10, (%rsi,%rcx,4), %xmm0, %xmm0
vmulps %xmm1, %xmm0, %xmm0
vmovlps %xmm0, (%rbx,%rdx,8)
incq %rdx
cmpl %edi, %edx
jb .L4
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Tame down element extracts and scalar loads for gather/scatter
similar to elementwise strided accesses.
* gcc.target/i386/pr89618-2.c: New testcase.
* gcc.target/i386/pr88531-2b.c: Adjust.
* gcc.target/i386/pr88531-2c.c: Likewise.
&& stmt_info
&& (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
|| STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
- && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
+ && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ != INTEGER_CST))
+ || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
{
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
#include "pr88531-2a.c"
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
#include "pr88531-2a.c"
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
+
+void foo (int n, int *off, double *a)
+{
+ const int m = 32;
+
+ for (int j = 0; j < n/m; ++j)
+ {
+ int const start = j*m;
+ int const end = (j+1)*m;
+
+#pragma GCC ivdep
+ for (int i = start; i < end; ++i)
+ {
+ a[off[i]] = a[i] < 0 ? a[i] : 0;
+ }
+ }
+}
+
+/* Make sure the cost model selects SSE vectors rather than AVX to avoid
+ too many scalar ops for the address computes in the loop. */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */