]> git.ipfire.org Git - thirdparty/gcc.git/commit
x86: Improve vector_loop/unrolled_loop for memset/memcpy
authorH.J. Lu <hjl.tools@gmail.com>
Tue, 17 Jun 2025 02:17:17 +0000 (10:17 +0800)
committerH.J. Lu <hjl.tools@gmail.com>
Mon, 7 Jul 2025 07:39:31 +0000 (15:39 +0800)
commit401199377c50045ede560daf3f6e8b51749c2a87
treea50c14081e7b7a8c59dc89f694c2ee49942af85f
parent66455591fac1e80b5acc615598cbf556d565e080
x86: Improve vector_loop/unrolled_loop for memset/memcpy

1. Don't generate the loop if the loop count is 1.
2. For memset with vector on small size, use vector if small size supports
vector, otherwise use the scalar value.
3. Always expand vector-version of memset for vector_loop.
4. Always duplicate the promoted scalar value for vector_loop if not 0 nor
-1.
5. Use misaligned prologue if alignment isn't needed.  When misaligned
prologue is used, check if destination is actually aligned and update
destination alignment if aligned.
6. Use move_by_pieces and store_by_pieces for memcpy and memset epilogues
with the fixed epilogue size to enable overlapping moves and stores.

The included tests show that codegen of vector_loop/unrolled_loop for
memset/memcpy are significantly improved.  For

void
foo (void *p1, size_t len)
{
  __builtin_memset (p1, 0, len);
}

with

-O2 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -march=x86-64

we used to generate

foo:
.LFB0:
.cfi_startproc
movq %rdi, %rax
pxor %xmm0, %xmm0
cmpq $64, %rsi
jnb .L18
.L2:
andl $63, %esi
je .L1
xorl %edx, %edx
testb $1, %sil
je .L5
movl $1, %edx
movb $0, (%rax)
cmpq %rsi, %rdx
jnb .L19
.L5:
movb $0, (%rax,%rdx)
movb $0, 1(%rax,%rdx)
addq $2, %rdx
cmpq %rsi, %rdx
jb .L5
.L1:
ret
.p2align 4,,10
.p2align 3
.L18:
movq %rsi, %rdx
xorl %eax, %eax
andq $-64, %rdx
.L3:
movups %xmm0, (%rdi,%rax)
movups %xmm0, 16(%rdi,%rax)
movups %xmm0, 32(%rdi,%rax)
movups %xmm0, 48(%rdi,%rax)
addq $64, %rax
cmpq %rdx, %rax
jb .L3
addq %rdi, %rax
jmp .L2
.L19:
ret
.cfi_endproc

with very poor prologue/epilogue.  With this patch, we now generate:

foo:
.LFB0:
.cfi_startproc
pxor %xmm0, %xmm0
cmpq $64, %rsi
jnb .L2
testb $32, %sil
jne .L19
testb $16, %sil
jne .L20
testb $8, %sil
jne .L21
testb $4, %sil
jne .L22
testq %rsi, %rsi
jne .L23
.L1:
ret
.p2align 4,,10
.p2align 3
.L2:
movups %xmm0, -64(%rdi,%rsi)
movups %xmm0, -48(%rdi,%rsi)
movups %xmm0, -32(%rdi,%rsi)
movups %xmm0, -16(%rdi,%rsi)
subq $1, %rsi
cmpq $64, %rsi
jb .L1
andq $-64, %rsi
xorl %eax, %eax
.L9:
movups %xmm0, (%rdi,%rax)
movups %xmm0, 16(%rdi,%rax)
movups %xmm0, 32(%rdi,%rax)
movups %xmm0, 48(%rdi,%rax)
addq $64, %rax
cmpq %rsi, %rax
jb .L9
ret
.p2align 4,,10
.p2align 3
.L23:
movb $0, (%rdi)
testb $2, %sil
je .L1
xorl %eax, %eax
movw %ax, -2(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L19:
movups %xmm0, (%rdi)
movups %xmm0, 16(%rdi)
movups %xmm0, -32(%rdi,%rsi)
movups %xmm0, -16(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L20:
movups %xmm0, (%rdi)
movups %xmm0, -16(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L21:
movq $0, (%rdi)
movq $0, -8(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L22:
movl $0, (%rdi)
movl $0, -4(%rdi,%rsi)
ret
.cfi_endproc

gcc/

PR target/120670
PR target/120683
* config/i386/i386-expand.cc (expand_set_or_cpymem_via_loop):
Don't generate the loop if the loop count is 1.
(expand_cpymem_epilogue): Use move_by_pieces.
(setmem_epilogue_gen_val): New.
(expand_setmem_epilogue): Use store_by_pieces.
(expand_small_cpymem_or_setmem): Choose cpymem mode from MOVE_MAX.
For memset with vector and the size is smaller than the vector
size, first try the narrower vector, otherwise, use the scalar
value.
(promote_duplicated_reg): Duplicate the scalar value for vector.
(ix86_expand_set_or_cpymem): Always expand vector-version of
memset for vector_loop.  Use misaligned prologue if alignment
isn't needed and destination isn't aligned.  Always initialize
vec_promoted_val from the promoted scalar value for vector_loop.

gcc/testsuite/

PR target/120670
PR target/120683
* gcc.target/i386/auto-init-padding-9.c: Updated.
* gcc.target/i386/memcpy-strategy-12.c: Likewise.
* gcc.target/i386/memset-strategy-25.c: Likewise.
* gcc.target/i386/memset-strategy-29.c: Likewise.
* gcc.target/i386/memset-strategy-30.c: Likewise.
* gcc.target/i386/memset-strategy-31.c: Likewise.
* gcc.target/i386/memcpy-pr120683-1.c: New test.
* gcc.target/i386/memcpy-pr120683-2.c: Likewise.
* gcc.target/i386/memcpy-pr120683-3.c: Likewise.
* gcc.target/i386/memcpy-pr120683-4.c: Likewise.
* gcc.target/i386/memcpy-pr120683-5.c: Likewise.
* gcc.target/i386/memcpy-pr120683-6.c: Likewise.
* gcc.target/i386/memcpy-pr120683-7.c: Likewise.
* gcc.target/i386/memset-pr120683-1.c: Likewise.
* gcc.target/i386/memset-pr120683-2.c: Likewise.
* gcc.target/i386/memset-pr120683-3.c: Likewise.
* gcc.target/i386/memset-pr120683-4.c: Likewise.
* gcc.target/i386/memset-pr120683-5.c: Likewise.
* gcc.target/i386/memset-pr120683-6.c: Likewise.
* gcc.target/i386/memset-pr120683-7.c: Likewise.
* gcc.target/i386/memset-pr120683-8.c: Likewise.
* gcc.target/i386/memset-pr120683-9.c: Likewise.
* gcc.target/i386/memset-pr120683-10.c: Likewise.
* gcc.target/i386/memset-pr120683-11.c: Likewise.
* gcc.target/i386/memset-pr120683-12.c: Likewise.
* gcc.target/i386/memset-pr120683-13.c: Likewise.
* gcc.target/i386/memset-pr120683-14.c: Likewise.
* gcc.target/i386/memset-pr120683-15.c: Likewise.
* gcc.target/i386/memset-pr120683-16.c: Likewise.
* gcc.target/i386/memset-pr120683-17.c: Likewise.
* gcc.target/i386/memset-pr120683-18.c: Likewise.
* gcc.target/i386/memset-pr120683-19.c: Likewise.
* gcc.target/i386/memset-pr120683-20.c: Likewise.
* gcc.target/i386/memset-pr120683-21.c: Likewise.
* gcc.target/i386/memset-pr120683-22.c: Likewise.
* gcc.target/i386/memset-pr120683-23.c: Likewise.
37 files changed:
gcc/config/i386/i386-expand.cc
gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
gcc/testsuite/gcc.target/i386/memset-pr120683-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-10.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-11.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-12.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-13.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-14.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-15.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-16.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-17.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-18.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-19.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-20.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-21.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-22.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-23.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-6.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-7.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-8.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-pr120683-9.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-strategy-25.c
gcc/testsuite/gcc.target/i386/memset-strategy-29.c
gcc/testsuite/gcc.target/i386/memset-strategy-30.c
gcc/testsuite/gcc.target/i386/memset-strategy-31.c