commit
b41f96465190751561f6909e858604ceab00595b
Author: H.J. Lu <hjl.tools@gmail.com>
x86-64: Inline memmove with overlapping unaligned loads and stores
has
rtx_code_label *last_4x_vec_label = nullptr;
if (min_size == 0 || min_size < 4 * move_max)
last_4x_vec_label = gen_label_rtx ();
/* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */
if (last_4x_vec_label)
emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
nullptr, count_mode, 1,
last_4x_vec_label);
...
if (last_4x_vec_label)
{
/* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */
emit_label (last_4x_vec_label);
The last_4x_vec_label block covers min_size <= 4 * MOVE_MAX, not
min_size < 4 * MOVE_MAX. When MOVE_MAX == 16 bytes and min_size == 64,
the last_4x_vec_label isn't generated. Change min_size < 4 * move_max
to min_size <= 4 * move_max to correct the last_4x_vec_label condition.
Tested on Linux/x86-64.
gcc/
PR target/125117
* config/i386/i386-expand.cc (ix86_expand_movmem): Generate
last_4x_vec_label when min_size <= 4 * MOVE_MAX.
gcc/testsuite/
PR target/125117
* gcc.dg/pr125117.c: New test.
* gfortran.dg/pr125117.f90: Likewise.
* gcc.target/i386/builtin-memmove-10.c: Updated.
* gcc.target/i386/builtin-memmove-15.c: Likewise.
* gcc.target/i386/builtin-memmove-2a.c: Likewise.
* gcc.target/i386/builtin-memmove-2b.c: Likewise.
* gcc.target/i386/builtin-memmove-2c.c: Likewise.
* gcc.target/i386/builtin-memmove-2d.c: Likewise.
* gcc.target/i386/builtin-memmove-3a.c: Likewise.
* gcc.target/i386/builtin-memmove-3b.c: Likewise.
* gcc.target/i386/builtin-memmove-3c.c: Likewise.
* gcc.target/i386/builtin-memmove-4a.c: Likewise.
* gcc.target/i386/builtin-memmove-4b.c: Likewise.
* gcc.target/i386/builtin-memmove-4c.c: Likewise.
* gcc.target/i386/builtin-memmove-5b.c: Likewise.
* gcc.target/i386/builtin-memmove-5c.c: Likewise.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
more_8x_vec_label);
rtx_code_label *last_4x_vec_label = nullptr;
- if (min_size == 0 || min_size < 4 * move_max)
+ if (min_size == 0 || min_size <= 4 * move_max)
last_4x_vec_label = gen_label_rtx ();
- /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */
+ /* Jump to LAST_4X_VEC_LABEL if size <= 4 * MOVE_MAX. */
if (last_4x_vec_label)
- emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LEU,
nullptr, count_mode, 1,
last_4x_vec_label);
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O0" } */
+
+#include <string.h>
+#include <stdlib.h>
+
+#define OFFSET 32
+
+char buffer[128];
+char expected[64];
+
+void
+foo (char *dest, char *src, size_t n)
+{
+ if (n == 64)
+ __builtin_memmove (dest, src, n);
+}
+
+int
+main ()
+{
+ memset (buffer, -2, sizeof (buffer));
+ memset (buffer + OFFSET, -1, sizeof (buffer) - OFFSET);
+ memset (expected, -2, sizeof (expected));
+ memset (expected + OFFSET / 2, -1, OFFSET + OFFSET / 2);
+ foo (buffer, buffer + OFFSET / 2, 64);
+ if (memcmp (expected, buffer, 64) != 0)
+ abort ();
+
+ return 0;
+}
**.LFB0:
** .cfi_startproc
** cmpq \$63, %rdx
-** ja .L12
+** ja .L13
**.L1:
** ret
** .p2align 4,,10
** .p2align 3
-**.L12:
+**.L13:
** movq %rdi, %rcx
** movq %rsi, %rax
** cmpq \$128, %rdx
-** jbe .L13
+** jbe .L14
** movq %rdx, %rsi
** cmpq %rdi, %rax
-** jb .L6
+** jb .L7
** je .L1
** movdqu -16\(%rax,%rdx\), %xmm7
** movdqu -32\(%rax,%rdx\), %xmm6
** movdqu -48\(%rax,%rdx\), %xmm5
** movdqu -64\(%rax,%rdx\), %xmm4
-**.L7:
+**.L8:
** movdqu \(%rax\), %xmm3
** subq \$64, %rsi
** addq \$64, %rcx
** movups %xmm1, -32\(%rcx\)
** movups %xmm0, -16\(%rcx\)
** cmpq \$64, %rsi
-** ja .L7
+** ja .L8
** movups %xmm7, -16\(%rdi,%rdx\)
** movups %xmm6, -32\(%rdi,%rdx\)
** movups %xmm5, -48\(%rdi,%rdx\)
** ret
** .p2align 4,,10
** .p2align 3
-**.L13:
+**.L14:
+** cmpq \$64, %rdx
+** jbe .L6
** movdqu \(%rsi\), %xmm7
** movdqu 16\(%rsi\), %xmm6
** movdqu 32\(%rsi\), %xmm5
** .p2align 4,,10
** .p2align 3
**.L6:
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** movdqu -16\(%rsi,%rdx\), %xmm1
+** movdqu -32\(%rsi,%rdx\), %xmm0
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, -16\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
** movdqu \(%rax\), %xmm3
** movdqu 16\(%rax\), %xmm2
** leaq \(%rdi,%rdx\), %rcx
** movdqu 32\(%rax\), %xmm1
** movdqu 48\(%rax\), %xmm0
** addq %rdx, %rax
-**.L8:
+**.L9:
** movdqu -16\(%rax\), %xmm7
** movdqu -32\(%rax\), %xmm6
** subq \$64, %rsi
** movups %xmm5, 16\(%rcx\)
** movups %xmm4, \(%rcx\)
** cmpq \$64, %rsi
-** ja .L8
+** ja .L9
** movups %xmm3, \(%rdi\)
** movups %xmm2, 16\(%rdi\)
** movups %xmm1, 32\(%rdi\)
** .p2align 3
**.L5:
** cmpl \$64, %edx
-** jnb .L14
+** ja .L14
** movl %edx, %edx
** movdqu \(%rsi\), %xmm3
** movdqu 16\(%rsi\), %xmm2
** .p2align 3
**.L18:
** cmpq \$64, %rdx
-** jb .L10
+** jbe .L10
** movdqu \(%rsi\), %xmm7
** movdqu 16\(%rsi\), %xmm6
** movdqu 32\(%rsi\), %xmm5
** .p2align 3
**.L19:
** cmpq \$128, %rdx
-** jb .L11
+** jbe .L11
** vmovdqu \(%rsi\), %ymm7
** vmovdqu 32\(%rsi\), %ymm6
** vmovdqu 64\(%rsi\), %ymm5
** .p2align 3
**.L20:
** cmpq \$256, %rdx
-** jb .L12
+** jbe .L12
** vmovdqu64 \(%rsi\), %zmm7
** vmovdqu64 64\(%rsi\), %zmm6
** vmovdqu64 -64\(%rsi,%rdx\), %zmm3
**.L20:
** .cfi_def_cfa_offset 40
** cmpq \$32, %rdx
-** jb .L9
+** jbe .L9
** movq %rbx, \(%rsp\)
** movq %r14, 16\(%rsp\)
** .cfi_offset 3, -40
** cmpq \$128, %rdx
** ja .L5
** cmpq \$64, %rdx
-** jnb .L15
+** ja .L15
** movdqu \(%rsi\), %xmm3
** movdqu 16\(%rsi\), %xmm2
** movdqu -16\(%rsi,%rdx\), %xmm1
** .p2align 3
**.L17:
** cmpq \$128, %rdx
-** jb .L8
+** jbe .L8
** vmovdqu \(%rsi\), %ymm7
** vmovdqu 32\(%rsi\), %ymm6
** vmovdqu 64\(%rsi\), %ymm5
** .p2align 3
**.L20:
** cmpq \$256, %rdx
-** jb .L9
+** jbe .L9
** vmovdqu64 \(%rsi\), %zmm7
** vmovdqu64 64\(%rsi\), %zmm6
** vmovdqu64 -64\(%rsi,%rdx\), %zmm3
** .p2align 3
**.L14:
** cmpq \$64, %rdx
-** jb .L6
+** jbe .L6
** movdqu \(%rsi\), %xmm7
** movdqu 16\(%rsi\), %xmm6
** movdqu 32\(%rsi\), %xmm5
** cmpq \$256, %rdx
** ja .L5
** cmpq \$128, %rdx
-** jnb .L16
+** ja .L16
** vmovdqu \(%rsi\), %ymm3
** vmovdqu 32\(%rsi\), %ymm2
** vmovdqu -32\(%rsi,%rdx\), %ymm1
** .p2align 3
**.L17:
** cmpq \$256, %rdx
-** jb .L8
+** jbe .L8
** vmovdqu64 \(%rsi\), %zmm7
** vmovdqu64 64\(%rsi\), %zmm6
** vmovdqu64 -64\(%rsi,%rdx\), %zmm3
** .p2align 3
**.L15:
** cmpq \$128, %rdx
-** jb .L6
+** jbe .L6
** vmovdqu \(%rsi\), %ymm7
** vmovdqu 32\(%rsi\), %ymm6
** vmovdqu 64\(%rsi\), %ymm5
** cmpq \$512, %rdx
** ja .L5
** cmpq \$256, %rdx
-** jnb .L16
+** ja .L16
** vmovdqu64 \(%rsi\), %zmm3
** vmovdqu64 64\(%rsi\), %zmm2
** vmovdqu64 -64\(%rsi,%rdx\), %zmm1
--- /dev/null
+! { dg-do run }
+! { dg-options "-O0" }
+! PR fortran/125117
+
+program test
+ implicit none
+ character(len=64) :: fixed
+ character(:), allocatable :: got
+ fixed = 'hello world'
+ got = getName(fixed)
+ if (trim(got) == 'hello world') then
+ print *, 'PASS'
+ else
+ print *, 'FAIL: got=[', trim(got), ']'
+ call abort()
+ end if
+contains
+ function getName(fixed) result(name)
+ character(len=64), intent(in) :: fixed
+ character(:), allocatable :: name
+ name = fixed
+ end function
+end program