From b41f96465190751561f6909e858604ceab00595b Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 20 Oct 2025 16:14:34 +0800 Subject: [PATCH] x86-64: Inline memmove with overlapping unaligned loads and stores Inline memmove in 64-bit since there are much less registers available in 32-bit: 1. Load all sources into registers and store them together to avoid possible address overlap between source and destination. 2. For known size, first try to fully unroll with 8 registers. 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first and then store them together. 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources into 4 registers first and then store them together. 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources into 8 registers first and then store them together. 6. For size > 8 * MOVE_MAX, a. If address of destination > address of source, copy backward with a 4 * MOVE_MAX loop with unaligned loads and stores. Load the first 4 * MOVE_MAX into 4 registers before the loop and store them after the loop to support overlapping addresses. b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned loads and stores. Load the last 4 * MOVE_MAX into 4 registers before the loop and store them after the loop to support overlapping addresses. Verified and benchmarked memmove implementations inlined with GPR, SSE2, AVX2 and AVX512 using glibc memmove tests. It is available at https://gitlab.com/x86-glibc/glibc/-/commits/users/hjl/test/memmove Their performances are comparable with optimized memmove implementations in glibc on Intel Core i7-1195G7. gcc/ PR target/90262 * config/i386/i386-expand.cc (ix86_expand_unroll_movmem): New. (ix86_expand_n_move_movmem): Likewise. (ix86_expand_load_movmem): Likewise. (ix86_expand_store_movmem): Likewise. (ix86_expand_n_overlapping_move_movmem): Likewise. (ix86_expand_less_move_movmem): Likewise. (ix86_expand_movmem): Likewise. * config/i386/i386-protos.h (ix86_expand_movmem): Likewise. * config/i386/i386.md (movmem): Likewise. gcc/testsuite/ * gcc.target/i386/builtin-memmove-1a.c: New test. * gcc.target/i386/builtin-memmove-1b.c: Likewise. * gcc.target/i386/builtin-memmove-1c.c: Likewise. * gcc.target/i386/builtin-memmove-1d.c: Likewise. * gcc.target/i386/builtin-memmove-2a.c: Likewise. * gcc.target/i386/builtin-memmove-2b.c: Likewise. * gcc.target/i386/builtin-memmove-2c.c: Likewise. * gcc.target/i386/builtin-memmove-2d.c: Likewise. * gcc.target/i386/builtin-memmove-3a.c: Likewise. * gcc.target/i386/builtin-memmove-3b.c: Likewise. * gcc.target/i386/builtin-memmove-3c.c: Likewise. * gcc.target/i386/builtin-memmove-4a.c: Likewise. * gcc.target/i386/builtin-memmove-4b.c: Likewise. * gcc.target/i386/builtin-memmove-4c.c: Likewise. * gcc.target/i386/builtin-memmove-5a.c: Likewise. * gcc.target/i386/builtin-memmove-5b.c: Likewise. * gcc.target/i386/builtin-memmove-5c.c: Likewise. * gcc.target/i386/builtin-memmove-6.c: Likewise. * gcc.target/i386/builtin-memmove-7.c: Likewise. * gcc.target/i386/builtin-memmove-8.c: Likewise. * gcc.target/i386/builtin-memmove-9.c: Likewise. * gcc.target/i386/builtin-memmove-10.c: Likewise. * gcc.target/i386/builtin-memmove-11a.c: Likewise. * gcc.target/i386/builtin-memmove-11b.c: Likewise. * gcc.target/i386/builtin-memmove-11c.c: Likewise. * gcc.target/i386/builtin-memmove-12.c: Likewise. * gcc.target/i386/builtin-memmove-13.c: Likewise. * gcc.target/i386/builtin-memmove-14.c: Likewise. * gcc.target/i386/builtin-memmove-15.c: Likewise. Signed-off-by: H.J. Lu --- gcc/config/i386/i386-expand.cc | 748 ++++++++++++++++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.md | 17 + .../gcc.target/i386/builtin-memmove-10.c | 105 +++ .../gcc.target/i386/builtin-memmove-11a.c | 79 ++ .../gcc.target/i386/builtin-memmove-11b.c | 74 ++ .../gcc.target/i386/builtin-memmove-11c.c | 33 + .../gcc.target/i386/builtin-memmove-12.c | 41 + .../gcc.target/i386/builtin-memmove-13.c | 25 + .../gcc.target/i386/builtin-memmove-14.c | 90 +++ .../gcc.target/i386/builtin-memmove-15.c | 114 +++ .../gcc.target/i386/builtin-memmove-1a.c | 123 +++ .../gcc.target/i386/builtin-memmove-1b.c | 98 +++ .../gcc.target/i386/builtin-memmove-1c.c | 94 +++ .../gcc.target/i386/builtin-memmove-1d.c | 226 ++++++ .../gcc.target/i386/builtin-memmove-2a.c | 165 ++++ .../gcc.target/i386/builtin-memmove-2b.c | 173 ++++ .../gcc.target/i386/builtin-memmove-2c.c | 184 +++++ .../gcc.target/i386/builtin-memmove-2d.c | 195 +++++ .../gcc.target/i386/builtin-memmove-3a.c | 133 ++++ .../gcc.target/i386/builtin-memmove-3b.c | 140 ++++ .../gcc.target/i386/builtin-memmove-3c.c | 151 ++++ .../gcc.target/i386/builtin-memmove-4a.c | 123 +++ .../gcc.target/i386/builtin-memmove-4b.c | 130 +++ .../gcc.target/i386/builtin-memmove-4c.c | 141 ++++ .../gcc.target/i386/builtin-memmove-5a.c | 109 +++ .../gcc.target/i386/builtin-memmove-5b.c | 120 +++ .../gcc.target/i386/builtin-memmove-5c.c | 130 +++ .../gcc.target/i386/builtin-memmove-6.c | 52 ++ .../gcc.target/i386/builtin-memmove-7.c | 42 + .../gcc.target/i386/builtin-memmove-8.c | 90 +++ .../gcc.target/i386/builtin-memmove-9.c | 63 ++ 32 files changed, 4009 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-10.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-11a.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-11b.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-11c.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-12.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-13.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-14.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-15.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-1b.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-1c.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-1d.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-2a.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-2b.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-2c.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-2d.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-3a.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-3b.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-3c.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-4a.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-4b.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-4c.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-5a.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-5b.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-5c.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-6.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-7.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-8.c create mode 100644 gcc/testsuite/gcc.target/i386/builtin-memmove-9.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index a1f1b26f78a..c131f7c44c1 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -9995,6 +9995,754 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, return true; } +/* Fully unroll memmove of known size with up to 8 registers. */ + +static bool +ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg, + unsigned HOST_WIDE_INT count, + machine_mode mode) +{ + /* If 8 registers registers can cover all memory, load them into + registers and store them together to avoid possible address + overlap between source and destination. */ + unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode); + if (moves == 0) + { + mode = smallest_int_mode_for_size + (count * BITS_PER_UNIT).require (); + if (count == GET_MODE_SIZE (mode)) + moves = 1; + else + { + /* Reduce the smallest move size by half so that MOVES == 1. */ + mode = smallest_int_mode_for_size + (GET_MODE_BITSIZE (mode) / 2).require (); + moves = count / GET_MODE_SIZE (mode); + gcc_assert (moves == 1); + } + } + else if (moves > 8) + return false; + + unsigned int i; + rtx tmp[9]; + + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); + + rtx srcmem = change_address (src, mode, srcreg); + for (i = 0; i < moves; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + + unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1); + machine_mode epilogue_mode = VOIDmode; + if (epilogue_size) + { + /* Handle the remaining bytes with overlapping move. */ + epilogue_mode = smallest_int_mode_for_size + (epilogue_size * BITS_PER_UNIT).require (); + tmp[8] = gen_reg_rtx (epilogue_mode); + srcmem = adjust_address (srcmem, epilogue_mode, 0); + srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1); + srcmem = offset_address (srcmem, + GEN_INT (-GET_MODE_SIZE (epilogue_mode)), + GET_MODE_SIZE (epilogue_mode)); + emit_move_insn (tmp[8], srcmem); + } + + rtx destmem = change_address (dst, mode, destreg); + for (i = 0; i < moves; i++) + { + emit_move_insn (destmem, tmp[i]); + destmem = offset_address (destmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + + if (epilogue_size) + { + /* Use overlapping move. */ + destmem = adjust_address (destmem, epilogue_mode, 0); + destmem = offset_address (destmem, GEN_INT (epilogue_size), 1); + destmem = offset_address (destmem, + GEN_INT (-GET_MODE_SIZE (epilogue_mode)), + GET_MODE_SIZE (epilogue_mode)); + emit_move_insn (destmem, tmp[8]); + } + + return true; +} + +/* Expand memmove of size with MOVES * mode size and MOVES <= 4. If + FORWARD is true, copy forward. Otherwise copy backward. */ + +static void +ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode, + unsigned int moves, bool forward) +{ + gcc_assert (moves <= 4); + + unsigned int i; + rtx tmp[8]; + + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); + + rtx step; + if (forward) + step = GEN_INT (GET_MODE_SIZE (mode)); + else + step = GEN_INT (-GET_MODE_SIZE (mode)); + + /* Load MOVES. */ + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (tmp[i], srcmem); + + /* Store MOVES. */ + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (destmem, tmp[i]); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmp[i]); +} + +/* Load MOVES of mode size into REGS. If LAST is true, load the + last MOVES. Otherwise, load the first MOVES. */ + +static void +ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp, + machine_mode mode, unsigned int moves, + rtx regs[], bool last) +{ + unsigned int i; + + for (i = 0; i < moves; i++) + regs[i] = gen_reg_rtx (mode); + + rtx srcmem = change_address (src, mode, srcreg); + rtx step; + if (last) + { + srcmem = offset_address (srcmem, count_exp, 1); + step = GEN_INT (-GET_MODE_SIZE (mode)); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + } + else + step = GEN_INT (GET_MODE_SIZE (mode)); + + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (regs[i], srcmem); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (regs[i], srcmem); +} + +/* Store MOVES of mode size into REGS. If LAST is true, store the + last MOVES. Otherwise, store the first MOVES. */ + +static void +ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp, + machine_mode mode, unsigned int moves, + rtx regs[], bool last) +{ + unsigned int i; + + rtx destmem = change_address (dst, mode, destreg); + rtx step; + if (last) + { + destmem = offset_address (destmem, count_exp, 1); + step = GEN_INT (-GET_MODE_SIZE (mode)); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + } + else + step = GEN_INT (GET_MODE_SIZE (mode)); + + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (destmem, regs[i]); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, regs[i]); +} + +/* Expand memmove of size between (MOVES / 2) * mode size and + MOVES * mode size with overlapping load and store. MOVES is even. + MOVES >= 2 and MOVES <= 8. */ + +static void +ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg, + rtx srcreg, rtx count_exp, + machine_mode mode, + unsigned int moves) +{ + gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0); + + unsigned int half_moves = moves / 2; + unsigned int i, j; + rtx tmp[8]; + + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); + + rtx base_srcmem = change_address (src, mode, srcreg); + + /* Load the first half. */ + rtx srcmem = base_srcmem; + for (i = 0; i < half_moves - 1; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (tmp[i], srcmem); + + /* Load the second half. */ + srcmem = offset_address (base_srcmem, count_exp, 1); + srcmem = offset_address (srcmem, + GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + for (j = half_moves, i = 0; i < half_moves - 1; i++, j++) + { + emit_move_insn (tmp[j], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (tmp[j], srcmem); + + rtx base_destmem = change_address (dst, mode, destreg); + + /* Store the first half. */ + rtx destmem = base_destmem; + for (i = 0; i < half_moves - 1; i++) + { + emit_move_insn (destmem, tmp[i]); + destmem = offset_address (destmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmp[i]); + + /* Store the second half. */ + destmem = offset_address (base_destmem, count_exp, 1); + destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + for (j = half_moves, i = 0; i < half_moves - 1; i++, j++) + { + emit_move_insn (destmem, tmp[j]); + destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmp[j]); +} + +/* Expand memmove of size < mode size which is <= 64. */ + +static void +ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg, + rtx srcreg, rtx count_exp, + unsigned HOST_WIDE_INT min_size, + machine_mode mode, + rtx_code_label *done_label) +{ + bool skip = false; + machine_mode count_mode = counter_mode (count_exp); + + rtx_code_label *between_32_63_label + = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64. */ + if (between_32_63_label) + { + if (min_size && min_size >= 32) + { + emit_jump_insn (gen_jump (between_32_63_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU, + nullptr, count_mode, 1, + between_32_63_label); + } + + rtx_code_label *between_16_31_label + = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31. */ + if (between_16_31_label) + { + if (min_size && min_size >= 16) + { + emit_jump_insn (gen_jump (between_16_31_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU, + nullptr, count_mode, 1, + between_16_31_label); + } + + rtx_code_label *between_8_15_label + = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15. */ + if (between_8_15_label) + { + if (min_size && min_size >= 8) + { + emit_jump_insn (gen_jump (between_8_15_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU, + nullptr, count_mode, 1, + between_8_15_label); + } + + rtx_code_label *between_4_7_label + = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7. */ + if (between_4_7_label) + { + if (min_size && min_size >= 4) + { + emit_jump_insn (gen_jump (between_4_7_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU, + nullptr, count_mode, 1, + between_4_7_label); + } + + rtx_code_label *between_2_3_label + = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr; + /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3. */ + if (between_2_3_label) + { + if (min_size && min_size >= 2) + { + emit_jump_insn (gen_jump (between_2_3_label)); + emit_barrier (); + skip = true; + } + else + emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT, + nullptr, count_mode, 1, + between_2_3_label); + } + + if (!skip) + { + rtx_code_label *zero_label + = min_size == 0 ? gen_label_rtx () : nullptr; + /* Skip if size == 0. */ + if (zero_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT, + nullptr, count_mode, 1, + zero_label, + profile_probability::unlikely ()); + + /* Move 1 byte. */ + rtx tmp0 = gen_reg_rtx (QImode); + rtx srcmem = change_address (src, QImode, srcreg); + emit_move_insn (tmp0, srcmem); + rtx destmem = change_address (dst, QImode, destreg); + emit_move_insn (destmem, tmp0); + + if (zero_label) + emit_label (zero_label); + + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_32_63_label) + { + emit_label (between_32_63_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, OImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_16_31_label) + { + emit_label (between_16_31_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, TImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_8_15_label) + { + emit_label (between_8_15_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, DImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_4_7_label) + { + emit_label (between_4_7_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, SImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (between_2_3_label) + { + emit_label (between_2_3_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, HImode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } +} + +/* Expand movmem with overlapping unaligned loads and stores: + 1. Load all sources into registers and store them together to avoid + possible address overlap between source and destination. + 2. For known size, first try to fully unroll with 8 registers. + 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first + and then store them together. + 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources + into 4 registers first and then store them together. + 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources + into 8 registers first and then store them together. + 6. For size > 8 * MOVE_MAX, + a. If address of destination > address of source, copy backward + with a 4 * MOVE_MAX loop with unaligned loads and stores. Load + the first 4 * MOVE_MAX into 4 registers before the loop and + store them after the loop to support overlapping addresses. + b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned + loads and stores. Load the last 4 * MOVE_MAX into 4 registers + before the loop and store them after the loop to support + overlapping addresses. + */ + +bool +ix86_expand_movmem (rtx operands[]) +{ + /* Since there are much less registers available in 32-bit mode, don't + inline movmem in 32-bit mode. */ + if (!TARGET_64BIT) + return false; + + rtx dst = operands[0]; + rtx src = operands[1]; + rtx count_exp = operands[2]; + rtx expected_size_exp = operands[5]; + rtx min_size_exp = operands[6]; + rtx probable_max_size_exp = operands[8]; + unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U; + HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U; + unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U; + unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U; + + if (CONST_INT_P (count_exp)) + { + min_size = probable_max_size = count = expected_size + = INTVAL (count_exp); + /* When COUNT is 0, there is nothing to do. */ + if (!count) + return true; + } + else + { + if (min_size_exp) + min_size = INTVAL (min_size_exp); + if (probable_max_size_exp) + probable_max_size = INTVAL (probable_max_size_exp); + if (CONST_INT_P (expected_size_exp)) + expected_size = INTVAL (expected_size_exp); + } + + /* Make sure we don't need to care about overflow later on. */ + if (count > (HOST_WIDE_INT_1U << 30)) + return false; + + addr_space_t dst_as = MEM_ADDR_SPACE (dst); + addr_space_t src_as = MEM_ADDR_SPACE (src); + int dynamic_check; + bool noalign; + enum stringop_alg alg = decide_alg (count, expected_size, min_size, + probable_max_size, false, false, + dst_as, src_as, &dynamic_check, + &noalign, false); + if (alg == libcall) + return false; + + rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); + rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); + + unsigned int move_max = MOVE_MAX; + machine_mode mode = smallest_int_mode_for_size + (move_max * BITS_PER_UNIT).require (); + if (probable_max_size && probable_max_size < move_max) + { + /* Get a usable MOVE_MAX. */ + mode = smallest_int_mode_for_size + (probable_max_size * BITS_PER_UNIT).require (); + /* Reduce MOVE_MAX by half so that MOVE_MAX can be used. */ + if (GET_MODE_SIZE (mode) > probable_max_size) + mode = smallest_int_mode_for_size + (GET_MODE_BITSIZE (mode) / 2).require (); + move_max = GET_MODE_SIZE (mode); + } + + /* Try to fully unroll memmove of known size first. */ + if (count + && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count, + mode)) + return true; + + rtx_code_label *done_label = gen_label_rtx (); + + rtx_code_label *less_vec_label = nullptr; + if (min_size == 0 || min_size < move_max) + less_vec_label = gen_label_rtx (); + + machine_mode count_mode = counter_mode (count_exp); + + /* Jump to LESS_VEC_LABEL if size < MOVE_MAX. */ + if (less_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU, + nullptr, count_mode, 1, + less_vec_label); + + rtx_code_label *more_2x_vec_label = nullptr; + if (probable_max_size == 0 || probable_max_size > 2 * move_max) + more_2x_vec_label = gen_label_rtx (); + + /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX. */ + if (more_2x_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU, + nullptr, count_mode, 1, + more_2x_vec_label); + + if (min_size == 0 || min_size <= 2 * move_max) + { + /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */ + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, + count_exp, mode, 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (less_vec_label) + { + /* Size < MOVE_MAX. */ + emit_label (less_vec_label); + ix86_expand_less_move_movmem (dst, src, destreg, srcreg, + count_exp, min_size, mode, + done_label); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (more_2x_vec_label) + { + /* Size > 2 * MOVE_MAX and destination may overlap with source. */ + emit_label (more_2x_vec_label); + + rtx_code_label *more_8x_vec_label = nullptr; + if (probable_max_size == 0 || probable_max_size > 8 * move_max) + more_8x_vec_label = gen_label_rtx (); + + /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX. */ + if (more_8x_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU, + nullptr, count_mode, 1, + more_8x_vec_label); + + rtx_code_label *last_4x_vec_label = nullptr; + if (min_size == 0 || min_size < 4 * move_max) + last_4x_vec_label = gen_label_rtx (); + + /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */ + if (last_4x_vec_label) + emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU, + nullptr, count_mode, 1, + last_4x_vec_label); + + if (probable_max_size == 0 || probable_max_size > 4 * move_max) + { + /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */ + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, + srcreg, count_exp, + mode, 8); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (last_4x_vec_label) + { + /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */ + emit_label (last_4x_vec_label); + ix86_expand_n_overlapping_move_movmem (dst, src, destreg, + srcreg, count_exp, + mode, 4); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + + if (more_8x_vec_label) + { + /* Size > 8 * MOVE_MAX. */ + emit_label (more_8x_vec_label); + + rtx loop_count = gen_reg_rtx (count_mode); + emit_move_insn (loop_count, count_exp); + + /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is + lower than destination address. */ + rtx_code_label *more_8x_vec_backward_label = gen_label_rtx (); + emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr, + GET_MODE (destreg), 1, + more_8x_vec_backward_label); + + /* Skip if source == destination which is less common. */ + emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr, + GET_MODE (destreg), 1, done_label, + profile_probability::unlikely ()); + + rtx base_destreg = gen_reg_rtx (GET_MODE (destreg)); + emit_move_insn (base_destreg, destreg); + + /* Load the last 4 * MOVE_MAX. */ + rtx regs[4]; + ix86_expand_load_movmem (src, srcreg, count_exp, mode, + ARRAY_SIZE (regs), regs, true); + + rtx srcmem = change_address (src, mode, srcreg); + rtx destmem = change_address (dst, mode, destreg); + + /* Copy forward with a 4 * MOVE_MAX loop. */ + rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx (); + emit_label (loop_4x_vec_forward_label); + + ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true); + + rtx tmp; + rtx delta = GEN_INT (4 * MOVE_MAX); + + /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (loop_count), MINUS, + loop_count, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != loop_count) + emit_move_insn (loop_count, tmp); + + /* Increment DESTREG and SRCREG by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (destreg), PLUS, + destreg, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg, + delta, nullptr, 1, OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */ + emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr, + GET_MODE (loop_count), 1, + loop_4x_vec_forward_label); + + /* Store the last 4 * MOVE_MAX. */ + ix86_expand_store_movmem (dst, base_destreg, count_exp, mode, + ARRAY_SIZE (regs), regs, true); + + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + + /* Copy backward with a 4 * MOVE_MAX loop. */ + emit_label (more_8x_vec_backward_label); + + base_destreg = gen_reg_rtx (GET_MODE (destreg)); + emit_move_insn (base_destreg, destreg); + + /* Load the first 4 * MOVE_MAX. */ + ix86_expand_load_movmem (src, srcreg, count_exp, mode, + ARRAY_SIZE (regs), regs, false); + + /* Increment DESTREG and SRCREG by COUNT_EXP. */ + tmp = expand_simple_binop (GET_MODE (destreg), PLUS, + destreg, count_exp, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg, + count_exp, nullptr, 1, OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + srcmem = change_address (src, mode, srcreg); + destmem = change_address (dst, mode, destreg); + rtx step = GEN_INT (-GET_MODE_SIZE (mode)); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); + + rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx (); + emit_label (loop_4x_vec_backward_label); + + ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false); + + /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (loop_count), MINUS, + loop_count, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != loop_count) + emit_move_insn (loop_count, tmp); + + /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (destreg), MINUS, + destreg, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg, + delta, nullptr, 1, OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */ + emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr, + GET_MODE (loop_count), 1, + loop_4x_vec_backward_label); + + /* Store the first 4 * MOVE_MAX. */ + ix86_expand_store_movmem (dst, base_destreg, count_exp, mode, + ARRAY_SIZE (regs), regs, false); + + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } + } + + emit_label (done_label); + + return true; +} + /* Expand cmpstrn or memcmp. */ bool diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index bdb8bb963b5..5ff414a22a2 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -78,6 +78,7 @@ extern void substitute_vpternlog_operands (rtx[]); extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx); extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, bool); +extern bool ix86_expand_movmem (rtx[]); extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool); extern enum reg_class ix86_insn_base_reg_class (rtx_insn *); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b812d8b3823..7ec028c8264 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -25708,6 +25708,23 @@ (set_attr "length_immediate" "0") (set_attr "modrm" "0")]) +(define_expand "movmem" + [(use (match_operand:BLK 0 "memory_operand")) + (use (match_operand:BLK 1 "memory_operand")) + (use (match_operand:SWI48 2 "nonmemory_operand")) + (use (match_operand:SWI48 3 "const_int_operand")) + (use (match_operand:SI 4 "const_int_operand")) + (use (match_operand:SI 5 "const_int_operand")) + (use (match_operand:SI 6 "")) + (use (match_operand:SI 7 "")) + (use (match_operand:SI 8 ""))] + "" +{ + if (ix86_expand_movmem (operands)) + DONE; + FAIL; +}) + (define_expand "cpymem" [(use (match_operand:BLK 0 "memory_operand")) (use (match_operand:BLK 1 "memory_operand")) diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-10.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-10.c new file mode 100644 index 00000000000..43d6489de62 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-10.c @@ -0,0 +1,105 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove: +**.LFB0: +** .cfi_startproc +** cmpq \$63, %rdx +** ja .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$128, %rdx +** jbe .L13 +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L6 +** je .L1 +** movdqu -16\(%rax,%rdx\), %xmm7 +** movdqu -32\(%rax,%rdx\), %xmm6 +** movdqu -48\(%rax,%rdx\), %xmm5 +** movdqu -64\(%rax,%rdx\), %xmm4 +**.L7: +** movdqu \(%rax\), %xmm3 +** subq \$64, %rsi +** addq \$64, %rcx +** addq \$64, %rax +** movdqu -48\(%rax\), %xmm2 +** movdqu -32\(%rax\), %xmm1 +** movdqu -16\(%rax\), %xmm0 +** movups %xmm3, -64\(%rcx\) +** movups %xmm2, -48\(%rcx\) +** movups %xmm1, -32\(%rcx\) +** movups %xmm0, -16\(%rcx\) +** cmpq \$64, %rsi +** ja .L7 +** movups %xmm7, -16\(%rdi,%rdx\) +** movups %xmm6, -32\(%rdi,%rdx\) +** movups %xmm5, -48\(%rdi,%rdx\) +** movups %xmm4, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** movdqu 32\(%rsi\), %xmm5 +** movdqu 48\(%rsi\), %xmm4 +** movdqu -16\(%rsi,%rdx\), %xmm3 +** movdqu -32\(%rsi,%rdx\), %xmm2 +** movdqu -48\(%rsi,%rdx\), %xmm1 +** movdqu -64\(%rsi,%rdx\), %xmm0 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +** movups %xmm3, -16\(%rdi,%rdx\) +** movups %xmm2, -32\(%rdi,%rdx\) +** movups %xmm1, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movdqu \(%rax\), %xmm3 +** movdqu 16\(%rax\), %xmm2 +** leaq \(%rdi,%rdx\), %rcx +** movdqu 32\(%rax\), %xmm1 +** movdqu 48\(%rax\), %xmm0 +** addq %rdx, %rax +**.L8: +** movdqu -16\(%rax\), %xmm7 +** movdqu -32\(%rax\), %xmm6 +** subq \$64, %rsi +** subq \$64, %rcx +** movdqu -48\(%rax\), %xmm5 +** movdqu -64\(%rax\), %xmm4 +** subq \$64, %rax +** movups %xmm7, 48\(%rcx\) +** movups %xmm6, 32\(%rcx\) +** movups %xmm5, 16\(%rcx\) +** movups %xmm4, \(%rcx\) +** cmpq \$64, %rsi +** ja .L8 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n >= 64) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-11a.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-11a.c new file mode 100644 index 00000000000..3f4e2cac1ce --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-11a.c @@ -0,0 +1,79 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_xmm: +**.LFB0: +** .cfi_startproc +** movq %rdi, %rax +** movl \$512, %edx +** cmpq %rdi, %rsi +** jb .L5 +** je .L1 +** movdqu 496\(%rsi\), %xmm7 +** movdqu 480\(%rsi\), %xmm6 +** movdqu 464\(%rsi\), %xmm5 +** movdqu 448\(%rsi\), %xmm4 +**.L6: +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** subl \$64, %edx +** addq \$64, %rax +** movdqu 32\(%rsi\), %xmm1 +** movdqu 48\(%rsi\), %xmm0 +** addq \$64, %rsi +** movups %xmm3, -64\(%rax\) +** movups %xmm2, -48\(%rax\) +** movups %xmm1, -32\(%rax\) +** movups %xmm0, -16\(%rax\) +** cmpl \$64, %edx +** ja .L6 +** movups %xmm7, 496\(%rdi\) +** movups %xmm6, 480\(%rdi\) +** movups %xmm5, 464\(%rdi\) +** movups %xmm4, 448\(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** leaq 512\(%rdi\), %rax +** addq \$512, %rsi +** movdqu -480\(%rsi\), %xmm5 +** movdqu -464\(%rsi\), %xmm4 +**.L7: +** movdqu -16\(%rsi\), %xmm3 +** subl \$64, %edx +** subq \$64, %rax +** subq \$64, %rsi +** movdqu 32\(%rsi\), %xmm2 +** movdqu 16\(%rsi\), %xmm1 +** movdqu \(%rsi\), %xmm0 +** movups %xmm3, 48\(%rax\) +** movups %xmm2, 32\(%rax\) +** movups %xmm1, 16\(%rax\) +** movups %xmm0, \(%rax\) +** cmpl \$64, %edx +** ja .L7 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +**.L1: +** ret +** .cfi_endproc +**... +*/ + +#ifndef gcc_memmove +#define gcc_memmove gcc_memmove_xmm +#endif + +void +gcc_memmove (void *a, void *b) +{ + __builtin_memmove (a, b, 512); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-11b.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-11b.c new file mode 100644 index 00000000000..031dd12658e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-11b.c @@ -0,0 +1,74 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_ymm: +**.LFB0: +** .cfi_startproc +** movq %rdi, %rax +** movl \$512, %edx +** cmpq %rdi, %rsi +** jb .L5 +** je .L10 +** vmovdqu 480\(%rsi\), %ymm7 +** vmovdqu 448\(%rsi\), %ymm6 +** vmovdqu 416\(%rsi\), %ymm5 +** vmovdqu 384\(%rsi\), %ymm4 +**.L6: +** vmovdqu \(%rsi\), %ymm3 +** vmovdqu 32\(%rsi\), %ymm2 +** addl \$-128, %edx +** subq \$-128, %rax +** vmovdqu 64\(%rsi\), %ymm1 +** vmovdqu 96\(%rsi\), %ymm0 +** subq \$-128, %rsi +** vmovdqu %ymm3, -128\(%rax\) +** vmovdqu %ymm2, -96\(%rax\) +** vmovdqu %ymm1, -64\(%rax\) +** vmovdqu %ymm0, -32\(%rax\) +** cmpl \$128, %edx +** ja .L6 +** vmovdqu %ymm7, 480\(%rdi\) +** vmovdqu %ymm6, 448\(%rdi\) +** vmovdqu %ymm5, 416\(%rdi\) +** vmovdqu %ymm4, 384\(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** vmovdqu \(%rsi\), %ymm7 +** vmovdqu 32\(%rsi\), %ymm6 +** leaq 512\(%rdi\), %rax +** addq \$512, %rsi +** vmovdqu -448\(%rsi\), %ymm5 +** vmovdqu -416\(%rsi\), %ymm4 +**.L7: +** vmovdqu -32\(%rsi\), %ymm3 +** addl \$-128, %edx +** addq \$-128, %rax +** addq \$-128, %rsi +** vmovdqu 64\(%rsi\), %ymm2 +** vmovdqu 32\(%rsi\), %ymm1 +** vmovdqu \(%rsi\), %ymm0 +** vmovdqu %ymm3, 96\(%rax\) +** vmovdqu %ymm2, 64\(%rax\) +** vmovdqu %ymm1, 32\(%rax\) +** vmovdqu %ymm0, \(%rax\) +** cmpl \$128, %edx +** ja .L7 +** vmovdqu %ymm7, \(%rdi\) +** vmovdqu %ymm6, 32\(%rdi\) +** vmovdqu %ymm5, 64\(%rdi\) +** vmovdqu %ymm4, 96\(%rdi\) +** vzeroupper +**.L10: +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_ymm +#include "builtin-memmove-11a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-11c.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-11c.c new file mode 100644 index 00000000000..9c5e2c6f3e1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-11c.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */ + +/* +**gcc_memmove_zmm: +**.LFB0: +** .cfi_startproc +** vmovdqu64 \(%(e|r)si\), %zmm7 +** vmovdqu64 64\(%(e|r)si\), %zmm6 +** vmovdqu64 128\(%(e|r)si\), %zmm5 +** vmovdqu64 192\(%(e|r)si\), %zmm4 +** vmovdqu64 256\(%(e|r)si\), %zmm3 +** vmovdqu64 320\(%(e|r)si\), %zmm2 +** vmovdqu64 384\(%(e|r)si\), %zmm1 +** vmovdqu64 448\(%(e|r)si\), %zmm0 +** vmovdqu64 %zmm7, \(%(e|r)di\) +** vmovdqu64 %zmm6, 64\(%(e|r)di\) +** vmovdqu64 %zmm5, 128\(%(e|r)di\) +** vmovdqu64 %zmm4, 192\(%(e|r)di\) +** vmovdqu64 %zmm3, 256\(%(e|r)di\) +** vmovdqu64 %zmm2, 320\(%(e|r)di\) +** vmovdqu64 %zmm1, 384\(%(e|r)di\) +** vmovdqu64 %zmm0, 448\(%(e|r)di\) +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_zmm +#include "builtin-memmove-11a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-12.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-12.c new file mode 100644 index 00000000000..270df03f290 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-12.c @@ -0,0 +1,41 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movdqu a\+20\(%rip\), %xmm5 +** movdqu a\+36\(%rip\), %xmm4 +** movdqu a\+52\(%rip\), %xmm3 +** movdqu a\+68\(%rip\), %xmm2 +** movdqu a\+84\(%rip\), %xmm1 +** movdqu a\+100\(%rip\), %xmm0 +** movups %xmm5, a\+24\(%rip\) +** movq a\+116\(%rip\), %rax +** movdqu a\+4\(%rip\), %xmm6 +** movups %xmm4, a\+40\(%rip\) +** movl %edi, a\+4\(%rip\) +** movq %rax, a\+120\(%rip\) +** movups %xmm6, a\+8\(%rip\) +** movups %xmm3, a\+56\(%rip\) +** movups %xmm2, a\+72\(%rip\) +** movups %xmm1, a\+88\(%rip\) +** movups %xmm0, a\+104\(%rip\) +** ret +** .cfi_endproc +**... +*/ + +#define N 32 + +int a[N]; + +void +foo (int x) +{ + __builtin_memmove (a + 2, a + 1, sizeof a - 2 * sizeof *a); + a[1] = x; +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-13.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-13.c new file mode 100644 index 00000000000..1c71cce021e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-13.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movl a\+3\(%rip\), %eax +** movl %eax, a\(%rip\) +** movzbl a\+7\(%rip\), %eax +** movb %al, a\+4\(%rip\) +** ret +** .cfi_endproc +**... +*/ + +char a[8] = "12345678"; + +void +foo (void) +{ + __builtin_memmove (a, a + 3, sizeof a - 3); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-14.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-14.c new file mode 100644 index 00000000000..009c61d8b14 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-14.c @@ -0,0 +1,90 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove: +**.LFB0: +** .cfi_startproc +** cmpq \$64, %rdx +** jbe .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** cmpl \$16, %edx +** jnb .L13 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** cmpl \$32, %edx +** ja .L5 +** movl %edx, %edx +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** addq %rdx, %rsi +** movdqu -16\(%rsi\), %xmm1 +** movdqu -32\(%rsi\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n <= 64) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-15.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-15.c new file mode 100644 index 00000000000..c1ccf4427fa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-15.c @@ -0,0 +1,114 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove: +**.LFB0: +** .cfi_startproc +** cmpq \$66, %rdx +** jbe .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** cmpl \$16, %edx +** jnb .L13 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** cmpl \$32, %edx +** ja .L5 +** movl %edx, %edx +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** cmpl \$64, %edx +** jnb .L14 +** movl %edx, %edx +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** addq %rdx, %rsi +** movdqu -16\(%rsi\), %xmm1 +** movdqu -32\(%rsi\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movl %edx, %edx +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** movdqu 32\(%rsi\), %xmm5 +** movdqu 48\(%rsi\), %xmm4 +** addq %rdx, %rsi +** movdqu -16\(%rsi\), %xmm3 +** movdqu -32\(%rsi\), %xmm2 +** movdqu -48\(%rsi\), %xmm1 +** movdqu -64\(%rsi\), %xmm0 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +** movups %xmm3, -16\(%rdi,%rdx\) +** movups %xmm2, -32\(%rdi,%rdx\) +** movups %xmm1, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n <= 66) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-1a.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-1a.c new file mode 100644 index 00000000000..34598753c0d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-1a.c @@ -0,0 +1,123 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */ + +/* +**memmove7: +**.LFB[0-9]+: +** .cfi_startproc +** movl \(%(?:r|e)si\), %edx +** movl 3\(%(?:r|e)si\), %eax +** movl %edx, \(%(?:r|e)di\) +** movl %eax, 3\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove13: +**.LFB[0-9]+: +** .cfi_startproc +** movq \(%(?:r|e)si\), %rdx +** movq 5\(%(?:r|e)si\), %rax +** movq %rdx, \(%(?:r|e)di\) +** movq %rax, 5\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove31: +**.LFB[0-9]+: +** .cfi_startproc +** movdqu \(%(?:r|e)si\), %xmm1 +** movdqu 15\(%(?:r|e)si\), %xmm0 +** movups %xmm1, \(%(?:r|e)di\) +** movups %xmm0, 15\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove39: +**.LFB[0-9]+: +** .cfi_startproc +** movdqu \(%(?:r|e)si\), %xmm1 +** movdqu 16\(%(?:r|e)si\), %xmm0 +** movq 31\(%(?:r|e)si\), %rax +** movups %xmm0, 16\(%(?:r|e)di\) +** movups %xmm1, \(%(?:r|e)di\) +** movq %rax, 31\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove61: +**.LFB[0-9]+: +** .cfi_startproc +** movdqu \(%(?:r|e)si\), %xmm3 +** movdqu 16\(%(?:r|e)si\), %xmm2 +** movdqu 32\(%(?:r|e)si\), %xmm1 +** movdqu 45\(%(?:r|e)si\), %xmm0 +** movups %xmm3, \(%(?:r|e)di\) +** movups %xmm1, 32\(%(?:r|e)di\) +** movups %xmm2, 16\(%(?:r|e)di\) +** movups %xmm0, 45\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove69: +**.LFB[0-9]+: +** .cfi_startproc +** movdqu \(%(?:r|e)si\), %xmm3 +** movdqu 16\(%(?:r|e)si\), %xmm2 +** movdqu 32\(%(?:r|e)si\), %xmm1 +** movdqu 48\(%(?:r|e)si\), %xmm0 +** movq 61\(%(?:r|e)si\), %rax +** movups %xmm3, \(%(?:r|e)di\) +** movups %xmm0, 48\(%(?:r|e)di\) +** movups %xmm2, 16\(%(?:r|e)di\) +** movq %rax, 61\(%(?:r|e)di\) +** movups %xmm1, 32\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove93: +**.LFB[0-9]+: +** .cfi_startproc +** movdqu \(%(?:r|e)si\), %xmm5 +** movdqu 16\(%(?:r|e)si\), %xmm4 +** movdqu 32\(%(?:r|e)si\), %xmm3 +** movdqu 48\(%(?:r|e)si\), %xmm2 +** movdqu 64\(%(?:r|e)si\), %xmm1 +** movdqu 77\(%(?:r|e)si\), %xmm0 +** movups %xmm5, \(%(?:r|e)di\) +** movups %xmm4, 16\(%(?:r|e)di\) +** movups %xmm1, 64\(%(?:r|e)di\) +** movups %xmm3, 32\(%(?:r|e)di\) +** movups %xmm2, 48\(%(?:r|e)di\) +** movups %xmm0, 77\(%(?:r|e)di\) +** ret +**... +*/ + +#define TEST(n) \ + void \ + memmove##n (void *a, void *b) \ + { \ + __builtin_memmove (a, b, n); \ + } + +TEST (7) +TEST (13) +TEST (31) +TEST (39) +TEST (61) +TEST (69) +TEST (93) diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-1b.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-1b.c new file mode 100644 index 00000000000..25d008c5122 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-1b.c @@ -0,0 +1,98 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */ + +/* +**memmove7: +**.LFB[0-9]+: +** .cfi_startproc +** movl \(%(?:r|e)si\), %edx +** movl 3\(%(?:r|e)si\), %eax +** movl %edx, \(%(?:r|e)di\) +** movl %eax, 3\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove13: +**.LFB[0-9]+: +** .cfi_startproc +** movq \(%(?:r|e)si\), %rdx +** movq 5\(%(?:r|e)si\), %rax +** movq %rdx, \(%(?:r|e)di\) +** movq %rax, 5\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove31: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu \(%(?:r|e)si\), %xmm1 +** vmovdqu 15\(%(?:r|e)si\), %xmm0 +** vmovdqu %xmm1, \(%(?:r|e)di\) +** vmovdqu %xmm0, 15\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove39: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu \(%(?:r|e)si\), %ymm0 +** movq 31\(%(?:r|e)si\), %rax +** vmovdqu %ymm0, \(%(?:r|e)di\) +** movq %rax, 31\(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +/* +**memmove61: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu \(%(?:r|e)si\), %ymm1 +** vmovdqu 29\(%(?:r|e)si\), %ymm0 +** vmovdqu %ymm1, \(%(?:r|e)di\) +** vmovdqu %ymm0, 29\(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +/* +**memmove69: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu 32\(%(?:r|e)si\), %ymm0 +** movq 61\(%(?:r|e)si\), %rax +** vmovdqu \(%(?:r|e)si\), %ymm1 +** vmovdqu %ymm0, 32\(%(?:r|e)di\) +** movq %rax, 61\(%(?:r|e)di\) +** vmovdqu %ymm1, \(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +/* +**memmove93: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu \(%(?:r|e)si\), %ymm2 +** vmovdqu 32\(%(?:r|e)si\), %ymm1 +** vmovdqu 61\(%(?:r|e)si\), %ymm0 +** vmovdqu %ymm1, 32\(%(?:r|e)di\) +** vmovdqu %ymm2, \(%(?:r|e)di\) +** vmovdqu %ymm0, 61\(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +#include "builtin-memmove-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-1c.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-1c.c new file mode 100644 index 00000000000..9eb9a39871d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-1c.c @@ -0,0 +1,94 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */ + +/* +**memmove7: +**.LFB[0-9]+: +** .cfi_startproc +** movl \(%(?:r|e)si\), %edx +** movl 3\(%(?:r|e)si\), %eax +** movl %edx, \(%(?:r|e)di\) +** movl %eax, 3\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove13: +**.LFB[0-9]+: +** .cfi_startproc +** movq \(%(?:r|e)si\), %rdx +** movq 5\(%(?:r|e)si\), %rax +** movq %rdx, \(%(?:r|e)di\) +** movq %rax, 5\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove31: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu \(%(?:r|e)si\), %xmm1 +** vmovdqu 15\(%(?:r|e)si\), %xmm0 +** vmovdqu %xmm1, \(%(?:r|e)di\) +** vmovdqu %xmm0, 15\(%(?:r|e)di\) +** ret +**... +*/ + +/* +**memmove39: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu \(%(?:r|e)si\), %ymm0 +** movq 31\(%(?:r|e)si\), %rax +** vmovdqu %ymm0, \(%(?:r|e)di\) +** movq %rax, 31\(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +/* +**memmove61: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu \(%(?:r|e)si\), %ymm1 +** vmovdqu 29\(%(?:r|e)si\), %ymm0 +** vmovdqu %ymm1, \(%(?:r|e)di\) +** vmovdqu %ymm0, 29\(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +/* +**memmove69: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu64 \(%(?:r|e)si\), %zmm0 +** movq 61\(%(?:r|e)si\), %rax +** vmovdqu64 %zmm0, \(%(?:r|e)di\) +** movq %rax, 61\(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +/* +**memmove93: +**.LFB[0-9]+: +** .cfi_startproc +** vmovdqu64 \(%(?:r|e)si\), %zmm1 +** vmovdqu 61\(%(?:r|e)si\), %ymm0 +** vmovdqu64 %zmm1, \(%(?:r|e)di\) +** vmovdqu %ymm0, 61\(%(?:r|e)di\) +** vzeroupper +** ret +**... +*/ + +#include "builtin-memmove-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-1d.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-1d.c new file mode 100644 index 00000000000..ffa757584e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-1d.c @@ -0,0 +1,226 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mgeneral-regs-only -march=x86-64 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**memmove7: +**.LFB[0-9]+: +** .cfi_startproc +** movl \(%rsi\), %edx +** movl 3\(%rsi\), %eax +** movl %edx, \(%rdi\) +** movl %eax, 3\(%rdi\) +** ret +**... +*/ + +/* +**memmove13: +**.LFB[0-9]+: +** .cfi_startproc +** movq \(%rsi\), %rdx +** movq 5\(%rsi\), %rax +** movq %rdx, \(%rdi\) +** movq %rax, 5\(%rdi\) +** ret +**... +*/ + +/* +**memmove31: +**.LFB[0-9]+: +** .cfi_startproc +** movq \(%(e|r)si\), %r8 +** movq 8\(%(e|r)si\), %rcx +** movq 16\(%(e|r)si\), %rdx +** movq 23\(%(e|r)si\), %rax +** movq %r8, \(%(e|r)di\) +** movq %rdx, 16\(%(e|r)di\) +** movq %rcx, 8\(%(e|r)di\) +** movq %rax, 23\(%(e|r)di\) +** ret +**... +*/ + +/* +**memmove39: +**.LFB[0-9]+: +** .cfi_startproc +** movq \(%rsi\), %r9 +** movq 8\(%rsi\), %r8 +** movq 16\(%rsi\), %rcx +** movq 24\(%rsi\), %rdx +** movq 31\(%rsi\), %rax +** movq %r9, \(%rdi\) +** movq %rdx, 24\(%rdi\) +** movq %r8, 8\(%rdi\) +** movq %rcx, 16\(%rdi\) +** movq %rax, 31\(%rdi\) +** ret +**... +*/ + +/* +**memmove61: +**.LFB[0-9]+: +** .cfi_startproc +** movq 8\(%rsi\), %r11 +** movq 16\(%rsi\), %r10 +** pushq %rbx +** .cfi_def_cfa_offset 16 +** .cfi_offset 3, -16 +** movq 24\(%rsi\), %r9 +** movq \(%rsi\), %rbx +** movq 32\(%rsi\), %r8 +** movq 40\(%rsi\), %rcx +** movq 48\(%rsi\), %rdx +** movq 53\(%rsi\), %rax +** movq %rbx, \(%rdi\) +** movq %r11, 8\(%rdi\) +** popq %rbx +** .cfi_def_cfa_offset 8 +** movq %rdx, 48\(%rdi\) +** movq %r10, 16\(%rdi\) +** movq %r9, 24\(%rdi\) +** movq %r8, 32\(%rdi\) +** movq %rcx, 40\(%rdi\) +** movq %rax, 53\(%rdi\) +** ret +**... +*/ + +/* +**memmove69: +**.LFB5: +** .cfi_startproc +** movq 16\(%rsi\), %r11 +** movq 24\(%rsi\), %r10 +** pushq %rbp +** .cfi_def_cfa_offset 16 +** .cfi_offset 6, -16 +** movq 32\(%rsi\), %r9 +** movq \(%rsi\), %rbp +** pushq %rbx +** .cfi_def_cfa_offset 24 +** .cfi_offset 3, -24 +** movq 40\(%rsi\), %r8 +** movq 8\(%rsi\), %rbx +** movq 48\(%rsi\), %rcx +** movq 56\(%rsi\), %rdx +** movq 61\(%rsi\), %rax +** movq %rbp, \(%rdi\) +** movq %rbx, 8\(%rdi\) +** popq %rbx +** .cfi_def_cfa_offset 16 +** movq %rdx, 56\(%rdi\) +** popq %rbp +** .cfi_def_cfa_offset 8 +** movq %r11, 16\(%rdi\) +** movq %r10, 24\(%rdi\) +** movq %r9, 32\(%rdi\) +** movq %r8, 40\(%rdi\) +** movq %rcx, 48\(%rdi\) +** movq %rax, 61\(%rdi\) +** ret +**... +*/ + +/* +**memmove93: +**.LFB[0-9]+: +** .cfi_startproc +** sub(l|q) \$24, %(e|r)sp +** .cfi_def_cfa_offset 32 +** mov(l|q) %(e|r)si, %(e|r)ax +** movl \$93, %ecx +** cmp(l|q) %(e|r)di, %(e|r)si +** jb .L14 +** je .L10 +** movq %rbx, \(%(e|r)sp\) +** mov(l|q) %(e|r)di, %(e|r)dx +** movq %r14, 8\(%(e|r)sp\) +** movq %r15, 16\(%(e|r)sp\) +** .cfi_offset 3, -32 +** .cfi_offset 14, -24 +** .cfi_offset 15, -16 +** movq 85\(%(e|r)si\), %r14 +** movq 77\(%(e|r)si\), %r15 +** movq 69\(%(e|r)si\), %r10 +** movq 61\(%(e|r)si\), %r11 +**.L15: +** movq 8\(%(e|r)ax\), %r9 +** movq 16\(%(e|r)ax\), %r8 +** subl \$32, %ecx +** add(l|q) \$32, %(e|r)dx +** movq 24\(%(e|r)ax\), %rsi +** movq \(%(e|r)ax\), %rbx +** add(l|q) \$32, %(e|r)ax +** movq %r9, -24\(%(e|r)dx\) +** movq %rbx, -32\(%(e|r)dx\) +** movq %r8, -16\(%(e|r)dx\) +** movq %rsi, -8\(%(e|r)dx\) +** cmpl \$32, %ecx +** ja .L15 +** movq %r10, 69\(%(e|r)di\) +** movq \(%(e|r)sp\), %rbx +** .cfi_restore 3 +** movq %r11, 61\(%(e|r)di\) +** movq %r14, 85\(%(e|r)di\) +** movq 8\(%(e|r)sp\), %r14 +** .cfi_restore 14 +** movq %r15, 77\(%(e|r)di\) +** movq 16\(%(e|r)sp\), %r15 +** .cfi_restore 15 +**.L10: +** add(l|q) \$24, %(e|r)sp +** .cfi_remember_state +** .cfi_def_cfa_offset 8 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** .cfi_restore_state +** movq %rbx, \(%(e|r)sp\) +** lea(l|q) 93\(%(e|r)di\), %(e|r)dx +** add(l|q) \$93, %(e|r)ax +** movq %r14, 8\(%(e|r)sp\) +** movq %r15, 16\(%(e|r)sp\) +** .cfi_offset 3, -32 +** .cfi_offset 14, -24 +** .cfi_offset 15, -16 +** movq \(%(e|r)si\), %r14 +** movq 8\(%(e|r)si\), %r15 +** movq 16\(%(e|r)si\), %r10 +** movq 24\(%(e|r)si\), %r11 +**.L16: +** movq -16\(%(e|r)ax\), %r9 +** movq -24\(%(e|r)ax\), %r8 +** subl \$32, %ecx +** sub(l|q) \$32, %(e|r)dx +** movq -32\(%(e|r)ax\), %rsi +** movq -8\(%(e|r)ax\), %rbx +** sub(l|q) \$32, %(e|r)ax +** movq %r9, 16\(%(e|r)dx\) +** movq %rbx, 24\(%(e|r)dx\) +** movq %r8, 8\(%(e|r)dx\) +** movq %rsi, \(%(e|r)dx\) +** cmpl \$32, %ecx +** ja .L16 +** movq %r14, \(%(e|r)di\) +** movq \(%(e|r)sp\), %rbx +** .cfi_restore 3 +** movq %r15, 8\(%(e|r)di\) +** movq 8\(%(e|r)sp\), %r14 +** .cfi_restore 14 +** movq %r10, 16\(%(e|r)di\) +** movq 16\(%(e|r)sp\), %r15 +** .cfi_restore 15 +** movq %r11, 24\(%(e|r)di\) +** add(l|q) \$24, %(e|r)sp +** .cfi_def_cfa_offset 8 +** ret +**... +*/ + +#include "builtin-memmove-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-2a.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-2a.c new file mode 100644 index 00000000000..0a7e7048d60 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-2a.c @@ -0,0 +1,165 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_xmm: +**.LFB0: +** .cfi_startproc +** movq %rdi, %rax +** movq %rsi, %rcx +** cmpq \$16, %rdx +** jb .L3 +** cmpq \$32, %rdx +** jbe .L17 +** cmpq \$128, %rdx +** jbe .L18 +** movq %rdx, %rsi +** cmpq %rdi, %rcx +** jb .L11 +** je .L2 +** movdqu -16\(%rcx,%rdx\), %xmm7 +** movdqu -32\(%rcx,%rdx\), %xmm6 +** movdqu -48\(%rcx,%rdx\), %xmm5 +** movdqu -64\(%rcx,%rdx\), %xmm4 +**.L12: +** movdqu \(%rcx\), %xmm3 +** subq \$64, %rsi +** addq \$64, %rdi +** addq \$64, %rcx +** movdqu -48\(%rcx\), %xmm2 +** movdqu -32\(%rcx\), %xmm1 +** movdqu -16\(%rcx\), %xmm0 +** movups %xmm3, -64\(%rdi\) +** movups %xmm2, -48\(%rdi\) +** movups %xmm1, -32\(%rdi\) +** movups %xmm0, -16\(%rdi\) +** cmpq \$64, %rsi +** ja .L12 +** movups %xmm7, -16\(%rax,%rdx\) +** movups %xmm6, -32\(%rax,%rdx\) +** movups %xmm5, -48\(%rax,%rdx\) +** movups %xmm4, -64\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L3: +** cmpq \$8, %rdx +** jb .L19 +** movq \(%rsi\), %rdi +** movq -8\(%rsi,%rdx\), %rcx +** movq %rdi, \(%rax\) +** movq %rcx, -8\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** cmpq \$4, %rdx +** jnb .L6 +** cmpq \$1, %rdx +** ja .L7 +** jb .L2 +** movzbl \(%rsi\), %edx +** movb %dl, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L17: +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L18: +** cmpq \$64, %rdx +** jb .L10 +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** movdqu 32\(%rsi\), %xmm5 +** movdqu 48\(%rsi\), %xmm4 +** movdqu -16\(%rsi,%rdx\), %xmm3 +** movdqu -32\(%rsi,%rdx\), %xmm2 +** movdqu -48\(%rsi,%rdx\), %xmm1 +** movdqu -64\(%rsi,%rdx\), %xmm0 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +** movups %xmm3, -16\(%rdi,%rdx\) +** movups %xmm2, -32\(%rdi,%rdx\) +** movups %xmm1, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl \(%rsi\), %edi +** movl -4\(%rsi,%rdx\), %ecx +** movl %edi, \(%rax\) +** movl %ecx, -4\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movdqu \(%rcx\), %xmm7 +** movdqu 16\(%rcx\), %xmm6 +** leaq \(%rdi,%rdx\), %rdi +** movdqu 32\(%rcx\), %xmm5 +** movdqu 48\(%rcx\), %xmm4 +** addq %rdx, %rcx +**.L13: +** movdqu -16\(%rcx\), %xmm3 +** movdqu -32\(%rcx\), %xmm2 +** subq \$64, %rsi +** subq \$64, %rdi +** movdqu -48\(%rcx\), %xmm1 +** movdqu -64\(%rcx\), %xmm0 +** subq \$64, %rcx +** movups %xmm3, 48\(%rdi\) +** movups %xmm2, 32\(%rdi\) +** movups %xmm1, 16\(%rdi\) +** movups %xmm0, \(%rdi\) +** cmpq \$64, %rsi +** ja .L13 +** movups %xmm7, \(%rax\) +** movups %xmm6, 16\(%rax\) +** movups %xmm5, 32\(%rax\) +** movups %xmm4, 48\(%rax\) +**.L2: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** movdqu -16\(%rsi,%rdx\), %xmm1 +** movdqu -32\(%rsi,%rdx\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movzwl \(%rsi\), %edi +** movzwl -2\(%rsi,%rdx\), %ecx +** movw %di, \(%rax\) +** movw %cx, -2\(%rax,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#ifndef gcc_memmove +#define gcc_memmove gcc_memmove_xmm +#endif + +void * +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + return __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-2b.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-2b.c new file mode 100644 index 00000000000..0596ca75841 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-2b.c @@ -0,0 +1,173 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_ymm: +**.LFB0: +** .cfi_startproc +** movq %rdi, %rax +** movq %rsi, %rcx +** cmpq \$32, %rdx +** jb .L3 +** cmpq \$64, %rdx +** jbe .L18 +** cmpq \$256, %rdx +** jbe .L19 +** movq %rdx, %rsi +** cmpq %rdi, %rcx +** jb .L12 +** je .L2 +** vmovdqu -32\(%rcx,%rdx\), %ymm7 +** vmovdqu -64\(%rcx,%rdx\), %ymm6 +** vmovdqu -96\(%rcx,%rdx\), %ymm5 +** vmovdqu -128\(%rcx,%rdx\), %ymm4 +**.L13: +** vmovdqu \(%rcx\), %ymm3 +** addq \$-128, %rsi +** subq \$-128, %rdi +** subq \$-128, %rcx +** vmovdqu -96\(%rcx\), %ymm2 +** vmovdqu -64\(%rcx\), %ymm1 +** vmovdqu -32\(%rcx\), %ymm0 +** vmovdqu %ymm3, -128\(%rdi\) +** vmovdqu %ymm2, -96\(%rdi\) +** vmovdqu %ymm1, -64\(%rdi\) +** vmovdqu %ymm0, -32\(%rdi\) +** cmpq \$128, %rsi +** ja .L13 +** vmovdqu %ymm7, -32\(%rax,%rdx\) +** vmovdqu %ymm6, -64\(%rax,%rdx\) +** vmovdqu %ymm5, -96\(%rax,%rdx\) +** vmovdqu %ymm4, -128\(%rax,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L3: +** cmpq \$16, %rdx +** jb .L20 +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** cmpq \$8, %rdx +** jnb .L6 +** cmpq \$4, %rdx +** jnb .L7 +** cmpq \$1, %rdx +** ja .L8 +** jb .L2 +** movzbl \(%rsi\), %edx +** movb %dl, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L18: +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** cmpq \$128, %rdx +** jb .L11 +** vmovdqu \(%rsi\), %ymm7 +** vmovdqu 32\(%rsi\), %ymm6 +** vmovdqu 64\(%rsi\), %ymm5 +** vmovdqu 96\(%rsi\), %ymm4 +** vmovdqu -32\(%rsi,%rdx\), %ymm3 +** vmovdqu -64\(%rsi,%rdx\), %ymm2 +** vmovdqu -96\(%rsi,%rdx\), %ymm1 +** vmovdqu -128\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm7, \(%rdi\) +** vmovdqu %ymm6, 32\(%rdi\) +** vmovdqu %ymm5, 64\(%rdi\) +** vmovdqu %ymm4, 96\(%rdi\) +** vmovdqu %ymm3, -32\(%rdi,%rdx\) +** vmovdqu %ymm2, -64\(%rdi,%rdx\) +** vmovdqu %ymm1, -96\(%rdi,%rdx\) +** vmovdqu %ymm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movq \(%rsi\), %rdi +** movq -8\(%rsi,%rdx\), %rcx +** movq %rdi, \(%rax\) +** movq %rcx, -8\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** vmovdqu \(%rcx\), %ymm7 +** vmovdqu 32\(%rcx\), %ymm6 +** leaq \(%rdi,%rdx\), %rdi +** vmovdqu 64\(%rcx\), %ymm5 +** vmovdqu 96\(%rcx\), %ymm4 +** addq %rdx, %rcx +**.L14: +** vmovdqu -32\(%rcx\), %ymm3 +** vmovdqu -64\(%rcx\), %ymm2 +** addq \$-128, %rsi +** addq \$-128, %rdi +** vmovdqu -96\(%rcx\), %ymm1 +** vmovdqu -128\(%rcx\), %ymm0 +** addq \$-128, %rcx +** vmovdqu %ymm3, 96\(%rdi\) +** vmovdqu %ymm2, 64\(%rdi\) +** vmovdqu %ymm1, 32\(%rdi\) +** vmovdqu %ymm0, \(%rdi\) +** cmpq \$128, %rsi +** ja .L14 +** vmovdqu %ymm7, \(%rax\) +** vmovdqu %ymm6, 32\(%rax\) +** vmovdqu %ymm5, 64\(%rax\) +** vmovdqu %ymm4, 96\(%rax\) +** vzeroupper +**.L2: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** vmovdqu \(%rsi\), %ymm3 +** vmovdqu 32\(%rsi\), %ymm2 +** vmovdqu -32\(%rsi,%rdx\), %ymm1 +** vmovdqu -64\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, -32\(%rdi,%rdx\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl \(%rsi\), %edi +** movl -4\(%rsi,%rdx\), %ecx +** movl %edi, \(%rax\) +** movl %ecx, -4\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movzwl \(%rsi\), %edi +** movzwl -2\(%rsi,%rdx\), %ecx +** movw %di, \(%rax\) +** movw %cx, -2\(%rax,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_ymm +#include "builtin-memmove-2a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-2c.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-2c.c new file mode 100644 index 00000000000..cb3cb9e786e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-2c.c @@ -0,0 +1,184 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_zmm: +**.LFB0: +** .cfi_startproc +** movq %rdi, %rax +** movq %rsi, %rcx +** cmpq \$64, %rdx +** jb .L3 +** cmpq \$128, %rdx +** jbe .L19 +** cmpq \$512, %rdx +** jbe .L20 +** movq %rdx, %rsi +** cmpq %rdi, %rcx +** jb .L13 +** je .L2 +** vmovdqu64 -64\(%rcx,%rdx\), %zmm7 +** vmovdqu64 -128\(%rcx,%rdx\), %zmm6 +** vmovdqu64 -192\(%rcx,%rdx\), %zmm5 +** vmovdqu64 -256\(%rcx,%rdx\), %zmm4 +**.L14: +** vmovdqu64 \(%rcx\), %zmm3 +** vmovdqu64 64\(%rcx\), %zmm2 +** subq \$256, %rsi +** addq \$256, %rdi +** vmovdqu64 128\(%rcx\), %zmm1 +** addq \$256, %rcx +** vmovdqu64 -64\(%rcx\), %zmm0 +** vmovdqu64 %zmm3, -256\(%rdi\) +** vmovdqu64 %zmm2, -192\(%rdi\) +** vmovdqu64 %zmm1, -128\(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi\) +** cmpq \$256, %rsi +** ja .L14 +** vmovdqu64 %zmm7, -64\(%rax,%rdx\) +** vmovdqu64 %zmm6, -128\(%rax,%rdx\) +** vmovdqu64 %zmm5, -192\(%rax,%rdx\) +** vmovdqu64 %zmm4, -256\(%rax,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L3: +** cmpq \$32, %rdx +** jb .L21 +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L21: +** cmpq \$16, %rdx +** jnb .L6 +** cmpq \$8, %rdx +** jnb .L7 +** cmpq \$4, %rdx +** jnb .L8 +** cmpq \$1, %rdx +** ja .L9 +** jb .L2 +** movzbl \(%rsi\), %edx +** movb %dl, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** vmovdqu64 \(%rsi\), %zmm1 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm1, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** cmpq \$256, %rdx +** jb .L12 +** vmovdqu64 \(%rsi\), %zmm7 +** vmovdqu64 64\(%rsi\), %zmm6 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm3 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm2 +** vmovdqu64 128\(%rsi\), %zmm5 +** vmovdqu64 192\(%rsi\), %zmm4 +** vmovdqu64 -192\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -256\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm7, \(%rdi\) +** vmovdqu64 %zmm6, 64\(%rdi\) +** vmovdqu64 %zmm5, 128\(%rdi\) +** vmovdqu64 %zmm4, 192\(%rdi\) +** vmovdqu64 %zmm3, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm2, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm1, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -256\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** vmovdqu64 \(%rcx\), %zmm7 +** leaq \(%rdi,%rdx\), %rdi +** vmovdqu64 64\(%rcx\), %zmm6 +** vmovdqu64 128\(%rcx\), %zmm5 +** vmovdqu64 192\(%rcx\), %zmm4 +** addq %rdx, %rcx +**.L15: +** vmovdqu64 -64\(%rcx\), %zmm3 +** vmovdqu64 -128\(%rcx\), %zmm2 +** subq \$256, %rsi +** subq \$256, %rdi +** vmovdqu64 -192\(%rcx\), %zmm1 +** subq \$256, %rcx +** vmovdqu64 \(%rcx\), %zmm0 +** vmovdqu64 %zmm3, 192\(%rdi\) +** vmovdqu64 %zmm2, 128\(%rdi\) +** vmovdqu64 %zmm1, 64\(%rdi\) +** vmovdqu64 %zmm0, \(%rdi\) +** cmpq \$256, %rsi +** ja .L15 +** vmovdqu64 %zmm7, \(%rax\) +** vmovdqu64 %zmm6, 64\(%rax\) +** vmovdqu64 %zmm5, 128\(%rax\) +** vmovdqu64 %zmm4, 192\(%rax\) +** vzeroupper +**.L2: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** vmovdqu64 \(%rsi\), %zmm3 +** vmovdqu64 64\(%rsi\), %zmm2 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm3, \(%rdi\) +** vmovdqu64 %zmm2, 64\(%rdi\) +** vmovdqu64 %zmm1, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movq \(%rsi\), %rdi +** movq -8\(%rsi,%rdx\), %rcx +** movq %rdi, \(%rax\) +** movq %rcx, -8\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl \(%rsi\), %edi +** movl -4\(%rsi,%rdx\), %ecx +** movl %edi, \(%rax\) +** movl %ecx, -4\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movzwl \(%rsi\), %edi +** movzwl -2\(%rsi,%rdx\), %ecx +** movw %di, \(%rax\) +** movw %cx, -2\(%rax,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_zmm +#include "builtin-memmove-2a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-2d.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-2d.c new file mode 100644 index 00000000000..c27edfeaf0b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-2d.c @@ -0,0 +1,195 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mgeneral-regs-only -march=x86-64 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_gpr: +**.LFB0: +** .cfi_startproc +** movq %rdi, %rax +** cmpq \$8, %rdx +** jb .L3 +** cmpq \$16, %rdx +** jbe .L19 +** subq \$32, %rsp +** .cfi_def_cfa_offset 40 +** cmpq \$64, %rdx +** jbe .L20 +** movq %rsi, %rcx +** movq %rdx, %rsi +** cmpq %rdi, %rcx +** jb .L10 +** je .L2 +** movq %rbx, \(%rsp\) +** movq %rbp, 8\(%rsp\) +** movq %r14, 16\(%rsp\) +** movq %r15, 24\(%rsp\) +** .cfi_offset 3, -40 +** .cfi_offset 6, -32 +** .cfi_offset 14, -24 +** .cfi_offset 15, -16 +** movq -8\(%rcx,%rdx\), %r15 +** movq -16\(%rcx,%rdx\), %r14 +** movq -24\(%rcx,%rdx\), %rbp +** movq -32\(%rcx,%rdx\), %r11 +**.L11: +** movq 8\(%rcx\), %r10 +** movq 16\(%rcx\), %r9 +** subq \$32, %rsi +** addq \$32, %rdi +** movq 24\(%rcx\), %r8 +** movq \(%rcx\), %rbx +** addq \$32, %rcx +** movq %r10, -24\(%rdi\) +** movq %rbx, -32\(%rdi\) +** movq %r9, -16\(%rdi\) +** movq %r8, -8\(%rdi\) +** cmpq \$32, %rsi +** ja .L11 +** movq %r15, -8\(%rax,%rdx\) +** movq %r14, -16\(%rax,%rdx\) +** movq %rbp, -24\(%rax,%rdx\) +** movq %r11, -32\(%rax,%rdx\) +** movq \(%rsp\), %rbx +** .cfi_restore 3 +** movq 8\(%rsp\), %rbp +** .cfi_restore 6 +** movq 16\(%rsp\), %r14 +** .cfi_restore 14 +** movq 24\(%rsp\), %r15 +** .cfi_restore 15 +** jmp .L2 +** .p2align 4,,10 +** .p2align 3 +**.L3: +** .cfi_def_cfa_offset 8 +** cmpq \$4, %rdx +** jb .L21 +** movl \(%rsi\), %edi +** movl -4\(%rsi,%rdx\), %ecx +** movl %edi, \(%rax\) +** movl %ecx, -4\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L21: +** cmpq \$1, %rdx +** ja .L6 +** jb .L16 +** movzbl \(%rsi\), %edx +** movb %dl, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** movq \(%rsi\), %rdi +** movq -8\(%rsi,%rdx\), %rcx +** movq %rdi, \(%rax\) +** movq %rcx, -8\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** .cfi_def_cfa_offset 40 +** cmpq \$32, %rdx +** jb .L9 +** movq %rbx, \(%rsp\) +** movq %r14, 16\(%rsp\) +** .cfi_offset 3, -40 +** .cfi_offset 14, -24 +** movq \(%rsi\), %rbx +** movq 8\(%rsi\), %r14 +** movq 16\(%rsi\), %r11 +** movq 24\(%rsi\), %r10 +** movq -8\(%rsi,%rdx\), %r9 +** movq -16\(%rsi,%rdx\), %r8 +** movq -24\(%rsi,%rdx\), %rdi +** movq -32\(%rsi,%rdx\), %rcx +** movq %rbx, \(%rax\) +** movq %r14, 8\(%rax\) +** movq %r11, 16\(%rax\) +** movq %r10, 24\(%rax\) +** movq %r9, -8\(%rax,%rdx\) +** movq %r8, -16\(%rax,%rdx\) +** movq %rdi, -24\(%rax,%rdx\) +** movq %rcx, -32\(%rax,%rdx\) +** movq \(%rsp\), %rbx +** .cfi_restore 3 +** movq 16\(%rsp\), %r14 +** .cfi_restore 14 +**.L2: +** addq \$32, %rsp +** .cfi_def_cfa_offset 8 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movzwl \(%rsi\), %edi +** movzwl -2\(%rsi,%rdx\), %ecx +** movw %di, \(%rax\) +** movw %cx, -2\(%rax,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** .cfi_def_cfa_offset 40 +** movq \(%rsi\), %r9 +** movq 8\(%rsi\), %r8 +** movq -8\(%rsi,%rdx\), %rdi +** movq -16\(%rsi,%rdx\), %rcx +** movq %r9, \(%rax\) +** movq %r8, 8\(%rax\) +** movq %rdi, -8\(%rax,%rdx\) +** movq %rcx, -16\(%rax,%rdx\) +** jmp .L2 +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movq %rbx, \(%rsp\) +** leaq \(%rdi,%rdx\), %rdi +** movq %r14, 16\(%rsp\) +** movq %r15, 24\(%rsp\) +** .cfi_offset 3, -40 +** .cfi_offset 14, -24 +** .cfi_offset 15, -16 +** movq \(%rcx\), %r14 +** movq 8\(%rcx\), %r15 +** movq 16\(%rcx\), %r10 +** movq 24\(%rcx\), %r11 +** addq %rdx, %rcx +**.L12: +** movq -16\(%rcx\), %r9 +** movq -24\(%rcx\), %r8 +** subq \$32, %rsi +** subq \$32, %rdi +** movq -32\(%rcx\), %rdx +** movq -8\(%rcx\), %rbx +** subq \$32, %rcx +** movq %r9, 16\(%rdi\) +** movq %rbx, 24\(%rdi\) +** movq %r8, 8\(%rdi\) +** movq %rdx, \(%rdi\) +** cmpq \$32, %rsi +** ja .L12 +** movq %r14, \(%rax\) +** movq \(%rsp\), %rbx +** .cfi_restore 3 +** movq %r15, 8\(%rax\) +** movq 16\(%rsp\), %r14 +** .cfi_restore 14 +** movq %r10, 16\(%rax\) +** movq 24\(%rsp\), %r15 +** .cfi_restore 15 +** movq %r11, 24\(%rax\) +** jmp .L2 +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_gpr +#include "builtin-memmove-2a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-3a.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-3a.c new file mode 100644 index 00000000000..83cb8e1a446 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-3a.c @@ -0,0 +1,133 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_xmm: +**.LFB0: +** .cfi_startproc +** cmpq \$16, %rdx +** ja .L13 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$32, %rdx +** jbe .L14 +** cmpq \$128, %rdx +** ja .L5 +** cmpq \$64, %rdx +** jnb .L15 +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** movdqu -16\(%rsi,%rdx\), %xmm1 +** movdqu -32\(%rsi,%rdx\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L7 +** je .L1 +** movdqu -16\(%rax,%rdx\), %xmm7 +** movdqu -32\(%rax,%rdx\), %xmm6 +** movdqu -48\(%rax,%rdx\), %xmm5 +** movdqu -64\(%rax,%rdx\), %xmm4 +**.L8: +** movdqu \(%rax\), %xmm3 +** subq \$64, %rsi +** addq \$64, %rcx +** addq \$64, %rax +** movdqu -48\(%rax\), %xmm2 +** movdqu -32\(%rax\), %xmm1 +** movdqu -16\(%rax\), %xmm0 +** movups %xmm3, -64\(%rcx\) +** movups %xmm2, -48\(%rcx\) +** movups %xmm1, -32\(%rcx\) +** movups %xmm0, -16\(%rcx\) +** cmpq \$64, %rsi +** ja .L8 +** movups %xmm7, -16\(%rdi,%rdx\) +** movups %xmm6, -32\(%rdi,%rdx\) +** movups %xmm5, -48\(%rdi,%rdx\) +** movups %xmm4, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movdqu \(%rax\), %xmm3 +** movdqu 16\(%rax\), %xmm2 +** leaq \(%rdi,%rdx\), %rcx +** movdqu 32\(%rax\), %xmm1 +** movdqu 48\(%rax\), %xmm0 +** addq %rdx, %rax +**.L9: +** movdqu -16\(%rax\), %xmm7 +** movdqu -32\(%rax\), %xmm6 +** subq \$64, %rsi +** subq \$64, %rcx +** movdqu -48\(%rax\), %xmm5 +** movdqu -64\(%rax\), %xmm4 +** subq \$64, %rax +** movups %xmm7, 48\(%rcx\) +** movups %xmm6, 32\(%rcx\) +** movups %xmm5, 16\(%rcx\) +** movups %xmm4, \(%rcx\) +** cmpq \$64, %rsi +** ja .L9 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** movdqu 32\(%rsi\), %xmm5 +** movdqu 48\(%rsi\), %xmm4 +** movdqu -16\(%rsi,%rdx\), %xmm3 +** movdqu -32\(%rsi,%rdx\), %xmm2 +** movdqu -48\(%rsi,%rdx\), %xmm1 +** movdqu -64\(%rsi,%rdx\), %xmm0 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +** movups %xmm3, -16\(%rdi,%rdx\) +** movups %xmm2, -32\(%rdi,%rdx\) +** movups %xmm1, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#ifndef gcc_memmove +#define gcc_memmove gcc_memmove_xmm +#endif + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n > 16) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-3b.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-3b.c new file mode 100644 index 00000000000..43fae5c134f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-3b.c @@ -0,0 +1,140 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_ymm: +**.LFB0: +** .cfi_startproc +** cmpq \$16, %rdx +** ja .L16 +**.L14: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$32, %rdx +** jb .L6 +** cmpq \$64, %rdx +** ja .L5 +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** cmpq \$256, %rdx +** jbe .L17 +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L9 +** je .L14 +** vmovdqu -32\(%rax,%rdx\), %ymm7 +** vmovdqu -64\(%rax,%rdx\), %ymm6 +** vmovdqu -96\(%rax,%rdx\), %ymm5 +** vmovdqu -128\(%rax,%rdx\), %ymm4 +**.L10: +** vmovdqu \(%rax\), %ymm3 +** addq \$-128, %rsi +** subq \$-128, %rcx +** subq \$-128, %rax +** vmovdqu -96\(%rax\), %ymm2 +** vmovdqu -64\(%rax\), %ymm1 +** vmovdqu -32\(%rax\), %ymm0 +** vmovdqu %ymm3, -128\(%rcx\) +** vmovdqu %ymm2, -96\(%rcx\) +** vmovdqu %ymm1, -64\(%rcx\) +** vmovdqu %ymm0, -32\(%rcx\) +** cmpq \$128, %rsi +** ja .L10 +** vmovdqu %ymm7, -32\(%rdi,%rdx\) +** vmovdqu %ymm6, -64\(%rdi,%rdx\) +** vmovdqu %ymm5, -96\(%rdi,%rdx\) +** vmovdqu %ymm4, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L17: +** cmpq \$128, %rdx +** jb .L8 +** vmovdqu \(%rsi\), %ymm7 +** vmovdqu 32\(%rsi\), %ymm6 +** vmovdqu 64\(%rsi\), %ymm5 +** vmovdqu 96\(%rsi\), %ymm4 +** vmovdqu -32\(%rsi,%rdx\), %ymm3 +** vmovdqu -64\(%rsi,%rdx\), %ymm2 +** vmovdqu -96\(%rsi,%rdx\), %ymm1 +** vmovdqu -128\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm7, \(%rdi\) +** vmovdqu %ymm6, 32\(%rdi\) +** vmovdqu %ymm5, 64\(%rdi\) +** vmovdqu %ymm4, 96\(%rdi\) +** vmovdqu %ymm3, -32\(%rdi,%rdx\) +** vmovdqu %ymm2, -64\(%rdi,%rdx\) +** vmovdqu %ymm1, -96\(%rdi,%rdx\) +** vmovdqu %ymm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** vmovdqu \(%rsi\), %ymm3 +** vmovdqu 32\(%rsi\), %ymm2 +** vmovdqu -32\(%rsi,%rdx\), %ymm1 +** vmovdqu -64\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, -32\(%rdi,%rdx\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** vmovdqu \(%rax\), %ymm3 +** vmovdqu 32\(%rax\), %ymm2 +** leaq \(%rdi,%rdx\), %rcx +** vmovdqu 64\(%rax\), %ymm1 +** vmovdqu 96\(%rax\), %ymm0 +** addq %rdx, %rax +**.L11: +** vmovdqu -32\(%rax\), %ymm7 +** vmovdqu -64\(%rax\), %ymm6 +** addq \$-128, %rsi +** addq \$-128, %rcx +** vmovdqu -96\(%rax\), %ymm5 +** vmovdqu -128\(%rax\), %ymm4 +** addq \$-128, %rax +** vmovdqu %ymm7, 96\(%rcx\) +** vmovdqu %ymm6, 64\(%rcx\) +** vmovdqu %ymm5, 32\(%rcx\) +** vmovdqu %ymm4, \(%rcx\) +** cmpq \$128, %rsi +** ja .L11 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, 64\(%rdi\) +** vmovdqu %ymm0, 96\(%rdi\) +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_ymm +#include "builtin-memmove-3a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-3c.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-3c.c new file mode 100644 index 00000000000..11ccb698e43 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-3c.c @@ -0,0 +1,151 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_zmm: +**.LFB0: +** .cfi_startproc +** cmpq \$16, %rdx +** ja .L18 +**.L16: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L18: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$64, %rdx +** jnb .L19 +** cmpq \$32, %rdx +** jb .L15 +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** cmpq \$128, %rdx +** ja .L5 +** vmovdqu64 \(%rsi\), %zmm1 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm1, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** cmpq \$512, %rdx +** jbe .L20 +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L10 +** je .L16 +** vmovdqu64 -64\(%rax,%rdx\), %zmm7 +** vmovdqu64 -128\(%rax,%rdx\), %zmm6 +** vmovdqu64 -192\(%rax,%rdx\), %zmm5 +** vmovdqu64 -256\(%rax,%rdx\), %zmm4 +**.L11: +** vmovdqu64 \(%rax\), %zmm3 +** addq \$256, %rax +** vmovdqu64 -192\(%rax\), %zmm2 +** subq \$256, %rsi +** vmovdqu64 -128\(%rax\), %zmm1 +** vmovdqu64 -64\(%rax\), %zmm0 +** addq \$256, %rcx +** vmovdqu64 %zmm3, -256\(%rcx\) +** vmovdqu64 %zmm2, -192\(%rcx\) +** vmovdqu64 %zmm1, -128\(%rcx\) +** vmovdqu64 %zmm0, -64\(%rcx\) +** cmpq \$256, %rsi +** ja .L11 +** vmovdqu64 %zmm7, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm6, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm5, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm4, -256\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** cmpq \$256, %rdx +** jb .L9 +** vmovdqu64 \(%rsi\), %zmm7 +** vmovdqu64 64\(%rsi\), %zmm6 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm3 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm2 +** vmovdqu64 128\(%rsi\), %zmm5 +** vmovdqu64 192\(%rsi\), %zmm4 +** vmovdqu64 -192\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -256\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm7, \(%rdi\) +** vmovdqu64 %zmm6, 64\(%rdi\) +** vmovdqu64 %zmm5, 128\(%rdi\) +** vmovdqu64 %zmm4, 192\(%rdi\) +** vmovdqu64 %zmm3, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm2, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm1, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -256\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** vmovdqu64 \(%rsi\), %zmm3 +** vmovdqu64 64\(%rsi\), %zmm2 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm3, \(%rdi\) +** vmovdqu64 %zmm2, 64\(%rdi\) +** vmovdqu64 %zmm1, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** vmovdqu64 \(%rax\), %zmm3 +** leaq \(%rdi,%rdx\), %rcx +** vmovdqu64 64\(%rax\), %zmm2 +** vmovdqu64 128\(%rax\), %zmm1 +** vmovdqu64 192\(%rax\), %zmm0 +** addq %rdx, %rax +**.L12: +** vmovdqu64 -64\(%rax\), %zmm7 +** subq \$256, %rax +** vmovdqu64 128\(%rax\), %zmm6 +** subq \$256, %rsi +** vmovdqu64 64\(%rax\), %zmm5 +** vmovdqu64 \(%rax\), %zmm4 +** subq \$256, %rcx +** vmovdqu64 %zmm7, 192\(%rcx\) +** vmovdqu64 %zmm6, 128\(%rcx\) +** vmovdqu64 %zmm5, 64\(%rcx\) +** vmovdqu64 %zmm4, \(%rcx\) +** cmpq \$256, %rsi +** ja .L12 +** vmovdqu64 %zmm3, \(%rdi\) +** vmovdqu64 %zmm2, 64\(%rdi\) +** vmovdqu64 %zmm1, 128\(%rdi\) +** vmovdqu64 %zmm0, 192\(%rdi\) +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_zmm +#include "builtin-memmove-3a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-4a.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-4a.c new file mode 100644 index 00000000000..c437a537127 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-4a.c @@ -0,0 +1,123 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_xmm: +**.LFB0: +** .cfi_startproc +** cmpq \$32, %rdx +** ja .L13 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$128, %rdx +** jbe .L14 +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L7 +** je .L1 +** movdqu -16\(%rax,%rdx\), %xmm7 +** movdqu -32\(%rax,%rdx\), %xmm6 +** movdqu -48\(%rax,%rdx\), %xmm5 +** movdqu -64\(%rax,%rdx\), %xmm4 +**.L8: +** movdqu \(%rax\), %xmm3 +** subq \$64, %rsi +** addq \$64, %rcx +** addq \$64, %rax +** movdqu -48\(%rax\), %xmm2 +** movdqu -32\(%rax\), %xmm1 +** movdqu -16\(%rax\), %xmm0 +** movups %xmm3, -64\(%rcx\) +** movups %xmm2, -48\(%rcx\) +** movups %xmm1, -32\(%rcx\) +** movups %xmm0, -16\(%rcx\) +** cmpq \$64, %rsi +** ja .L8 +** movups %xmm7, -16\(%rdi,%rdx\) +** movups %xmm6, -32\(%rdi,%rdx\) +** movups %xmm5, -48\(%rdi,%rdx\) +** movups %xmm4, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** cmpq \$64, %rdx +** jb .L6 +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** movdqu 32\(%rsi\), %xmm5 +** movdqu 48\(%rsi\), %xmm4 +** movdqu -16\(%rsi,%rdx\), %xmm3 +** movdqu -32\(%rsi,%rdx\), %xmm2 +** movdqu -48\(%rsi,%rdx\), %xmm1 +** movdqu -64\(%rsi,%rdx\), %xmm0 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +** movups %xmm3, -16\(%rdi,%rdx\) +** movups %xmm2, -32\(%rdi,%rdx\) +** movups %xmm1, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** movdqu -16\(%rsi,%rdx\), %xmm1 +** movdqu -32\(%rsi,%rdx\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movdqu \(%rax\), %xmm3 +** movdqu 16\(%rax\), %xmm2 +** leaq \(%rdi,%rdx\), %rcx +** movdqu 32\(%rax\), %xmm1 +** movdqu 48\(%rax\), %xmm0 +** addq %rdx, %rax +**.L9: +** movdqu -16\(%rax\), %xmm7 +** movdqu -32\(%rax\), %xmm6 +** subq \$64, %rsi +** subq \$64, %rcx +** movdqu -48\(%rax\), %xmm5 +** movdqu -64\(%rax\), %xmm4 +** subq \$64, %rax +** movups %xmm7, 48\(%rcx\) +** movups %xmm6, 32\(%rcx\) +** movups %xmm5, 16\(%rcx\) +** movups %xmm4, \(%rcx\) +** cmpq \$64, %rsi +** ja .L9 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +** ret +** .cfi_endproc +**... +*/ + +#ifndef gcc_memmove +#define gcc_memmove gcc_memmove_xmm +#endif + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n > 32) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-4b.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-4b.c new file mode 100644 index 00000000000..4b65fca11a4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-4b.c @@ -0,0 +1,130 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_ymm: +**.LFB0: +** .cfi_startproc +** cmpq \$32, %rdx +** ja .L14 +**.L12: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$64, %rdx +** jbe .L15 +** cmpq \$256, %rdx +** ja .L5 +** cmpq \$128, %rdx +** jnb .L16 +** vmovdqu \(%rsi\), %ymm3 +** vmovdqu 32\(%rsi\), %ymm2 +** vmovdqu -32\(%rsi,%rdx\), %ymm1 +** vmovdqu -64\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, -32\(%rdi,%rdx\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L7 +** je .L12 +** vmovdqu -32\(%rax,%rdx\), %ymm7 +** vmovdqu -64\(%rax,%rdx\), %ymm6 +** vmovdqu -96\(%rax,%rdx\), %ymm5 +** vmovdqu -128\(%rax,%rdx\), %ymm4 +**.L8: +** vmovdqu \(%rax\), %ymm3 +** addq \$-128, %rsi +** subq \$-128, %rcx +** subq \$-128, %rax +** vmovdqu -96\(%rax\), %ymm2 +** vmovdqu -64\(%rax\), %ymm1 +** vmovdqu -32\(%rax\), %ymm0 +** vmovdqu %ymm3, -128\(%rcx\) +** vmovdqu %ymm2, -96\(%rcx\) +** vmovdqu %ymm1, -64\(%rcx\) +** vmovdqu %ymm0, -32\(%rcx\) +** cmpq \$128, %rsi +** ja .L8 +** vmovdqu %ymm7, -32\(%rdi,%rdx\) +** vmovdqu %ymm6, -64\(%rdi,%rdx\) +** vmovdqu %ymm5, -96\(%rdi,%rdx\) +** vmovdqu %ymm4, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** vmovdqu \(%rax\), %ymm3 +** vmovdqu 32\(%rax\), %ymm2 +** leaq \(%rdi,%rdx\), %rcx +** vmovdqu 64\(%rax\), %ymm1 +** vmovdqu 96\(%rax\), %ymm0 +** addq %rdx, %rax +**.L9: +** vmovdqu -32\(%rax\), %ymm7 +** vmovdqu -64\(%rax\), %ymm6 +** addq \$-128, %rsi +** addq \$-128, %rcx +** vmovdqu -96\(%rax\), %ymm5 +** vmovdqu -128\(%rax\), %ymm4 +** addq \$-128, %rax +** vmovdqu %ymm7, 96\(%rcx\) +** vmovdqu %ymm6, 64\(%rcx\) +** vmovdqu %ymm5, 32\(%rcx\) +** vmovdqu %ymm4, \(%rcx\) +** cmpq \$128, %rsi +** ja .L9 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, 64\(%rdi\) +** vmovdqu %ymm0, 96\(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** vmovdqu \(%rsi\), %ymm7 +** vmovdqu 32\(%rsi\), %ymm6 +** vmovdqu 64\(%rsi\), %ymm5 +** vmovdqu 96\(%rsi\), %ymm4 +** vmovdqu -32\(%rsi,%rdx\), %ymm3 +** vmovdqu -64\(%rsi,%rdx\), %ymm2 +** vmovdqu -96\(%rsi,%rdx\), %ymm1 +** vmovdqu -128\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm7, \(%rdi\) +** vmovdqu %ymm6, 32\(%rdi\) +** vmovdqu %ymm5, 64\(%rdi\) +** vmovdqu %ymm4, 96\(%rdi\) +** vmovdqu %ymm3, -32\(%rdi,%rdx\) +** vmovdqu %ymm2, -64\(%rdi,%rdx\) +** vmovdqu %ymm1, -96\(%rdi,%rdx\) +** vmovdqu %ymm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_ymm +#include "builtin-memmove-4a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-4c.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-4c.c new file mode 100644 index 00000000000..fea3e496c0c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-4c.c @@ -0,0 +1,141 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_zmm: +**.LFB0: +** .cfi_startproc +** cmpq \$32, %rdx +** ja .L16 +**.L14: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$64, %rdx +** jb .L6 +** cmpq \$128, %rdx +** ja .L5 +** vmovdqu64 \(%rsi\), %zmm1 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm1, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** cmpq \$512, %rdx +** jbe .L17 +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L9 +** je .L14 +** vmovdqu64 -64\(%rax,%rdx\), %zmm7 +** vmovdqu64 -128\(%rax,%rdx\), %zmm6 +** vmovdqu64 -192\(%rax,%rdx\), %zmm5 +** vmovdqu64 -256\(%rax,%rdx\), %zmm4 +**.L10: +** vmovdqu64 \(%rax\), %zmm3 +** addq \$256, %rax +** vmovdqu64 -192\(%rax\), %zmm2 +** subq \$256, %rsi +** vmovdqu64 -128\(%rax\), %zmm1 +** vmovdqu64 -64\(%rax\), %zmm0 +** addq \$256, %rcx +** vmovdqu64 %zmm3, -256\(%rcx\) +** vmovdqu64 %zmm2, -192\(%rcx\) +** vmovdqu64 %zmm1, -128\(%rcx\) +** vmovdqu64 %zmm0, -64\(%rcx\) +** cmpq \$256, %rsi +** ja .L10 +** vmovdqu64 %zmm7, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm6, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm5, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm4, -256\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L17: +** cmpq \$256, %rdx +** jb .L8 +** vmovdqu64 \(%rsi\), %zmm7 +** vmovdqu64 64\(%rsi\), %zmm6 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm3 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm2 +** vmovdqu64 128\(%rsi\), %zmm5 +** vmovdqu64 192\(%rsi\), %zmm4 +** vmovdqu64 -192\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -256\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm7, \(%rdi\) +** vmovdqu64 %zmm6, 64\(%rdi\) +** vmovdqu64 %zmm5, 128\(%rdi\) +** vmovdqu64 %zmm4, 192\(%rdi\) +** vmovdqu64 %zmm3, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm2, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm1, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -256\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** vmovdqu64 \(%rsi\), %zmm3 +** vmovdqu64 64\(%rsi\), %zmm2 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm3, \(%rdi\) +** vmovdqu64 %zmm2, 64\(%rdi\) +** vmovdqu64 %zmm1, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** vmovdqu64 \(%rax\), %zmm3 +** leaq \(%rdi,%rdx\), %rcx +** vmovdqu64 64\(%rax\), %zmm2 +** vmovdqu64 128\(%rax\), %zmm1 +** vmovdqu64 192\(%rax\), %zmm0 +** addq %rdx, %rax +**.L11: +** vmovdqu64 -64\(%rax\), %zmm7 +** subq \$256, %rax +** vmovdqu64 128\(%rax\), %zmm6 +** subq \$256, %rsi +** vmovdqu64 64\(%rax\), %zmm5 +** vmovdqu64 \(%rax\), %zmm4 +** subq \$256, %rcx +** vmovdqu64 %zmm7, 192\(%rcx\) +** vmovdqu64 %zmm6, 128\(%rcx\) +** vmovdqu64 %zmm5, 64\(%rcx\) +** vmovdqu64 %zmm4, \(%rcx\) +** cmpq \$256, %rsi +** ja .L11 +** vmovdqu64 %zmm3, \(%rdi\) +** vmovdqu64 %zmm2, 64\(%rdi\) +** vmovdqu64 %zmm1, 128\(%rdi\) +** vmovdqu64 %zmm0, 192\(%rdi\) +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_zmm +#include "builtin-memmove-4a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-5a.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-5a.c new file mode 100644 index 00000000000..c86defbabd5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-5a.c @@ -0,0 +1,109 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_xmm: +**.LFB0: +** .cfi_startproc +** cmpq \$67, %rdx +** ja .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$128, %rdx +** jbe .L13 +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L6 +** je .L1 +** movdqu -16\(%rax,%rdx\), %xmm7 +** movdqu -32\(%rax,%rdx\), %xmm6 +** movdqu -48\(%rax,%rdx\), %xmm5 +** movdqu -64\(%rax,%rdx\), %xmm4 +**.L7: +** movdqu \(%rax\), %xmm3 +** subq \$64, %rsi +** addq \$64, %rcx +** addq \$64, %rax +** movdqu -48\(%rax\), %xmm2 +** movdqu -32\(%rax\), %xmm1 +** movdqu -16\(%rax\), %xmm0 +** movups %xmm3, -64\(%rcx\) +** movups %xmm2, -48\(%rcx\) +** movups %xmm1, -32\(%rcx\) +** movups %xmm0, -16\(%rcx\) +** cmpq \$64, %rsi +** ja .L7 +** movups %xmm7, -16\(%rdi,%rdx\) +** movups %xmm6, -32\(%rdi,%rdx\) +** movups %xmm5, -48\(%rdi,%rdx\) +** movups %xmm4, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** movdqu 32\(%rsi\), %xmm5 +** movdqu 48\(%rsi\), %xmm4 +** movdqu -16\(%rsi,%rdx\), %xmm3 +** movdqu -32\(%rsi,%rdx\), %xmm2 +** movdqu -48\(%rsi,%rdx\), %xmm1 +** movdqu -64\(%rsi,%rdx\), %xmm0 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +** movups %xmm3, -16\(%rdi,%rdx\) +** movups %xmm2, -32\(%rdi,%rdx\) +** movups %xmm1, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movdqu \(%rax\), %xmm3 +** movdqu 16\(%rax\), %xmm2 +** leaq \(%rdi,%rdx\), %rcx +** movdqu 32\(%rax\), %xmm1 +** movdqu 48\(%rax\), %xmm0 +** addq %rdx, %rax +**.L8: +** movdqu -16\(%rax\), %xmm7 +** movdqu -32\(%rax\), %xmm6 +** subq \$64, %rsi +** subq \$64, %rcx +** movdqu -48\(%rax\), %xmm5 +** movdqu -64\(%rax\), %xmm4 +** subq \$64, %rax +** movups %xmm7, 48\(%rcx\) +** movups %xmm6, 32\(%rcx\) +** movups %xmm5, 16\(%rcx\) +** movups %xmm4, \(%rcx\) +** cmpq \$64, %rsi +** ja .L8 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +** ret +** .cfi_endproc +**... +*/ + +#ifndef gcc_memmove +#define gcc_memmove gcc_memmove_xmm +#endif + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n > 67) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-5b.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-5b.c new file mode 100644 index 00000000000..e5fc1569e12 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-5b.c @@ -0,0 +1,120 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_ymm: +**.LFB0: +** .cfi_startproc +** cmpq \$67, %rdx +** ja .L14 +**.L12: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$256, %rdx +** jbe .L15 +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L7 +** je .L12 +** vmovdqu -32\(%rax,%rdx\), %ymm7 +** vmovdqu -64\(%rax,%rdx\), %ymm6 +** vmovdqu -96\(%rax,%rdx\), %ymm5 +** vmovdqu -128\(%rax,%rdx\), %ymm4 +**.L8: +** vmovdqu \(%rax\), %ymm3 +** addq \$-128, %rsi +** subq \$-128, %rcx +** subq \$-128, %rax +** vmovdqu -96\(%rax\), %ymm2 +** vmovdqu -64\(%rax\), %ymm1 +** vmovdqu -32\(%rax\), %ymm0 +** vmovdqu %ymm3, -128\(%rcx\) +** vmovdqu %ymm2, -96\(%rcx\) +** vmovdqu %ymm1, -64\(%rcx\) +** vmovdqu %ymm0, -32\(%rcx\) +** cmpq \$128, %rsi +** ja .L8 +** vmovdqu %ymm7, -32\(%rdi,%rdx\) +** vmovdqu %ymm6, -64\(%rdi,%rdx\) +** vmovdqu %ymm5, -96\(%rdi,%rdx\) +** vmovdqu %ymm4, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** cmpq \$128, %rdx +** jb .L6 +** vmovdqu \(%rsi\), %ymm7 +** vmovdqu 32\(%rsi\), %ymm6 +** vmovdqu 64\(%rsi\), %ymm5 +** vmovdqu 96\(%rsi\), %ymm4 +** vmovdqu -32\(%rsi,%rdx\), %ymm3 +** vmovdqu -64\(%rsi,%rdx\), %ymm2 +** vmovdqu -96\(%rsi,%rdx\), %ymm1 +** vmovdqu -128\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm7, \(%rdi\) +** vmovdqu %ymm6, 32\(%rdi\) +** vmovdqu %ymm5, 64\(%rdi\) +** vmovdqu %ymm4, 96\(%rdi\) +** vmovdqu %ymm3, -32\(%rdi,%rdx\) +** vmovdqu %ymm2, -64\(%rdi,%rdx\) +** vmovdqu %ymm1, -96\(%rdi,%rdx\) +** vmovdqu %ymm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** vmovdqu \(%rsi\), %ymm3 +** vmovdqu 32\(%rsi\), %ymm2 +** vmovdqu -32\(%rsi,%rdx\), %ymm1 +** vmovdqu -64\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, -32\(%rdi,%rdx\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** vmovdqu \(%rax\), %ymm3 +** vmovdqu 32\(%rax\), %ymm2 +** leaq \(%rdi,%rdx\), %rcx +** vmovdqu 64\(%rax\), %ymm1 +** vmovdqu 96\(%rax\), %ymm0 +** addq %rdx, %rax +**.L9: +** vmovdqu -32\(%rax\), %ymm7 +** vmovdqu -64\(%rax\), %ymm6 +** addq \$-128, %rsi +** addq \$-128, %rcx +** vmovdqu -96\(%rax\), %ymm5 +** vmovdqu -128\(%rax\), %ymm4 +** addq \$-128, %rax +** vmovdqu %ymm7, 96\(%rcx\) +** vmovdqu %ymm6, 64\(%rcx\) +** vmovdqu %ymm5, 32\(%rcx\) +** vmovdqu %ymm4, \(%rcx\) +** cmpq \$128, %rsi +** ja .L9 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, 64\(%rdi\) +** vmovdqu %ymm0, 96\(%rdi\) +** vzeroupper +** ret +** .cfi_endproc +**.LFE0: +**... +*/ + +#define gcc_memmove gcc_memmove_ymm +#include "builtin-memmove-5a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-5c.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-5c.c new file mode 100644 index 00000000000..a8443f6a67f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-5c.c @@ -0,0 +1,130 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove_zmm: +**.LFB0: +** .cfi_startproc +** cmpq \$67, %rdx +** ja .L14 +**.L12: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movq %rdi, %rcx +** movq %rsi, %rax +** cmpq \$128, %rdx +** jbe .L15 +** cmpq \$512, %rdx +** ja .L5 +** cmpq \$256, %rdx +** jnb .L16 +** vmovdqu64 \(%rsi\), %zmm3 +** vmovdqu64 64\(%rsi\), %zmm2 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm3, \(%rdi\) +** vmovdqu64 %zmm2, 64\(%rdi\) +** vmovdqu64 %zmm1, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -128\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** vmovdqu64 \(%rsi\), %zmm1 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm1, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movq %rdx, %rsi +** cmpq %rdi, %rax +** jb .L7 +** je .L12 +** vmovdqu64 -64\(%rax,%rdx\), %zmm7 +** vmovdqu64 -128\(%rax,%rdx\), %zmm6 +** vmovdqu64 -192\(%rax,%rdx\), %zmm5 +** vmovdqu64 -256\(%rax,%rdx\), %zmm4 +**.L8: +** vmovdqu64 \(%rax\), %zmm3 +** addq \$256, %rax +** vmovdqu64 -192\(%rax\), %zmm2 +** subq \$256, %rsi +** vmovdqu64 -128\(%rax\), %zmm1 +** vmovdqu64 -64\(%rax\), %zmm0 +** addq \$256, %rcx +** vmovdqu64 %zmm3, -256\(%rcx\) +** vmovdqu64 %zmm2, -192\(%rcx\) +** vmovdqu64 %zmm1, -128\(%rcx\) +** vmovdqu64 %zmm0, -64\(%rcx\) +** cmpq \$256, %rsi +** ja .L8 +** vmovdqu64 %zmm7, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm6, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm5, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm4, -256\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** vmovdqu64 \(%rax\), %zmm3 +** leaq \(%rdi,%rdx\), %rcx +** vmovdqu64 64\(%rax\), %zmm2 +** vmovdqu64 128\(%rax\), %zmm1 +** vmovdqu64 192\(%rax\), %zmm0 +** addq %rdx, %rax +**.L9: +** vmovdqu64 -64\(%rax\), %zmm7 +** subq \$256, %rax +** vmovdqu64 128\(%rax\), %zmm6 +** subq \$256, %rsi +** vmovdqu64 64\(%rax\), %zmm5 +** vmovdqu64 \(%rax\), %zmm4 +** subq \$256, %rcx +** vmovdqu64 %zmm7, 192\(%rcx\) +** vmovdqu64 %zmm6, 128\(%rcx\) +** vmovdqu64 %zmm5, 64\(%rcx\) +** vmovdqu64 %zmm4, \(%rcx\) +** cmpq \$256, %rsi +** ja .L9 +** vmovdqu64 %zmm3, \(%rdi\) +** vmovdqu64 %zmm2, 64\(%rdi\) +** vmovdqu64 %zmm1, 128\(%rdi\) +** vmovdqu64 %zmm0, 192\(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** vmovdqu64 \(%rsi\), %zmm7 +** vmovdqu64 64\(%rsi\), %zmm6 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm3 +** vmovdqu64 -128\(%rsi,%rdx\), %zmm2 +** vmovdqu64 128\(%rsi\), %zmm5 +** vmovdqu64 192\(%rsi\), %zmm4 +** vmovdqu64 -192\(%rsi,%rdx\), %zmm1 +** vmovdqu64 -256\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm7, \(%rdi\) +** vmovdqu64 %zmm6, 64\(%rdi\) +** vmovdqu64 %zmm5, 128\(%rdi\) +** vmovdqu64 %zmm4, 192\(%rdi\) +** vmovdqu64 %zmm3, -64\(%rdi,%rdx\) +** vmovdqu64 %zmm2, -128\(%rdi,%rdx\) +** vmovdqu64 %zmm1, -192\(%rdi,%rdx\) +** vmovdqu64 %zmm0, -256\(%rdi,%rdx\) +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#define gcc_memmove gcc_memmove_zmm +#include "builtin-memmove-5a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-6.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-6.c new file mode 100644 index 00000000000..6d159160135 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-6.c @@ -0,0 +1,52 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove: +**.LFB0: +** .cfi_startproc +** cmpq \$7, %rdx +** jbe .L8 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** cmpl \$4, %edx +** jnb .L9 +** cmpl \$1, %edx +** ja .L5 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 8) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-7.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-7.c new file mode 100644 index 00000000000..4118b1328ff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-7.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove: +**.LFB0: +** .cfi_startproc +** cmpq \$3, %rdx +** jbe .L7 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** cmpl \$2, %edx +** jnb .L8 +** cmpl \$1, %edx +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 4) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-8.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-8.c new file mode 100644 index 00000000000..aa57a109fbe --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-8.c @@ -0,0 +1,90 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove: +**.LFB0: +** .cfi_startproc +** cmpq \$33, %rdx +** jbe .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** cmpl \$16, %edx +** jnb .L13 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** cmpl \$32, %edx +** ja .L5 +** movl %edx, %edx +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** addq %rdx, %rsi +** movdqu -16\(%rsi\), %xmm1 +** movdqu -32\(%rsi\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 34) + __builtin_memmove (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memmove-9.c b/gcc/testsuite/gcc.target/i386/builtin-memmove-9.c new file mode 100644 index 00000000000..f84565ed96a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memmove-9.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memmove: +**.LFB0: +** .cfi_startproc +** cmpq \$15, %rdx +** jbe .L9 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** cmpl \$8, %edx +** jnb .L10 +** cmpl \$4, %edx +** jnb .L5 +** cmpl \$1, %edx +** ja .L6 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memmove (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 16) + __builtin_memmove (a, b, n); +} -- 2.47.3