From: H.J. Lu Date: Wed, 17 Jun 2026 09:32:21 +0000 (+0800) Subject: x86-64: Expand bounded memset and memcpy like memmove X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b64fc298a3538c3d7d85ca05fef822acba23e4a1;p=thirdparty%2Fgcc.git x86-64: Expand bounded memset and memcpy like memmove commit 401199377c50045ede560daf3f6e8b51749c2a87 Author: H.J. Lu Date: Tue Jun 17 10:17:17 2025 +0800 x86: Improve vector_loop/unrolled_loop for memset/memcpy uses move_by_pieces and store_by_pieces for memcpy and memset epilogues with the fixed epilogue size. Since move_by_pieces and store_by_pieces don't use the maximum size info, they generate extra branches and moves for bounded memcpy and memset. Commit commit b41f96465190751561f6909e858604ceab00595b Author: H.J. Lu Date: Mon Oct 20 16:14:34 2025 +0800 x86-64: Inline memmove with overlapping unaligned loads and stores. inlines memmove with overlapping unaligned and stores which reduces the numbers of branches as well as moves when the maximum size is known. Rename ix86_expand_movmem to ix86_expand_set_or_movmem and extend it to inline bounded memcpy and memset. Update ix86_expand_set_or_cpymem to call ix86_expand_set_or_movmem for bounded memset and memcpy as memmove if misaligned moves are preferred. In addition to reduce 727.cppcheck_r O2 code size by ~9%, there're another ~8 benchmarks whose code sizes are reduced >2% across spec2026 and spec2017 with march=x86-64-v3 O2, no big code size impact for Ofast, performance impact is negligible(slightly better, but all under noise range) for both O2 and Ofast. gcc/ PR target/125856 PR target/125865 * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): Call ix86_expand_set_or_movmem for TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES. (ix86_expand_memset_val): New. (ix86_expand_n_move_movmem): Renamed to ... (ix86_expand_n_move_set_or_movmem): This. Add a pointer to rtx argument for memset. Also expand memset. (ix86_expand_n_overlapping_move_movmem): Renamed to ... (ix86_expand_n_overlapping_move_set_or_movmem): This. Add a pointer to rtx argument for memset. Also expand memset. (ix86_expand_less_move_movmem): Renamed to ... (ix86_expand_less_move_set_or_movmem): This. Add a pointer to rtx argument for memset. Also expand memset. (ix86_expand_movmem): Renamed to ... (ix86_expand_set_or_movmem): This. Add bool arguments for memcpy and memset. Also expand memcpy and memset. * config/i386/i386-protos.h (ix86_expand_movmem): Renamed to ... (ix86_expand_set_or_movmem): This. Add bool arguments for memcpy and memset. * config/i386/i386.md (movmem): Replace ix86_expand_movmem with ix86_expand_set_or_movmem. gcc/testsuite/ PR target/125856 PR target/125865 * gcc.target/i386/builtin-memcpy-1a.c: New test. * gcc.target/i386/builtin-memcpy-1b.c: Likewise. * gcc.target/i386/builtin-memcpy-2a.c: Likewise. * gcc.target/i386/builtin-memcpy-2b.c: Likewise. * gcc.target/i386/builtin-memcpy-2c.c: Likewise. * gcc.target/i386/builtin-memcpy-3a.c: Likewise. * gcc.target/i386/builtin-memcpy-3b.c: Likewise. * gcc.target/i386/builtin-memcpy-3c.c: Likewise. * gcc.target/i386/builtin-memcpy-4a.c: Likewise. * gcc.target/i386/builtin-memcpy-4b.c: Likewise. * gcc.target/i386/builtin-memcpy-4c.c: Likewise. * gcc.target/i386/builtin-memcpy-5a.c: Likewise. * gcc.target/i386/builtin-memcpy-5b.c: Likewise. * gcc.target/i386/builtin-memcpy-5c.c: Likewise. * gcc.target/i386/builtin-memcpy-6.c: Likewise. * gcc.target/i386/builtin-memcpy-bounded-1a.c: Likewise. * gcc.target/i386/builtin-memcpy-bounded-1b.c: Likewise. * gcc.target/i386/builtin-memset-1a.c: Likewise. * gcc.target/i386/builtin-memset-1b.c: Likewise. * gcc.target/i386/builtin-memset-2a.c: Likewise. * gcc.target/i386/builtin-memset-2b.c: Likewise. * gcc.target/i386/builtin-memset-2c.c: Likewise. * gcc.target/i386/builtin-memset-3a.c: Likewise. * gcc.target/i386/builtin-memset-3b.c: Likewise. * gcc.target/i386/builtin-memset-3c.c: Likewise. * gcc.target/i386/builtin-memset-4a.c: Likewise. * gcc.target/i386/builtin-memset-4b.c: Likewise. * gcc.target/i386/builtin-memset-4c.c: Likewise. * gcc.target/i386/builtin-memset-5a.c: Likewise. * gcc.target/i386/builtin-memset-5b.c: Likewise. * gcc.target/i386/builtin-memset-5c.c: Likewise. * gcc.target/i386/builtin-memset-6a.c: Likewise. * gcc.target/i386/builtin-memset-6b.c: Likewise. * gcc.target/i386/builtin-memset-6c.c: Likewise. * gcc.target/i386/builtin-memset-7a.c: Likewise. * gcc.target/i386/builtin-memset-7b.c: Likewise. * gcc.target/i386/builtin-memset-7c.c: Likewise. * gcc.target/i386/builtin-memset-8.c: Likewise. * gcc.target/i386/builtin-memset-bounded-1a.c: Likewise. * gcc.target/i386/builtin-memset-bounded-1b.c: Likewise. Co-Authored-By: Hongtao Liu Signed-off-by: H.J. Lu --- diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 8e7b90c9744..2303de120ab 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -9547,6 +9547,34 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, rtx max_size_exp, rtx probable_max_size_exp, bool issetmem) { + if (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES) + { + /* Expand bounded memset and memcpy as memmove if misaligned moves + are preferred. Since + + commit b41f96465190751561f6909e858604ceab00595b + Author: H.J. Lu + Date: Mon Oct 20 16:14:34 2025 +0800 + + x86-64: Inline memmove with overlapping unaligned loads and stores. + + inlines memmove with overlapping unaligned and stores, which + reduces the numbers of branches and memory moves, comparing + against the regular memset and memcpy inlining. */ + rtx operands[9]; + operands[0] = dst; + operands[1] = issetmem ? val_exp : src; + operands[2] = count_exp; + operands[3] = align_exp; + operands[4] = expected_align_exp; + operands[5] = expected_size_exp; + operands[6] = min_size_exp; + operands[7] = max_size_exp; + operands[8] = probable_max_size_exp; + if (ix86_expand_set_or_movmem (operands, !issetmem, issetmem)) + return true; + } + rtx destreg; rtx srcreg = NULL; rtx_code_label *label = NULL; @@ -10130,34 +10158,91 @@ ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg, return true; } +/* Value kind in MEMSET_VALS: + + memset_val_byte: The value rtx in QImode. + memset_val_word: The value rtx in word_mode. + memset_val_vector: The value rtx in QI vector mode. + + */ +enum memset_val_kind +{ + memset_val_byte = 0, + memset_val_word = 1, + memset_val_vector = 2, + memset_val_max = 3 +}; + +/* Return a value rtx in MODE for memset from MEMSET_VALS. */ + +static rtx +ix86_expand_memset_val (rtx *memset_vals, machine_mode mode) +{ + rtx byte_val = memset_vals[memset_val_byte]; + + if (mode == QImode) + return byte_val; + else if (mode == word_mode) + return memset_vals[memset_val_word]; + + /* All-zero/all-ones is a property of the original byte value, so + detect it once here rather than re-deriving it from each slot. */ + if (byte_val == const0_rtx) + return CONST0_RTX (mode); + if (byte_val == constm1_rtx) + return CONSTM1_RTX (mode); + + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + if (GET_MODE (memset_vals[memset_val_vector]) == mode) + return memset_vals[memset_val_vector]; + return gen_rtx_SUBREG (mode, memset_vals[memset_val_vector], 0); + } + + gcc_assert (mode == HImode || mode == SImode); + return gen_rtx_SUBREG (mode, memset_vals[memset_val_word], 0); +} + /* Expand memmove of size with MOVES * mode size and MOVES <= 4. If FORWARD is true, copy forward. Otherwise copy backward. */ static void -ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode, - unsigned int moves, bool forward) +ix86_expand_n_move_set_or_movmem (rtx destmem, rtx srcmem, + rtx *memset_vals, machine_mode mode, + unsigned int moves, bool forward) { gcc_assert (moves <= 4); unsigned int i; rtx tmp[8]; - for (i = 0; i < moves; i++) - tmp[i] = gen_reg_rtx (mode); - rtx step; if (forward) step = GEN_INT (GET_MODE_SIZE (mode)); else step = GEN_INT (-GET_MODE_SIZE (mode)); - /* Load MOVES. */ - for (i = 0; i < moves - 1; i++) + if (memset_vals) { + /* Expand memset. */ + rtx val = ix86_expand_memset_val (memset_vals, mode); + for (i = 0; i < moves; i++) + tmp[i] = val; + } + else + { + /* Expand memmove. */ + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); + + /* Load MOVES. */ + for (i = 0; i < moves - 1; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); + } emit_move_insn (tmp[i], srcmem); - srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); } - emit_move_insn (tmp[i], srcmem); /* Store MOVES. */ for (i = 0; i < moves - 1; i++) @@ -10234,10 +10319,12 @@ ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp, MOVES >= 2 and MOVES <= 8. */ static void -ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg, - rtx srcreg, rtx count_exp, - machine_mode mode, - unsigned int moves) +ix86_expand_n_overlapping_move_set_or_movmem (rtx dst, rtx src, + rtx *memset_vals, + rtx destreg, rtx srcreg, + rtx count_exp, + machine_mode mode, + unsigned int moves) { gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0); @@ -10245,35 +10332,46 @@ ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg, unsigned int i, j; rtx tmp[8]; - for (i = 0; i < moves; i++) - tmp[i] = gen_reg_rtx (mode); + if (memset_vals) + { + /* Expand memset. */ + rtx val = ix86_expand_memset_val (memset_vals, mode); + for (i = 0; i < moves; i++) + tmp[i] = val; + } + else + { + /* Expand memmove. */ + for (i = 0; i < moves; i++) + tmp[i] = gen_reg_rtx (mode); - rtx base_srcmem = change_address (src, mode, srcreg); + rtx base_srcmem = change_address (src, mode, srcreg); - /* Load the first half. */ - rtx srcmem = base_srcmem; - for (i = 0; i < half_moves - 1; i++) - { + /* Load the first half. */ + rtx srcmem = base_srcmem; + for (i = 0; i < half_moves - 1; i++) + { + emit_move_insn (tmp[i], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } emit_move_insn (tmp[i], srcmem); - srcmem = offset_address (srcmem, - GEN_INT (GET_MODE_SIZE (mode)), - GET_MODE_SIZE (mode)); - } - emit_move_insn (tmp[i], srcmem); - /* Load the second half. */ - srcmem = offset_address (base_srcmem, count_exp, 1); - srcmem = offset_address (srcmem, - GEN_INT (-GET_MODE_SIZE (mode)), - GET_MODE_SIZE (mode)); - for (j = half_moves, i = 0; i < half_moves - 1; i++, j++) - { - emit_move_insn (tmp[j], srcmem); + /* Load the second half. */ + srcmem = offset_address (base_srcmem, count_exp, 1); srcmem = offset_address (srcmem, GEN_INT (-GET_MODE_SIZE (mode)), GET_MODE_SIZE (mode)); + for (j = half_moves, i = 0; i < half_moves - 1; i++, j++) + { + emit_move_insn (tmp[j], srcmem); + srcmem = offset_address (srcmem, + GEN_INT (-GET_MODE_SIZE (mode)), + GET_MODE_SIZE (mode)); + } + emit_move_insn (tmp[j], srcmem); } - emit_move_insn (tmp[j], srcmem); rtx base_destmem = change_address (dst, mode, destreg); @@ -10304,11 +10402,12 @@ ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg, /* Expand memmove of size < mode size which is <= 64. */ static void -ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg, - rtx srcreg, rtx count_exp, - unsigned HOST_WIDE_INT min_size, - machine_mode mode, - rtx_code_label *done_label) +ix86_expand_less_move_set_or_movmem (rtx dst, rtx src, rtx *memset_vals, + rtx destreg, rtx srcreg, + rtx count_exp, + unsigned HOST_WIDE_INT min_size, + machine_mode mode, + rtx_code_label *done_label) { bool skip = false; machine_mode count_mode = counter_mode (count_exp); @@ -10410,9 +10509,16 @@ ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg, profile_probability::unlikely ()); /* Move 1 byte. */ - rtx tmp0 = gen_reg_rtx (QImode); - rtx srcmem = change_address (src, QImode, srcreg); - emit_move_insn (tmp0, srcmem); + rtx tmp0; + /* Use the value rtx in QImode for memset. */ + if (memset_vals) + tmp0 = memset_vals[memset_val_byte]; + else + { + tmp0 = gen_reg_rtx (QImode); + rtx srcmem = change_address (src, QImode, srcreg); + emit_move_insn (tmp0, srcmem); + } rtx destmem = change_address (dst, QImode, destreg); emit_move_insn (destmem, tmp0); @@ -10423,50 +10529,35 @@ ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg, emit_barrier (); } - if (between_32_63_label) - { - emit_label (between_32_63_label); - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, - count_exp, OImode, 2); - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - } - - if (between_16_31_label) - { - emit_label (between_16_31_label); - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, - count_exp, TImode, 2); - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - } - - if (between_8_15_label) - { - emit_label (between_8_15_label); - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, - count_exp, DImode, 2); - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - } - - if (between_4_7_label) - { - emit_label (between_4_7_label); - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, - count_exp, SImode, 2); - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - } - - if (between_2_3_label) - { - emit_label (between_2_3_label); - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, - count_exp, HImode, 2); - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - } + /* For each size band, memset uses a QI-vector mode above a word so it + can broadcast the fill value, while memmove uses the same-size + scalar integer mode; at and below a word both use the scalar + integer mode. */ + struct { + rtx_code_label *label; + machine_mode set_mode; + machine_mode move_mode; + } bands[] = { + { between_32_63_label, V32QImode, OImode }, + { between_16_31_label, V16QImode, TImode }, + { between_8_15_label, DImode, DImode }, + { between_4_7_label, SImode, SImode }, + { between_2_3_label, HImode, HImode }, + }; + + for (auto &band : bands) + if (band.label) + { + emit_label (band.label); + machine_mode bmode = memset_vals ? band.set_mode : band.move_mode; + ix86_expand_n_overlapping_move_set_or_movmem (dst, src, + memset_vals, + destreg, srcreg, + count_exp, bmode, + 2); + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } } /* Expand movmem with overlapping unaligned loads and stores: @@ -10491,7 +10582,7 @@ ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg, */ bool -ix86_expand_movmem (rtx operands[]) +ix86_expand_set_or_movmem (rtx operands[], bool iscpymem, bool issetmem) { /* Since there are much less registers available in 32-bit mode, don't inline movmem in 32-bit mode. */ @@ -10499,19 +10590,31 @@ ix86_expand_movmem (rtx operands[]) return false; rtx dst = operands[0]; - rtx src = operands[1]; + rtx src, memset_val_exp; + if (issetmem) + { + src = nullptr; + memset_val_exp = operands[1]; + } + else + { + src = operands[1]; + memset_val_exp = nullptr; + } rtx count_exp = operands[2]; rtx expected_size_exp = operands[5]; rtx min_size_exp = operands[6]; + rtx max_size_exp = operands[7]; rtx probable_max_size_exp = operands[8]; unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U; HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U; unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U; + unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U; unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U; if (CONST_INT_P (count_exp)) { - min_size = probable_max_size = count = expected_size + min_size = max_size = probable_max_size = count = expected_size = INTVAL (count_exp); /* When COUNT is 0, there is nothing to do. */ if (!count) @@ -10521,6 +10624,8 @@ ix86_expand_movmem (rtx operands[]) { if (min_size_exp) min_size = INTVAL (min_size_exp); + if (max_size_exp) + max_size = INTVAL (max_size_exp); if (probable_max_size_exp) probable_max_size = INTVAL (probable_max_size_exp); if (CONST_INT_P (expected_size_exp)) @@ -10532,18 +10637,35 @@ ix86_expand_movmem (rtx operands[]) return false; addr_space_t dst_as = MEM_ADDR_SPACE (dst); - addr_space_t src_as = MEM_ADDR_SPACE (src); + addr_space_t src_as = (issetmem + ? ADDR_SPACE_GENERIC + : MEM_ADDR_SPACE (src)); int dynamic_check; bool noalign; enum stringop_alg alg = decide_alg (count, expected_size, min_size, - probable_max_size, false, false, + probable_max_size, issetmem, + (issetmem + && memset_val_exp == const0_rtx), dst_as, src_as, &dynamic_check, &noalign, false); if (alg == libcall) return false; + /* Expand memcpy and memset like memmove only for bounded size. */ + if (iscpymem || issetmem) + { + unsigned HOST_WIDE_INT unbounded + = GET_MODE_MASK (counter_mode (count_exp)); + if (count != 0 /* Fixed size. */ + || max_size == 0 /* Unbounded size. */ + || max_size == unbounded) /* Unbounded size. */ + return false; + } + rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); - rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); + rtx srcreg = (issetmem + ? nullptr + : ix86_copy_addr_to_reg (XEXP (src, 0))); unsigned int move_max = MOVE_MAX; machine_mode mode = smallest_int_mode_for_size @@ -10566,6 +10688,53 @@ ix86_expand_movmem (rtx operands[]) mode)) return true; + rtx memset_vals[memset_val_max]; + rtx *memset_vals_p; + if (issetmem) + { + /* Use vector mode if MODE size > word size. */ + unsigned int size = GET_MODE_SIZE (mode); + poly_uint64 nunits; + if (size > UNITS_PER_WORD) + { + nunits = size / GET_MODE_SIZE (QImode); + mode = mode_for_vector (QImode, nunits).require (); + } + + /* Populate MEMSET_VALS to expand memset. */ + rtx val_word; + memset_vals[memset_val_byte] = memset_val_exp; + if (memset_val_exp == const0_rtx || memset_val_exp == constm1_rtx) + val_word = memset_val_exp; + else + val_word = promote_duplicated_reg (word_mode, memset_val_exp); + memset_vals[memset_val_word] = val_word; + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + { + if (memset_val_exp == const0_rtx) + memset_vals[memset_val_vector] = CONST0_RTX (mode); + else if (memset_val_exp == constm1_rtx) + memset_vals[memset_val_vector] = CONSTM1_RTX (mode); + else + { + /* Use the vector mode based on WORD_MODE to avoid extra + GPR moves. */ + nunits = size / GET_MODE_SIZE (word_mode); + machine_mode vector_mode + = mode_for_vector (word_mode, nunits).require (); + rtx vector = promote_duplicated_reg (vector_mode, + val_word); + memset_vals[memset_val_vector] + = convert_to_mode (mode, vector, 1); + } + } + else + memset_vals[memset_val_vector] = nullptr; + memset_vals_p = memset_vals; + } + else + memset_vals_p = nullptr; + rtx_code_label *done_label = gen_label_rtx (); rtx_code_label *less_vec_label = nullptr; @@ -10593,8 +10762,10 @@ ix86_expand_movmem (rtx operands[]) if (min_size == 0 || min_size <= 2 * move_max) { /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */ - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg, - count_exp, mode, 2); + ix86_expand_n_overlapping_move_set_or_movmem (dst, src, + memset_vals_p, + destreg, srcreg, + count_exp, mode, 2); emit_jump_insn (gen_jump (done_label)); emit_barrier (); } @@ -10603,9 +10774,9 @@ ix86_expand_movmem (rtx operands[]) { /* Size < MOVE_MAX. */ emit_label (less_vec_label); - ix86_expand_less_move_movmem (dst, src, destreg, srcreg, - count_exp, min_size, mode, - done_label); + ix86_expand_less_move_set_or_movmem (dst, src, memset_vals_p, + destreg, srcreg, count_exp, + min_size, mode, done_label); emit_jump_insn (gen_jump (done_label)); emit_barrier (); } @@ -10638,9 +10809,11 @@ ix86_expand_movmem (rtx operands[]) if (probable_max_size == 0 || probable_max_size > 4 * move_max) { /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */ - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, - srcreg, count_exp, - mode, 8); + ix86_expand_n_overlapping_move_set_or_movmem (dst, src, + memset_vals_p, + destreg, srcreg, + count_exp, mode, + 8); emit_jump_insn (gen_jump (done_label)); emit_barrier (); } @@ -10649,9 +10822,11 @@ ix86_expand_movmem (rtx operands[]) { /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */ emit_label (last_4x_vec_label); - ix86_expand_n_overlapping_move_movmem (dst, src, destreg, - srcreg, count_exp, - mode, 4); + ix86_expand_n_overlapping_move_set_or_movmem (dst, src, + memset_vals_p, + destreg, srcreg, + count_exp, mode, + 4); emit_jump_insn (gen_jump (done_label)); emit_barrier (); } @@ -10664,34 +10839,66 @@ ix86_expand_movmem (rtx operands[]) rtx loop_count = gen_reg_rtx (count_mode); emit_move_insn (loop_count, count_exp); - /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is - lower than destination address. */ - rtx_code_label *more_8x_vec_backward_label = gen_label_rtx (); - emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr, - GET_MODE (destreg), 1, - more_8x_vec_backward_label); - - /* Skip if source == destination which is less common. */ - emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr, - GET_MODE (destreg), 1, done_label, - profile_probability::unlikely ()); - - rtx base_destreg = gen_reg_rtx (GET_MODE (destreg)); - emit_move_insn (base_destreg, destreg); - - /* Load the last 4 * MOVE_MAX. */ + rtx_code_label *more_8x_vec_backward_label; + rtx base_destreg; + rtx srcmem; rtx regs[4]; - ix86_expand_load_movmem (src, srcreg, count_exp, mode, - ARRAY_SIZE (regs), regs, true); + if (iscpymem || issetmem) + { + /* Always store forward for memcpy and memset. */ + more_8x_vec_backward_label = nullptr; + if (iscpymem) + { + /* Load the last 4 * MOVE_MAX for memcpy. */ + ix86_expand_load_movmem (src, srcreg, count_exp, mode, + ARRAY_SIZE (regs), regs, + true); + srcmem = change_address (src, mode, srcreg); + } + else + { + /* Fill REGS with MEMSET_VALS for memset. */ + rtx val = ix86_expand_memset_val (memset_vals, mode); + for (unsigned int i = 0; i < 4; i++) + regs[i] = val; + srcmem = nullptr; + } + base_destreg = gen_reg_rtx (GET_MODE (destreg)); + emit_move_insn (base_destreg, destreg); + } + else + { + /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is + lower than destination address. */ + more_8x_vec_backward_label = gen_label_rtx (); + emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr, + GET_MODE (destreg), 1, + more_8x_vec_backward_label); + + /* Skip if source == destination which is less common. */ + emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr, + GET_MODE (destreg), 1, done_label, + profile_probability::unlikely ()); + + base_destreg = gen_reg_rtx (GET_MODE (destreg)); + emit_move_insn (base_destreg, destreg); + + /* Load the last 4 * MOVE_MAX. */ + ix86_expand_load_movmem (src, srcreg, count_exp, mode, + ARRAY_SIZE (regs), regs, true); + + srcmem = change_address (src, mode, srcreg); + } - rtx srcmem = change_address (src, mode, srcreg); rtx destmem = change_address (dst, mode, destreg); /* Copy forward with a 4 * MOVE_MAX loop. */ rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx (); emit_label (loop_4x_vec_forward_label); - ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true); + ix86_expand_n_move_set_or_movmem (destmem, srcmem, + memset_vals_p, mode, 4, + true); rtx tmp; rtx delta = GEN_INT (4 * MOVE_MAX); @@ -10709,10 +10916,14 @@ ix86_expand_movmem (rtx operands[]) OPTAB_DIRECT); if (tmp != destreg) emit_move_insn (destreg, tmp); - tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg, - delta, nullptr, 1, OPTAB_DIRECT); - if (tmp != srcreg) - emit_move_insn (srcreg, tmp); + if (!issetmem) + { + tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, + srcreg, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + } /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */ emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr, @@ -10726,67 +10937,78 @@ ix86_expand_movmem (rtx operands[]) emit_jump_insn (gen_jump (done_label)); emit_barrier (); - /* Copy backward with a 4 * MOVE_MAX loop. */ - emit_label (more_8x_vec_backward_label); - - base_destreg = gen_reg_rtx (GET_MODE (destreg)); - emit_move_insn (base_destreg, destreg); - - /* Load the first 4 * MOVE_MAX. */ - ix86_expand_load_movmem (src, srcreg, count_exp, mode, - ARRAY_SIZE (regs), regs, false); - - /* Increment DESTREG and SRCREG by COUNT_EXP. */ - tmp = expand_simple_binop (GET_MODE (destreg), PLUS, - destreg, count_exp, nullptr, 1, - OPTAB_DIRECT); - if (tmp != destreg) - emit_move_insn (destreg, tmp); - tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg, - count_exp, nullptr, 1, OPTAB_DIRECT); - if (tmp != srcreg) - emit_move_insn (srcreg, tmp); - - srcmem = change_address (src, mode, srcreg); - destmem = change_address (dst, mode, destreg); - rtx step = GEN_INT (-GET_MODE_SIZE (mode)); - srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode)); - destmem = offset_address (destmem, step, GET_MODE_SIZE (mode)); - - rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx (); - emit_label (loop_4x_vec_backward_label); - - ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false); - - /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */ - tmp = expand_simple_binop (GET_MODE (loop_count), MINUS, - loop_count, delta, nullptr, 1, - OPTAB_DIRECT); - if (tmp != loop_count) - emit_move_insn (loop_count, tmp); - - /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */ - tmp = expand_simple_binop (GET_MODE (destreg), MINUS, - destreg, delta, nullptr, 1, - OPTAB_DIRECT); - if (tmp != destreg) - emit_move_insn (destreg, tmp); - tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg, - delta, nullptr, 1, OPTAB_DIRECT); - if (tmp != srcreg) - emit_move_insn (srcreg, tmp); - - /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */ - emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr, - GET_MODE (loop_count), 1, - loop_4x_vec_backward_label); - - /* Store the first 4 * MOVE_MAX. */ - ix86_expand_store_movmem (dst, base_destreg, count_exp, mode, - ARRAY_SIZE (regs), regs, false); - - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); + if (more_8x_vec_backward_label) + { + /* Copy backward with a 4 * MOVE_MAX loop. */ + emit_label (more_8x_vec_backward_label); + + base_destreg = gen_reg_rtx (GET_MODE (destreg)); + emit_move_insn (base_destreg, destreg); + + /* Load the first 4 * MOVE_MAX. */ + ix86_expand_load_movmem (src, srcreg, count_exp, mode, + ARRAY_SIZE (regs), regs, false); + + /* Increment DESTREG and SRCREG by COUNT_EXP. */ + tmp = expand_simple_binop (GET_MODE (destreg), PLUS, + destreg, count_exp, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg, + count_exp, nullptr, 1, + OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + srcmem = change_address (src, mode, srcreg); + destmem = change_address (dst, mode, destreg); + rtx step = GEN_INT (-GET_MODE_SIZE (mode)); + srcmem = offset_address (srcmem, step, + GET_MODE_SIZE (mode)); + destmem = offset_address (destmem, step, + GET_MODE_SIZE (mode)); + + rtx_code_label *loop_4x_vec_backward_label + = gen_label_rtx (); + emit_label (loop_4x_vec_backward_label); + + ix86_expand_n_move_set_or_movmem (destmem, srcmem, + memset_vals_p, mode, 4, + false); + + /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (loop_count), MINUS, + loop_count, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != loop_count) + emit_move_insn (loop_count, tmp); + + /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */ + tmp = expand_simple_binop (GET_MODE (destreg), MINUS, + destreg, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != destreg) + emit_move_insn (destreg, tmp); + tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, + srcreg, delta, nullptr, 1, + OPTAB_DIRECT); + if (tmp != srcreg) + emit_move_insn (srcreg, tmp); + + /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */ + emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr, + GET_MODE (loop_count), 1, + loop_4x_vec_backward_label); + + /* Store the first 4 * MOVE_MAX. */ + ix86_expand_store_movmem (dst, base_destreg, count_exp, + mode, ARRAY_SIZE (regs), regs, + false); + + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + } } } diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 15d71debbf5..d11fa2df35d 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -78,7 +78,7 @@ extern void substitute_vpternlog_operands (rtx[]); extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx); extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, bool); -extern bool ix86_expand_movmem (rtx[]); +extern bool ix86_expand_set_or_movmem (rtx[], bool, bool); extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool); extern enum reg_class ix86_insn_base_reg_class (rtx_insn *); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 9d334f9115f..00bc37b1cb0 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -26547,7 +26547,7 @@ (use (match_operand:SI 8 ""))] "" { - if (ix86_expand_movmem (operands)) + if (ix86_expand_set_or_movmem (operands, false, false)) DONE; FAIL; }) diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-1a.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-1a.c new file mode 100644 index 00000000000..c4710b41139 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-1a.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movdqu \(%rsi\), %xmm0 +** movups %xmm0, \(%rdi\) +** movdqu 16\(%rsi\), %xmm0 +** movups %xmm0, 16\(%rdi\) +** movdqu 32\(%rsi\), %xmm0 +** movups %xmm0, 32\(%rdi\) +** movdqu 48\(%rsi\), %xmm0 +** movups %xmm0, 48\(%rdi\) +** movdqu 64\(%rsi\), %xmm0 +** movups %xmm0, 64\(%rdi\) +** movdqu 80\(%rsi\), %xmm0 +** movups %xmm0, 80\(%rdi\) +** movdqu 96\(%rsi\), %xmm0 +** movups %xmm0, 96\(%rdi\) +** movdqu 112\(%rsi\), %xmm0 +** movups %xmm0, 112\(%rdi\) +** movzbl 128\(%rsi\), %eax +** movb %al, 128\(%rdi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, const char *src) +{ + __builtin_memcpy (dest, src, 129); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-1b.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-1b.c new file mode 100644 index 00000000000..7c27f8c0c2e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-1b.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mtune=znver3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movl \(%rsi\), %eax +** movq %rdi, %rcx +** leaq 4\(%rdi\), %rdi +** movl %eax, -4\(%rdi\) +** movl 125\(%rsi\), %eax +** movl %eax, 121\(%rdi\) +** andq \$-4, %rdi +** subq %rdi, %rcx +** subq %rcx, %rsi +** addl \$129, %ecx +** shrl \$2, %ecx +** rep movsl +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memcpy-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-2a.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-2a.c new file mode 100644 index 00000000000..6fe5102fe91 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-2a.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$15, %rdx +** jbe .L9 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** cmpl \$8, %edx +** jnb .L10 +** cmpl \$4, %edx +** jnb .L5 +** cmpl \$1, %edx +** ja .L6 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memcpy (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 16) + __builtin_memcpy (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-2b.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-2b.c new file mode 100644 index 00000000000..1b0a64e5ea5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-2b.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$15, %rdx +** jbe .L10 +**.L8: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** cmpl \$8, %edx +** jnb .L11 +** cmpl \$4, %edx +** jnb .L5 +** cmpl \$1, %edx +** ja .L6 +** jb .L8 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memcpy-2a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-2c.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-2c.c new file mode 100644 index 00000000000..761c377557b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-2c.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$15, %rdx +** jbe .L10 +**.L8: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** cmpl \$8, %edx +** jnb .L11 +** cmpl \$4, %edx +** jnb .L5 +** cmpl \$1, %edx +** ja .L6 +** jb .L8 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memcpy-2a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-3a.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-3a.c new file mode 100644 index 00000000000..707424163ea --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-3a.c @@ -0,0 +1,74 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$31, %rdx +** jbe .L10 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** cmpl \$16, %edx +** jnb .L11 +** cmpl \$8, %edx +** jnb .L5 +** cmpl \$4, %edx +** jnb .L6 +** cmpl \$1, %edx +** ja .L7 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %edx, %edx +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memcpy (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 32) + __builtin_memcpy (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-3b.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-3b.c new file mode 100644 index 00000000000..65eb7d4b3d3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-3b.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$31, %rdx +** jbe .L11 +**.L9: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** cmpl \$16, %edx +** jnb .L12 +** cmpl \$8, %edx +** jnb .L5 +** cmpl \$4, %edx +** jnb .L6 +** cmpl \$1, %edx +** ja .L7 +** jb .L9 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** movl %edx, %edx +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memcpy-3a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-3c.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-3c.c new file mode 100644 index 00000000000..b4df4399bfe --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-3c.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$31, %rdx +** jbe .L11 +**.L9: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** cmpl \$16, %edx +** jnb .L12 +** cmpl \$8, %edx +** jnb .L5 +** cmpl \$4, %edx +** jnb .L6 +** cmpl \$1, %edx +** ja .L7 +** jb .L9 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** movl %edx, %edx +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memcpy-3a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-4a.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-4a.c new file mode 100644 index 00000000000..7c349807db6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-4a.c @@ -0,0 +1,90 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$63, %rdx +** jbe .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** cmpl \$16, %edx +** jnb .L13 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** cmpl \$32, %edx +** ja .L5 +** movl %edx, %edx +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** addq %rdx, %rsi +** movdqu -16\(%rsi\), %xmm1 +** movdqu -32\(%rsi\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memcpy (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 64) + __builtin_memcpy (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-4b.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-4b.c new file mode 100644 index 00000000000..0c95c7017c7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-4b.c @@ -0,0 +1,80 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$63, %rdx +** jbe .L12 +**.L10: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** cmpl \$32, %edx +** jnb .L13 +** cmpl \$16, %edx +** jnb .L5 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L10 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movl %edx, %edx +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +**... +*/ + +#include "builtin-memcpy-4a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-4c.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-4c.c new file mode 100644 index 00000000000..d7d771a5507 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-4c.c @@ -0,0 +1,80 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$63, %rdx +** jbe .L12 +**.L10: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** cmpl \$32, %edx +** jnb .L13 +** cmpl \$16, %edx +** jnb .L5 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L10 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movl %edx, %edx +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +**... +*/ + +#include "builtin-memcpy-4a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-5a.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-5a.c new file mode 100644 index 00000000000..780015f6782 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-5a.c @@ -0,0 +1,114 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$127, %rdx +** jbe .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** cmpl \$16, %edx +** jnb .L13 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L1 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** cmpl \$32, %edx +** ja .L5 +** movl %edx, %edx +** movdqu \(%rsi\), %xmm1 +** movdqu -16\(%rsi,%rdx\), %xmm0 +** movups %xmm1, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** cmpl \$64, %edx +** ja .L14 +** movl %edx, %edx +** movdqu \(%rsi\), %xmm3 +** movdqu 16\(%rsi\), %xmm2 +** addq %rdx, %rsi +** movdqu -16\(%rsi\), %xmm1 +** movdqu -32\(%rsi\), %xmm0 +** movups %xmm3, \(%rdi\) +** movups %xmm2, 16\(%rdi\) +** movups %xmm1, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movl %edx, %edx +** movdqu \(%rsi\), %xmm7 +** movdqu 16\(%rsi\), %xmm6 +** movdqu 32\(%rsi\), %xmm5 +** movdqu 48\(%rsi\), %xmm4 +** addq %rdx, %rsi +** movdqu -16\(%rsi\), %xmm3 +** movdqu -32\(%rsi\), %xmm2 +** movdqu -48\(%rsi\), %xmm1 +** movdqu -64\(%rsi\), %xmm0 +** movups %xmm7, \(%rdi\) +** movups %xmm6, 16\(%rdi\) +** movups %xmm5, 32\(%rdi\) +** movups %xmm4, 48\(%rdi\) +** movups %xmm3, -16\(%rdi,%rdx\) +** movups %xmm2, -32\(%rdi,%rdx\) +** movups %xmm1, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memcpy (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n < 128) + __builtin_memcpy (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-5b.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-5b.c new file mode 100644 index 00000000000..683740c2f7a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-5b.c @@ -0,0 +1,97 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$127, %rdx +** jbe .L14 +**.L12: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** cmpl \$32, %edx +** jnb .L15 +** cmpl \$16, %edx +** jnb .L6 +** cmpl \$8, %edx +** jnb .L7 +** cmpl \$4, %edx +** jnb .L8 +** cmpl \$1, %edx +** ja .L9 +** jb .L12 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** cmpl \$64, %edx +** ja .L5 +** movl %edx, %edx +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu \(%rsi\), %ymm3 +** vmovdqu 32\(%rsi\), %ymm2 +** addq %rdx, %rsi +** vmovdqu -32\(%rsi\), %ymm1 +** vmovdqu -64\(%rsi\), %ymm0 +** vmovdqu %ymm3, \(%rdi\) +** vmovdqu %ymm2, 32\(%rdi\) +** vmovdqu %ymm1, -32\(%rdi,%rdx\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +**... +*/ + +#include "builtin-memcpy-5a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-5c.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-5c.c new file mode 100644 index 00000000000..31540fdd101 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-5c.c @@ -0,0 +1,91 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */ + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpq \$127, %rdx +** jbe .L13 +**.L11: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** cmpl \$64, %edx +** jnb .L14 +** cmpl \$32, %edx +** jnb .L5 +** cmpl \$16, %edx +** jnb .L6 +** cmpl \$8, %edx +** jnb .L7 +** cmpl \$4, %edx +** jnb .L8 +** cmpl \$1, %edx +** ja .L9 +** jb .L11 +** movzbl \(%rsi\), %eax +** movb %al, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movl %edx, %edx +** vmovdqu64 \(%rsi\), %zmm1 +** vmovdqu64 -64\(%rsi,%rdx\), %zmm0 +** vmovdqu64 %zmm1, \(%rdi\) +** vmovdqu64 %zmm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu \(%rsi\), %ymm1 +** vmovdqu -32\(%rsi,%rdx\), %ymm0 +** vmovdqu %ymm1, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** vmovdqu \(%rsi\), %xmm1 +** vmovdqu -16\(%rsi,%rdx\), %xmm0 +** vmovdqu %xmm1, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movq \(%rsi\), %rcx +** movq -8\(%rsi,%rdx\), %rax +** movq %rcx, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movl \(%rsi\), %ecx +** movl -4\(%rsi,%rdx\), %eax +** movl %ecx, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +**.L9: +** movl %edx, %edx +** movzwl \(%rsi\), %ecx +** movzwl -2\(%rsi,%rdx\), %eax +** movw %cx, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memcpy-5a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-6.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-6.c new file mode 100644 index 00000000000..a024c456524 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-6.c @@ -0,0 +1,109 @@ +/* { dg-do compile { target { maybe_x32 && lp64 } } } */ +/* { dg-options "-O2 -mx32 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + + +/* +**gcc_memcpy: +**.LFB0: +** .cfi_startproc +** cmpl \$64, %edx +** jnb .L2 +** testb \$32, %dl +** jne .L19 +** testb \$16, %dl +** jne .L20 +** testb \$8, %dl +** jne .L21 +** testb \$4, %dl +** jne .L22 +** testl %edx, %edx +** jne .L23 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** movdqu -64\(%edx,%esi\), %xmm0 +** subl \$1, %edx +** movups %xmm0, -63\(%edx,%edi\) +** movdqu -47\(%edx,%esi\), %xmm0 +** movups %xmm0, -47\(%edx,%edi\) +** movdqu -31\(%edx,%esi\), %xmm0 +** movups %xmm0, -31\(%edx,%edi\) +** movdqu -15\(%edx,%esi\), %xmm0 +** movups %xmm0, -15\(%edx,%edi\) +** cmpl \$64, %edx +** jb .L1 +** andl \$-64, %edx +** xorl %eax, %eax +**.L9: +** movdqu \(%eax,%esi\), %xmm3 +** movdqu 16\(%eax,%esi\), %xmm2 +** addl \$64, %eax +** movdqu -32\(%eax,%esi\), %xmm1 +** movdqu -16\(%eax,%esi\), %xmm0 +** movups %xmm3, -64\(%eax,%edi\) +** movups %xmm2, -48\(%eax,%edi\) +** movups %xmm1, -32\(%eax,%edi\) +** movups %xmm0, -16\(%eax,%edi\) +** cmpl %edx, %eax +** jb .L9 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L23: +** movzbl \(%esi\), %eax +** movb %al, \(%edi\) +** testb \$2, %dl +** je .L1 +** movzwl -2\(%edx,%esi\), %eax +** movw %ax, -2\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** movdqu \(%esi\), %xmm0 +** movups %xmm0, \(%edi\) +** movdqu 16\(%esi\), %xmm0 +** movups %xmm0, 16\(%edi\) +** movdqu -32\(%edx,%esi\), %xmm0 +** movups %xmm0, -32\(%edx,%edi\) +** movdqu -16\(%edx,%esi\), %xmm0 +** movups %xmm0, -16\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** movdqu \(%esi\), %xmm0 +** movups %xmm0, \(%edi\) +** movdqu -16\(%edx,%esi\), %xmm0 +** movups %xmm0, -16\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L21: +** movq \(%esi\), %rax +** movq %rax, \(%edi\) +** movq -8\(%edx,%esi\), %rax +** movq %rax, -8\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L22: +** movl \(%esi\), %eax +** movl %eax, \(%edi\) +** movl -4\(%edx,%esi\), %eax +** movl %eax, -4\(%edx,%edi\) +** ret +** .cfi_endproc +**... +*/ + +void +gcc_memcpy (void *a, void *b, __SIZE_TYPE__ n) +{ + if (n <= (__SIZE_TYPE__) -1) + __builtin_memcpy (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-bounded-1a.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-bounded-1a.c new file mode 100644 index 00000000000..200c3682f59 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-bounded-1a.c @@ -0,0 +1,122 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -minline-all-stringops" } */ + +/* Test -O2 -minline-all-stringops on memcpy with various bounds. */ + +#include + +#define MAX_LENGTH 4096 +#define EXTRA 64 +#define TOTAL (EXTRA + MAX_LENGTH + EXTRA) + +#define MEMCPY_BOUND(BOUND) \ +__attribute__ ((noipa, noinline)) \ +static char * \ +memcpy_##BOUND (char *dest, char *src, size_t len) \ +{ \ + if (len <= BOUND) \ + return __builtin_memcpy (dest + EXTRA, src, len); \ + else \ + return dest + EXTRA; \ +} + +#define CHECK_MEMCPY_BOUND(BOUND, SIZE) \ + { \ + for (i = 0; i < TOTAL; i++) \ + { \ + dest[i] = 'a'; \ + src[i] = 'A'; \ + } \ + p = memcpy_##BOUND (dest, src, SIZE); \ + if (p != dest + EXTRA) \ + abort (); \ + for (i = 0; i < SIZE; i++, p++) \ + if (*p != 'A') \ + abort (); \ + for (; i < (TOTAL - EXTRA); i++, p++) \ + if (*p != 'a') \ + abort (); \ + p = dest; \ + for (i = 0; i < EXTRA; i++, p++) \ + if (*p != 'a') \ + abort (); \ + } + +#define CHECK_MEMCPY(SIZE) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE) \ + if (SIZE > 1) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE - 1) \ + if (SIZE > 2) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE - 2) \ + if (SIZE > 3) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE - 3) \ + if (SIZE > 4) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE - 4) \ + if (SIZE > 5) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE - 5) \ + if (SIZE > 6) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE - 6) \ + if (SIZE > 7) \ + CHECK_MEMCPY_BOUND (SIZE, SIZE - 7) + +char dest[TOTAL]; +char src[TOTAL]; + +MEMCPY_BOUND (0); +MEMCPY_BOUND (1); +MEMCPY_BOUND (2); +MEMCPY_BOUND (3); +MEMCPY_BOUND (4); +MEMCPY_BOUND (5); +MEMCPY_BOUND (7); +MEMCPY_BOUND (8); +MEMCPY_BOUND (9); +MEMCPY_BOUND (15); +MEMCPY_BOUND (16); +MEMCPY_BOUND (17); +MEMCPY_BOUND (31); +MEMCPY_BOUND (32); +MEMCPY_BOUND (33); +MEMCPY_BOUND (63); +MEMCPY_BOUND (64); +MEMCPY_BOUND (65); +MEMCPY_BOUND (127); +MEMCPY_BOUND (128); +MEMCPY_BOUND (129); +MEMCPY_BOUND (255); +MEMCPY_BOUND (256); +MEMCPY_BOUND (257); + +int +main (void) +{ + unsigned int i; + char *p; + + CHECK_MEMCPY (0); + CHECK_MEMCPY (1); + CHECK_MEMCPY (2); + CHECK_MEMCPY (3); + CHECK_MEMCPY (4); + CHECK_MEMCPY (5); + CHECK_MEMCPY (7); + CHECK_MEMCPY (8); + CHECK_MEMCPY (9); + CHECK_MEMCPY (15); + CHECK_MEMCPY (16); + CHECK_MEMCPY (17); + CHECK_MEMCPY (31); + CHECK_MEMCPY (32); + CHECK_MEMCPY (33); + CHECK_MEMCPY (63); + CHECK_MEMCPY (64); + CHECK_MEMCPY (65); + CHECK_MEMCPY (127); + CHECK_MEMCPY (128); + CHECK_MEMCPY (129); + CHECK_MEMCPY (255); + CHECK_MEMCPY (256); + CHECK_MEMCPY (257); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memcpy-bounded-1b.c b/gcc/testsuite/gcc.target/i386/builtin-memcpy-bounded-1b.c new file mode 100644 index 00000000000..aa7263ecdfb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memcpy-bounded-1b.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-options "-O0 -minline-all-stringops" } */ + +/* Test -O0 -minline-all-stringops on memcpy with various bounds. */ + +#include "builtin-memcpy-bounded-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-1a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-1a.c new file mode 100644 index 00000000000..33f813cdbcf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-1a.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movd %esi, %xmm0 +** movb %sil, 128\(%rdi\) +** punpcklbw %xmm0, %xmm0 +** punpcklwd %xmm0, %xmm0 +** pshufd \$0, %xmm0, %xmm0 +** movups %xmm0, \(%rdi\) +** movups %xmm0, 16\(%rdi\) +** movups %xmm0, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +** movups %xmm0, 64\(%rdi\) +** movups %xmm0, 80\(%rdi\) +** movups %xmm0, 96\(%rdi\) +** movups %xmm0, 112\(%rdi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c) +{ + __builtin_memset (dest, c, 129); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-1b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-1b.c new file mode 100644 index 00000000000..0e1edf200d6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-1b.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mtune=znver3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movzbl %sil, %eax +** movabsq \$72340172838076673, %rdx +** movq %rdi, %rcx +** leaq 8\(%rdi\), %rdi +** imulq %rdx, %rax +** movq %rax, -8\(%rdi\) +** movq %rax, 113\(%rdi\) +** andq \$-8, %rdi +** subq %rdi, %rcx +** addl \$129, %ecx +** shrl \$3, %ecx +** rep stosq +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-2a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-2a.c new file mode 100644 index 00000000000..9c72189a40d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-2a.c @@ -0,0 +1,59 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$15, %rdx +** jbe .L9 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** cmpl \$8, %edx +** jnb .L10 +** cmpl \$4, %edx +** jnb .L5 +** cmpl \$1, %edx +** ja .L6 +** jb .L1 +** movb %sil, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c, __SIZE_TYPE__ n) +{ + if (n < 16) + __builtin_memset (dest, c, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-2b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-2b.c new file mode 100644 index 00000000000..49a7e49dc4f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-2b.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$15, %rdx +** jbe .L10 +**.L8: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** cmpl \$8, %edx +** jnb .L11 +** cmpl \$4, %edx +** jnb .L5 +** cmpl \$1, %edx +** ja .L6 +** jb .L8 +** movb %sil, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-2a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-2c.c b/gcc/testsuite/gcc.target/i386/builtin-memset-2c.c new file mode 100644 index 00000000000..f3dfb845918 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-2c.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$15, %rdx +** jbe .L10 +**.L8: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** cmpl \$8, %edx +** jnb .L11 +** cmpl \$4, %edx +** jnb .L5 +** cmpl \$1, %edx +** ja .L6 +** jb .L8 +** movb %sil, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-2a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-3a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-3a.c new file mode 100644 index 00000000000..83601dd34a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-3a.c @@ -0,0 +1,70 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$31, %rdx +** jbe .L10 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** movq %rax, %xmm0 +** punpcklqdq %xmm0, %xmm0 +** cmpl \$16, %edx +** jnb .L11 +** cmpl \$8, %edx +** jnb .L5 +** cmpl \$4, %edx +** jnb .L6 +** cmpl \$1, %edx +** ja .L7 +** jb .L1 +** movb %sil, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %edx, %edx +** movups %xmm0, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c, __SIZE_TYPE__ n) +{ + if (n < 32) + __builtin_memset (dest, c, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-3b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-3b.c new file mode 100644 index 00000000000..25f96ada02c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-3b.c @@ -0,0 +1,65 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$31, %rdx +** jbe .L11 +**.L9: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** vmovq %rax, %xmm1 +** vpunpcklqdq %xmm1, %xmm1, %xmm0 +** cmpl \$16, %edx +** jnb .L12 +** cmpl \$8, %edx +** jnb .L5 +** cmpl \$4, %edx +** jnb .L6 +** cmpl \$1, %edx +** ja .L7 +** jb .L9 +** movb %sil, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** movl %edx, %edx +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-3a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-3c.c b/gcc/testsuite/gcc.target/i386/builtin-memset-3c.c new file mode 100644 index 00000000000..cce2f5062dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-3c.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**... +*/ + +#include "builtin-memset-3a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-4a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-4a.c new file mode 100644 index 00000000000..e80d31dc71e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-4a.c @@ -0,0 +1,80 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$63, %rdx +** jbe .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** movq %rax, %xmm0 +** punpcklqdq %xmm0, %xmm0 +** cmpl \$16, %edx +** jnb .L13 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L1 +** movb %sil, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movups %xmm0, \(%rdi\) +** cmpl \$32, %edx +** ja .L5 +** movl %edx, %edx +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** movups %xmm0, 16\(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c, __SIZE_TYPE__ n) +{ + if (n < 64) + __builtin_memset (dest, c, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-4b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-4b.c new file mode 100644 index 00000000000..df566f25a35 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-4b.c @@ -0,0 +1,82 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$63, %rdx +** jbe .L13 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** vmovq %rax, %xmm1 +** vpbroadcastq %xmm1, %ymm0 +** cmpl \$32, %edx +** jnb .L14 +** cmpl \$16, %edx +** jnb .L5 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L10 +** movb %sil, \(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movl %edx, %edx +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** vzeroupper +** ret +**.L10: +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-4a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-4c.c b/gcc/testsuite/gcc.target/i386/builtin-memset-4c.c new file mode 100644 index 00000000000..965a299f0db --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-4c.c @@ -0,0 +1,81 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$63, %rdx +** jbe .L13 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** vpbroadcastq %rax, %ymm0 +** cmpl \$32, %edx +** jnb .L14 +** cmpl \$16, %edx +** jnb .L5 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L10 +** movb %sil, \(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movl %edx, %edx +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** vzeroupper +** ret +**.L10: +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-4a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-5a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-5a.c new file mode 100644 index 00000000000..d66f9e96742 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-5a.c @@ -0,0 +1,93 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$127, %rdx +** jbe .L12 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L12: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** movq %rax, %xmm0 +** punpcklqdq %xmm0, %xmm0 +** cmpl \$16, %edx +** jnb .L13 +** cmpl \$8, %edx +** jnb .L6 +** cmpl \$4, %edx +** jnb .L7 +** cmpl \$1, %edx +** ja .L8 +** jb .L1 +** movb %sil, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L13: +** movups %xmm0, \(%rdi\) +** cmpl \$32, %edx +** ja .L5 +** movl %edx, %edx +** movups %xmm0, -16\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movups %xmm0, 16\(%rdi\) +** cmpl \$64, %edx +** ja .L14 +** movl %edx, %edx +** movups %xmm0, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movl %edx, %edx +** movups %xmm0, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +** movups %xmm0, -16\(%rdi,%rdx\) +** movups %xmm0, -32\(%rdi,%rdx\) +** movups %xmm0, -48\(%rdi,%rdx\) +** movups %xmm0, -64\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dest, int c, __SIZE_TYPE__ n) +{ + if (n < 128) + __builtin_memset (dest, c, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-5b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-5b.c new file mode 100644 index 00000000000..a6a04cd2651 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-5b.c @@ -0,0 +1,93 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$127, %rdx +** jbe .L15 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** vmovq %rax, %xmm1 +** vpbroadcastq %xmm1, %ymm0 +** cmpl \$32, %edx +** jnb .L16 +** cmpl \$16, %edx +** jnb .L6 +** cmpl \$8, %edx +** jnb .L7 +** cmpl \$4, %edx +** jnb .L8 +** cmpl \$1, %edx +** ja .L9 +** jb .L12 +** movb %sil, \(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** vmovdqu %ymm0, \(%rdi\) +** cmpl \$64, %edx +** ja .L5 +** movl %edx, %edx +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu %ymm0, 32\(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vmovdqu %ymm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** vzeroupper +** ret +**.L12: +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-5a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-5c.c b/gcc/testsuite/gcc.target/i386/builtin-memset-5c.c new file mode 100644 index 00000000000..7e91aa865e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-5c.c @@ -0,0 +1,89 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$127, %rdx +** jbe .L14 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** movabsq \$72340172838076673, %rcx +** movzbl %sil, %eax +** imulq %rcx, %rax +** vpbroadcastq %rax, %zmm0 +** cmpl \$64, %edx +** jnb .L15 +** cmpl \$32, %edx +** jnb .L5 +** cmpl \$16, %edx +** jnb .L6 +** cmpl \$8, %edx +** jnb .L7 +** cmpl \$4, %edx +** jnb .L8 +** cmpl \$1, %edx +** ja .L9 +** jb .L11 +** movb %sil, \(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** movl %edx, %edx +** vmovdqu8 %zmm0, \(%rdi\) +** vmovdqu8 %zmm0, -64\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %edx, %edx +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %edx, %edx +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %edx, %edx +** movq %rax, \(%rdi\) +** movq %rax, -8\(%rdi,%rdx\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %edx, %edx +** movl %eax, \(%rdi\) +** movl %eax, -4\(%rdi,%rdx\) +** vzeroupper +** ret +**.L9: +** movl %edx, %edx +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rdx\) +** vzeroupper +** ret +**.L11: +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-5a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-6a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-6a.c new file mode 100644 index 00000000000..f4a76ade2d1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-6a.c @@ -0,0 +1,109 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$247, %rsi +** jbe .L16 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** movq %rdi, %rax +** cmpl \$16, %esi +** jnb .L17 +** cmpl \$8, %esi +** jnb .L6 +** cmpl \$4, %esi +** jnb .L7 +** cmpl \$1, %esi +** ja .L8 +** jb .L1 +** movb \$0, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L17: +** pxor %xmm0, %xmm0 +** cmpl \$32, %esi +** ja .L5 +** movl %esi, %esi +** movups %xmm0, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** cmpl \$128, %esi +** ja .L10 +** movups %xmm0, \(%rdi\) +** movups %xmm0, 16\(%rdi\) +** cmpl \$64, %esi +** jbe .L11 +** movups %xmm0, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +**.L14: +** movl %esi, %esi +** movups %xmm0, -16\(%rdi,%rsi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** movups %xmm0, -48\(%rdi,%rsi\) +** movups %xmm0, -64\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %esi, %esi +** movq \$0, \(%rdi\) +** movq \$0, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movl %esi, %edx +**.L12: +** subl \$64, %edx +** movups %xmm0, \(%rax\) +** addq \$64, %rax +** movups %xmm0, -48\(%rax\) +** movups %xmm0, -32\(%rax\) +** movups %xmm0, -16\(%rax\) +** cmpl \$64, %edx +** ja .L12 +** jmp .L14 +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %esi, %esi +** movl \$0, \(%rdi\) +** movl \$0, -4\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %esi, %esi +** movups %xmm0, -16\(%rdi,%rsi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** xorl %eax, %eax +** movl %esi, %esi +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rsi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dst, __SIZE_TYPE__ n) +{ + if (n <= 247) + __builtin_memset(dst, 0, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-6b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-6b.c new file mode 100644 index 00000000000..75b8d40a09f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-6b.c @@ -0,0 +1,97 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$247, %rsi +** jbe .L14 +**.L12: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L14: +** vpxor %xmm0, %xmm0, %xmm0 +** cmpl \$32, %esi +** jnb .L15 +** cmpl \$16, %esi +** jnb .L6 +** cmpl \$8, %esi +** jnb .L7 +** cmpl \$4, %esi +** jnb .L8 +** cmpl \$1, %esi +** ja .L9 +** jb .L12 +** movb \$0, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** vmovdqu %ymm0, \(%rdi\) +** cmpl \$64, %esi +** ja .L5 +** movl %esi, %esi +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** vmovdqu %ymm0, 32\(%rdi\) +** cmpl \$128, %esi +** ja .L16 +** movl %esi, %esi +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %esi, %esi +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** movl %esi, %esi +** vmovdqu %ymm0, 64\(%rdi\) +** vmovdqu %ymm0, 96\(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vmovdqu %ymm0, -96\(%rdi,%rsi\) +** vmovdqu %ymm0, -128\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %esi, %esi +** movq \$0, \(%rdi\) +** movq \$0, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %esi, %esi +** movl \$0, \(%rdi\) +** movl \$0, -4\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** xorl %eax, %eax +** movl %esi, %esi +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rsi\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-6a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-6c.c b/gcc/testsuite/gcc.target/i386/builtin-memset-6c.c new file mode 100644 index 00000000000..d93a518bbe2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-6c.c @@ -0,0 +1,91 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$247, %rsi +** jbe .L15 +**.L13: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** vpxor %xmm0, %xmm0, %xmm0 +** cmpl \$64, %esi +** jnb .L16 +** cmpl \$32, %esi +** jnb .L6 +** cmpl \$16, %esi +** jnb .L7 +** cmpl \$8, %esi +** jnb .L8 +** cmpl \$4, %esi +** jnb .L9 +** cmpl \$1, %esi +** ja .L10 +** jb .L13 +** movb \$0, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** vmovdqu8 %zmm0, \(%rdi\) +** cmpl \$128, %esi +** ja .L5 +** movl %esi, %esi +** vmovdqu8 %zmm0, -64\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %esi, %esi +** vmovdqu8 %zmm0, 64\(%rdi\) +** vmovdqu8 %zmm0, -64\(%rdi,%rsi\) +** vmovdqu8 %zmm0, -128\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %esi, %esi +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %esi, %esi +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %esi, %esi +** movq \$0, \(%rdi\) +** movq \$0, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movl %esi, %esi +** movl \$0, \(%rdi\) +** movl \$0, -4\(%rdi,%rsi\) +** ret +**.L10: +** xorl %eax, %eax +** movl %esi, %esi +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rsi\) +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-6a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-7a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-7a.c new file mode 100644 index 00000000000..ca7ffba0b23 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-7a.c @@ -0,0 +1,108 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$247, %rsi +** jbe .L16 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** movq %rdi, %rax +** cmpl \$16, %esi +** jnb .L17 +** cmpl \$8, %esi +** jnb .L6 +** cmpl \$4, %esi +** jnb .L7 +** cmpl \$1, %esi +** ja .L8 +** jb .L1 +** movb \$-1, \(%rdi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L17: +** pcmpeqd %xmm0, %xmm0 +** cmpl \$32, %esi +** ja .L5 +** movl %esi, %esi +** movups %xmm0, \(%rdi\) +** movups %xmm0, -16\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** cmpl \$128, %esi +** ja .L10 +** movups %xmm0, \(%rdi\) +** movups %xmm0, 16\(%rdi\) +** cmpl \$64, %esi +** jbe .L11 +** movups %xmm0, 32\(%rdi\) +** movups %xmm0, 48\(%rdi\) +**.L14: +** movl %esi, %esi +** movups %xmm0, -16\(%rdi,%rsi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** movups %xmm0, -48\(%rdi,%rsi\) +** movups %xmm0, -64\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %esi, %esi +** movq \$-1, \(%rdi\) +** movq \$-1, -8\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L10: +** movl %esi, %edx +**.L12: +** subl \$64, %edx +** movups %xmm0, \(%rax\) +** addq \$64, %rax +** movups %xmm0, -48\(%rax\) +** movups %xmm0, -32\(%rax\) +** movups %xmm0, -16\(%rax\) +** cmpl \$64, %edx +** ja .L12 +** jmp .L14 +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %esi, %esi +** movl \$-1, \(%rdi\) +** movl \$-1, -4\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L11: +** movl %esi, %esi +** movups %xmm0, -16\(%rdi,%rsi\) +** movups %xmm0, -32\(%rdi,%rsi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl \$-1, %eax +** movl %esi, %esi +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rsi\) +** ret +**... +*/ + +void +foo (char *dst, __SIZE_TYPE__ n) +{ + if (n <= 247) + __builtin_memset(dst, -1, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-7b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-7b.c new file mode 100644 index 00000000000..1054e5f0c2e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-7b.c @@ -0,0 +1,103 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$247, %rsi +** jbe .L15 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L15: +** vpcmpeqd %ymm0, %ymm0, %ymm0 +** cmpl \$32, %esi +** jnb .L16 +** cmpl \$16, %esi +** jnb .L6 +** cmpl \$8, %esi +** jnb .L7 +** cmpl \$4, %esi +** jnb .L8 +** cmpl \$1, %esi +** ja .L9 +** jb .L12 +** movb \$-1, \(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** vmovdqu %ymm0, \(%rdi\) +** cmpl \$64, %esi +** ja .L5 +** movl %esi, %esi +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** vmovdqu %ymm0, 32\(%rdi\) +** cmpl \$128, %esi +** ja .L17 +** movl %esi, %esi +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %esi, %esi +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L17: +** movl %esi, %esi +** vmovdqu %ymm0, 64\(%rdi\) +** vmovdqu %ymm0, 96\(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vmovdqu %ymm0, -64\(%rdi,%rsi\) +** vmovdqu %ymm0, -96\(%rdi,%rsi\) +** vmovdqu %ymm0, -128\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %esi, %esi +** movq \$-1, \(%rdi\) +** movq \$-1, -8\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %esi, %esi +** movl \$-1, \(%rdi\) +** movl \$-1, -4\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movl \$-1, %eax +** movl %esi, %esi +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rsi\) +** vzeroupper +** ret +**.L12: +** vzeroupper +** ret +**... +*/ + +#include "builtin-memset-7a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-7c.c b/gcc/testsuite/gcc.target/i386/builtin-memset-7c.c new file mode 100644 index 00000000000..6c99eafea20 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-7c.c @@ -0,0 +1,99 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** cmpq \$247, %rsi +** jbe .L16 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L16: +** vpxor %xmm0, %xmm0, %xmm0 +** vpternlogd \$0xFF, %zmm0, %zmm0, %zmm0 +** cmpl \$64, %esi +** jnb .L17 +** cmpl \$32, %esi +** jnb .L6 +** cmpl \$16, %esi +** jnb .L7 +** cmpl \$8, %esi +** jnb .L8 +** cmpl \$4, %esi +** jnb .L9 +** cmpl \$1, %esi +** ja .L10 +** jb .L13 +** movb \$-1, \(%rdi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L17: +** vmovdqu8 %zmm0, \(%rdi\) +** cmpl \$128, %esi +** ja .L5 +** movl %esi, %esi +** vmovdqu8 %zmm0, -64\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L5: +** movl %esi, %esi +** vmovdqu8 %zmm0, 64\(%rdi\) +** vmovdqu8 %zmm0, -64\(%rdi,%rsi\) +** vmovdqu8 %zmm0, -128\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L6: +** movl %esi, %esi +** vmovdqu %ymm0, \(%rdi\) +** vmovdqu %ymm0, -32\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L7: +** movl %esi, %esi +** vmovdqu %xmm0, \(%rdi\) +** vmovdqu %xmm0, -16\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L8: +** movl %esi, %esi +** movq \$-1, \(%rdi\) +** movq \$-1, -8\(%rdi,%rsi\) +** vzeroupper +** ret +** .p2align 4,,10 +** .p2align 3 +**.L9: +** movl %esi, %esi +** movl \$-1, \(%rdi\) +** movl \$-1, -4\(%rdi,%rsi\) +** vzeroupper +** ret +**.L10: +** movl \$-1, %eax +** movl %esi, %esi +** movw %ax, \(%rdi\) +** movw %ax, -2\(%rdi,%rsi\) +** vzeroupper +** ret +**.L13: +** vzeroupper +** ret +** .cfi_endproc +**... +*/ + +#include "builtin-memset-7a.c" diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-8.c b/gcc/testsuite/gcc.target/i386/builtin-memset-8.c new file mode 100644 index 00000000000..e51f4aaa578 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-8.c @@ -0,0 +1,93 @@ +/* { dg-do compile { target { maybe_x32 && lp64 } } } */ +/* { dg-options "-O2 -mx32 -march=x86-64 -minline-all-stringops" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**foo: +**.LFB0: +** .cfi_startproc +** movabsq \$72340172838076673, %rax +** movzbl %sil, %esi +** imulq %rax, %rsi +** movq %rsi, %xmm0 +** punpcklqdq %xmm0, %xmm0 +** cmpl \$64, %edx +** jnb .L2 +** testb \$32, %dl +** jne .L19 +** testb \$16, %dl +** jne .L20 +** testb \$8, %dl +** jne .L21 +** testb \$4, %dl +** jne .L22 +** testl %edx, %edx +** jne .L23 +**.L1: +** ret +** .p2align 4,,10 +** .p2align 3 +**.L2: +** movups %xmm0, -64\(%edx,%edi\) +** subl \$1, %edx +** movups %xmm0, -47\(%edx,%edi\) +** movups %xmm0, -31\(%edx,%edi\) +** movups %xmm0, -15\(%edx,%edi\) +** cmpl \$64, %edx +** jb .L1 +** andl \$-64, %edx +** xorl %eax, %eax +**.L9: +** movups %xmm0, \(%eax,%edi\) +** addl \$64, %eax +** movups %xmm0, -48\(%eax,%edi\) +** movups %xmm0, -32\(%eax,%edi\) +** movups %xmm0, -16\(%eax,%edi\) +** cmpl %edx, %eax +** jb .L9 +** ret +** .p2align 4,,10 +** .p2align 3 +**.L23: +** movb %sil, \(%edi\) +** testb \$2, %dl +** je .L1 +** movw %si, -2\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L19: +** movups %xmm0, \(%edi\) +** movups %xmm0, 16\(%edi\) +** movups %xmm0, -32\(%edx,%edi\) +** movups %xmm0, -16\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L20: +** movups %xmm0, \(%edi\) +** movups %xmm0, -16\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L21: +** movq %rsi, \(%edi\) +** movq %rsi, -8\(%edx,%edi\) +** ret +** .p2align 4,,10 +** .p2align 3 +**.L22: +** movl %esi, \(%edi\) +** movl %esi, -4\(%edx,%edi\) +** ret +** .cfi_endproc +**... +*/ + +void +foo (char *dst, int c, __SIZE_TYPE__ n) +{ + if (n <= (__SIZE_TYPE__) -1) + __builtin_memset (dst, c, n); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-bounded-1a.c b/gcc/testsuite/gcc.target/i386/builtin-memset-bounded-1a.c new file mode 100644 index 00000000000..ef2c4d5d967 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-bounded-1a.c @@ -0,0 +1,118 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -minline-all-stringops" } */ + +/* Test -O2 -minline-all-stringops on memset with various bounds. */ + +#include + +#define MAX_LENGTH 4096 +#define EXTRA 64 +#define TOTAL (EXTRA + MAX_LENGTH + EXTRA) + +#define MEMSET_BOUND(BOUND) \ +__attribute__ ((noipa, noinline)) \ +static char * \ +memset_##BOUND (char *buf, size_t len) \ +{ \ + if (len <= BOUND) \ + return __builtin_memset (buf + EXTRA, 'A', len); \ + else \ + return buf + EXTRA; \ +} + +#define CHECK_MEMSET_BOUND(BOUND, SIZE) \ + { \ + for (i = 0; i < TOTAL; i++) \ + buf[i] = 'a'; \ + p = memset_##BOUND (buf, SIZE); \ + if (p != buf + EXTRA) \ + abort (); \ + for (i = 0; i < SIZE; i++, p++) \ + if (*p != 'A') \ + abort (); \ + for (; i < (TOTAL - EXTRA); i++, p++) \ + if (*p != 'a') \ + abort (); \ + p = buf; \ + for (i = 0; i < EXTRA; i++, p++) \ + if (*p != 'a') \ + abort (); \ + } + +#define CHECK_MEMSET(SIZE) \ + CHECK_MEMSET_BOUND (SIZE, SIZE) \ + if (SIZE > 1) \ + CHECK_MEMSET_BOUND (SIZE, SIZE - 1) \ + if (SIZE > 2) \ + CHECK_MEMSET_BOUND (SIZE, SIZE - 2) \ + if (SIZE > 3) \ + CHECK_MEMSET_BOUND (SIZE, SIZE - 3) \ + if (SIZE > 4) \ + CHECK_MEMSET_BOUND (SIZE, SIZE - 4) \ + if (SIZE > 5) \ + CHECK_MEMSET_BOUND (SIZE, SIZE - 5) \ + if (SIZE > 6) \ + CHECK_MEMSET_BOUND (SIZE, SIZE - 6) \ + if (SIZE > 7) \ + CHECK_MEMSET_BOUND (SIZE, SIZE - 7) + +char buf[TOTAL]; + +MEMSET_BOUND (0); +MEMSET_BOUND (1); +MEMSET_BOUND (2); +MEMSET_BOUND (3); +MEMSET_BOUND (4); +MEMSET_BOUND (5); +MEMSET_BOUND (7); +MEMSET_BOUND (8); +MEMSET_BOUND (9); +MEMSET_BOUND (15); +MEMSET_BOUND (16); +MEMSET_BOUND (17); +MEMSET_BOUND (31); +MEMSET_BOUND (32); +MEMSET_BOUND (33); +MEMSET_BOUND (63); +MEMSET_BOUND (64); +MEMSET_BOUND (65); +MEMSET_BOUND (127); +MEMSET_BOUND (128); +MEMSET_BOUND (129); +MEMSET_BOUND (255); +MEMSET_BOUND (256); +MEMSET_BOUND (257); + +int +main (void) +{ + unsigned int i; + char *p; + + CHECK_MEMSET (0); + CHECK_MEMSET (1); + CHECK_MEMSET (2); + CHECK_MEMSET (3); + CHECK_MEMSET (4); + CHECK_MEMSET (5); + CHECK_MEMSET (7); + CHECK_MEMSET (8); + CHECK_MEMSET (9); + CHECK_MEMSET (15); + CHECK_MEMSET (16); + CHECK_MEMSET (17); + CHECK_MEMSET (31); + CHECK_MEMSET (32); + CHECK_MEMSET (33); + CHECK_MEMSET (63); + CHECK_MEMSET (64); + CHECK_MEMSET (65); + CHECK_MEMSET (127); + CHECK_MEMSET (128); + CHECK_MEMSET (129); + CHECK_MEMSET (255); + CHECK_MEMSET (256); + CHECK_MEMSET (257); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/builtin-memset-bounded-1b.c b/gcc/testsuite/gcc.target/i386/builtin-memset-bounded-1b.c new file mode 100644 index 00000000000..302b571aa1e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/builtin-memset-bounded-1b.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-options "-O0 -minline-all-stringops" } */ + +/* Test -O0 -minline-all-stringops on memset with various bounds. */ + +#include "builtin-memset-bounded-1a.c"