From: Wilco Dijkstra Date: Wed, 25 Oct 2023 15:28:04 +0000 (+0100) Subject: AArch64: Fix strict-align cpymem/setmem [PR103100] X-Git-Tag: basepoints/gcc-15~4127 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=318f5232cfb3e0c9694889565e1f5424d0354463;p=thirdparty%2Fgcc.git AArch64: Fix strict-align cpymem/setmem [PR103100] The cpymemdi/setmemdi implementation doesn't fully support strict alignment. Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT. Clean up the condition when to use MOPS. gcc/ChangeLog/ PR target/103100 * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition. (setmemdi): Likewise. * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support strict-align. Cleanup condition for using MOPS. (aarch64_expand_setmem): Likewise. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 64684258b7bf..bc126cc60fc3 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -23105,27 +23105,23 @@ aarch64_expand_cpymem (rtx *operands) int mode_bits; rtx dst = operands[0]; rtx src = operands[1]; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode; + bool size_p = optimize_function_for_size_p (cfun); - /* Variable-sized memcpy can go through the MOPS expansion if available. */ - if (!CONST_INT_P (operands[2])) + /* Variable-sized or strict-align copies may use the MOPS expansion. */ + if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_cpymem_mops (operands); - unsigned HOST_WIDE_INT size = INTVAL (operands[2]); - - /* Try to inline up to 256 bytes or use the MOPS threshold if available. */ - unsigned HOST_WIDE_INT max_copy_size - = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; + unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); - bool size_p = optimize_function_for_size_p (cfun); + /* Try to inline up to 256 bytes. */ + unsigned max_copy_size = 256; + unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; - /* Large constant-sized cpymem should go through MOPS when possible. - It should be a win even for size optimization in the general case. - For speed optimization the choice between MOPS and the SIMD sequence - depends on the size of the copy, rather than number of instructions, - alignment etc. */ - if (size > max_copy_size) + /* Large copies use MOPS when available or a library call. */ + if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) return aarch64_expand_cpymem_mops (operands); int copy_bits = 256; @@ -23289,12 +23285,13 @@ aarch64_expand_setmem (rtx *operands) unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; + unsigned align = UINTVAL (operands[3]); rtx base; machine_mode cur_mode = BLKmode, next_mode; - /* If we don't have SIMD registers or the size is variable use the MOPS - inlined sequence if possible. */ - if (!CONST_INT_P (operands[1]) || !TARGET_SIMD) + /* Variable-sized or strict-align memset may use the MOPS expansion. */ + if (!CONST_INT_P (operands[1]) || !TARGET_SIMD + || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); bool size_p = optimize_function_for_size_p (cfun); @@ -23302,10 +23299,13 @@ aarch64_expand_setmem (rtx *operands) /* Default the maximum to 256-bytes when considering only libcall vs SIMD broadcast sequence. */ unsigned max_set_size = 256; + unsigned mops_threshold = aarch64_mops_memset_size_threshold; - len = INTVAL (operands[1]); - if (len > max_set_size && !TARGET_MOPS) - return false; + len = UINTVAL (operands[1]); + + /* Large memset uses MOPS when available or a library call. */ + if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) + return aarch64_expand_setmem_mops (operands); int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); /* The MOPS sequence takes: @@ -23318,12 +23318,6 @@ aarch64_expand_setmem (rtx *operands) the arguments + 1 for the call. */ unsigned libcall_cost = 4; - /* Upper bound check. For large constant-sized setmem use the MOPS sequence - when available. */ - if (TARGET_MOPS - && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold) - return aarch64_expand_setmem_mops (operands); - /* Attempt a sequence with a vector broadcast followed by stores. Count the number of operations involved to see if it's worth it against the alternatives. A simple counter simd_ops on the @@ -23365,10 +23359,8 @@ aarch64_expand_setmem (rtx *operands) simd_ops++; n -= mode_bits; - /* Do certain trailing copies as overlapping if it's going to be - cheaper. i.e. less instructions to do so. For instance doing a 15 - byte copy it's more efficient to do two overlapping 8 byte copies than - 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */ + /* Emit trailing writes using overlapping unaligned accesses + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) { next_mode = smallest_mode_for_size (n, MODE_INT); diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 7be1de38b1c3..4a3af6df7e7c 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1630,7 +1630,7 @@ (match_operand:BLK 1 "memory_operand") (match_operand:DI 2 "general_operand") (match_operand:DI 3 "immediate_operand")] - "!STRICT_ALIGNMENT || TARGET_MOPS" + "" { if (aarch64_expand_cpymem (operands)) DONE; @@ -1727,7 +1727,7 @@ (match_operand:QI 2 "nonmemory_operand")) ;; Value (use (match_operand:DI 1 "general_operand")) ;; Length (match_operand 3 "immediate_operand")] ;; Align - "TARGET_SIMD || TARGET_MOPS" + "" { if (aarch64_expand_setmem (operands)) DONE;