From: Naveen Date: Thu, 11 Jun 2026 13:26:23 +0000 (-0700) Subject: [AArch64]: Use MOVI for low‑64‑bit integer SIMD constant vectors [PR113926] X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f152cf1734f8087b982605ce93d4fb048283564e;p=thirdparty%2Fgcc.git [AArch64]: Use MOVI for low‑64‑bit integer SIMD constant vectors [PR113926] Extend AdvSIMD constant materialization to recognize 128‑bit integer vector constants where the low 64 bits contain a duplicated scalar value and the high 64 bits are zero. Bootstrapped and tested on aarch64-linux-gnu. gcc/ChangeLog: PR target/113926 * config/aarch64/aarch64.cc (struct simd_immediate_info): Add width field to record AdvSIMD output vector width. (simd_immediate_info::simd_immediate_info): Initialize width to zero in all constructors. (aarch64_simd_valid_imm): Allow 128-bit AdvSIMD MOV immediates with zero high 64 bits to be materialized using 64-bit MOVI. (aarch64_output_simd_imm): Use recorded immediate width when outputting AdvSIMD immediates. gcc/testsuite/ChangeLog: PR target/113926 * gcc.target/aarch64/pr113926.c: New test. * gcc.target/aarch64/pr113926_1.c: New test. Signed-off-by: Naveen --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 983cef6d681..69530ea9d34 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -169,6 +169,9 @@ struct simd_immediate_info /* The mode of the elements. */ scalar_mode elt_mode; + /* If nonzero, the vector width to print the AdvSIMD immediate. */ + unsigned int width = 0; + /* The instruction to use to move the immediate into a vector. */ insn_type insn; @@ -203,7 +206,7 @@ struct simd_immediate_info ELT_MODE_IN and value VALUE_IN. */ inline simd_immediate_info ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) - : elt_mode (elt_mode_in), insn (MOV) + : elt_mode (elt_mode_in), width (0), insn (MOV) { u.mov.value = value_in; u.mov.modifier = LSL; @@ -218,7 +221,7 @@ inline simd_immediate_info unsigned HOST_WIDE_INT value_in, insn_type insn_in, modifier_type modifier_in, unsigned int shift_in) - : elt_mode (elt_mode_in), insn (insn_in) + : elt_mode (elt_mode_in), width (0), insn (insn_in) { u.mov.value = gen_int_mode (value_in, elt_mode_in); u.mov.modifier = modifier_in; @@ -229,7 +232,7 @@ inline simd_immediate_info and where element I is equal to BASE_IN + I * STEP_IN. */ inline simd_immediate_info ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in) - : elt_mode (elt_mode_in), insn (INDEX) + : elt_mode (elt_mode_in), width (0), insn (INDEX) { u.index.base = base_in; u.index.step = step_in; @@ -240,7 +243,7 @@ inline simd_immediate_info inline simd_immediate_info ::simd_immediate_info (scalar_int_mode elt_mode_in, aarch64_svpattern pattern_in) - : elt_mode (elt_mode_in), insn (PTRUE) + : elt_mode (elt_mode_in), width (0), insn (PTRUE) { u.pattern = pattern_in; } @@ -24635,11 +24638,32 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info *info, } } - /* The immediate must repeat every eight bytes. */ + /* The immediate must normally repeat every eight bytes. For MOV + also allow a 128-bit AdvSIMD constant whose high 64 bits are zero + since it can be materialized using a 64-bit MOVI. */ unsigned int nbytes = bytes.length (); - for (unsigned i = 8; i < nbytes; ++i) + unsigned int output_width = 0; + bool repeats_every_8_bytes = true; + + for (unsigned int i = 8; i < nbytes; ++i) if (bytes[i] != bytes[i - 8]) - return false; + { + repeats_every_8_bytes = false; + break; + } + + if (!repeats_every_8_bytes) + { + if (which != AARCH64_CHECK_MOV || !(vec_flags & VEC_ADVSIMD) + || aarch64_sve_mode_p (mode) || nbytes != 16) + return false; + + for (unsigned int i = 8; i < nbytes; ++i) + if (bytes[i] != 0) + return false; + + output_width = 64; + } /* Get the repeating 8-byte value as an integer. No endian correction is needed here because bytes is already in lsb-first order. */ @@ -24692,6 +24716,7 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info *info, { rtx float_val = const_double_from_real_value (r, fmode); *info = simd_immediate_info (fmode, float_val); + info->width = output_width; } return true; } @@ -24701,7 +24726,11 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info *info, return aarch64_sve_valid_immediate (ival, imode, info, which); if (aarch64_advsimd_valid_immediate (val64, imode, info, which)) - return true; + { + if (info) + info->width = output_width; + return true; + } if (TARGET_SVE) return aarch64_sve_valid_immediate (ival, imode, info, which); @@ -27235,6 +27264,9 @@ aarch64_output_simd_imm (rtx const_vector, unsigned width, is_valid = aarch64_simd_valid_imm (const_vector, &info, which); gcc_assert (is_valid); + if (info.width != 0) + width = info.width; + element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); lane_count = width / GET_MODE_BITSIZE (info.elt_mode); diff --git a/gcc/testsuite/gcc.target/aarch64/pr113926.c b/gcc/testsuite/gcc.target/aarch64/pr113926.c new file mode 100644 index 00000000000..d55ec77d57c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr113926.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +typedef signed char v16qi __attribute__((vector_size(16))); +typedef short v8hi __attribute__((vector_size(16))); +typedef int v4si __attribute__((vector_size(16))); + +v16qi +f_qi (void) +{ + return (v16qi) + { 3, 3, 3, 3, 3, 3, 3, 3, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +v8hi +f_hi (void) +{ + return (v8hi) + { 2, 2, 2, 2, + 0, 0, 0, 0 }; +} + +v4si +f_si (void) +{ + return (v4si) + { 1, 1, 0, 0 }; +} + +/* { dg-final { scan-assembler-times {\tmovi\tv[0-9]+\.8b, 0x3} 1 } } */ +/* { dg-final { scan-assembler-times {\tmovi\tv[0-9]+\.4h, 0x2} 1 } } */ +/* { dg-final { scan-assembler-times {\tmovi\tv[0-9]+\.2s, 0x1} 1 } } */ +/* { dg-final { scan-assembler-not {\tldr\tq[0-9]+,} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr113926_1.c b/gcc/testsuite/gcc.target/aarch64/pr113926_1.c new file mode 100644 index 00000000000..8e76ca52b39 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr113926_1.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv8.2-a+fp16" } */ + +typedef float v2sf __attribute__((vector_size(8))); +typedef float v4sf __attribute__((vector_size(16))); +typedef double v2df __attribute__((vector_size(16))); +typedef __fp16 v4hf __attribute__((vector_size(8))); +typedef __fp16 v8hf __attribute__((vector_size(16))); + +v2sf +f_v2sf (void) +{ + return (v2sf){ 1.0f, 1.0f }; +} + +v4sf +f_v4sf (void) +{ + return (v4sf){ 1.0f, 1.0f, 0.0f, 0.0f }; +} + +v2df +f_v2df (void) +{ + return (v2df){ 1.0, 0.0 }; +} + +v4hf +f_v4hf (void) +{ + return (v4hf){ (__fp16)1.0, (__fp16)1.0, (__fp16)1.0, (__fp16)1.0 }; +} + +v8hf +f_v8hf (void) +{ + return (v8hf){ (__fp16)1.0, (__fp16)1.0, (__fp16)1.0, (__fp16)1.0, 0, 0, 0, 0 }; +} + +/* Each function should use fmov, not a literal pool load. */ +/* { dg-final { scan-assembler-times {fmov[ \t]+v[0-9]+[.]2s,[ \t]+1[.]0e[+]0} 2 } } */ +/* { dg-final { scan-assembler-times {fmov[ \t]+v[0-9]+[.]4h,[ \t]+1[.]0e[+]0} 2 } } */ +/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+,} 1 } } */ +/* { dg-final { scan-assembler-not {\tldr\t[sdq][0-9]+, =} } } */