Continuing the series of straightforward annotations, this one handles the normal (not widening or narrowing) vector shifts.
Tests included.
Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-none-elf.
gcc/ChangeLog:
PR target/99195
* config/aarch64/aarch64-simd.md (aarch64_simd_lshr<mode>): Rename to...
(aarch64_simd_lshr<mode><vczle><vczbe>): ... This.
(aarch64_simd_ashr<mode>): Rename to...
(aarch64_simd_ashr<mode><vczle><vczbe>): ... This.
(aarch64_simd_imm_shl<mode>): Rename to...
(aarch64_simd_imm_shl<mode><vczle><vczbe>): ... This.
(aarch64_simd_reg_sshl<mode>): Rename to...
(aarch64_simd_reg_sshl<mode><vczle><vczbe>): ... This.
(aarch64_simd_reg_shl<mode>_unsigned): Rename to...
(aarch64_simd_reg_shl<mode>_unsigned<vczle><vczbe>): ... This.
(aarch64_simd_reg_shl<mode>_signed): Rename to...
(aarch64_simd_reg_shl<mode>_signed<vczle><vczbe>): ... This.
(vec_shr_<mode>): Rename to...
(vec_shr_<mode><vczle><vczbe>): ... This.
(aarch64_<sur>shl<mode>): Rename to...
(aarch64_<sur>shl<mode><vczle><vczbe>): ... This.
(aarch64_<sur>q<r>shl<mode>): Rename to...
(aarch64_<sur>q<r>shl<mode><vczle><vczbe>): ... This.
gcc/testsuite/ChangeLog:
PR target/99195
* gcc.target/aarch64/simd/pr99195_1.c: Add testing for shifts.
* gcc.target/aarch64/simd/pr99195_6.c: Likewise.
* gcc.target/aarch64/simd/pr99195_8.c: New test.
DONE;
})
-(define_insn "aarch64_simd_lshr<mode>"
+(define_insn "aarch64_simd_lshr<mode><vczle><vczbe>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
(lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
(match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "Dr")))]
[(set_attr "type" "neon_shift_imm<q>")]
)
-(define_insn "aarch64_simd_ashr<mode>"
+(define_insn "aarch64_simd_ashr<mode><vczle><vczbe>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w,w")
(ashiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w,w")
(match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "D1,Dr")))]
[(set_attr "type" "neon_shift_acc<q>")]
)
-(define_insn "aarch64_simd_imm_shl<mode>"
+(define_insn "aarch64_simd_imm_shl<mode><vczle><vczbe>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
(ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
(match_operand:VDQ_I 2 "aarch64_simd_lshift_imm" "Dl")))]
[(set_attr "type" "neon_shift_imm<q>")]
)
-(define_insn "aarch64_simd_reg_sshl<mode>"
+(define_insn "aarch64_simd_reg_sshl<mode><vczle><vczbe>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
(ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
(match_operand:VDQ_I 2 "register_operand" "w")))]
[(set_attr "type" "neon_shift_reg<q>")]
)
-(define_insn "aarch64_simd_reg_shl<mode>_unsigned"
+(define_insn "aarch64_simd_reg_shl<mode>_unsigned<vczle><vczbe>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
(unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand" "w")
(match_operand:VDQ_I 2 "register_operand" "w")]
[(set_attr "type" "neon_shift_reg<q>")]
)
-(define_insn "aarch64_simd_reg_shl<mode>_signed"
+(define_insn "aarch64_simd_reg_shl<mode>_signed<vczle><vczbe>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
(unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand" "w")
(match_operand:VDQ_I 2 "register_operand" "w")]
)
;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
-(define_insn "vec_shr_<mode>"
+(define_insn "vec_shr_<mode><vczle><vczbe>"
[(set (match_operand:VD 0 "register_operand" "=w")
(unspec:VD [(match_operand:VD 1 "register_operand" "w")
(match_operand:SI 2 "immediate_operand" "i")]
;; vshl
-(define_insn "aarch64_<sur>shl<mode>"
+(define_insn "aarch64_<sur>shl<mode><vczle><vczbe>"
[(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w")
(unspec:VSDQ_I_DI
[(match_operand:VSDQ_I_DI 1 "register_operand" "w")
;; vqshl
-(define_insn "aarch64_<sur>q<r>shl<mode>"
+(define_insn "aarch64_<sur>q<r>shl<mode><vczle><vczbe>"
[(set (match_operand:VSDQ_I 0 "register_operand" "=w")
(unspec:VSDQ_I
[(match_operand:VSDQ_I 1 "register_operand" "w")
OPNINETEEN (int16, 4, 8, s16, padd, add, qadd, qsub, sub, mul, and, orr, eor, orn, bic, max, min, hadd, rhadd, hsub, abd, pmax, pmin)
OPNINETEEN (int32, 2, 4, s32, padd, add, qadd, qsub, sub, mul, and, orr, eor, orn, bic, max, min, hadd, rhadd, hsub, abd, pmax, pmin)
-OPFOUR (int8, 8, 16, s8, zip1, zip2, uzp1, uzp2)
-OPFOUR (int16, 4, 8, s16, zip1, zip2, uzp1, uzp2)
-OPFOUR (int32, 2, 4, s32, zip1, zip2, uzp1, uzp2)
+OPSIX (int8, 8, 16, s8, zip1, zip2, uzp1, uzp2, shl, qshl)
+OPSIX (int16, 4, 8, s16, zip1, zip2, uzp1, uzp2, shl, qshl)
+OPSIX (int32, 2, 4, s32, zip1, zip2, uzp1, uzp2, shl, qshl)
OPNINETEEN (uint8, 8, 16, u8, padd, add, qadd, qsub, sub, mul, and, orr, eor, orn, bic, max, min, hadd, rhadd, hsub, abd, pmax, pmin)
OPNINETEEN (uint16, 4, 8, u16, padd, add, qadd, qsub, sub, mul, and, orr, eor, orn, bic, max, min, hadd, rhadd, hsub, abd, pmax, pmin)
MYOP (uint32x4_t, uint32x2_t, int32x2_t, sqadd, u32)
MYOP (uint64x2_t, uint64x1_t, int64x1_t, sqadd, u64)
+MYOP (uint8x16_t, uint8x8_t, int8x8_t, shl, u8)
+MYOP (uint16x8_t, uint16x4_t, int16x4_t, shl, u16)
+MYOP (uint32x4_t, uint32x2_t, int32x2_t, shl, u32)
+MYOP (uint64x2_t, uint64x1_t, int64x1_t, shl, u64)
+
+MYOP (uint8x16_t, uint8x8_t, int8x8_t, qshl, u8)
+MYOP (uint16x8_t, uint16x4_t, int16x4_t, qshl, u16)
+MYOP (uint32x4_t, uint32x2_t, int32x2_t, qshl, u32)
+MYOP (uint64x2_t, uint64x1_t, int64x1_t, qshl, u64)
+
/* { dg-final { scan-assembler-not {\tfmov\t} } } */
/* { dg-final { scan-assembler-not {\tmov\t} } } */
--- /dev/null
+/* PR target/99195. */
+/* Check that we take advantage of 64-bit Advanced SIMD operations clearing
+ the top half of the vector register and no explicit zeroing instructions
+ are emitted. */
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+#define MYOP(OT,IT1,OP,S,OS) \
+OT \
+foo_##OP##_##S##OS (IT1 a) \
+{ \
+ IT1 zeros = vcreate_##S##OS (0); \
+ return vcombine_##S##OS (v##OP##_##S##OS (a, 3), zeros); \
+} \
+OT \
+foo_##OP##_##S##OS##_s (IT1 a) \
+{ \
+ IT1 zeros = vcreate_##S##OS (0); \
+ return vcombine_##S##OS (v##OP##_##S##OS (a, OS - 1), zeros); \
+}
+
+MYOP (int8x16_t, int8x8_t, shr_n, s, 8)
+MYOP (int16x8_t, int16x4_t, shr_n, s, 16)
+MYOP (int32x4_t, int32x2_t, shr_n, s, 32)
+MYOP (uint8x16_t, uint8x8_t, shr_n, u, 8)
+MYOP (uint16x8_t, uint16x4_t, shr_n, u, 16)
+MYOP (uint32x4_t, uint32x2_t, shr_n, u, 32)
+MYOP (int8x16_t, int8x8_t, shl_n, s, 8)
+MYOP (int16x8_t, int16x4_t, shl_n, s, 16)
+MYOP (int32x4_t, int32x2_t, shl_n, s, 32)
+MYOP (uint8x16_t, uint8x8_t, shl_n, u, 8)
+MYOP (uint16x8_t, uint16x4_t, shl_n, u, 16)
+MYOP (uint32x4_t, uint32x2_t, shl_n, u, 32)
+
+/* { dg-final { scan-assembler-not {\tfmov\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\t} } } */
+