]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Remove SME2.1 forms of LUTI2/4
authorRichard Sandiford <richard.sandiford@arm.com>
Tue, 5 Mar 2024 17:51:24 +0000 (17:51 +0000)
committerRichard Sandiford <richard.sandiford@arm.com>
Tue, 5 Mar 2024 17:51:24 +0000 (17:51 +0000)
I was over-eager when adding support for strided SME2 instructions
and accidentally included forms of LUTI2 and LUTI4 that are only
available with SME2.1, not SME2.  This patch removes them for now.
We're planning to add proper support for SME2.1 in the GCC 15
timeframe.

Sorry for the blunder :(

gcc/
* config/aarch64/aarch64.md (stride_type): Remove luti_consecutive
and luti_strided.
* config/aarch64/aarch64-sme.md
(@aarch64_sme_lut<LUTI_BITS><mode>): Remove stride_type attribute.
(@aarch64_sme_lut<LUTI_BITS><mode>_strided2): Delete.
(@aarch64_sme_lut<LUTI_BITS><mode>_strided4): Likewise.
* config/aarch64/aarch64-early-ra.cc (is_stride_candidate)
(early_ra::maybe_convert_to_strided_access): Remove support for
strided LUTI2 and LUTI4.

gcc/testsuite/
* gcc.target/aarch64/sme/strided_1.c (test5): Remove.

gcc/config/aarch64/aarch64-early-ra.cc
gcc/config/aarch64/aarch64-sme.md
gcc/config/aarch64/aarch64.md
gcc/testsuite/gcc.target/aarch64/sme/strided_1.c

index 8530b0ae41ea723c8e1c93774213d63f9d64df60..1e2c823cb2eb47c1c06700708205fe93eb05e3f1 100644 (file)
@@ -1060,8 +1060,7 @@ is_stride_candidate (rtx_insn *insn)
     return false;
 
   auto stride_type = get_attr_stride_type (insn);
-  return (stride_type == STRIDE_TYPE_LUTI_CONSECUTIVE
-         || stride_type == STRIDE_TYPE_LD1_CONSECUTIVE
+  return (stride_type == STRIDE_TYPE_LD1_CONSECUTIVE
          || stride_type == STRIDE_TYPE_ST1_CONSECUTIVE);
 }
 
@@ -3212,8 +3211,7 @@ early_ra::maybe_convert_to_strided_access (rtx_insn *insn)
   auto stride_type = get_attr_stride_type (insn);
   rtx pat = PATTERN (insn);
   rtx op;
-  if (stride_type == STRIDE_TYPE_LUTI_CONSECUTIVE
-      || stride_type == STRIDE_TYPE_LD1_CONSECUTIVE)
+  if (stride_type == STRIDE_TYPE_LD1_CONSECUTIVE)
     op = SET_DEST (pat);
   else if (stride_type == STRIDE_TYPE_ST1_CONSECUTIVE)
     op = XVECEXP (SET_SRC (pat), 0, 1);
@@ -3263,20 +3261,6 @@ early_ra::maybe_convert_to_strided_access (rtx_insn *insn)
       XVECEXP (SET_SRC (pat), 0, XVECLEN (SET_SRC (pat), 0) - 1)
        = *recog_data.dup_loc[0];
     }
-  else if (stride_type == STRIDE_TYPE_LUTI_CONSECUTIVE)
-    {
-      auto bits = INTVAL (XVECEXP (SET_SRC (pat), 0, 4));
-      if (range.count == 2)
-       pat = gen_aarch64_sme_lut_strided2 (bits, single_mode,
-                                           regs[0], regs[1],
-                                           recog_data.operand[1],
-                                           recog_data.operand[2]);
-      else
-       pat = gen_aarch64_sme_lut_strided4 (bits, single_mode,
-                                           regs[0], regs[1], regs[2], regs[3],
-                                           recog_data.operand[1],
-                                           recog_data.operand[2]);
-    }
   else
     gcc_unreachable ();
   PATTERN (insn) = pat;
index c95d4aa696c644b8530c822f24399f9c1aa3c8c2..78ad2fc699f22c9483580c62a9a873ca7129a2c1 100644 (file)
   "TARGET_STREAMING_SME2
    && !(<LUTI_BITS> == 4 && <vector_count> == 4 && <elem_bits> == 8)"
   "luti<LUTI_BITS>\t%0, zt0, %1[%2]"
-  [(set_attr "stride_type" "luti_consecutive")]
-)
-
-(define_insn "@aarch64_sme_lut<LUTI_BITS><mode>_strided2"
-  [(set (match_operand:SVE_FULL_BHS 0 "aarch64_simd_register" "=Uwd")
-       (unspec:SVE_FULL_BHS
-         [(reg:V8DI ZT0_REGNUM)
-          (reg:DI SME_STATE_REGNUM)
-          (match_operand:VNx16QI 2 "register_operand" "w")
-          (match_operand:DI 3 "const_int_operand")
-          (const_int LUTI_BITS)
-          (const_int 0)]
-         UNSPEC_SME_LUTI))
-   (set (match_operand:SVE_FULL_BHS 1 "aarch64_simd_register" "=w")
-       (unspec:SVE_FULL_BHS
-         [(reg:V8DI ZT0_REGNUM)
-          (reg:DI SME_STATE_REGNUM)
-          (match_dup 2)
-          (match_dup 3)
-          (const_int LUTI_BITS)
-          (const_int 1)]
-         UNSPEC_SME_LUTI))]
-  "TARGET_STREAMING_SME2
-   && aarch64_strided_registers_p (operands, 2, 8)"
-  "luti<LUTI_BITS>\t{%0.<Vetype>, %1.<Vetype>}, zt0, %2[%3]"
-  [(set_attr "stride_type" "luti_strided")]
-)
-
-(define_insn "@aarch64_sme_lut<LUTI_BITS><mode>_strided4"
-  [(set (match_operand:SVE_FULL_BHS 0 "aarch64_simd_register" "=Uwt")
-       (unspec:SVE_FULL_BHS
-         [(reg:V8DI ZT0_REGNUM)
-          (reg:DI SME_STATE_REGNUM)
-          (match_operand:VNx16QI 4 "register_operand" "w")
-          (match_operand:DI 5 "const_int_operand")
-          (const_int LUTI_BITS)
-          (const_int 0)]
-         UNSPEC_SME_LUTI))
-   (set (match_operand:SVE_FULL_BHS 1 "aarch64_simd_register" "=w")
-       (unspec:SVE_FULL_BHS
-         [(reg:V8DI ZT0_REGNUM)
-          (reg:DI SME_STATE_REGNUM)
-          (match_dup 4)
-          (match_dup 5)
-          (const_int LUTI_BITS)
-          (const_int 1)]
-         UNSPEC_SME_LUTI))
-   (set (match_operand:SVE_FULL_BHS 2 "aarch64_simd_register" "=w")
-       (unspec:SVE_FULL_BHS
-         [(reg:V8DI ZT0_REGNUM)
-          (reg:DI SME_STATE_REGNUM)
-          (match_dup 4)
-          (match_dup 5)
-          (const_int LUTI_BITS)
-          (const_int 2)]
-         UNSPEC_SME_LUTI))
-   (set (match_operand:SVE_FULL_BHS 3 "aarch64_simd_register" "=w")
-       (unspec:SVE_FULL_BHS
-         [(reg:V8DI ZT0_REGNUM)
-          (reg:DI SME_STATE_REGNUM)
-          (match_dup 4)
-          (match_dup 5)
-          (const_int LUTI_BITS)
-          (const_int 3)]
-         UNSPEC_SME_LUTI))]
-  "TARGET_STREAMING_SME2
-   && !(<LUTI_BITS> == 4 && <elem_bits> == 8)
-   && aarch64_strided_registers_p (operands, 4, 4)"
-  "luti<LUTI_BITS>\t{%0.<Vetype>, %1.<Vetype>, %2.<Vetype>, %3.<Vetype>}, zt0, %4[%5]"
-  [(set_attr "stride_type" "luti_strided")]
 )
index 33fbe1b2e8db42d3bda284adbc6bd77df8e3b846..7d51d923bf68c558d951c640a675ae61831f32be 100644 (file)
 ;; The RTL mapping therefore applies at LD1 granularity, rather than
 ;; being broken down into individual types of load.
 (define_attr "stride_type"
-  "none,ld1_consecutive,ld1_strided,st1_consecutive,st1_strided,
-   luti_consecutive,luti_strided"
+  "none,ld1_consecutive,ld1_strided,st1_consecutive,st1_strided"
   (const_string "none"))
 
 ;; Attribute used to identify load pair and store pair instructions.
index 3620fff36687f75759c3e107672402c30f547b26..73aac0683ea532743c74ca96fdd86b39875bfd37 100644 (file)
@@ -180,61 +180,6 @@ void test4(int32_t *dest, int32_t *src) __arm_streaming
                       svget4(l2, 3), svget4(l3, 3)));
 }
 
-/*
-** test5:
-**     ptrue   [^\n]+
-**     ld1b    [^\n]+
-**     ld1b    [^\n]+
-**     ptrue   ([^\n]+)\.s
-**     ld1w    [^\n]+, \1/z, \[x0\]
-**     luti4   {z16\.s, z20\.s, z24\.s, z28\.s}, zt0, z[0-9]+\[0\]
-**     luti4   {z17\.s, z21\.s, z25\.s, z29\.s}, zt0, z[0-9]+\[1\]
-**     luti4   {z18\.s, z22\.s, z26\.s, z30\.s}, zt0, z[0-9]+\[0\]
-**     luti4   {z19\.s, z23\.s, z27\.s, z31\.s}, zt0, z[0-9]+\[1\]
-**     uclamp  {z16\.s - z19\.s}, z[0-9]+\.s, z[0-9]+\.s
-**     uclamp  {z20\.s - z23\.s}, z[0-9]+\.s, z[0-9]+\.s
-**     uclamp  {z24\.s - z27\.s}, z[0-9]+\.s, z[0-9]+\.s
-**     uclamp  {z28\.s - z31\.s}, z[0-9]+\.s, z[0-9]+\.s
-**     st1w    {z16\.s - z19\.s}, \1, \[x0\]
-**     st1w    {z20\.s - z23\.s}, \1, \[x0, #4, mul vl\]
-**     st1w    {z24\.s - z27\.s}, \1, \[x0, #8, mul vl\]
-**     st1w    {z28\.s - z31\.s}, \1, \[x0, #12, mul vl\]
-**     ret
-*/
-void test5(uint32_t *dest, uint8_t *indices)
-  __arm_streaming __arm_preserves("za") __arm_inout("zt0")
-{
-  svuint8_t indices1 = svld1_vnum(svptrue_b8(), indices, 0);
-  svuint8_t indices2 = svld1_vnum(svptrue_b8(), indices, 2);
-
-  svcount_t pg = svptrue_c32();
-  svuint32x4_t bounds = svld1_x4(pg, dest);
-
-  svuint32x4_t x0 = svluti4_lane_zt_u32_x4(0, indices1, 0);
-  svuint32x4_t x1 = svluti4_lane_zt_u32_x4(0, indices1, 1);
-  svuint32x4_t x2 = svluti4_lane_zt_u32_x4(0, indices2, 0);
-  svuint32x4_t x3 = svluti4_lane_zt_u32_x4(0, indices2, 1);
-
-  svuint32x4_t y0 = svcreate4(svget4(x0, 0), svget4(x1, 0),
-                             svget4(x2, 0), svget4(x3, 0));
-  svuint32x4_t y1 = svcreate4(svget4(x0, 1), svget4(x1, 1),
-                             svget4(x2, 1), svget4(x3, 1));
-  svuint32x4_t y2 = svcreate4(svget4(x0, 2), svget4(x1, 2),
-                             svget4(x2, 2), svget4(x3, 2));
-  svuint32x4_t y3 = svcreate4(svget4(x0, 3), svget4(x1, 3),
-                             svget4(x2, 3), svget4(x3, 3));
-
-  y0 = svclamp(y0, svget4(bounds, 0), svget4(bounds, 1));
-  y1 = svclamp(y1, svget4(bounds, 2), svget4(bounds, 3));
-  y2 = svclamp(y2, svget4(bounds, 0), svget4(bounds, 1));
-  y3 = svclamp(y3, svget4(bounds, 2), svget4(bounds, 3));
-
-  svst1_vnum(pg, dest, 0, y0);
-  svst1_vnum(pg, dest, 4, y1);
-  svst1_vnum(pg, dest, 8, y2);
-  svst1_vnum(pg, dest, 12, y3);
-}
-
 /*
 ** test6:
 **     ptrue   [^\n]+