]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
i386: Improve workaround for PR82524 LRA limitation [PR85730]
authorUros Bizjak <ubizjak@gmail.com>
Tue, 12 Oct 2021 16:20:38 +0000 (18:20 +0200)
committerUros Bizjak <ubizjak@gmail.com>
Tue, 12 Oct 2021 16:21:33 +0000 (18:21 +0200)
As explained in PR82524, LRA is not able to reload strict_low_part inout
operand with matched input operand. The patch introduces a workaround,
where we allow LRA to generate an instruction with non-matched input operand
which is split post reload to an instruction that inserts non-matched input
operand to an inout operand and the instruction that uses matched operand.

The generated code improves from:

        movsbl  %dil, %edx
        movl    %edi, %eax
        sall    $3, %edx
        movb    %dl, %al

to:

        movl    %edi, %eax
        movb    %dil, %al
        salb    $3, %al

which is still not optimal, but the code is one instruction shorter and
does not use a temporary register.

2021-10-12  Uroš Bizjak  <ubizjak@gmail.com>

gcc/
PR target/85730
PR target/82524
* config/i386/i386.md (*add<mode>_1_slp): Rewrite as
define_insn_and_split pattern.  Add alternative 1 and split it
post reload to insert operand 1 into the low part of operand 0.
(*sub<mode>_1_slp): Ditto.
(*and<mode>_1_slp): Ditto.
(*<any_or:code><mode>_1_slp): Ditto.
(*ashl<mode>3_1_slp): Ditto.
(*<any_shiftrt:insn><mode>3_1_slp): Ditto.
(*<any_rotate:insn><mode>3_1_slp): Ditto.
(*neg<mode>_1_slp): New insn_and_split pattern.
(*one_cmpl<mode>_1_slp): Ditto.

gcc/testsuite/
PR target/85730
PR target/82524
* gcc.target/i386/pr85730.c: New test.

gcc/config/i386/i386.md
gcc/testsuite/gcc.target/i386/pr85730.c [new file with mode: 0644]

index c7ae4ac5fbcbcd3c31f6603a5092bbf0a9035398..e733a40fc90ef3d02623a60065bb87c2a517da1c 100644 (file)
              (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
           (symbol_ref "true")))])
 
-(define_insn "*add<mode>_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
-       (plus:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0")
-                   (match_operand:SWI12 2 "general_operand" "<r>mn")))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*add<mode>_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (plus:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0,!<r>")
+                   (match_operand:SWI12 2 "general_operand" "<r>mn,<r>mn")))
    (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && (rtx_equal_p (operands[0], operands[1])
-       || rtx_equal_p (operands[0], operands[2]))"
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
 {
+  if (which_alternative)
+    return "#";
+
   switch (get_attr_type (insn))
     {
     case TYPE_INCDEC:
       return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
     }
 }
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (plus:SWI12 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set (attr "type")
      (if_then_else (match_operand:QI 2 "incdec_operand")
        (const_string "incdec")
   [(set_attr "type" "alu")
    (set_attr "mode" "SI")])
 
-(define_insn "*sub<mode>_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
-       (minus:SWI12 (match_operand:SWI12 1 "register_operand" "0")
-                    (match_operand:SWI12 2 "general_operand" "<r>mn")))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*sub<mode>_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (minus:SWI12 (match_operand:SWI12 1 "register_operand" "0,!<r>")
+                    (match_operand:SWI12 2 "general_operand" "<r>mn,<r>mn")))
    (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && rtx_equal_p (operands[0], operands[1])"
-  "sub{<imodesuffix>}\t{%2, %0|%0, %2}"
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   sub{<imodesuffix>}\t{%2, %0|%0, %2}
+   #"
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (minus:SWI12 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
              (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
           (symbol_ref "true")))])
 
-(define_insn "*and<mode>_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
-       (and:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0")
-                  (match_operand:SWI12 2 "general_operand" "<r>mn")))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*and<mode>_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (and:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0,!<r>")
+                  (match_operand:SWI12 2 "general_operand" "<r>mn,<r>mn")))
    (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && (rtx_equal_p (operands[0], operands[1])
-       || rtx_equal_p (operands[0], operands[2]))"
-  "and{<imodesuffix>}\t{%2, %0|%0, %2}"
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   and{<imodesuffix>}\t{%2, %0|%0, %2}
+   #"
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (and:SWI12 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
              (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
           (symbol_ref "true")))])
 
-(define_insn "*<code><mode>_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
-       (any_or:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0")
-                     (match_operand:SWI12 2 "general_operand" "<r>mn")))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<code><mode>_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (any_or:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "%0,!<r>")
+                     (match_operand:SWI12 2 "general_operand" "<r>mn,<r>mn")))
    (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && (rtx_equal_p (operands[0], operands[1])
-       || rtx_equal_p (operands[0], operands[2]))"
-  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+   #"
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (any_or:SWI12 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
   [(set_attr "type" "negnot")
    (set_attr "mode" "SI")])
 
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*neg<mode>_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (neg:SWI12 (match_operand:SWI12 1 "register_operand" "0,!<r>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   neg{<imodesuffix>}\t%0
+   #"
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (neg:SWI12 (match_dup 0)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*neg<mode>_2"
   [(set (reg FLAGS_REG)
        (compare
              (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
           (symbol_ref "true")))])
 
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*one_cmpl<mode>_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (not:SWI12 (match_operand:SWI12 1 "register_operand" "0,!<r>")))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "@
+   not{<imodesuffix>}\t%0
+   #"
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (set (strict_low_part (match_dup 0))
+       (not:SWI12 (match_dup 0)))]
+  ""
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*one_cmpl<mode>2_2"
   [(set (reg FLAGS_REG)
        (compare (not:SWI (match_operand:SWI 1 "nonimmediate_operand" "0"))
              (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
           (symbol_ref "true")))])
 
-(define_insn "*ashl<mode>3_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
-       (ashift:SWI12 (match_operand:SWI12 1 "register_operand" "0")
-                     (match_operand:QI 2 "nonmemory_operand" "cI")))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*ashl<mode>3_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (ashift:SWI12 (match_operand:SWI12 1 "register_operand" "0,!<r>")
+                     (match_operand:QI 2 "nonmemory_operand" "cI,cI")))
    (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && rtx_equal_p (operands[0], operands[1])"
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
 {
+  if (which_alternative)
+    return "#";
+
   switch (get_attr_type (insn))
     {
     case TYPE_ALU:
        return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
     }
 }
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (ashift:SWI12 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set (attr "type")
      (cond [(and (match_test "TARGET_DOUBLE_WITH_ADD")
                 (match_operand 2 "const1_operand"))
        (const_string "*")))
    (set_attr "mode" "HI")])
 
-(define_insn "*<insn><mode>3_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
-       (any_shiftrt:SWI12 (match_operand:SWI12 1 "register_operand" "0")
-                          (match_operand:QI 2 "nonmemory_operand" "cI")))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<insn><mode>3_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (any_shiftrt:SWI12 (match_operand:SWI12 1 "register_operand" "0,!<r>")
+                          (match_operand:QI 2 "nonmemory_operand" "cI,cI")))
    (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && rtx_equal_p (operands[0], operands[1])"
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
 {
+  if (which_alternative)
+    return "#";
+
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
     return "<shift>{<imodesuffix>}\t%0";
   else
     return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (any_shiftrt:SWI12 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
      (if_then_else
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<insn><mode>3_1_slp"
-  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
-       (any_rotate:SWI12 (match_operand:SWI12 1 "register_operand" "0")
-                         (match_operand:QI 2 "nonmemory_operand" "cI")))
+;; Alternative 1 is needed to work around LRA limitation, see PR82524.
+(define_insn_and_split "*<insn><mode>3_1_slp"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,<r>"))
+       (any_rotate:SWI12 (match_operand:SWI12 1 "register_operand" "0,!<r>")
+                         (match_operand:QI 2 "nonmemory_operand" "cI,cI")))
    (clobber (reg:CC FLAGS_REG))]
-  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
-   /* FIXME: without this LRA can't reload this pattern, see PR82524.  */
-   && rtx_equal_p (operands[0], operands[1])"
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
 {
+  if (which_alternative)
+    return "#";
+
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
     return "<rotate>{<imodesuffix>}\t%0";
   else
     return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
+  "&& reload_completed"
+  [(set (strict_low_part (match_dup 0)) (match_dup 1))
+   (parallel
+     [(set (strict_low_part (match_dup 0))
+          (any_rotate:SWI12 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
   [(set_attr "type" "rotate")
    (set (attr "length_immediate")
      (if_then_else
diff --git a/gcc/testsuite/gcc.target/i386/pr85730.c b/gcc/testsuite/gcc.target/i386/pr85730.c
new file mode 100644 (file)
index 0000000..b279016
--- /dev/null
@@ -0,0 +1,95 @@
+/* PR target/85730 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse4" } */
+
+typedef char V __attribute__((vector_size(4)));
+
+V
+test_and (V v, char c)
+{
+  v[0] &= c;
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]andb" } } */
+
+V
+test_or (V v, char c)
+{
+  v[0] |= c;
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]orb" } } */
+
+V
+test_xor (V v, char c)
+{
+  v[0] ^= c;
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]xorb" } } */
+
+V
+test_not (V v)
+{
+  v[0] = ~v[0];
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]notb" } } */
+
+V
+test_sal (V v)
+{
+  v[0] <<= 3;
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]salb" } } */
+
+V
+test_sar (V v)
+{
+  v[0] >>= 3;
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]sarb" } } */
+
+V
+test_add (V v, char c)
+{
+  v[0] += c;
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]addb" } } */
+
+V
+test_sub (V v, char c)
+{
+  v[0] -= c;
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]subb" } } */
+
+V
+test_neg (V v)
+{
+  v[0] = -v[0];
+
+  return v;
+}
+
+/* { dg-final { scan-assembler "\[ \t\]negb" } } */