Handle const_vector in mulv4si3 for pre-sse4.1.

author Richard Henderson <rth@redhat.com>

Tue, 19 Jun 2012 18:19:26 +0000 (11:19 -0700)

committer Richard Henderson <rth@gcc.gnu.org>

Tue, 19 Jun 2012 18:19:26 +0000 (11:19 -0700)
author Richard Henderson <rth@redhat.com>
Tue, 19 Jun 2012 18:19:26 +0000 (11:19 -0700)
committer Richard Henderson <rth@gcc.gnu.org>
Tue, 19 Jun 2012 18:19:26 +0000 (11:19 -0700)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 0aed15f86e21f5287918ff3cb5f41b9ea988621a..8b328197cd66acc4e4e514401b37b3cda52d8e32 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,12 @@
+2012-06-19  Richard Henderson  <rth@redhat.com>
+
+       * config/i386/i386-protos.h (ix86_expand_sse2_mulv4si3): Declare.
+       * config/i386/i386.c (ix86_expand_sse2_mulv4si3): New.
+       * config/i386/predicates.md (nonimmediate_or_const_vector_operand): New.
+       * config/i386/sse.md (sse2_mulv4si3): Delete.
+       (mul<VI4_AVX2>3): Use ix86_expand_sse2_mulv4si3 and
+       nonimmediate_or_const_vector_operand.
+
  2012-06-19  Richard Henderson  <rth@redhat.com>
  
         * expmed.c (struct init_expmed_rtl): Split ...
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h

index f300a56834d853c33c91c0e3fea2419a8776ce72..431db6c8cf567c42914a40c73d10bc97b0f58808 100644 (file)
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
  
  extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
  extern bool ix86_expand_pinsr (rtx *);
+extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
  
  /* In i386-c.c  */
  extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 578a7565dea828eb6aa62343ac44049a21ba516e..0dc08f3aa3abeffb0ffca90ee1ebd47da19a59a4 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
    expand_vec_perm_even_odd_1 (&d, odd);
  }
  
+void
+ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+{
+  rtx op1_m1, op1_m2;
+  rtx op2_m1, op2_m2;
+  rtx res_1, res_2;
+
+  /* Shift both input vectors down one element, so that elements 3
+     and 1 are now in the slots for elements 2 and 0.  For K8, at
+     least, this is faster than using a shuffle.  */
+  op1_m1 = op1 = force_reg (V4SImode, op1);
+  op1_m2 = gen_reg_rtx (V4SImode);
+  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
+                                gen_lowpart (V1TImode, op1),
+                                GEN_INT (32)));
+
+  if (GET_CODE (op2) == CONST_VECTOR)
+    {
+      rtvec v;
+
+      /* Constant propagate the vector shift, leaving the dont-care
+        vector elements as zero.  */
+      v = rtvec_alloc (4);
+      RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
+      RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
+      RTVEC_ELT (v, 1) = const0_rtx;
+      RTVEC_ELT (v, 3) = const0_rtx;
+      op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
+      op2_m1 = force_reg (V4SImode, op2_m1);
+
+      v = rtvec_alloc (4);
+      RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
+      RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
+      RTVEC_ELT (v, 1) = const0_rtx;
+      RTVEC_ELT (v, 3) = const0_rtx;
+      op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
+      op2_m2 = force_reg (V4SImode, op2_m2);
+    }
+  else
+    {
+      op2_m1 = op2 = force_reg (V4SImode, op2);
+      op2_m2 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
+                                    gen_lowpart (V1TImode, op2),
+                                    GEN_INT (32)));
+    }
+
+  /* Widening multiply of elements 0+2, and 1+3.  */
+  res_1 = gen_reg_rtx (V4SImode);
+  res_2 = gen_reg_rtx (V4SImode);
+  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
+                                    op1_m1, op2_m1));
+  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
+                                    op1_m2, op2_m2));
+
+  /* Move the results in element 2 down to element 1; we don't care
+     what goes in elements 2 and 3.  Then we can merge the parts
+     back together with an interleave.
+
+     Note that two other sequences were tried:
+     (1) Use interleaves at the start instead of psrldq, which allows
+     us to use a single shufps to merge things back at the end.
+     (2) Use shufps here to combine the two vectors, then pshufd to
+     put the elements in the correct order.
+     In both cases the cost of the reformatting stall was too high
+     and the overall sequence slower.  */
+
+  emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
+                               const0_rtx, const0_rtx));
+  emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
+                               const0_rtx, const0_rtx));
+  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+
+  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+}
+
  /* Expand an insert into a vector register through pinsr insn.
     Return true if successful.  */
  
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md

index 92db80912befaec5c3ef2909ab7185ca4a00d6f5..f23e932d9f1778e3241c8e32adb473ab5733ee5d 100644 (file)
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -816,6 +816,13 @@
    return false;
  })
  
+;; Return true when OP is a nonimmediate or a vector constant.  Note
+;; that most vector constants are not legitimate operands, so we need
+;; to special-case this.
+(define_predicate "nonimmediate_or_const_vector_operand"
+  (ior (match_code "const_vector")
+       (match_operand 0 "nonimmediate_operand")))
+
  ;; Return true if OP is a register or a zero.
  (define_predicate "reg_or_0_operand"
    (ior (match_operand 0 "register_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

index 38ade496f171078cac026d6cc09085bcbe9c0687..6c54d33113b832baa68b45ddfbd68f2a0c261e78 100644 (file)
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5610,12 +5610,22 @@
  
  (define_expand "mul<mode>3"
    [(set (match_operand:VI4_AVX2 0 "register_operand")
-       (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand")
-                      (match_operand:VI4_AVX2 2 "register_operand")))]
+       (mult:VI4_AVX2
+         (match_operand:VI4_AVX2 1 "nonimmediate_operand")
+         (match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
    "TARGET_SSE2"
  {
    if (TARGET_SSE4_1 || TARGET_AVX)
-    ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
+    {
+      if (CONSTANT_P (operands[2]))
+       operands[2] = force_const_mem (<MODE>mode, operands[2]);
+      ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
+    }
+  else
+    {
+      ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]);
+      DONE;
+    }
  })
  
  (define_insn "*<sse4_1_avx2>_mul<mode>3"
@@ -5633,62 +5643,6 @@
     (set_attr "prefix" "orig,vex")
     (set_attr "mode" "<sseinsnmode>")])
  
-(define_insn_and_split "*sse2_mulv4si3"
-  [(set (match_operand:V4SI 0 "register_operand")
-       (mult:V4SI (match_operand:V4SI 1 "register_operand")
-                  (match_operand:V4SI 2 "register_operand")))]
-  "TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX
-   && can_create_pseudo_p ()"
-  "#"
-  "&& 1"
-  [(const_int 0)]
-{
-  rtx t1, t2, t3, t4, t5, t6, thirtytwo;
-  rtx op0, op1, op2;
-
-  op0 = operands[0];
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V4SImode);
-  t2 = gen_reg_rtx (V4SImode);
-  t3 = gen_reg_rtx (V4SImode);
-  t4 = gen_reg_rtx (V4SImode);
-  t5 = gen_reg_rtx (V4SImode);
-  t6 = gen_reg_rtx (V4SImode);
-  thirtytwo = GEN_INT (32);
-
-  /* Multiply elements 2 and 0.  */
-  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
-                                    op1, op2));
-
-  /* Shift both input vectors down one element, so that elements 3
-     and 1 are now in the slots for elements 2 and 0.  For K8, at
-     least, this is faster than using a shuffle.  */
-  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2),
-                                gen_lowpart (V1TImode, op1),
-                                thirtytwo));
-  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3),
-                                gen_lowpart (V1TImode, op2),
-                                thirtytwo));
-  /* Multiply elements 3 and 1.  */
-  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
-                                    t2, t3));
-
-  /* Move the results in element 2 down to element 1; we don't care
-     what goes in elements 2 and 3.  */
-  emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
-                               const0_rtx, const0_rtx));
-  emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
-                               const0_rtx, const0_rtx));
-
-  /* Merge the parts back together.  */
-  emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6));
-
-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-                      gen_rtx_MULT (V4SImode, operands[1], operands[2]));
-  DONE;
-})
-
  (define_insn_and_split "mul<mode>3"
    [(set (match_operand:VI8_AVX2 0 "register_operand")
         (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
author	Richard Henderson <rth@redhat.com>
	Tue, 19 Jun 2012 18:19:26 +0000 (11:19 -0700)
committer	Richard Henderson <rth@gcc.gnu.org>
	Tue, 19 Jun 2012 18:19:26 +0000 (11:19 -0700)
gcc/ChangeLog		patch \| blob \| blame \| history
gcc/config/i386/i386-protos.h		patch \| blob \| blame \| history
gcc/config/i386/i386.c		patch \| blob \| blame \| history
gcc/config/i386/predicates.md		patch \| blob \| blame \| history
gcc/config/i386/sse.md		patch \| blob \| blame \| history