From: vekumar <Venkataramanan.Kumar@amd.com>
Date: Thu, 25 Jun 2026 05:03:55 +0000 (+0530)
Subject: i386: Restrict AVX-VNNI-INT8 insns to VEX encoding [PR125949]
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=314bd1495ca78f5934f83a4dbc6028dd34fb0652;p=thirdparty%2Fgcc.git

i386: Restrict AVX-VNNI-INT8 insns to VEX encoding [PR125949]

The vpdpb{ss,su,uu}d[s] instructions from AVX-VNNI-INT8 only have a VEX
encoding; the EVEX form of these operations is provided by AVX10.2.  A
target such as -march=znver6 enables AVX-VNNI-INT8 together with the
AVX-512 xmm/ymm16-31 register file but does not have AVX10.2.  The
vpdp<vpdotprodtype>_<mode> pattern used a single "v" alternative with a
"maybe_evex" prefix, so under register pressure the allocator could pick
xmm/ymm16-31 and the instruction was promoted to its EVEX form, which is
not available on such targets.

Add a VEX-only alternative that keeps the operands in xmm/ymm0-15
(enabled for AVX-VNNI-INT8) and gate the existing EVEX alternative on
AVX10.2.  When AVX10.2 is not available only the VEX alternative is
enabled, so the high registers are no longer used and the EVEX form is
never emitted.

gcc/ChangeLog:

	PR target/125949
	* config/i386/i386.md ("isa"): Add avxvnniint8.
	("enabled"): Adjust for avxvnniint8.
	* config/i386/sse.md (vpdp<vpdotprodtype>_<mode>): Adjust for
	AVXVNNIINT8.

gcc/testsuite/ChangeLog:

	PR target/125949
	* gcc.target/i386/pr125949.c: New test.
---

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 912fc1af018..9d334f9115f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -583,7 +583,7 @@
 		    sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
 		    avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
 		    avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl,
-		    avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16,avxifma,
+		    avx512vl,noavx512vl,avxvnni,avxvnniint8,avx512vnnivl,avx512fp16,avxifma,
 		    avx512ifmavl,avxneconvert,avx512bf16vl,vpclmulqdqvl,
 		    avx_noavx512f,avx_noavx512vl,vaes_avx512vl,noapx_nf,avx10_2"
   (const_string "base"))
@@ -967,6 +967,7 @@
 	 (eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL")
 	 (eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL")
 	 (eq_attr "isa" "avxvnni") (symbol_ref "TARGET_AVXVNNI")
+	 (eq_attr "isa" "avxvnniint8") (symbol_ref "TARGET_AVXVNNIINT8")
 	 (eq_attr "isa" "avx512vnnivl")
 	   (symbol_ref "TARGET_AVX512VNNI && TARGET_AVX512VL")
 	 (eq_attr "isa" "avx512fp16")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb150f6b5f9..3425baea3b6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -32624,15 +32624,17 @@
 })
 
 (define_insn "vpdp<vpdotprodtype>_<mode>"
-  [(set (match_operand:VI4_AVX 0 "register_operand" "=v")
+  [(set (match_operand:VI4_AVX 0 "register_operand" "=x,v")
 	(unspec:VI4_AVX
-	  [(match_operand:VI4_AVX 1 "register_operand" "0")
-	   (match_operand:VI4_AVX 2 "register_operand" "v")
-	   (match_operand:VI4_AVX 3 "nonimmediate_operand" "vm")]
+	  [(match_operand:VI4_AVX 1 "register_operand" "0,0")
+	   (match_operand:VI4_AVX 2 "register_operand" "x,v")
+	   (match_operand:VI4_AVX 3 "nonimmediate_operand" "xjm,vm")]
 	  VPDOTPROD))]
   "TARGET_AVXVNNIINT8 || TARGET_AVX10_2"
   "vpdp<vpdotprodtype>\t{%3, %2, %0|%0, %2, %3}"
-   [(set_attr "prefix" "maybe_evex")])
+   [(set_attr "prefix" "maybe_evex")
+    (set_attr "addr" "gpr16,*")
+    (set_attr "isa" "avxvnniint8,avx10_2")])
 
 (define_insn "vpdp<vpdotprodtype>_v16si"
   [(set (match_operand:V16SI 0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/i386/pr125949.c b/gcc/testsuite/gcc.target/i386/pr125949.c
new file mode 100644
index 00000000000..4254e5ae9ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr125949.c
@@ -0,0 +1,38 @@
+/* PR target/125949 */
+/* On a target with AVX-VNNI-INT8 the vpdpb{ss,su,uu}d[s] instructions only
+   have a VEX encoding.  Even under register pressure the compiler must stay
+   within xmm/ymm0-15 for these insns and must not allocate xmm/ymm16-31 and
+   emit an EVEX-encoded vpdpb* using a high register.  -mavx512vl exposes the
+   xmm/ymm16-31 register file so the allocator would otherwise be tempted to
+   use it.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavxvnniint8 -mavx512vl" } */
+/* { dg-final { scan-assembler-not "vpdpb\[a-z\]+\[ \\t\]+\[^\n\]*%\[xy\]mm(1\[6-9\]|2\[0-9\]|3\[01\])" } } */
+
+#include <immintrin.h>
+
+#define NACC 32
+
+void
+foo (unsigned int *restrict out, const unsigned char *restrict a,
+     const unsigned char *restrict b, int n)
+{
+  __m256i acc[NACC];
+
+  for (int k = 0; k < NACC; k++)
+    acc[k] = _mm256_setzero_si256 ();
+
+  for (int i = 0; i < n; i++)
+    {
+      __m256i vb = _mm256_loadu_si256 ((const __m256i *) (b + 32 * (i & 7)));
+      for (int k = 0; k < NACC; k++)
+	{
+	  __m256i va
+	    = _mm256_loadu_si256 ((const __m256i *) (a + 32 * ((i + k) & 15)));
+	  acc[k] = _mm256_dpbuud_epi32 (acc[k], va, vb);
+	}
+    }
+
+  for (int k = 0; k < NACC; k++)
+    _mm256_storeu_si256 ((__m256i *) (out + 8 * k), acc[k]);
+}