sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl,
- avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16,avxifma,
+ avx512vl,noavx512vl,avxvnni,avxvnniint8,avx512vnnivl,avx512fp16,avxifma,
avx512ifmavl,avxneconvert,avx512bf16vl,vpclmulqdqvl,
avx_noavx512f,avx_noavx512vl,vaes_avx512vl,noapx_nf,avx10_2"
(const_string "base"))
(eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL")
(eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL")
(eq_attr "isa" "avxvnni") (symbol_ref "TARGET_AVXVNNI")
+ (eq_attr "isa" "avxvnniint8") (symbol_ref "TARGET_AVXVNNIINT8")
(eq_attr "isa" "avx512vnnivl")
(symbol_ref "TARGET_AVX512VNNI && TARGET_AVX512VL")
(eq_attr "isa" "avx512fp16")
})
(define_insn "vpdp<vpdotprodtype>_<mode>"
- [(set (match_operand:VI4_AVX 0 "register_operand" "=v")
+ [(set (match_operand:VI4_AVX 0 "register_operand" "=x,v")
(unspec:VI4_AVX
- [(match_operand:VI4_AVX 1 "register_operand" "0")
- (match_operand:VI4_AVX 2 "register_operand" "v")
- (match_operand:VI4_AVX 3 "nonimmediate_operand" "vm")]
+ [(match_operand:VI4_AVX 1 "register_operand" "0,0")
+ (match_operand:VI4_AVX 2 "register_operand" "x,v")
+ (match_operand:VI4_AVX 3 "nonimmediate_operand" "xjm,vm")]
VPDOTPROD))]
"TARGET_AVXVNNIINT8 || TARGET_AVX10_2"
"vpdp<vpdotprodtype>\t{%3, %2, %0|%0, %2, %3}"
- [(set_attr "prefix" "maybe_evex")])
+ [(set_attr "prefix" "maybe_evex")
+ (set_attr "addr" "gpr16,*")
+ (set_attr "isa" "avxvnniint8,avx10_2")])
(define_insn "vpdp<vpdotprodtype>_v16si"
[(set (match_operand:V16SI 0 "register_operand" "=v")
--- /dev/null
+/* PR target/125949 */
+/* On a target with AVX-VNNI-INT8 the vpdpb{ss,su,uu}d[s] instructions only
+ have a VEX encoding. Even under register pressure the compiler must stay
+ within xmm/ymm0-15 for these insns and must not allocate xmm/ymm16-31 and
+ emit an EVEX-encoded vpdpb* using a high register. -mavx512vl exposes the
+ xmm/ymm16-31 register file so the allocator would otherwise be tempted to
+ use it. */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavxvnniint8 -mavx512vl" } */
+/* { dg-final { scan-assembler-not "vpdpb\[a-z\]+\[ \\t\]+\[^\n\]*%\[xy\]mm(1\[6-9\]|2\[0-9\]|3\[01\])" } } */
+
+#include <immintrin.h>
+
+#define NACC 32
+
+void
+foo (unsigned int *restrict out, const unsigned char *restrict a,
+ const unsigned char *restrict b, int n)
+{
+ __m256i acc[NACC];
+
+ for (int k = 0; k < NACC; k++)
+ acc[k] = _mm256_setzero_si256 ();
+
+ for (int i = 0; i < n; i++)
+ {
+ __m256i vb = _mm256_loadu_si256 ((const __m256i *) (b + 32 * (i & 7)));
+ for (int k = 0; k < NACC; k++)
+ {
+ __m256i va
+ = _mm256_loadu_si256 ((const __m256i *) (a + 32 * ((i + k) & 15)));
+ acc[k] = _mm256_dpbuud_epi32 (acc[k], va, vb);
+ }
+ }
+
+ for (int k = 0; k < NACC; k++)
+ _mm256_storeu_si256 ((__m256i *) (out + 8 * k), acc[k]);
+}