Zen5 tuning part 4: update reassocation width

author Jan Hubicka <jh@suse.cz>

Tue, 3 Sep 2024 16:20:34 +0000 (18:20 +0200)

committer Jan Hubicka <jh@suse.cz>

Tue, 7 Jan 2025 11:43:11 +0000 (12:43 +0100)
author Jan Hubicka <jh@suse.cz>
Tue, 3 Sep 2024 16:20:34 +0000 (18:20 +0200)
committer Jan Hubicka <jh@suse.cz>
Tue, 7 Jan 2025 11:43:11 +0000 (12:43 +0100)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index 2087f8633eb8066bd829b6190091854efc6bf632..ea25e56ad6447f6a1d630089e228a1b42625fbd5 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22923,13 +22923,17 @@ ix86_reassociation_width (unsigned int op, machine_mode mode)
        if (width == 1)
         return 1;
  
-      /* Integer vector instructions execute in FP unit
+      /* Znver1-4 Integer vector instructions execute in FP unit
          and can execute 3 additions and one multiplication per cycle.  */
        if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
-          || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
-          || ix86_tune == PROCESSOR_ZNVER5)
+          || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
           && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
         return 1;
+      /* Znver5 can do 2 integer multiplications per cycle with latency
+        of 3.  */
+      if (ix86_tune == PROCESSOR_ZNVER5
+         && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
+       width = 6;
  
        /* Account for targets that splits wide vectors into multiple parts.  */
        if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index b8e7ab9372ea79aba38372e616ab74cf0ff72f75..0f2308bb079c492de29559e91cddc1021a929f86 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2068,16 +2068,19 @@ struct processor_costs znver5_cost = {
    COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
    COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
    COSTS_N_INSNS (20),                  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle.  FP operations
-     take 3 cycles and it can execute 2 integer additions and 2
-     multiplications thus reassociation may make sense up to with of 6.
-     SPEC2k6 bencharks suggests
-     that 4 works better than 6 probably due to register pressure.
-
-     Integer vector operations are taken by FP unit and execute 3 vector
-     plus/minus operations per cycle but only one multiply.  This is adjusted
-     in ix86_reassociation_width.  */
-  4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Zen5 can execute:
+      - integer ops: 6 per cycle, at most 3 multiplications.
+       latency 1 for additions, 3 for multiplications (pipelined)
+
+       Setting width of 9 for multiplication is probably excessive
+       for register pressure.
+      - fp ops: 2 additions per cycle, latency 2-3
+               2 multiplicaitons per cycle, latency 3
+      - vector intger ops: 4 additions, latency 1
+                          2 multiplications, latency 4
+       We increase width to 6 for multiplications
+       in ix86_reassociation_width.  */
+  6, 6, 4, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
    znver2_memcpy,
    znver2_memset,
    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
author	Jan Hubicka <jh@suse.cz>
	Tue, 3 Sep 2024 16:20:34 +0000 (18:20 +0200)
committer	Jan Hubicka <jh@suse.cz>
	Tue, 7 Jan 2025 11:43:11 +0000 (12:43 +0100)
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/config/i386/x86-tune-costs.h		patch \| blob \| blame \| history