]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
x86: Make the divisor in setting `non_temporal_threshold` cpu specific
authorNoah Goldstein <goldstein.w.n@gmail.com>
Wed, 7 Jun 2023 18:18:03 +0000 (13:18 -0500)
committerNoah Goldstein <goldstein.w.n@gmail.com>
Mon, 12 Jun 2023 16:33:39 +0000 (11:33 -0500)
Different systems prefer a different divisors.

From benchmarks[1] so far the following divisors have been found:
    ICX     : 2
    SKX     : 2
    BWD     : 8

For Intel, we are generalizing that BWD and older prefers 8 as a
divisor, and SKL and newer prefers 2. This number can be further tuned
as benchmarks are run.

[1]: https://github.com/goldsteinn/memcpy-nt-benchmarks
Reviewed-by: DJ Delorie <dj@redhat.com>
sysdeps/x86/cpu-features.c
sysdeps/x86/dl-cacheinfo.h
sysdeps/x86/dl-diagnostics-cpu.c
sysdeps/x86/include/cpu-features.h

index d52a718e921e7f0e16aafedf22a5d60668b66aa6..525828f59c827ead8b44d26a7d4a5b8418cc951b 100644 (file)
@@ -636,6 +636,7 @@ init_cpu_features (struct cpu_features *cpu_features)
   unsigned int stepping = 0;
   enum cpu_features_kind kind;
 
+  cpu_features->cachesize_non_temporal_divisor = 4;
 #if !HAS_CPUID
   if (__get_cpuid_max (0, 0) == 0)
     {
@@ -716,13 +717,13 @@ init_cpu_features (struct cpu_features *cpu_features)
 
              /* Bigcore/Default Tuning.  */
            default:
+           default_tuning:
              /* Unknown family 0x06 processors.  Assuming this is one
                 of Core i3/i5/i7 processors if AVX is available.  */
              if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
                break;
-             /* Fall through.  */
-           case INTEL_BIGCORE_NEHALEM:
-           case INTEL_BIGCORE_WESTMERE:
+
+           enable_modern_features:
              /* Rep string instructions, unaligned load, unaligned copy,
                 and pminub are fast on Intel Core i3, i5 and i7.  */
              cpu_features->preferred[index_arch_Fast_Rep_String]
@@ -732,12 +733,23 @@ init_cpu_features (struct cpu_features *cpu_features)
                      | bit_arch_Prefer_PMINUB_for_stringop);
              break;
 
-          /*
-           Default tuned Bigcore microarch.
+           case INTEL_BIGCORE_NEHALEM:
+           case INTEL_BIGCORE_WESTMERE:
+             /* Older CPUs prefer non-temporal stores at lower threshold.  */
+             cpu_features->cachesize_non_temporal_divisor = 8;
+             goto enable_modern_features;
+
+             /* Older Bigcore microarch (smaller non-temporal store
+                threshold).  */
            case INTEL_BIGCORE_SANDYBRIDGE:
            case INTEL_BIGCORE_IVYBRIDGE:
            case INTEL_BIGCORE_HASWELL:
            case INTEL_BIGCORE_BROADWELL:
+             cpu_features->cachesize_non_temporal_divisor = 8;
+             goto default_tuning;
+
+             /* Newer Bigcore microarch (larger non-temporal store
+                threshold).  */
            case INTEL_BIGCORE_SKYLAKE:
            case INTEL_BIGCORE_KABYLAKE:
            case INTEL_BIGCORE_COMETLAKE:
@@ -753,13 +765,14 @@ init_cpu_features (struct cpu_features *cpu_features)
            case INTEL_BIGCORE_SAPPHIRERAPIDS:
            case INTEL_BIGCORE_EMERALDRAPIDS:
            case INTEL_BIGCORE_GRANITERAPIDS:
-           */
+             cpu_features->cachesize_non_temporal_divisor = 2;
+             goto default_tuning;
 
-          /*
-           Default tuned Mixed (bigcore + atom SOC).
+             /* Default tuned Mixed (bigcore + atom SOC). */
            case INTEL_MIXED_LAKEFIELD:
            case INTEL_MIXED_ALDERLAKE:
-           */
+             cpu_features->cachesize_non_temporal_divisor = 2;
+             goto default_tuning;
            }
 
              /* Disable TSX on some processors to avoid TSX on kernels that
index 3bd3b3ec1b70244802b487af678bb4593f82b224..fb1a6cf4a975c2669bbc964eca081d136930c417 100644 (file)
@@ -738,19 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   cpu_features->level3_cache_linesize = level3_cache_linesize;
   cpu_features->level4_cache_size = level4_cache_size;
 
-  /* The default setting for the non_temporal threshold is 1/4 of size
-     of the chip's cache. For most Intel and AMD processors with an
-     initial release date between 2017 and 2023, a thread's typical
-     share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
-     estimate the point where non-temporal stores begin out-competing
-     REP MOVSB. As well the point where the fact that non-temporal
-     stores are forced back to main memory would already occurred to the
-     majority of the lines in the copy. Note, concerns about the
-     entire L3 cache being evicted by the copy are mostly alleviated
-     by the fact that modern HW detects streaming patterns and
-     provides proper LRU hints so that the maximum thrashing
-     capped at 1/associativity. */
-  unsigned long int non_temporal_threshold = shared / 4;
+  unsigned long int cachesize_non_temporal_divisor
+      = cpu_features->cachesize_non_temporal_divisor;
+  if (cachesize_non_temporal_divisor <= 0)
+    cachesize_non_temporal_divisor = 4;
+
+  /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
+     of the chip's cache (depending on `cachesize_non_temporal_divisor` which
+     is microarch specific. The defeault is 1/4). For most Intel and AMD
+     processors with an initial release date between 2017 and 2023, a thread's
+     typical share of the cache is from 18-64MB. Using a reasonable size
+     fraction of L3 is meant to estimate the point where non-temporal stores
+     begin out-competing REP MOVSB. As well the point where the fact that
+     non-temporal stores are forced back to main memory would already occurred
+     to the majority of the lines in the copy. Note, concerns about the entire
+     L3 cache being evicted by the copy are mostly alleviated by the fact that
+     modern HW detects streaming patterns and provides proper LRU hints so that
+     the maximum thrashing capped at 1/associativity. */
+  unsigned long int non_temporal_threshold
+      = shared / cachesize_non_temporal_divisor;
   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
      a higher risk of actually thrashing the cache as they don't have a HW LRU
      hint. As well, their performance in highly parallel situations is
index a1578e466545a9f12d5369f82e8af222fd3802c2..5aab63e53211062abf9e15533918792862e9b5a6 100644 (file)
@@ -113,8 +113,11 @@ _dl_diagnostics_cpu (void)
                             cpu_features->level3_cache_linesize);
   print_cpu_features_value ("level4_cache_size",
                             cpu_features->level4_cache_size);
-  _Static_assert (offsetof (struct cpu_features, level4_cache_size)
-                  + sizeof (cpu_features->level4_cache_size)
-                  == sizeof (*cpu_features),
-                  "last cpu_features field has been printed");
+  print_cpu_features_value ("cachesize_non_temporal_divisor",
+                           cpu_features->cachesize_non_temporal_divisor);
+  _Static_assert (
+      offsetof (struct cpu_features, cachesize_non_temporal_divisor)
+             + sizeof (cpu_features->cachesize_non_temporal_divisor)
+         == sizeof (*cpu_features),
+      "last cpu_features field has been printed");
 }
index 40b8129d6a4d631c3f1e8fade38be68151c82dbb..c740e1a5fc7da45353b5cf3fc3dbd3a4a8daa79b 100644 (file)
@@ -945,6 +945,9 @@ struct cpu_features
   unsigned long int level3_cache_linesize;
   /* /_SC_LEVEL4_CACHE_SIZE.  */
   unsigned long int level4_cache_size;
+  /* When no user non_temporal_threshold is specified. We default to
+     cachesize / cachesize_non_temporal_divisor.  */
+  unsigned long int cachesize_non_temporal_divisor;
 };
 
 /* Get a pointer to the CPU features structure.  */