x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4`

author Noah Goldstein <goldstein.w.n@gmail.com>

Thu, 10 Aug 2023 17:13:26 +0000 (12:13 -0500)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Tue, 12 Sep 2023 03:48:43 +0000 (22:48 -0500)
author Noah Goldstein <goldstein.w.n@gmail.com>
Thu, 10 Aug 2023 17:13:26 +0000 (12:13 -0500)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Tue, 12 Sep 2023 03:48:43 +0000 (22:48 -0500)
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c

index b9444ddd52051e05183463340f4dd3cf3e61f4dc..4354c01c71e63d4047246b0bb8e7b7690663c409 100644 (file)
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -494,6 +494,7 @@ init_cacheinfo (void)
    int max_cpuid_ex;
    long int data = -1;
    long int shared = -1;
+  long int shared_per_thread = -1;
    unsigned int level;
    unsigned int threads = 0;
    const struct cpu_features *cpu_features = __get_cpu_features ();
@@ -509,7 +510,7 @@ init_cacheinfo (void)
        /* Try L3 first.  */
        level  = 3;
        shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
-
+      shared_per_thread = shared;
        /* Number of logical processors sharing L2 cache.  */
        int threads_l2;
  
@@ -521,6 +522,7 @@ init_cacheinfo (void)
           /* Try L2 otherwise.  */
           level  = 2;
           shared = core;
+      shared_per_thread = core;
           threads_l2 = 0;
           threads_l3 = -1;
         }
@@ -677,26 +679,25 @@ init_cacheinfo (void)
             }
           else
             {
-intel_bug_no_cache_info:
+           intel_bug_no_cache_info:
               /* Assume that all logical threads share the highest cache
                  level.  */
  
-             threads
-               = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx
-                   >> 16) & 0xff);
-           }
+             threads = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx >> 16)
+                        & 0xff);
  
-         /* Cap usage of highest cache level to the number of supported
-            threads.  */
-         if (shared > 0 && threads > 0)
-           shared /= threads;
+             /* Cap usage of highest cache level to the number of supported
+                threads.  */
+             if (shared_per_thread > 0 && threads > 0)
+               shared_per_thread /= threads;
+           }
         }
  
        /* Account for non-inclusive L2 and L3 caches.  */
        if (!inclusive_cache)
         {
-         if (threads_l2 > 0)
-           core /= threads_l2;
+      if (threads_l2 > 0)
+       shared_per_thread += core / threads_l2;
           shared += core;
         }
      }
@@ -705,13 +706,17 @@ intel_bug_no_cache_info:
        data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
        long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
        shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+      shared_per_thread = shared;
  
        /* Get maximum extended function. */
        __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
  
        if (shared <= 0)
-       /* No shared L3 cache.  All we have is the L2 cache.  */
-       shared = core;
+       {
+         /* No shared L3 cache.  All we have is the L2 cache.  */
+         shared = core;
+         shared_per_thread = core;
+       }
        else
         {
           /* Figure out the number of logical threads that share L3.  */
@@ -735,10 +740,11 @@ intel_bug_no_cache_info:
           /* Cap usage of highest cache level to the number of
              supported threads.  */
           if (threads > 0)
-           shared /= threads;
+           shared_per_thread /= threads;
  
           /* Account for exclusive L2 and L3 caches.  */
           shared += core;
+         shared_per_thread += core;
         }
  
  #ifndef DISABLE_PREFETCHW
@@ -766,26 +772,42 @@ intel_bug_no_cache_info:
      }
  
    if (cpu_features->shared_cache_size != 0)
-    shared = cpu_features->shared_cache_size;
+    shared_per_thread = cpu_features->shared_cache_size;
  
-  if (shared > 0)
+  if (shared_per_thread > 0)
      {
-      __x86_raw_shared_cache_size_half = shared / 2;
-      __x86_raw_shared_cache_size = shared;
+      __x86_raw_shared_cache_size_half = shared_per_thread / 2;
+      __x86_raw_shared_cache_size = shared_per_thread;
        /* Round shared cache size to multiple of 256 bytes.  */
-      shared = shared & ~255L;
-      __x86_shared_cache_size_half = shared / 2;
-      __x86_shared_cache_size = shared;
+      shared_per_thread = shared_per_thread & ~255L;
+      __x86_shared_cache_size_half = shared_per_thread / 2;
+      __x86_shared_cache_size = shared_per_thread;
      }
  
-  /* The large memcpy micro benchmark in glibc shows that 6 times of
-     shared cache size is the approximate value above which non-temporal
-     store becomes faster on a 8-core processor.  This is the 3/4 of the
-     total shared cache size.  */
+  /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
+     of the chip's cache (depending on `cachesize_non_temporal_divisor` which
+     is microarch specific. The default is 1/4). For most Intel processors
+     with an initial release date between 2017 and 2023, a thread's
+     typical share of the cache is from 18-64MB. Using a reasonable size
+     fraction of L3 is meant to estimate the point where non-temporal stores
+     begin out-competing REP MOVSB. As well the point where the fact that
+     non-temporal stores are forced back to main memory would already occurred
+     to the majority of the lines in the copy. Note, concerns about the entire
+     L3 cache being evicted by the copy are mostly alleviated by the fact that
+     modern HW detects streaming patterns and provides proper LRU hints so that
+     the maximum thrashing capped at 1/associativity. */
+  unsigned long int non_temporal_threshold = shared / 4;
+  /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+     a higher risk of actually thrashing the cache as they don't have a HW LRU
+     hint. As well, their performance in highly parallel situations is
+     noticeably worse.  */
+  if (!CPU_FEATURES_CPU_P (cpu_features, ERMS))
+    non_temporal_threshold = shared_per_thread * 3 / 4;
+
    __x86_shared_non_temporal_threshold
      = (cpu_features->non_temporal_threshold != 0
         ? cpu_features->non_temporal_threshold
-       : __x86_shared_cache_size * threads * 3 / 4);
+       : non_temporal_threshold);
  }
  
  #endif
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Thu, 10 Aug 2023 17:13:26 +0000 (12:13 -0500)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Tue, 12 Sep 2023 03:48:43 +0000 (22:48 -0500)