sysdeps/x86/dl-cacheinfo.h

   1 /* Initialize x86 cache info.
   2    Copyright (C) 2020-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 static const struct intel_02_cache_info
  20 {
  21   unsigned char idx;
  22   unsigned char assoc;
  23   unsigned char linesize;
  24   unsigned char rel_name;
  25   unsigned int size;
  26 } intel_02_known [] =
  27   {
  28 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
  29     { 0x06,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),    8192 },
  30     { 0x08,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   16384 },
  31     { 0x09,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
  32     { 0x0a,  2, 32, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
  33     { 0x0c,  4, 32, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  34     { 0x0d,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  35     { 0x0e,  6, 64, M(_SC_LEVEL1_DCACHE_SIZE),   24576 },
  36     { 0x21,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  37     { 0x22,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
  38     { 0x23,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  39     { 0x25,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  40     { 0x29,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  41     { 0x2c,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
  42     { 0x30,  8, 64, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
  43     { 0x39,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  44     { 0x3a,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   196608 },
  45     { 0x3b,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  46     { 0x3c,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  47     { 0x3d,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   393216 },
  48     { 0x3e,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  49     { 0x3f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  50     { 0x41,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  51     { 0x42,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  52     { 0x43,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  53     { 0x44,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  54     { 0x45,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  55     { 0x46,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  56     { 0x47,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  57     { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE),  3145728 },
  58     { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE),  4194304 },
  59     { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  6291456 },
  60     { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  61     { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
  62     { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
  63     { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE),  6291456 },
  64     { 0x60,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  65     { 0x66,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
  66     { 0x67,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  67     { 0x68,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
  68     { 0x78,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  69     { 0x79,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  70     { 0x7a,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  71     { 0x7b,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  72     { 0x7c,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  73     { 0x7d,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  74     { 0x7f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  75     { 0x80,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  76     { 0x82,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  77     { 0x83,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  78     { 0x84,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  79     { 0x85,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  80     { 0x86,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  81     { 0x87,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  82     { 0xd0,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
  83     { 0xd1,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  84     { 0xd2,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  85     { 0xd6,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  86     { 0xd7,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  87     { 0xd8,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  88     { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  89     { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  90     { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  91     { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  92     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  93     { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  94     { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
  95     { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
  96     { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
  97   };
  98
  99 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
 100
 101 static int
 102 intel_02_known_compare (const void *p1, const void *p2)
 103 {
 104   const struct intel_02_cache_info *i1;
 105   const struct intel_02_cache_info *i2;
 106
 107   i1 = (const struct intel_02_cache_info *) p1;
 108   i2 = (const struct intel_02_cache_info *) p2;
 109
 110   if (i1->idx == i2->idx)
 111     return 0;
 112
 113   return i1->idx < i2->idx ? -1 : 1;
 114 }
 115
 116
 117 static long int
 118 __attribute__ ((noinline))
 119 intel_check_word (int name, unsigned int value, bool *has_level_2,
 120                   bool *no_level_2_or_3,
 121                   const struct cpu_features *cpu_features)
 122 {
 123   if ((value & 0x80000000) != 0)
 124     /* The register value is reserved.  */
 125     return 0;
 126
 127   /* Fold the name.  The _SC_ constants are always in the order SIZE,
 128      ASSOC, LINESIZE.  */
 129   int folded_rel_name = (M(name) / 3) * 3;
 130
 131   while (value != 0)
 132     {
 133       unsigned int byte = value & 0xff;
 134
 135       if (byte == 0x40)
 136         {
 137           *no_level_2_or_3 = true;
 138
 139           if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 140             /* No need to look further.  */
 141             break;
 142         }
 143       else if (byte == 0xff)
 144         {
 145           /* CPUID leaf 0x4 contains all the information.  We need to
 146              iterate over it.  */
 147           unsigned int eax;
 148           unsigned int ebx;
 149           unsigned int ecx;
 150           unsigned int edx;
 151
 152           unsigned int round = 0;
 153           while (1)
 154             {
 155               __cpuid_count (4, round, eax, ebx, ecx, edx);
 156
 157               enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
 158               if (type == null)
 159                 /* That was the end.  */
 160                 break;
 161
 162               unsigned int level = (eax >> 5) & 0x7;
 163
 164               if ((level == 1 && type == data
 165                    && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
 166                   || (level == 1 && type == inst
 167                       && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
 168                   || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 169                   || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 170                   || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
 171                 {
 172                   unsigned int offset = M(name) - folded_rel_name;
 173
 174                   if (offset == 0)
 175                     /* Cache size.  */
 176                     return (((ebx >> 22) + 1)
 177                             * (((ebx >> 12) & 0x3ff) + 1)
 178                             * ((ebx & 0xfff) + 1)
 179                             * (ecx + 1));
 180                   if (offset == 1)
 181                     return (ebx >> 22) + 1;
 182
 183                   assert (offset == 2);
 184                   return (ebx & 0xfff) + 1;
 185                 }
 186
 187               ++round;
 188             }
 189           /* There is no other cache information anywhere else.  */
 190           break;
 191         }
 192       else
 193         {
 194           if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 195             {
 196               /* Intel reused this value.  For family 15, model 6 it
 197                  specifies the 3rd level cache.  Otherwise the 2nd
 198                  level cache.  */
 199               unsigned int family = cpu_features->basic.family;
 200               unsigned int model = cpu_features->basic.model;
 201
 202               if (family == 15 && model == 6)
 203                 {
 204                   /* The level 3 cache is encoded for this model like
 205                      the level 2 cache is for other models.  Pretend
 206                      the caller asked for the level 2 cache.  */
 207                   name = (_SC_LEVEL2_CACHE_SIZE
 208                           + (name - _SC_LEVEL3_CACHE_SIZE));
 209                   folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
 210                 }
 211             }
 212
 213           struct intel_02_cache_info *found;
 214           struct intel_02_cache_info search;
 215
 216           search.idx = byte;
 217           found = bsearch (&search, intel_02_known, nintel_02_known,
 218                            sizeof (intel_02_known[0]), intel_02_known_compare);
 219           if (found != NULL)
 220             {
 221               if (found->rel_name == folded_rel_name)
 222                 {
 223                   unsigned int offset = M(name) - folded_rel_name;
 224
 225                   if (offset == 0)
 226                     /* Cache size.  */
 227                     return found->size;
 228                   if (offset == 1)
 229                     return found->assoc;
 230
 231                   assert (offset == 2);
 232                   return found->linesize;
 233                 }
 234
 235               if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 236                 *has_level_2 = true;
 237             }
 238         }
 239
 240       /* Next byte for the next round.  */
 241       value >>= 8;
 242     }
 243
 244   /* Nothing found.  */
 245   return 0;
 246 }
 247
 248
 249 static long int __attribute__ ((noinline))
 250 handle_intel (int name, const struct cpu_features *cpu_features)
 251 {
 252   unsigned int maxidx = cpu_features->basic.max_cpuid;
 253
 254   /* Return -1 for older CPUs.  */
 255   if (maxidx < 2)
 256     return -1;
 257
 258   /* OK, we can use the CPUID instruction to get all info about the
 259      caches.  */
 260   unsigned int cnt = 0;
 261   unsigned int max = 1;
 262   long int result = 0;
 263   bool no_level_2_or_3 = false;
 264   bool has_level_2 = false;
 265
 266   while (cnt++ < max)
 267     {
 268       unsigned int eax;
 269       unsigned int ebx;
 270       unsigned int ecx;
 271       unsigned int edx;
 272       __cpuid (2, eax, ebx, ecx, edx);
 273
 274       /* The low byte of EAX in the first round contain the number of
 275          rounds we have to make.  At least one, the one we are already
 276          doing.  */
 277       if (cnt == 1)
 278         {
 279           max = eax & 0xff;
 280           eax &= 0xffffff00;
 281         }
 282
 283       /* Process the individual registers' value.  */
 284       result = intel_check_word (name, eax, &has_level_2,
 285                                  &no_level_2_or_3, cpu_features);
 286       if (result != 0)
 287         return result;
 288
 289       result = intel_check_word (name, ebx, &has_level_2,
 290                                  &no_level_2_or_3, cpu_features);
 291       if (result != 0)
 292         return result;
 293
 294       result = intel_check_word (name, ecx, &has_level_2,
 295                                  &no_level_2_or_3, cpu_features);
 296       if (result != 0)
 297         return result;
 298
 299       result = intel_check_word (name, edx, &has_level_2,
 300                                  &no_level_2_or_3, cpu_features);
 301       if (result != 0)
 302         return result;
 303     }
 304
 305   if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
 306       && no_level_2_or_3)
 307     return -1;
 308
 309   return 0;
 310 }
 311
 312
 313 static long int __attribute__ ((noinline))
 314 handle_amd (int name, const struct cpu_features *cpu_features)
 315 {
 316   unsigned int eax;
 317   unsigned int ebx;
 318   unsigned int ecx;
 319   unsigned int edx;
 320   unsigned int count = 0x1;
 321
 322   /* No level 4 cache (yet).  */
 323   if (name > _SC_LEVEL3_CACHE_LINESIZE)
 324     return 0;
 325
 326   if (name >= _SC_LEVEL3_CACHE_SIZE)
 327     count = 0x3;
 328   else if (name >= _SC_LEVEL2_CACHE_SIZE)
 329     count = 0x2;
 330   else if (name >= _SC_LEVEL1_DCACHE_SIZE)
 331     count = 0x0;
 332
 333   __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
 334
 335   switch (name)
 336     {
 337        case _SC_LEVEL1_ICACHE_ASSOC:
 338        case _SC_LEVEL1_DCACHE_ASSOC:
 339        case _SC_LEVEL2_CACHE_ASSOC:
 340        case _SC_LEVEL3_CACHE_ASSOC:
 341          return ecx?((ebx >> 22) & 0x3ff) + 1 : 0;
 342        case _SC_LEVEL1_ICACHE_LINESIZE:
 343        case _SC_LEVEL1_DCACHE_LINESIZE:
 344        case _SC_LEVEL2_CACHE_LINESIZE:
 345        case _SC_LEVEL3_CACHE_LINESIZE:
 346          return ecx?(ebx & 0xfff) + 1 : 0;
 347        case _SC_LEVEL1_ICACHE_SIZE:
 348        case _SC_LEVEL1_DCACHE_SIZE:
 349        case _SC_LEVEL2_CACHE_SIZE:
 350        case _SC_LEVEL3_CACHE_SIZE:
 351          return ecx?(((ebx >> 22) & 0x3ff) + 1)*((ebx & 0xfff) + 1)\
 352                                                     *(ecx + 1):0;
 353        default:
 354          assert (! "cannot happen");
 355     }
 356   return -1;
 357 }
 358
 359
 360 static long int __attribute__ ((noinline))
 361 handle_zhaoxin (int name)
 362 {
 363   unsigned int eax;
 364   unsigned int ebx;
 365   unsigned int ecx;
 366   unsigned int edx;
 367
 368   int folded_rel_name = (M(name) / 3) * 3;
 369
 370   unsigned int round = 0;
 371   while (1)
 372     {
 373       __cpuid_count (4, round, eax, ebx, ecx, edx);
 374
 375       enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
 376       if (type == null)
 377         break;
 378
 379       unsigned int level = (eax >> 5) & 0x7;
 380
 381       if ((level == 1 && type == data
 382         && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
 383         || (level == 1 && type == inst
 384             && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
 385         || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 386         || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
 387         {
 388           unsigned int offset = M(name) - folded_rel_name;
 389
 390           if (offset == 0)
 391             /* Cache size.  */
 392             return (((ebx >> 22) + 1)
 393                 * (((ebx >> 12) & 0x3ff) + 1)
 394                 * ((ebx & 0xfff) + 1)
 395                 * (ecx + 1));
 396           if (offset == 1)
 397             return (ebx >> 22) + 1;
 398
 399           assert (offset == 2);
 400           return (ebx & 0xfff) + 1;
 401         }
 402
 403       ++round;
 404     }
 405
 406   /* Nothing found.  */
 407   return 0;
 408 }
 409
 410 static void
 411 get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
 412                 long int core)
 413 {
 414   unsigned int eax;
 415   unsigned int ebx;
 416   unsigned int ecx;
 417   unsigned int edx;
 418
 419   /* Number of logical processors sharing L2 cache.  */
 420   int threads_l2;
 421
 422   /* Number of logical processors sharing L3 cache.  */
 423   int threads_l3;
 424
 425   const struct cpu_features *cpu_features = __get_cpu_features ();
 426   int max_cpuid = cpu_features->basic.max_cpuid;
 427   unsigned int family = cpu_features->basic.family;
 428   unsigned int model = cpu_features->basic.model;
 429   long int shared = *shared_ptr;
 430   unsigned int threads = *threads_ptr;
 431   bool inclusive_cache = true;
 432   bool support_count_mask = true;
 433
 434   /* Try L3 first.  */
 435   unsigned int level = 3;
 436
 437   if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
 438     support_count_mask = false;
 439
 440   if (shared <= 0)
 441     {
 442       /* Try L2 otherwise.  */
 443       level  = 2;
 444       shared = core;
 445       threads_l2 = 0;
 446       threads_l3 = -1;
 447     }
 448   else
 449     {
 450       threads_l2 = 0;
 451       threads_l3 = 0;
 452     }
 453
 454   /* A value of 0 for the HTT bit indicates there is only a single
 455      logical processor.  */
 456   if (HAS_CPU_FEATURE (HTT))
 457     {
 458       /* Figure out the number of logical threads that share the
 459          highest cache level.  */
 460       if (max_cpuid >= 4)
 461         {
 462           int i = 0;
 463
 464           /* Query until cache level 2 and 3 are enumerated.  */
 465           int check = 0x1 | (threads_l3 == 0) << 1;
 466           do
 467             {
 468               __cpuid_count (4, i++, eax, ebx, ecx, edx);
 469
 470               /* There seems to be a bug in at least some Pentium Ds
 471                  which sometimes fail to iterate all cache parameters.
 472                  Do not loop indefinitely here, stop in this case and
 473                  assume there is no such information.  */
 474               if (cpu_features->basic.kind == arch_kind_intel
 475                   && (eax & 0x1f) == 0 )
 476                 goto intel_bug_no_cache_info;
 477
 478               switch ((eax >> 5) & 0x7)
 479                 {
 480                   default:
 481                     break;
 482                   case 2:
 483                     if ((check & 0x1))
 484                       {
 485                         /* Get maximum number of logical processors
 486                            sharing L2 cache.  */
 487                         threads_l2 = (eax >> 14) & 0x3ff;
 488                         check &= ~0x1;
 489                       }
 490                     break;
 491                   case 3:
 492                     if ((check & (0x1 << 1)))
 493                       {
 494                         /* Get maximum number of logical processors
 495                            sharing L3 cache.  */
 496                         threads_l3 = (eax >> 14) & 0x3ff;
 497
 498                         /* Check if L2 and L3 caches are inclusive.  */
 499                         inclusive_cache = (edx & 0x2) != 0;
 500                         check &= ~(0x1 << 1);
 501                       }
 502                     break;
 503                 }
 504             }
 505           while (check);
 506
 507           /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
 508              numbers of addressable IDs for logical processors sharing
 509              the cache, instead of the maximum number of threads
 510              sharing the cache.  */
 511           if (max_cpuid >= 11 && support_count_mask)
 512             {
 513               /* Find the number of logical processors shipped in
 514                  one core and apply count mask.  */
 515               i = 0;
 516
 517               /* Count SMT only if there is L3 cache.  Always count
 518                  core if there is no L3 cache.  */
 519               int count = ((threads_l2 > 0 && level == 3)
 520                            | ((threads_l3 > 0
 521                                || (threads_l2 > 0 && level == 2)) << 1));
 522
 523               while (count)
 524                 {
 525                   __cpuid_count (11, i++, eax, ebx, ecx, edx);
 526
 527                   int shipped = ebx & 0xff;
 528                   int type = ecx & 0xff00;
 529                   if (shipped == 0 || type == 0)
 530                     break;
 531                   else if (type == 0x100)
 532                     {
 533                       /* Count SMT.  */
 534                       if ((count & 0x1))
 535                         {
 536                           int count_mask;
 537
 538                           /* Compute count mask.  */
 539                           asm ("bsr %1, %0"
 540                                : "=r" (count_mask) : "g" (threads_l2));
 541                           count_mask = ~(-1 << (count_mask + 1));
 542                           threads_l2 = (shipped - 1) & count_mask;
 543                           count &= ~0x1;
 544                         }
 545                     }
 546                   else if (type == 0x200)
 547                     {
 548                       /* Count core.  */
 549                       if ((count & (0x1 << 1)))
 550                         {
 551                           int count_mask;
 552                           int threads_core
 553                             = (level == 2 ? threads_l2 : threads_l3);
 554
 555                           /* Compute count mask.  */
 556                           asm ("bsr %1, %0"
 557                                : "=r" (count_mask) : "g" (threads_core));
 558                           count_mask = ~(-1 << (count_mask + 1));
 559                           threads_core = (shipped - 1) & count_mask;
 560                           if (level == 2)
 561                             threads_l2 = threads_core;
 562                           else
 563                             threads_l3 = threads_core;
 564                           count &= ~(0x1 << 1);
 565                         }
 566                     }
 567                 }
 568             }
 569           if (threads_l2 > 0)
 570             threads_l2 += 1;
 571           if (threads_l3 > 0)
 572             threads_l3 += 1;
 573           if (level == 2)
 574             {
 575               if (threads_l2)
 576                 {
 577                   threads = threads_l2;
 578                   if (cpu_features->basic.kind == arch_kind_intel
 579                       && threads > 2
 580                       && family == 6)
 581                     switch (model)
 582                       {
 583                         case 0x37:
 584                         case 0x4a:
 585                         case 0x4d:
 586                         case 0x5a:
 587                         case 0x5d:
 588                           /* Silvermont has L2 cache shared by 2 cores.  */
 589                           threads = 2;
 590                           break;
 591                         default:
 592                           break;
 593                       }
 594                 }
 595             }
 596           else if (threads_l3)
 597             threads = threads_l3;
 598         }
 599       else
 600         {
 601 intel_bug_no_cache_info:
 602           /* Assume that all logical threads share the highest cache
 603              level.  */
 604           threads
 605             = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
 606                & 0xff);
 607         }
 608
 609         /* Cap usage of highest cache level to the number of supported
 610            threads.  */
 611         if (shared > 0 && threads > 0)
 612           shared /= threads;
 613     }
 614
 615   /* Account for non-inclusive L2 and L3 caches.  */
 616   if (!inclusive_cache)
 617     {
 618       if (threads_l2 > 0)
 619         core /= threads_l2;
 620       shared += core;
 621     }
 622
 623   *shared_ptr = shared;
 624   *threads_ptr = threads;
 625 }
 626
 627 static void
 628 dl_init_cacheinfo (struct cpu_features *cpu_features)
 629 {
 630   /* Find out what brand of processor.  */
 631   long int data = -1;
 632   long int shared = -1;
 633   long int core = -1;
 634   unsigned int threads = 0;
 635   unsigned long int level1_icache_size = -1;
 636   unsigned long int level1_icache_linesize = -1;
 637   unsigned long int level1_dcache_size = -1;
 638   unsigned long int level1_dcache_assoc = -1;
 639   unsigned long int level1_dcache_linesize = -1;
 640   unsigned long int level2_cache_size = -1;
 641   unsigned long int level2_cache_assoc = -1;
 642   unsigned long int level2_cache_linesize = -1;
 643   unsigned long int level3_cache_size = -1;
 644   unsigned long int level3_cache_assoc = -1;
 645   unsigned long int level3_cache_linesize = -1;
 646   unsigned long int level4_cache_size = -1;
 647
 648   if (cpu_features->basic.kind == arch_kind_intel)
 649     {
 650       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
 651       core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
 652       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
 653
 654       level1_icache_size
 655         = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
 656       level1_icache_linesize
 657         = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
 658       level1_dcache_size = data;
 659       level1_dcache_assoc
 660         = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
 661       level1_dcache_linesize
 662         = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
 663       level2_cache_size = core;
 664       level2_cache_assoc
 665         = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
 666       level2_cache_linesize
 667         = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
 668       level3_cache_size = shared;
 669       level3_cache_assoc
 670         = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
 671       level3_cache_linesize
 672         = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
 673       level4_cache_size
 674         = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
 675
 676       get_common_cache_info (&shared, &threads, core);
 677     }
 678   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
 679     {
 680       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
 681       core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
 682       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
 683
 684       level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
 685       level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
 686       level1_dcache_size = data;
 687       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
 688       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
 689       level2_cache_size = core;
 690       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
 691       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
 692       level3_cache_size = shared;
 693       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
 694       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
 695
 696       get_common_cache_info (&shared, &threads, core);
 697     }
 698   else if (cpu_features->basic.kind == arch_kind_amd)
 699     {
 700       data  = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
 701       core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features);
 702       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features);
 703
 704       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
 705       level1_icache_linesize
 706         = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
 707       level1_dcache_size = data;
 708       level1_dcache_assoc
 709         = handle_amd (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
 710       level1_dcache_linesize
 711         = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
 712       level2_cache_size = core;
 713       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
 714       level2_cache_linesize
 715         = handle_amd (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
 716       level3_cache_size = shared;
 717       level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
 718       level3_cache_linesize
 719         = handle_amd (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
 720
 721       if (shared <= 0)
 722         /* No shared L3 cache.  All we have is the L2 cache.  */
 723          shared = core;
 724     }
 725
 726   cpu_features->level1_icache_size = level1_icache_size;
 727   cpu_features->level1_icache_linesize = level1_icache_linesize;
 728   cpu_features->level1_dcache_size = level1_dcache_size;
 729   cpu_features->level1_dcache_assoc = level1_dcache_assoc;
 730   cpu_features->level1_dcache_linesize = level1_dcache_linesize;
 731   cpu_features->level2_cache_size = level2_cache_size;
 732   cpu_features->level2_cache_assoc = level2_cache_assoc;
 733   cpu_features->level2_cache_linesize = level2_cache_linesize;
 734   cpu_features->level3_cache_size = level3_cache_size;
 735   cpu_features->level3_cache_assoc = level3_cache_assoc;
 736   cpu_features->level3_cache_linesize = level3_cache_linesize;
 737   cpu_features->level4_cache_size = level4_cache_size;
 738
 739   /* The default setting for the non_temporal threshold is 3/4 of one
 740      thread's share of the chip's cache. For most Intel and AMD processors
 741      with an initial release date between 2017 and 2020, a thread's typical
 742      share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
 743      threshold leaves 125 KBytes to 500 KBytes of the thread's data
 744      in cache after a maximum temporal copy, which will maintain
 745      in cache a reasonable portion of the thread's stack and other
 746      active data. If the threshold is set higher than one thread's
 747      share of the cache, it has a substantial risk of negatively
 748      impacting the performance of other threads running on the chip. */
 749   unsigned long int non_temporal_threshold = shared * 3 / 4;
 750   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
 751      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
 752      if that operation cannot overflow. Minimum of 0x4040 (16448) because the
 753      L(large_memset_4x) loops need 64-byte to cache align and enough space for
 754      at least 1 iteration of 4x PAGE_SIZE unrolled loop.  Both values are
 755      reflected in the manual.  */
 756   unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
 757   unsigned long int minimum_non_temporal_threshold = 0x4040;
 758   if (non_temporal_threshold < minimum_non_temporal_threshold)
 759     non_temporal_threshold = minimum_non_temporal_threshold;
 760   else if (non_temporal_threshold > maximum_non_temporal_threshold)
 761     non_temporal_threshold = maximum_non_temporal_threshold;
 762
 763   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
 764   unsigned int minimum_rep_movsb_threshold;
 765   /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
 766      VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
 767      threshold is 2048 * (VEC_SIZE / 16).  */
 768   unsigned int rep_movsb_threshold;
 769   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
 770       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
 771     {
 772       rep_movsb_threshold = 4096 * (64 / 16);
 773       minimum_rep_movsb_threshold = 64 * 8;
 774     }
 775   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 776                                     AVX_Fast_Unaligned_Load))
 777     {
 778       rep_movsb_threshold = 4096 * (32 / 16);
 779       minimum_rep_movsb_threshold = 32 * 8;
 780     }
 781   else
 782     {
 783       rep_movsb_threshold = 2048 * (16 / 16);
 784       minimum_rep_movsb_threshold = 16 * 8;
 785     }
 786   /* NB: The default REP MOVSB threshold is 2112 on processors with fast
 787      short REP MOVSB (FSRM).  */
 788   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
 789     rep_movsb_threshold = 2112;
 790
 791   /* The default threshold to use Enhanced REP STOSB.  */
 792   unsigned long int rep_stosb_threshold = 2048;
 793
 794   long int tunable_size;
 795
 796   tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
 797   /* NB: Ignore the default value 0.  */
 798   if (tunable_size != 0)
 799     data = tunable_size;
 800
 801   tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
 802   /* NB: Ignore the default value 0.  */
 803   if (tunable_size != 0)
 804     shared = tunable_size;
 805
 806   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
 807   if (tunable_size > minimum_non_temporal_threshold
 808       && tunable_size <= maximum_non_temporal_threshold)
 809     non_temporal_threshold = tunable_size;
 810
 811   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
 812   if (tunable_size > minimum_rep_movsb_threshold)
 813     rep_movsb_threshold = tunable_size;
 814
 815   /* NB: The default value of the x86_rep_stosb_threshold tunable is the
 816      same as the default value of __x86_rep_stosb_threshold and the
 817      minimum value is fixed.  */
 818   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
 819                                      long int, NULL);
 820
 821   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
 822   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
 823   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 824                            minimum_non_temporal_threshold,
 825                            maximum_non_temporal_threshold);
 826   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 827                            minimum_rep_movsb_threshold, SIZE_MAX);
 828   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
 829                            SIZE_MAX);
 830
 831   unsigned long int rep_movsb_stop_threshold;
 832   /* ERMS feature is implemented from AMD Zen3 architecture and it is
 833      performing poorly for data above L2 cache size. Henceforth, adding
 834      an upper bound threshold parameter to limit the usage of Enhanced
 835      REP MOVSB operations and setting its value to L2 cache size.  */
 836   if (cpu_features->basic.kind == arch_kind_amd)
 837     rep_movsb_stop_threshold = core;
 838   /* Setting the upper bound of ERMS to the computed value of
 839      non-temporal threshold for architectures other than AMD.  */
 840   else
 841     rep_movsb_stop_threshold = non_temporal_threshold;
 842
 843   cpu_features->data_cache_size = data;
 844   cpu_features->shared_cache_size = shared;
 845   cpu_features->non_temporal_threshold = non_temporal_threshold;
 846   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
 847   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
 848   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
 849 }