]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86/dl-cacheinfo.h
Remove --enable-tunables configure option
[thirdparty/glibc.git] / sysdeps / x86 / dl-cacheinfo.h
1 /* Initialize x86 cache info.
2 Copyright (C) 2020-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 static const struct intel_02_cache_info
20 {
21 unsigned char idx;
22 unsigned char assoc;
23 unsigned char linesize;
24 unsigned char rel_name;
25 unsigned int size;
26 } intel_02_known [] =
27 {
28 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
30 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
31 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
32 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
33 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
34 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
35 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
36 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
37 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
38 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
39 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
40 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
41 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
42 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
43 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
44 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
45 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
46 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
47 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
48 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
49 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
50 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
51 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
52 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
53 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
54 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
55 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
56 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
57 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
58 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
59 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
60 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
61 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
62 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
63 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
64 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
65 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
66 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
67 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
68 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
69 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
70 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
71 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
72 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
73 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
74 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
75 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
76 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
77 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
78 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
79 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
80 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
81 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
82 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
83 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
84 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
85 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
86 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
87 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
88 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
89 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
90 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
91 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
92 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
93 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
94 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
95 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
96 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
97 };
98
99 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101 static int
102 intel_02_known_compare (const void *p1, const void *p2)
103 {
104 const struct intel_02_cache_info *i1;
105 const struct intel_02_cache_info *i2;
106
107 i1 = (const struct intel_02_cache_info *) p1;
108 i2 = (const struct intel_02_cache_info *) p2;
109
110 if (i1->idx == i2->idx)
111 return 0;
112
113 return i1->idx < i2->idx ? -1 : 1;
114 }
115
116
117 static long int
118 __attribute__ ((noinline))
119 intel_check_word (int name, unsigned int value, bool *has_level_2,
120 bool *no_level_2_or_3,
121 const struct cpu_features *cpu_features)
122 {
123 if ((value & 0x80000000) != 0)
124 /* The register value is reserved. */
125 return 0;
126
127 /* Fold the name. The _SC_ constants are always in the order SIZE,
128 ASSOC, LINESIZE. */
129 int folded_rel_name = (M(name) / 3) * 3;
130
131 while (value != 0)
132 {
133 unsigned int byte = value & 0xff;
134
135 if (byte == 0x40)
136 {
137 *no_level_2_or_3 = true;
138
139 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140 /* No need to look further. */
141 break;
142 }
143 else if (byte == 0xff)
144 {
145 /* CPUID leaf 0x4 contains all the information. We need to
146 iterate over it. */
147 unsigned int eax;
148 unsigned int ebx;
149 unsigned int ecx;
150 unsigned int edx;
151
152 unsigned int round = 0;
153 while (1)
154 {
155 __cpuid_count (4, round, eax, ebx, ecx, edx);
156
157 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
158 if (type == null)
159 /* That was the end. */
160 break;
161
162 unsigned int level = (eax >> 5) & 0x7;
163
164 if ((level == 1 && type == data
165 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166 || (level == 1 && type == inst
167 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171 {
172 unsigned int offset = M(name) - folded_rel_name;
173
174 if (offset == 0)
175 /* Cache size. */
176 return (((ebx >> 22) + 1)
177 * (((ebx >> 12) & 0x3ff) + 1)
178 * ((ebx & 0xfff) + 1)
179 * (ecx + 1));
180 if (offset == 1)
181 return (ebx >> 22) + 1;
182
183 assert (offset == 2);
184 return (ebx & 0xfff) + 1;
185 }
186
187 ++round;
188 }
189 /* There is no other cache information anywhere else. */
190 break;
191 }
192 else
193 {
194 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195 {
196 /* Intel reused this value. For family 15, model 6 it
197 specifies the 3rd level cache. Otherwise the 2nd
198 level cache. */
199 unsigned int family = cpu_features->basic.family;
200 unsigned int model = cpu_features->basic.model;
201
202 if (family == 15 && model == 6)
203 {
204 /* The level 3 cache is encoded for this model like
205 the level 2 cache is for other models. Pretend
206 the caller asked for the level 2 cache. */
207 name = (_SC_LEVEL2_CACHE_SIZE
208 + (name - _SC_LEVEL3_CACHE_SIZE));
209 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210 }
211 }
212
213 struct intel_02_cache_info *found;
214 struct intel_02_cache_info search;
215
216 search.idx = byte;
217 found = bsearch (&search, intel_02_known, nintel_02_known,
218 sizeof (intel_02_known[0]), intel_02_known_compare);
219 if (found != NULL)
220 {
221 if (found->rel_name == folded_rel_name)
222 {
223 unsigned int offset = M(name) - folded_rel_name;
224
225 if (offset == 0)
226 /* Cache size. */
227 return found->size;
228 if (offset == 1)
229 return found->assoc;
230
231 assert (offset == 2);
232 return found->linesize;
233 }
234
235 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236 *has_level_2 = true;
237 }
238 }
239
240 /* Next byte for the next round. */
241 value >>= 8;
242 }
243
244 /* Nothing found. */
245 return 0;
246 }
247
248
249 static long int __attribute__ ((noinline))
250 handle_intel (int name, const struct cpu_features *cpu_features)
251 {
252 unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254 /* Return -1 for older CPUs. */
255 if (maxidx < 2)
256 return -1;
257
258 /* OK, we can use the CPUID instruction to get all info about the
259 caches. */
260 unsigned int cnt = 0;
261 unsigned int max = 1;
262 long int result = 0;
263 bool no_level_2_or_3 = false;
264 bool has_level_2 = false;
265
266 while (cnt++ < max)
267 {
268 unsigned int eax;
269 unsigned int ebx;
270 unsigned int ecx;
271 unsigned int edx;
272 __cpuid (2, eax, ebx, ecx, edx);
273
274 /* The low byte of EAX in the first round contain the number of
275 rounds we have to make. At least one, the one we are already
276 doing. */
277 if (cnt == 1)
278 {
279 max = eax & 0xff;
280 eax &= 0xffffff00;
281 }
282
283 /* Process the individual registers' value. */
284 result = intel_check_word (name, eax, &has_level_2,
285 &no_level_2_or_3, cpu_features);
286 if (result != 0)
287 return result;
288
289 result = intel_check_word (name, ebx, &has_level_2,
290 &no_level_2_or_3, cpu_features);
291 if (result != 0)
292 return result;
293
294 result = intel_check_word (name, ecx, &has_level_2,
295 &no_level_2_or_3, cpu_features);
296 if (result != 0)
297 return result;
298
299 result = intel_check_word (name, edx, &has_level_2,
300 &no_level_2_or_3, cpu_features);
301 if (result != 0)
302 return result;
303 }
304
305 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306 && no_level_2_or_3)
307 return -1;
308
309 return 0;
310 }
311
312
313 static long int __attribute__ ((noinline))
314 handle_amd (int name, const struct cpu_features *cpu_features)
315 {
316 unsigned int eax;
317 unsigned int ebx;
318 unsigned int ecx;
319 unsigned int edx;
320 unsigned int count = 0x1;
321
322 /* No level 4 cache (yet). */
323 if (name > _SC_LEVEL3_CACHE_LINESIZE)
324 return 0;
325
326 if (name >= _SC_LEVEL3_CACHE_SIZE)
327 count = 0x3;
328 else if (name >= _SC_LEVEL2_CACHE_SIZE)
329 count = 0x2;
330 else if (name >= _SC_LEVEL1_DCACHE_SIZE)
331 count = 0x0;
332
333 __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
334
335 switch (name)
336 {
337 case _SC_LEVEL1_ICACHE_ASSOC:
338 case _SC_LEVEL1_DCACHE_ASSOC:
339 case _SC_LEVEL2_CACHE_ASSOC:
340 case _SC_LEVEL3_CACHE_ASSOC:
341 return ecx?((ebx >> 22) & 0x3ff) + 1 : 0;
342 case _SC_LEVEL1_ICACHE_LINESIZE:
343 case _SC_LEVEL1_DCACHE_LINESIZE:
344 case _SC_LEVEL2_CACHE_LINESIZE:
345 case _SC_LEVEL3_CACHE_LINESIZE:
346 return ecx?(ebx & 0xfff) + 1 : 0;
347 case _SC_LEVEL1_ICACHE_SIZE:
348 case _SC_LEVEL1_DCACHE_SIZE:
349 case _SC_LEVEL2_CACHE_SIZE:
350 case _SC_LEVEL3_CACHE_SIZE:
351 return ecx?(((ebx >> 22) & 0x3ff) + 1)*((ebx & 0xfff) + 1)\
352 *(ecx + 1):0;
353 default:
354 assert (! "cannot happen");
355 }
356 return -1;
357 }
358
359
360 static long int __attribute__ ((noinline))
361 handle_zhaoxin (int name)
362 {
363 unsigned int eax;
364 unsigned int ebx;
365 unsigned int ecx;
366 unsigned int edx;
367
368 int folded_rel_name = (M(name) / 3) * 3;
369
370 unsigned int round = 0;
371 while (1)
372 {
373 __cpuid_count (4, round, eax, ebx, ecx, edx);
374
375 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
376 if (type == null)
377 break;
378
379 unsigned int level = (eax >> 5) & 0x7;
380
381 if ((level == 1 && type == data
382 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
383 || (level == 1 && type == inst
384 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
385 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
386 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
387 {
388 unsigned int offset = M(name) - folded_rel_name;
389
390 if (offset == 0)
391 /* Cache size. */
392 return (((ebx >> 22) + 1)
393 * (((ebx >> 12) & 0x3ff) + 1)
394 * ((ebx & 0xfff) + 1)
395 * (ecx + 1));
396 if (offset == 1)
397 return (ebx >> 22) + 1;
398
399 assert (offset == 2);
400 return (ebx & 0xfff) + 1;
401 }
402
403 ++round;
404 }
405
406 /* Nothing found. */
407 return 0;
408 }
409
410 static void
411 get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
412 long int core)
413 {
414 unsigned int eax;
415 unsigned int ebx;
416 unsigned int ecx;
417 unsigned int edx;
418
419 /* Number of logical processors sharing L2 cache. */
420 int threads_l2;
421
422 /* Number of logical processors sharing L3 cache. */
423 int threads_l3;
424
425 const struct cpu_features *cpu_features = __get_cpu_features ();
426 int max_cpuid = cpu_features->basic.max_cpuid;
427 unsigned int family = cpu_features->basic.family;
428 unsigned int model = cpu_features->basic.model;
429 long int shared = *shared_ptr;
430 unsigned int threads = *threads_ptr;
431 bool inclusive_cache = true;
432 bool support_count_mask = true;
433
434 /* Try L3 first. */
435 unsigned int level = 3;
436
437 if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
438 support_count_mask = false;
439
440 if (shared <= 0)
441 {
442 /* Try L2 otherwise. */
443 level = 2;
444 shared = core;
445 threads_l2 = 0;
446 threads_l3 = -1;
447 }
448 else
449 {
450 threads_l2 = 0;
451 threads_l3 = 0;
452 }
453
454 /* A value of 0 for the HTT bit indicates there is only a single
455 logical processor. */
456 if (HAS_CPU_FEATURE (HTT))
457 {
458 /* Figure out the number of logical threads that share the
459 highest cache level. */
460 if (max_cpuid >= 4)
461 {
462 int i = 0;
463
464 /* Query until cache level 2 and 3 are enumerated. */
465 int check = 0x1 | (threads_l3 == 0) << 1;
466 do
467 {
468 __cpuid_count (4, i++, eax, ebx, ecx, edx);
469
470 /* There seems to be a bug in at least some Pentium Ds
471 which sometimes fail to iterate all cache parameters.
472 Do not loop indefinitely here, stop in this case and
473 assume there is no such information. */
474 if (cpu_features->basic.kind == arch_kind_intel
475 && (eax & 0x1f) == 0 )
476 goto intel_bug_no_cache_info;
477
478 switch ((eax >> 5) & 0x7)
479 {
480 default:
481 break;
482 case 2:
483 if ((check & 0x1))
484 {
485 /* Get maximum number of logical processors
486 sharing L2 cache. */
487 threads_l2 = (eax >> 14) & 0x3ff;
488 check &= ~0x1;
489 }
490 break;
491 case 3:
492 if ((check & (0x1 << 1)))
493 {
494 /* Get maximum number of logical processors
495 sharing L3 cache. */
496 threads_l3 = (eax >> 14) & 0x3ff;
497
498 /* Check if L2 and L3 caches are inclusive. */
499 inclusive_cache = (edx & 0x2) != 0;
500 check &= ~(0x1 << 1);
501 }
502 break;
503 }
504 }
505 while (check);
506
507 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
508 numbers of addressable IDs for logical processors sharing
509 the cache, instead of the maximum number of threads
510 sharing the cache. */
511 if (max_cpuid >= 11 && support_count_mask)
512 {
513 /* Find the number of logical processors shipped in
514 one core and apply count mask. */
515 i = 0;
516
517 /* Count SMT only if there is L3 cache. Always count
518 core if there is no L3 cache. */
519 int count = ((threads_l2 > 0 && level == 3)
520 | ((threads_l3 > 0
521 || (threads_l2 > 0 && level == 2)) << 1));
522
523 while (count)
524 {
525 __cpuid_count (11, i++, eax, ebx, ecx, edx);
526
527 int shipped = ebx & 0xff;
528 int type = ecx & 0xff00;
529 if (shipped == 0 || type == 0)
530 break;
531 else if (type == 0x100)
532 {
533 /* Count SMT. */
534 if ((count & 0x1))
535 {
536 int count_mask;
537
538 /* Compute count mask. */
539 asm ("bsr %1, %0"
540 : "=r" (count_mask) : "g" (threads_l2));
541 count_mask = ~(-1 << (count_mask + 1));
542 threads_l2 = (shipped - 1) & count_mask;
543 count &= ~0x1;
544 }
545 }
546 else if (type == 0x200)
547 {
548 /* Count core. */
549 if ((count & (0x1 << 1)))
550 {
551 int count_mask;
552 int threads_core
553 = (level == 2 ? threads_l2 : threads_l3);
554
555 /* Compute count mask. */
556 asm ("bsr %1, %0"
557 : "=r" (count_mask) : "g" (threads_core));
558 count_mask = ~(-1 << (count_mask + 1));
559 threads_core = (shipped - 1) & count_mask;
560 if (level == 2)
561 threads_l2 = threads_core;
562 else
563 threads_l3 = threads_core;
564 count &= ~(0x1 << 1);
565 }
566 }
567 }
568 }
569 if (threads_l2 > 0)
570 threads_l2 += 1;
571 if (threads_l3 > 0)
572 threads_l3 += 1;
573 if (level == 2)
574 {
575 if (threads_l2)
576 {
577 threads = threads_l2;
578 if (cpu_features->basic.kind == arch_kind_intel
579 && threads > 2
580 && family == 6)
581 switch (model)
582 {
583 case 0x37:
584 case 0x4a:
585 case 0x4d:
586 case 0x5a:
587 case 0x5d:
588 /* Silvermont has L2 cache shared by 2 cores. */
589 threads = 2;
590 break;
591 default:
592 break;
593 }
594 }
595 }
596 else if (threads_l3)
597 threads = threads_l3;
598 }
599 else
600 {
601 intel_bug_no_cache_info:
602 /* Assume that all logical threads share the highest cache
603 level. */
604 threads
605 = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
606 & 0xff);
607 }
608
609 /* Cap usage of highest cache level to the number of supported
610 threads. */
611 if (shared > 0 && threads > 0)
612 shared /= threads;
613 }
614
615 /* Account for non-inclusive L2 and L3 caches. */
616 if (!inclusive_cache)
617 {
618 if (threads_l2 > 0)
619 core /= threads_l2;
620 shared += core;
621 }
622
623 *shared_ptr = shared;
624 *threads_ptr = threads;
625 }
626
627 static void
628 dl_init_cacheinfo (struct cpu_features *cpu_features)
629 {
630 /* Find out what brand of processor. */
631 long int data = -1;
632 long int shared = -1;
633 long int core = -1;
634 unsigned int threads = 0;
635 unsigned long int level1_icache_size = -1;
636 unsigned long int level1_icache_linesize = -1;
637 unsigned long int level1_dcache_size = -1;
638 unsigned long int level1_dcache_assoc = -1;
639 unsigned long int level1_dcache_linesize = -1;
640 unsigned long int level2_cache_size = -1;
641 unsigned long int level2_cache_assoc = -1;
642 unsigned long int level2_cache_linesize = -1;
643 unsigned long int level3_cache_size = -1;
644 unsigned long int level3_cache_assoc = -1;
645 unsigned long int level3_cache_linesize = -1;
646 unsigned long int level4_cache_size = -1;
647
648 if (cpu_features->basic.kind == arch_kind_intel)
649 {
650 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
651 core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
652 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
653
654 level1_icache_size
655 = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
656 level1_icache_linesize
657 = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
658 level1_dcache_size = data;
659 level1_dcache_assoc
660 = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
661 level1_dcache_linesize
662 = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
663 level2_cache_size = core;
664 level2_cache_assoc
665 = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
666 level2_cache_linesize
667 = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
668 level3_cache_size = shared;
669 level3_cache_assoc
670 = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
671 level3_cache_linesize
672 = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
673 level4_cache_size
674 = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
675
676 get_common_cache_info (&shared, &threads, core);
677 }
678 else if (cpu_features->basic.kind == arch_kind_zhaoxin)
679 {
680 data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
681 core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
682 shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
683
684 level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
685 level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
686 level1_dcache_size = data;
687 level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
688 level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
689 level2_cache_size = core;
690 level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
691 level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
692 level3_cache_size = shared;
693 level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
694 level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
695
696 get_common_cache_info (&shared, &threads, core);
697 }
698 else if (cpu_features->basic.kind == arch_kind_amd)
699 {
700 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
701 core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features);
702 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features);
703
704 level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
705 level1_icache_linesize
706 = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
707 level1_dcache_size = data;
708 level1_dcache_assoc
709 = handle_amd (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
710 level1_dcache_linesize
711 = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
712 level2_cache_size = core;
713 level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
714 level2_cache_linesize
715 = handle_amd (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
716 level3_cache_size = shared;
717 level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
718 level3_cache_linesize
719 = handle_amd (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
720
721 if (shared <= 0)
722 /* No shared L3 cache. All we have is the L2 cache. */
723 shared = core;
724 }
725
726 cpu_features->level1_icache_size = level1_icache_size;
727 cpu_features->level1_icache_linesize = level1_icache_linesize;
728 cpu_features->level1_dcache_size = level1_dcache_size;
729 cpu_features->level1_dcache_assoc = level1_dcache_assoc;
730 cpu_features->level1_dcache_linesize = level1_dcache_linesize;
731 cpu_features->level2_cache_size = level2_cache_size;
732 cpu_features->level2_cache_assoc = level2_cache_assoc;
733 cpu_features->level2_cache_linesize = level2_cache_linesize;
734 cpu_features->level3_cache_size = level3_cache_size;
735 cpu_features->level3_cache_assoc = level3_cache_assoc;
736 cpu_features->level3_cache_linesize = level3_cache_linesize;
737 cpu_features->level4_cache_size = level4_cache_size;
738
739 /* The default setting for the non_temporal threshold is 3/4 of one
740 thread's share of the chip's cache. For most Intel and AMD processors
741 with an initial release date between 2017 and 2020, a thread's typical
742 share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
743 threshold leaves 125 KBytes to 500 KBytes of the thread's data
744 in cache after a maximum temporal copy, which will maintain
745 in cache a reasonable portion of the thread's stack and other
746 active data. If the threshold is set higher than one thread's
747 share of the cache, it has a substantial risk of negatively
748 impacting the performance of other threads running on the chip. */
749 unsigned long int non_temporal_threshold = shared * 3 / 4;
750 /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
751 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
752 if that operation cannot overflow. Minimum of 0x4040 (16448) because the
753 L(large_memset_4x) loops need 64-byte to cache align and enough space for
754 at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
755 reflected in the manual. */
756 unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
757 unsigned long int minimum_non_temporal_threshold = 0x4040;
758 if (non_temporal_threshold < minimum_non_temporal_threshold)
759 non_temporal_threshold = minimum_non_temporal_threshold;
760 else if (non_temporal_threshold > maximum_non_temporal_threshold)
761 non_temporal_threshold = maximum_non_temporal_threshold;
762
763 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
764 unsigned int minimum_rep_movsb_threshold;
765 /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
766 VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
767 threshold is 2048 * (VEC_SIZE / 16). */
768 unsigned int rep_movsb_threshold;
769 if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
770 && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
771 {
772 rep_movsb_threshold = 4096 * (64 / 16);
773 minimum_rep_movsb_threshold = 64 * 8;
774 }
775 else if (CPU_FEATURE_PREFERRED_P (cpu_features,
776 AVX_Fast_Unaligned_Load))
777 {
778 rep_movsb_threshold = 4096 * (32 / 16);
779 minimum_rep_movsb_threshold = 32 * 8;
780 }
781 else
782 {
783 rep_movsb_threshold = 2048 * (16 / 16);
784 minimum_rep_movsb_threshold = 16 * 8;
785 }
786 /* NB: The default REP MOVSB threshold is 2112 on processors with fast
787 short REP MOVSB (FSRM). */
788 if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
789 rep_movsb_threshold = 2112;
790
791 /* The default threshold to use Enhanced REP STOSB. */
792 unsigned long int rep_stosb_threshold = 2048;
793
794 long int tunable_size;
795
796 tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
797 /* NB: Ignore the default value 0. */
798 if (tunable_size != 0)
799 data = tunable_size;
800
801 tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
802 /* NB: Ignore the default value 0. */
803 if (tunable_size != 0)
804 shared = tunable_size;
805
806 tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
807 if (tunable_size > minimum_non_temporal_threshold
808 && tunable_size <= maximum_non_temporal_threshold)
809 non_temporal_threshold = tunable_size;
810
811 tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
812 if (tunable_size > minimum_rep_movsb_threshold)
813 rep_movsb_threshold = tunable_size;
814
815 /* NB: The default value of the x86_rep_stosb_threshold tunable is the
816 same as the default value of __x86_rep_stosb_threshold and the
817 minimum value is fixed. */
818 rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
819 long int, NULL);
820
821 TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
822 TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
823 TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
824 minimum_non_temporal_threshold,
825 maximum_non_temporal_threshold);
826 TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
827 minimum_rep_movsb_threshold, SIZE_MAX);
828 TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
829 SIZE_MAX);
830
831 unsigned long int rep_movsb_stop_threshold;
832 /* ERMS feature is implemented from AMD Zen3 architecture and it is
833 performing poorly for data above L2 cache size. Henceforth, adding
834 an upper bound threshold parameter to limit the usage of Enhanced
835 REP MOVSB operations and setting its value to L2 cache size. */
836 if (cpu_features->basic.kind == arch_kind_amd)
837 rep_movsb_stop_threshold = core;
838 /* Setting the upper bound of ERMS to the computed value of
839 non-temporal threshold for architectures other than AMD. */
840 else
841 rep_movsb_stop_threshold = non_temporal_threshold;
842
843 cpu_features->data_cache_size = data;
844 cpu_features->shared_cache_size = shared;
845 cpu_features->non_temporal_threshold = non_temporal_threshold;
846 cpu_features->rep_movsb_threshold = rep_movsb_threshold;
847 cpu_features->rep_stosb_threshold = rep_stosb_threshold;
848 cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
849 }