]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86/cacheinfo.c
x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4`
[thirdparty/glibc.git] / sysdeps / x86 / cacheinfo.c
CommitLineData
6d59823c 1/* x86_64 cache info.
04277e02 2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
bfe6f5fa
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
bfe6f5fa 18
9c450f6f
L
19#if IS_IN (libc)
20
bfe6f5fa
UD
21#include <assert.h>
22#include <stdbool.h>
23#include <stdlib.h>
24#include <unistd.h>
6f6f1215 25#include <cpuid.h>
1ae6c72d 26#include <init-arch.h>
6f6f1215 27
bfe6f5fa
UD
28static const struct intel_02_cache_info
29{
1de0c161
UD
30 unsigned char idx;
31 unsigned char assoc;
32 unsigned char linesize;
33 unsigned char rel_name;
34 unsigned int size;
bfe6f5fa
UD
35} intel_02_known [] =
36 {
1de0c161
UD
37#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
38 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
39 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
40 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
41 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
42 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
43 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
3af48cbd 44 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
1de0c161
UD
45 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
46 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
47 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
48 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
49 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
50 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
51 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
52 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
53 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
54 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
55 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
56 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
57 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
58 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
59 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
60 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
61 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
62 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
63 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
64 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
65 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
66 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
67 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
68 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
69 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
70 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
71 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
72 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
73 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
74 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
75 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
76 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
77 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
78 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
79 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
80 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
81 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
82 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
83 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
3af48cbd 84 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
1de0c161
UD
85 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
86 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
87 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
88 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
89 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
90 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
91 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
92 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
93 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
94 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
95 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
96 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
97 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
98 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
99 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
dedc7c7b 100 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
1de0c161
UD
101 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
102 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
3e9099b4
UD
103 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
104 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
105 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
bfe6f5fa
UD
106 };
107
108#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
109
110static int
111intel_02_known_compare (const void *p1, const void *p2)
112{
113 const struct intel_02_cache_info *i1;
114 const struct intel_02_cache_info *i2;
115
116 i1 = (const struct intel_02_cache_info *) p1;
117 i2 = (const struct intel_02_cache_info *) p2;
118
119 if (i1->idx == i2->idx)
120 return 0;
121
122 return i1->idx < i2->idx ? -1 : 1;
123}
124
125
126static long int
127__attribute__ ((noinline))
128intel_check_word (int name, unsigned int value, bool *has_level_2,
48e7bc7a
L
129 bool *no_level_2_or_3,
130 const struct cpu_features *cpu_features)
bfe6f5fa
UD
131{
132 if ((value & 0x80000000) != 0)
133 /* The register value is reserved. */
134 return 0;
135
136 /* Fold the name. The _SC_ constants are always in the order SIZE,
137 ASSOC, LINESIZE. */
1de0c161 138 int folded_rel_name = (M(name) / 3) * 3;
bfe6f5fa
UD
139
140 while (value != 0)
141 {
142 unsigned int byte = value & 0xff;
143
144 if (byte == 0x40)
145 {
146 *no_level_2_or_3 = true;
147
1de0c161 148 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
bfe6f5fa
UD
149 /* No need to look further. */
150 break;
151 }
2a115601
UD
152 else if (byte == 0xff)
153 {
154 /* CPUID leaf 0x4 contains all the information. We need to
155 iterate over it. */
156 unsigned int eax;
157 unsigned int ebx;
158 unsigned int ecx;
159 unsigned int edx;
160
161 unsigned int round = 0;
162 while (1)
163 {
b4acef1f 164 __cpuid_count (4, round, eax, ebx, ecx, edx);
2a115601
UD
165
166 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
167 if (type == null)
168 /* That was the end. */
169 break;
170
171 unsigned int level = (eax >> 5) & 0x7;
172
173 if ((level == 1 && type == data
174 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
175 || (level == 1 && type == inst
176 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
177 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
178 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
179 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
180 {
181 unsigned int offset = M(name) - folded_rel_name;
182
183 if (offset == 0)
184 /* Cache size. */
185 return (((ebx >> 22) + 1)
186 * (((ebx >> 12) & 0x3ff) + 1)
187 * ((ebx & 0xfff) + 1)
188 * (ecx + 1));
189 if (offset == 1)
190 return (ebx >> 22) + 1;
191
192 assert (offset == 2);
193 return (ebx & 0xfff) + 1;
194 }
bb242059
UD
195
196 ++round;
2a115601
UD
197 }
198 /* There is no other cache information anywhere else. */
199 break;
200 }
bfe6f5fa
UD
201 else
202 {
1de0c161 203 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
bfe6f5fa
UD
204 {
205 /* Intel reused this value. For family 15, model 6 it
206 specifies the 3rd level cache. Otherwise the 2nd
207 level cache. */
c22e4c2a
L
208 unsigned int family = cpu_features->basic.family;
209 unsigned int model = cpu_features->basic.model;
b2509a1e 210
bfe6f5fa
UD
211 if (family == 15 && model == 6)
212 {
213 /* The level 3 cache is encoded for this model like
214 the level 2 cache is for other models. Pretend
215 the caller asked for the level 2 cache. */
216 name = (_SC_LEVEL2_CACHE_SIZE
217 + (name - _SC_LEVEL3_CACHE_SIZE));
1de0c161 218 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
bfe6f5fa
UD
219 }
220 }
221
222 struct intel_02_cache_info *found;
223 struct intel_02_cache_info search;
224
225 search.idx = byte;
226 found = bsearch (&search, intel_02_known, nintel_02_known,
227 sizeof (intel_02_known[0]), intel_02_known_compare);
228 if (found != NULL)
229 {
1de0c161 230 if (found->rel_name == folded_rel_name)
bfe6f5fa 231 {
1de0c161 232 unsigned int offset = M(name) - folded_rel_name;
bfe6f5fa
UD
233
234 if (offset == 0)
235 /* Cache size. */
236 return found->size;
237 if (offset == 1)
238 return found->assoc;
239
240 assert (offset == 2);
241 return found->linesize;
242 }
243
1de0c161 244 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
bfe6f5fa
UD
245 *has_level_2 = true;
246 }
247 }
248
249 /* Next byte for the next round. */
250 value >>= 8;
251 }
252
253 /* Nothing found. */
254 return 0;
255}
256
257
258static long int __attribute__ ((noinline))
48e7bc7a 259handle_intel (int name, const struct cpu_features *cpu_features)
bfe6f5fa 260{
c22e4c2a 261 unsigned int maxidx = cpu_features->basic.max_cpuid;
48e7bc7a 262
6a824767
L
263 /* Return -1 for older CPUs. */
264 if (maxidx < 2)
265 return -1;
bfe6f5fa
UD
266
267 /* OK, we can use the CPUID instruction to get all info about the
268 caches. */
269 unsigned int cnt = 0;
270 unsigned int max = 1;
271 long int result = 0;
272 bool no_level_2_or_3 = false;
273 bool has_level_2 = false;
6d59823c 274
bfe6f5fa
UD
275 while (cnt++ < max)
276 {
277 unsigned int eax;
278 unsigned int ebx;
279 unsigned int ecx;
280 unsigned int edx;
6f6f1215 281 __cpuid (2, eax, ebx, ecx, edx);
bfe6f5fa
UD
282
283 /* The low byte of EAX in the first round contain the number of
284 rounds we have to make. At least one, the one we are already
285 doing. */
286 if (cnt == 1)
287 {
288 max = eax & 0xff;
289 eax &= 0xffffff00;
290 }
291
292 /* Process the individual registers' value. */
48e7bc7a
L
293 result = intel_check_word (name, eax, &has_level_2,
294 &no_level_2_or_3, cpu_features);
bfe6f5fa
UD
295 if (result != 0)
296 return result;
297
48e7bc7a
L
298 result = intel_check_word (name, ebx, &has_level_2,
299 &no_level_2_or_3, cpu_features);
bfe6f5fa
UD
300 if (result != 0)
301 return result;
302
48e7bc7a
L
303 result = intel_check_word (name, ecx, &has_level_2,
304 &no_level_2_or_3, cpu_features);
bfe6f5fa
UD
305 if (result != 0)
306 return result;
307
48e7bc7a
L
308 result = intel_check_word (name, edx, &has_level_2,
309 &no_level_2_or_3, cpu_features);
bfe6f5fa
UD
310 if (result != 0)
311 return result;
312 }
313
314 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
315 && no_level_2_or_3)
316 return -1;
317
318 return 0;
319}
320
321
322static long int __attribute__ ((noinline))
323handle_amd (int name)
324{
325 unsigned int eax;
326 unsigned int ebx;
327 unsigned int ecx;
328 unsigned int edx;
6f6f1215 329 __cpuid (0x80000000, eax, ebx, ecx, edx);
bfe6f5fa 330
80e7d6a6
UD
331 /* No level 4 cache (yet). */
332 if (name > _SC_LEVEL3_CACHE_LINESIZE)
bfe6f5fa
UD
333 return 0;
334
335 unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
336 if (eax < fn)
337 return 0;
338
6f6f1215 339 __cpuid (fn, eax, ebx, ecx, edx);
bfe6f5fa
UD
340
341 if (name < _SC_LEVEL1_DCACHE_SIZE)
342 {
343 name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
344 ecx = edx;
345 }
346
347 switch (name)
348 {
349 case _SC_LEVEL1_DCACHE_SIZE:
350 return (ecx >> 14) & 0x3fc00;
80e7d6a6 351
bfe6f5fa
UD
352 case _SC_LEVEL1_DCACHE_ASSOC:
353 ecx >>= 16;
354 if ((ecx & 0xff) == 0xff)
355 /* Fully associative. */
356 return (ecx << 2) & 0x3fc00;
357 return ecx & 0xff;
80e7d6a6 358
bfe6f5fa
UD
359 case _SC_LEVEL1_DCACHE_LINESIZE:
360 return ecx & 0xff;
80e7d6a6 361
bfe6f5fa
UD
362 case _SC_LEVEL2_CACHE_SIZE:
363 return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
80e7d6a6 364
bfe6f5fa 365 case _SC_LEVEL2_CACHE_ASSOC:
76fca9f1 366 switch ((ecx >> 12) & 0xf)
7e4ba49c
HJ
367 {
368 case 0:
369 case 1:
370 case 2:
371 case 4:
76fca9f1 372 return (ecx >> 12) & 0xf;
bfe6f5fa
UD
373 case 6:
374 return 8;
375 case 8:
376 return 16;
80e7d6a6
UD
377 case 10:
378 return 32;
379 case 11:
380 return 48;
381 case 12:
382 return 64;
383 case 13:
384 return 96;
385 case 14:
386 return 128;
387 case 15:
76fca9f1 388 return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
bfe6f5fa
UD
389 default:
390 return 0;
7e4ba49c 391 }
80e7d6a6
UD
392 /* NOTREACHED */
393
bfe6f5fa
UD
394 case _SC_LEVEL2_CACHE_LINESIZE:
395 return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
80e7d6a6
UD
396
397 case _SC_LEVEL3_CACHE_SIZE:
398 return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
399
400 case _SC_LEVEL3_CACHE_ASSOC:
76fca9f1 401 switch ((edx >> 12) & 0xf)
80e7d6a6
UD
402 {
403 case 0:
404 case 1:
405 case 2:
406 case 4:
76fca9f1 407 return (edx >> 12) & 0xf;
80e7d6a6
UD
408 case 6:
409 return 8;
410 case 8:
411 return 16;
412 case 10:
413 return 32;
414 case 11:
415 return 48;
416 case 12:
417 return 64;
418 case 13:
419 return 96;
420 case 14:
421 return 128;
422 case 15:
76fca9f1 423 return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
80e7d6a6
UD
424 default:
425 return 0;
426 }
427 /* NOTREACHED */
428
429 case _SC_LEVEL3_CACHE_LINESIZE:
430 return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
431
bfe6f5fa
UD
432 default:
433 assert (! "cannot happen");
434 }
435 return -1;
436}
437
438
439/* Get the value of the system variable NAME. */
440long int
441attribute_hidden
442__cache_sysconf (int name)
443{
7c1d7225
L
444 const struct cpu_features *cpu_features = __get_cpu_features ();
445
c22e4c2a 446 if (cpu_features->basic.kind == arch_kind_intel)
48e7bc7a 447 return handle_intel (name, cpu_features);
bfe6f5fa 448
c22e4c2a 449 if (cpu_features->basic.kind == arch_kind_amd)
bfe6f5fa
UD
450 return handle_amd (name);
451
452 // XXX Fill in more vendors.
453
454 /* CPU not known, we have no information. */
455 return 0;
456}
457
458
3af48cbd 459/* Data cache size for use in memory and string routines, typically
c0dde15b 460 L1 size, rounded to multiple of 256 bytes. */
afec409a
L
461long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
462long int __x86_data_cache_size attribute_hidden = 32 * 1024;
463/* Similar to __x86_data_cache_size_half, but not rounded. */
464long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
465/* Similar to __x86_data_cache_size, but not rounded. */
466long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024;
6d59823c 467/* Shared cache size for use in memory and string routines, typically
c0dde15b 468 L2 or L3 size, rounded to multiple of 256 bytes. */
afec409a
L
469long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
470long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
471/* Similar to __x86_shared_cache_size_half, but not rounded. */
472long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
473/* Similar to __x86_shared_cache_size, but not rounded. */
474long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
6f6f1215 475
a057f5f8
L
476/* Threshold to use non temporal store. */
477long int __x86_shared_non_temporal_threshold attribute_hidden;
478
6f6f1215 479#ifndef DISABLE_PREFETCHW
0435403c 480/* PREFETCHW support flag for use in memory and string routines. */
afec409a 481int __x86_prefetchw attribute_hidden;
6f6f1215 482#endif
bfe6f5fa
UD
483
484
485static void
486__attribute__((constructor))
487init_cacheinfo (void)
488{
489 /* Find out what brand of processor. */
490 unsigned int eax;
491 unsigned int ebx;
492 unsigned int ecx;
493 unsigned int edx;
bfe6f5fa 494 int max_cpuid_ex;
0435403c 495 long int data = -1;
bfe6f5fa 496 long int shared = -1;
d4386d34 497 long int shared_per_thread = -1;
bfe6f5fa
UD
498 unsigned int level;
499 unsigned int threads = 0;
7c1d7225 500 const struct cpu_features *cpu_features = __get_cpu_features ();
c22e4c2a 501 int max_cpuid = cpu_features->basic.max_cpuid;
bfe6f5fa 502
c22e4c2a 503 if (cpu_features->basic.kind == arch_kind_intel)
bfe6f5fa 504 {
48e7bc7a 505 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
6d59823c 506
48e7bc7a 507 long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
9e4ec3e8
L
508 bool inclusive_cache = true;
509
0435403c 510 /* Try L3 first. */
bfe6f5fa 511 level = 3;
48e7bc7a 512 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
d4386d34 513 shared_per_thread = shared;
d6af2388
L
514 /* Number of logical processors sharing L2 cache. */
515 int threads_l2;
516
517 /* Number of logical processors sharing L3 cache. */
518 int threads_l3;
519
bfe6f5fa 520 if (shared <= 0)
7e4ba49c 521 {
0435403c 522 /* Try L2 otherwise. */
7e4ba49c 523 level = 2;
9e4ec3e8 524 shared = core;
d4386d34 525 shared_per_thread = core;
d6af2388
L
526 threads_l2 = 0;
527 threads_l3 = -1;
528 }
529 else
530 {
531 threads_l2 = 0;
532 threads_l3 = 0;
bfe6f5fa 533 }
6d59823c 534
7c08d791
L
535 /* A value of 0 for the HTT bit indicates there is only a single
536 logical processor. */
537 if (HAS_CPU_FEATURE (HTT))
7e4ba49c 538 {
7c08d791
L
539 /* Figure out the number of logical threads that share the
540 highest cache level. */
541 if (max_cpuid >= 4)
542 {
c22e4c2a
L
543 unsigned int family = cpu_features->basic.family;
544 unsigned int model = cpu_features->basic.model;
e2e4f560 545
7c08d791 546 int i = 0;
6d59823c 547
d6af2388
L
548 /* Query until cache level 2 and 3 are enumerated. */
549 int check = 0x1 | (threads_l3 == 0) << 1;
7c08d791
L
550 do
551 {
552 __cpuid_count (4, i++, eax, ebx, ecx, edx);
553
554 /* There seems to be a bug in at least some Pentium Ds
555 which sometimes fail to iterate all cache parameters.
556 Do not loop indefinitely here, stop in this case and
557 assume there is no such information. */
558 if ((eax & 0x1f) == 0)
559 goto intel_bug_no_cache_info;
6d59823c 560
d6af2388
L
561 switch ((eax >> 5) & 0x7)
562 {
563 default:
564 break;
565 case 2:
566 if ((check & 0x1))
567 {
568 /* Get maximum number of logical processors
569 sharing L2 cache. */
570 threads_l2 = (eax >> 14) & 0x3ff;
571 check &= ~0x1;
572 }
573 break;
574 case 3:
575 if ((check & (0x1 << 1)))
576 {
577 /* Get maximum number of logical processors
578 sharing L3 cache. */
579 threads_l3 = (eax >> 14) & 0x3ff;
9e4ec3e8 580
d6af2388
L
581 /* Check if L2 and L3 caches are inclusive. */
582 inclusive_cache = (edx & 0x2) != 0;
583 check &= ~(0x1 << 1);
584 }
585 break;
586 }
587 }
588 while (check);
3aa2588d 589
d6af2388
L
590 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
591 numbers of addressable IDs for logical processors sharing
592 the cache, instead of the maximum number of threads
7c08d791 593 sharing the cache. */
d6af2388 594 if (max_cpuid >= 11)
a546baa9 595 {
7c08d791
L
596 /* Find the number of logical processors shipped in
597 one core and apply count mask. */
598 i = 0;
d6af2388
L
599
600 /* Count SMT only if there is L3 cache. Always count
601 core if there is no L3 cache. */
602 int count = ((threads_l2 > 0 && level == 3)
603 | ((threads_l3 > 0
604 || (threads_l2 > 0 && level == 2)) << 1));
605
606 while (count)
a546baa9 607 {
7c08d791
L
608 __cpuid_count (11, i++, eax, ebx, ecx, edx);
609
610 int shipped = ebx & 0xff;
de71e042 611 int type = ecx & 0xff00;
7c08d791
L
612 if (shipped == 0 || type == 0)
613 break;
d6af2388
L
614 else if (type == 0x100)
615 {
616 /* Count SMT. */
617 if ((count & 0x1))
618 {
619 int count_mask;
620
621 /* Compute count mask. */
622 asm ("bsr %1, %0"
623 : "=r" (count_mask) : "g" (threads_l2));
624 count_mask = ~(-1 << (count_mask + 1));
625 threads_l2 = (shipped - 1) & count_mask;
626 count &= ~0x1;
627 }
628 }
7c08d791
L
629 else if (type == 0x200)
630 {
d6af2388
L
631 /* Count core. */
632 if ((count & (0x1 << 1)))
633 {
634 int count_mask;
635 int threads_core
636 = (level == 2 ? threads_l2 : threads_l3);
637
638 /* Compute count mask. */
639 asm ("bsr %1, %0"
640 : "=r" (count_mask) : "g" (threads_core));
641 count_mask = ~(-1 << (count_mask + 1));
642 threads_core = (shipped - 1) & count_mask;
643 if (level == 2)
644 threads_l2 = threads_core;
645 else
646 threads_l3 = threads_core;
647 count &= ~(0x1 << 1);
648 }
7c08d791
L
649 }
650 }
651 }
d6af2388
L
652 if (threads_l2 > 0)
653 threads_l2 += 1;
654 if (threads_l3 > 0)
655 threads_l3 += 1;
656 if (level == 2)
7c08d791 657 {
d6af2388 658 if (threads_l2)
7c08d791 659 {
d6af2388
L
660 threads = threads_l2;
661 if (threads > 2 && family == 6)
662 switch (model)
663 {
664 case 0x37:
665 case 0x4a:
666 case 0x4d:
667 case 0x5a:
668 case 0x5d:
669 /* Silvermont has L2 cache shared by 2 cores. */
670 threads = 2;
671 break;
672 default:
673 break;
674 }
a546baa9
L
675 }
676 }
d6af2388
L
677 else if (threads_l3)
678 threads = threads_l3;
a546baa9 679 }
7c08d791 680 else
a3d9ab50 681 {
d4386d34 682 intel_bug_no_cache_info:
7c08d791
L
683 /* Assume that all logical threads share the highest cache
684 level. */
685
d4386d34
NG
686 threads = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx >> 16)
687 & 0xff);
6d59823c 688
d4386d34
NG
689 /* Cap usage of highest cache level to the number of supported
690 threads. */
691 if (shared_per_thread > 0 && threads > 0)
692 shared_per_thread /= threads;
693 }
bfe6f5fa 694 }
6d59823c 695
9e4ec3e8 696 /* Account for non-inclusive L2 and L3 caches. */
d6af2388
L
697 if (!inclusive_cache)
698 {
d4386d34
NG
699 if (threads_l2 > 0)
700 shared_per_thread += core / threads_l2;
d6af2388
L
701 shared += core;
702 }
bfe6f5fa 703 }
c22e4c2a 704 else if (cpu_features->basic.kind == arch_kind_amd)
bfe6f5fa 705 {
0435403c
UD
706 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
707 long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
708 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
d4386d34 709 shared_per_thread = shared;
6d59823c 710
0435403c 711 /* Get maximum extended function. */
6f6f1215 712 __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
bfe6f5fa 713
0435403c 714 if (shared <= 0)
d4386d34
NG
715 {
716 /* No shared L3 cache. All we have is the L2 cache. */
717 shared = core;
718 shared_per_thread = core;
719 }
0435403c
UD
720 else
721 {
722 /* Figure out the number of logical threads that share L3. */
723 if (max_cpuid_ex >= 0x80000008)
724 {
725 /* Get width of APIC ID. */
6f6f1215 726 __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
0435403c
UD
727 threads = 1 << ((ecx >> 12) & 0x0f);
728 }
729
730 if (threads == 0)
731 {
732 /* If APIC ID width is not available, use logical
733 processor count. */
6f6f1215 734 __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
0435403c
UD
735
736 if ((edx & (1 << 28)) != 0)
737 threads = (ebx >> 16) & 0xff;
738 }
739
740 /* Cap usage of highest cache level to the number of
741 supported threads. */
742 if (threads > 0)
d4386d34 743 shared_per_thread /= threads;
0435403c
UD
744
745 /* Account for exclusive L2 and L3 caches. */
746 shared += core;
d4386d34 747 shared_per_thread += core;
0435403c
UD
748 }
749
6f6f1215 750#ifndef DISABLE_PREFETCHW
bfe6f5fa
UD
751 if (max_cpuid_ex >= 0x80000001)
752 {
6f6f1215 753 __cpuid (0x80000001, eax, ebx, ecx, edx);
0435403c 754 /* PREFETCHW || 3DNow! */
bfe6f5fa 755 if ((ecx & 0x100) || (edx & 0x80000000))
afec409a 756 __x86_prefetchw = -1;
bfe6f5fa 757 }
6f6f1215 758#endif
bfe6f5fa
UD
759 }
760
905947c3
L
761 if (cpu_features->data_cache_size != 0)
762 data = cpu_features->data_cache_size;
763
0435403c 764 if (data > 0)
3af48cbd 765 {
afec409a
L
766 __x86_raw_data_cache_size_half = data / 2;
767 __x86_raw_data_cache_size = data;
c0dde15b
UD
768 /* Round data cache size to multiple of 256 bytes. */
769 data = data & ~255L;
afec409a
L
770 __x86_data_cache_size_half = data / 2;
771 __x86_data_cache_size = data;
3af48cbd 772 }
bfe6f5fa 773
905947c3 774 if (cpu_features->shared_cache_size != 0)
d4386d34 775 shared_per_thread = cpu_features->shared_cache_size;
905947c3 776
d4386d34 777 if (shared_per_thread > 0)
e2b393bc 778 {
d4386d34
NG
779 __x86_raw_shared_cache_size_half = shared_per_thread / 2;
780 __x86_raw_shared_cache_size = shared_per_thread;
c0dde15b 781 /* Round shared cache size to multiple of 256 bytes. */
d4386d34
NG
782 shared_per_thread = shared_per_thread & ~255L;
783 __x86_shared_cache_size_half = shared_per_thread / 2;
784 __x86_shared_cache_size = shared_per_thread;
e2b393bc 785 }
a057f5f8 786
d4386d34
NG
787 /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
788 of the chip's cache (depending on `cachesize_non_temporal_divisor` which
789 is microarch specific. The default is 1/4). For most Intel processors
790 with an initial release date between 2017 and 2023, a thread's
791 typical share of the cache is from 18-64MB. Using a reasonable size
792 fraction of L3 is meant to estimate the point where non-temporal stores
793 begin out-competing REP MOVSB. As well the point where the fact that
794 non-temporal stores are forced back to main memory would already occurred
795 to the majority of the lines in the copy. Note, concerns about the entire
796 L3 cache being evicted by the copy are mostly alleviated by the fact that
797 modern HW detects streaming patterns and provides proper LRU hints so that
798 the maximum thrashing capped at 1/associativity. */
799 unsigned long int non_temporal_threshold = shared / 4;
800 /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
801 a higher risk of actually thrashing the cache as they don't have a HW LRU
802 hint. As well, their performance in highly parallel situations is
803 noticeably worse. */
804 if (!CPU_FEATURES_CPU_P (cpu_features, ERMS))
805 non_temporal_threshold = shared_per_thread * 3 / 4;
806
808fd9e6 807 __x86_shared_non_temporal_threshold
905947c3
L
808 = (cpu_features->non_temporal_threshold != 0
809 ? cpu_features->non_temporal_threshold
d4386d34 810 : non_temporal_threshold);
bfe6f5fa 811}
9c450f6f
L
812
813#endif