]> git.ipfire.org Git - thirdparty/linux.git/blame - mm/vmstat.c
arm64: tegra: Remove duplicate nodes on Jetson Orin NX
[thirdparty/linux.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36 29#include <linux/page_owner.h>
be5e015d 30#include <linux/sched/isolation.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
4518085e
KW
34#ifdef CONFIG_NUMA
35int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
36
37/* zero numa counters within a zone */
38static void zero_zone_numa_counters(struct zone *zone)
39{
40 int item, cpu;
41
f19298b9
MG
42 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
43 atomic_long_set(&zone->vm_numa_event[item], 0);
44 for_each_online_cpu(cpu) {
45 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
4518085e 46 = 0;
f19298b9 47 }
4518085e
KW
48 }
49}
50
51/* zero numa counters of all the populated zones */
52static void zero_zones_numa_counters(void)
53{
54 struct zone *zone;
55
56 for_each_populated_zone(zone)
57 zero_zone_numa_counters(zone);
58}
59
60/* zero global numa counters */
61static void zero_global_numa_counters(void)
62{
63 int item;
64
f19298b9
MG
65 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
66 atomic_long_set(&vm_numa_event[item], 0);
4518085e
KW
67}
68
69static void invalid_numa_statistics(void)
70{
71 zero_zones_numa_counters();
72 zero_global_numa_counters();
73}
74
75static DEFINE_MUTEX(vm_numa_stat_lock);
76
77int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 78 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
79{
80 int ret, oldval;
81
82 mutex_lock(&vm_numa_stat_lock);
83 if (write)
84 oldval = sysctl_vm_numa_stat;
85 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86 if (ret || !write)
87 goto out;
88
89 if (oldval == sysctl_vm_numa_stat)
90 goto out;
91 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92 static_branch_enable(&vm_numa_stat_key);
93 pr_info("enable numa statistics\n");
94 } else {
95 static_branch_disable(&vm_numa_stat_key);
96 invalid_numa_statistics();
97 pr_info("disable numa statistics, and clear numa counters\n");
98 }
99
100out:
101 mutex_unlock(&vm_numa_stat_lock);
102 return ret;
103}
104#endif
105
f8891e5e
CL
106#ifdef CONFIG_VM_EVENT_COUNTERS
107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
108EXPORT_PER_CPU_SYMBOL(vm_event_states);
109
31f961a8 110static void sum_vm_events(unsigned long *ret)
f8891e5e 111{
9eccf2a8 112 int cpu;
f8891e5e
CL
113 int i;
114
115 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
116
31f961a8 117 for_each_online_cpu(cpu) {
f8891e5e
CL
118 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
119
f8891e5e
CL
120 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
121 ret[i] += this->event[i];
122 }
123}
124
125/*
126 * Accumulate the vm event counters across all CPUs.
127 * The result is unavoidably approximate - it can change
128 * during and after execution of this function.
129*/
130void all_vm_events(unsigned long *ret)
131{
7625eccd 132 cpus_read_lock();
31f961a8 133 sum_vm_events(ret);
7625eccd 134 cpus_read_unlock();
f8891e5e 135}
32dd66fc 136EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 137
f8891e5e
CL
138/*
139 * Fold the foreign cpu events into our own.
140 *
141 * This is adding to the events on one processor
142 * but keeps the global counts constant.
143 */
144void vm_events_fold_cpu(int cpu)
145{
146 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
147 int i;
148
149 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
150 count_vm_events(i, fold_state->event[i]);
151 fold_state->event[i] = 0;
152 }
153}
f8891e5e
CL
154
155#endif /* CONFIG_VM_EVENT_COUNTERS */
156
2244b95a
CL
157/*
158 * Manage combined zone based / global counters
159 *
160 * vm_stat contains the global counters
161 */
75ef7184
MG
162atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
163atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
f19298b9 164atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165EXPORT_SYMBOL(vm_zone_stat);
166EXPORT_SYMBOL(vm_node_stat);
2244b95a 167
ebeac3ea
GU
168#ifdef CONFIG_NUMA
169static void fold_vm_zone_numa_events(struct zone *zone)
170{
171 unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
172 int cpu;
173 enum numa_stat_item item;
174
175 for_each_online_cpu(cpu) {
176 struct per_cpu_zonestat *pzstats;
177
178 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
179 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
180 zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
181 }
182
183 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
184 zone_numa_event_add(zone_numa_events[item], zone, item);
185}
186
187void fold_vm_numa_events(void)
188{
189 struct zone *zone;
190
191 for_each_populated_zone(zone)
192 fold_vm_zone_numa_events(zone);
193}
194#endif
195
2244b95a
CL
196#ifdef CONFIG_SMP
197
b44129b3 198int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
199{
200 int threshold;
201 int watermark_distance;
202
203 /*
204 * As vmstats are not up to date, there is drift between the estimated
205 * and real values. For high thresholds and a high number of CPUs, it
206 * is possible for the min watermark to be breached while the estimated
207 * value looks fine. The pressure threshold is a reduced value such
208 * that even the maximum amount of drift will not accidentally breach
209 * the min watermark
210 */
211 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
212 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
213
214 /*
215 * Maximum threshold is 125
216 */
217 threshold = min(125, threshold);
218
219 return threshold;
220}
221
b44129b3 222int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
223{
224 int threshold;
225 int mem; /* memory in 128 MB units */
226
227 /*
228 * The threshold scales with the number of processors and the amount
229 * of memory per zone. More memory means that we can defer updates for
230 * longer, more processors could lead to more contention.
231 * fls() is used to have a cheap way of logarithmic scaling.
232 *
233 * Some sample thresholds:
234 *
ea15ba17 235 * Threshold Processors (fls) Zonesize fls(mem)+1
df9ecaba
CL
236 * ------------------------------------------------------------------
237 * 8 1 1 0.9-1 GB 4
238 * 16 2 2 0.9-1 GB 4
239 * 20 2 2 1-2 GB 5
240 * 24 2 2 2-4 GB 6
241 * 28 2 2 4-8 GB 7
242 * 32 2 2 8-16 GB 8
243 * 4 2 2 <128M 1
244 * 30 4 3 2-4 GB 5
245 * 48 4 3 8-16 GB 8
246 * 32 8 4 1-2 GB 4
247 * 32 8 4 0.9-1GB 4
248 * 10 16 5 <128M 1
249 * 40 16 5 900M 4
250 * 70 64 7 2-4 GB 5
251 * 84 64 7 4-8 GB 6
252 * 108 512 9 4-8 GB 6
253 * 125 1024 10 8-16 GB 8
254 * 125 1024 10 16-32 GB 9
255 */
256
9705bea5 257 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
258
259 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
260
261 /*
262 * Maximum threshold is 125
263 */
264 threshold = min(125, threshold);
265
266 return threshold;
267}
2244b95a
CL
268
269/*
df9ecaba 270 * Refresh the thresholds for each zone.
2244b95a 271 */
a6cccdc3 272void refresh_zone_stat_thresholds(void)
2244b95a 273{
75ef7184 274 struct pglist_data *pgdat;
df9ecaba
CL
275 struct zone *zone;
276 int cpu;
277 int threshold;
278
75ef7184
MG
279 /* Zero current pgdat thresholds */
280 for_each_online_pgdat(pgdat) {
281 for_each_online_cpu(cpu) {
282 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
283 }
284 }
285
ee99c71c 286 for_each_populated_zone(zone) {
75ef7184 287 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
288 unsigned long max_drift, tolerate_drift;
289
b44129b3 290 threshold = calculate_normal_threshold(zone);
df9ecaba 291
75ef7184
MG
292 for_each_online_cpu(cpu) {
293 int pgdat_threshold;
294
28f836b6 295 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
99dcc3e5 296 = threshold;
1d90ca89 297
75ef7184
MG
298 /* Base nodestat threshold on the largest populated zone. */
299 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
300 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
301 = max(threshold, pgdat_threshold);
302 }
303
aa454840
CL
304 /*
305 * Only set percpu_drift_mark if there is a danger that
306 * NR_FREE_PAGES reports the low watermark is ok when in fact
307 * the min watermark could be breached by an allocation
308 */
309 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
310 max_drift = num_online_cpus() * threshold;
311 if (max_drift > tolerate_drift)
312 zone->percpu_drift_mark = high_wmark_pages(zone) +
313 max_drift;
df9ecaba 314 }
2244b95a
CL
315}
316
b44129b3
MG
317void set_pgdat_percpu_threshold(pg_data_t *pgdat,
318 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
319{
320 struct zone *zone;
321 int cpu;
322 int threshold;
323 int i;
324
88f5acf8
MG
325 for (i = 0; i < pgdat->nr_zones; i++) {
326 zone = &pgdat->node_zones[i];
327 if (!zone->percpu_drift_mark)
328 continue;
329
b44129b3 330 threshold = (*calculate_pressure)(zone);
1d90ca89 331 for_each_online_cpu(cpu)
28f836b6 332 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
88f5acf8
MG
333 = threshold;
334 }
88f5acf8
MG
335}
336
2244b95a 337/*
bea04b07
JZ
338 * For use when we know that interrupts are disabled,
339 * or when we know that preemption is disabled and that
340 * particular counter cannot be updated from interrupt context.
2244b95a
CL
341 */
342void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 343 long delta)
2244b95a 344{
28f836b6 345 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92 346 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 347 long x;
12938a92
CL
348 long t;
349
c68ed794
IM
350 /*
351 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
352 * atomicity is provided by IRQs being disabled -- either explicitly
353 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
354 * CPU migrations and preemption potentially corrupts a counter so
355 * disable preemption.
356 */
7a025e91 357 preempt_disable_nested();
c68ed794 358
12938a92 359 x = delta + __this_cpu_read(*p);
2244b95a 360
12938a92 361 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 362
40610076 363 if (unlikely(abs(x) > t)) {
2244b95a
CL
364 zone_page_state_add(x, zone, item);
365 x = 0;
366 }
12938a92 367 __this_cpu_write(*p, x);
c68ed794 368
7a025e91 369 preempt_enable_nested();
2244b95a
CL
370}
371EXPORT_SYMBOL(__mod_zone_page_state);
372
75ef7184
MG
373void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
374 long delta)
375{
376 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
377 s8 __percpu *p = pcp->vm_node_stat_diff + item;
378 long x;
379 long t;
380
ea426c2a 381 if (vmstat_item_in_bytes(item)) {
629484ae
JW
382 /*
383 * Only cgroups use subpage accounting right now; at
384 * the global level, these items still change in
385 * multiples of whole pages. Store them as pages
386 * internally to keep the per-cpu counters compact.
387 */
ea426c2a
RG
388 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
389 delta >>= PAGE_SHIFT;
390 }
391
c68ed794 392 /* See __mod_node_page_state */
7a025e91 393 preempt_disable_nested();
c68ed794 394
75ef7184
MG
395 x = delta + __this_cpu_read(*p);
396
397 t = __this_cpu_read(pcp->stat_threshold);
398
40610076 399 if (unlikely(abs(x) > t)) {
75ef7184
MG
400 node_page_state_add(x, pgdat, item);
401 x = 0;
402 }
403 __this_cpu_write(*p, x);
c68ed794 404
7a025e91 405 preempt_enable_nested();
75ef7184
MG
406}
407EXPORT_SYMBOL(__mod_node_page_state);
408
2244b95a
CL
409/*
410 * Optimized increment and decrement functions.
411 *
412 * These are only for a single page and therefore can take a struct page *
413 * argument instead of struct zone *. This allows the inclusion of the code
414 * generated for page_zone(page) into the optimized functions.
415 *
416 * No overflow check is necessary and therefore the differential can be
417 * incremented or decremented in place which may allow the compilers to
418 * generate better code.
2244b95a
CL
419 * The increment or decrement is known and therefore one boundary check can
420 * be omitted.
421 *
df9ecaba
CL
422 * NOTE: These functions are very performance sensitive. Change only
423 * with care.
424 *
2244b95a
CL
425 * Some processors have inc/dec instructions that are atomic vs an interrupt.
426 * However, the code must first determine the differential location in a zone
427 * based on the processor number and then inc/dec the counter. There is no
428 * guarantee without disabling preemption that the processor will not change
429 * in between and therefore the atomicity vs. interrupt cannot be exploited
430 * in a useful way here.
431 */
c8785385 432void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 433{
28f836b6 434 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
435 s8 __percpu *p = pcp->vm_stat_diff + item;
436 s8 v, t;
2244b95a 437
c68ed794 438 /* See __mod_node_page_state */
7a025e91 439 preempt_disable_nested();
c68ed794 440
908ee0f1 441 v = __this_cpu_inc_return(*p);
12938a92
CL
442 t = __this_cpu_read(pcp->stat_threshold);
443 if (unlikely(v > t)) {
444 s8 overstep = t >> 1;
df9ecaba 445
12938a92
CL
446 zone_page_state_add(v + overstep, zone, item);
447 __this_cpu_write(*p, -overstep);
2244b95a 448 }
c68ed794 449
7a025e91 450 preempt_enable_nested();
2244b95a 451}
ca889e6c 452
75ef7184
MG
453void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
454{
455 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
456 s8 __percpu *p = pcp->vm_node_stat_diff + item;
457 s8 v, t;
458
ea426c2a
RG
459 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
460
c68ed794 461 /* See __mod_node_page_state */
7a025e91 462 preempt_disable_nested();
c68ed794 463
75ef7184
MG
464 v = __this_cpu_inc_return(*p);
465 t = __this_cpu_read(pcp->stat_threshold);
466 if (unlikely(v > t)) {
467 s8 overstep = t >> 1;
468
469 node_page_state_add(v + overstep, pgdat, item);
470 __this_cpu_write(*p, -overstep);
471 }
c68ed794 472
7a025e91 473 preempt_enable_nested();
75ef7184
MG
474}
475
ca889e6c
CL
476void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
477{
478 __inc_zone_state(page_zone(page), item);
479}
2244b95a
CL
480EXPORT_SYMBOL(__inc_zone_page_state);
481
75ef7184
MG
482void __inc_node_page_state(struct page *page, enum node_stat_item item)
483{
484 __inc_node_state(page_pgdat(page), item);
485}
486EXPORT_SYMBOL(__inc_node_page_state);
487
c8785385 488void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 489{
28f836b6 490 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
491 s8 __percpu *p = pcp->vm_stat_diff + item;
492 s8 v, t;
2244b95a 493
c68ed794 494 /* See __mod_node_page_state */
7a025e91 495 preempt_disable_nested();
c68ed794 496
908ee0f1 497 v = __this_cpu_dec_return(*p);
12938a92
CL
498 t = __this_cpu_read(pcp->stat_threshold);
499 if (unlikely(v < - t)) {
500 s8 overstep = t >> 1;
2244b95a 501
12938a92
CL
502 zone_page_state_add(v - overstep, zone, item);
503 __this_cpu_write(*p, overstep);
2244b95a 504 }
c68ed794 505
7a025e91 506 preempt_enable_nested();
2244b95a 507}
c8785385 508
75ef7184
MG
509void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
510{
511 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
512 s8 __percpu *p = pcp->vm_node_stat_diff + item;
513 s8 v, t;
514
ea426c2a
RG
515 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
516
c68ed794 517 /* See __mod_node_page_state */
7a025e91 518 preempt_disable_nested();
c68ed794 519
75ef7184
MG
520 v = __this_cpu_dec_return(*p);
521 t = __this_cpu_read(pcp->stat_threshold);
522 if (unlikely(v < - t)) {
523 s8 overstep = t >> 1;
524
525 node_page_state_add(v - overstep, pgdat, item);
526 __this_cpu_write(*p, overstep);
527 }
c68ed794 528
7a025e91 529 preempt_enable_nested();
75ef7184
MG
530}
531
c8785385
CL
532void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
533{
534 __dec_zone_state(page_zone(page), item);
535}
2244b95a
CL
536EXPORT_SYMBOL(__dec_zone_page_state);
537
75ef7184
MG
538void __dec_node_page_state(struct page *page, enum node_stat_item item)
539{
540 __dec_node_state(page_pgdat(page), item);
541}
542EXPORT_SYMBOL(__dec_node_page_state);
543
4156153c 544#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
545/*
546 * If we have cmpxchg_local support then we do not need to incur the overhead
547 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
548 *
549 * mod_state() modifies the zone counter state through atomic per cpu
550 * operations.
551 *
552 * Overstep mode specifies how overstep should handled:
553 * 0 No overstepping
554 * 1 Overstepping half of threshold
555 * -1 Overstepping minus half of threshold
556*/
75ef7184
MG
557static inline void mod_zone_state(struct zone *zone,
558 enum zone_stat_item item, long delta, int overstep_mode)
7c839120 559{
28f836b6 560 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
7c839120
CL
561 s8 __percpu *p = pcp->vm_stat_diff + item;
562 long o, n, t, z;
563
564 do {
565 z = 0; /* overflow to zone counters */
566
567 /*
568 * The fetching of the stat_threshold is racy. We may apply
569 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
570 * rescheduled while executing here. However, the next
571 * counter update will apply the threshold again and
572 * therefore bring the counter under the threshold again.
573 *
574 * Most of the time the thresholds are the same anyways
575 * for all cpus in a zone.
7c839120
CL
576 */
577 t = this_cpu_read(pcp->stat_threshold);
578
579 o = this_cpu_read(*p);
580 n = delta + o;
581
40610076 582 if (abs(n) > t) {
7c839120
CL
583 int os = overstep_mode * (t >> 1) ;
584
585 /* Overflow must be added to zone counters */
586 z = n + os;
587 n = -os;
588 }
589 } while (this_cpu_cmpxchg(*p, o, n) != o);
590
591 if (z)
592 zone_page_state_add(z, zone, item);
593}
594
595void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 596 long delta)
7c839120 597{
75ef7184 598 mod_zone_state(zone, item, delta, 0);
7c839120
CL
599}
600EXPORT_SYMBOL(mod_zone_page_state);
601
7c839120
CL
602void inc_zone_page_state(struct page *page, enum zone_stat_item item)
603{
75ef7184 604 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
605}
606EXPORT_SYMBOL(inc_zone_page_state);
607
608void dec_zone_page_state(struct page *page, enum zone_stat_item item)
609{
75ef7184 610 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
611}
612EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
613
614static inline void mod_node_state(struct pglist_data *pgdat,
615 enum node_stat_item item, int delta, int overstep_mode)
616{
617 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
618 s8 __percpu *p = pcp->vm_node_stat_diff + item;
619 long o, n, t, z;
620
ea426c2a 621 if (vmstat_item_in_bytes(item)) {
629484ae
JW
622 /*
623 * Only cgroups use subpage accounting right now; at
624 * the global level, these items still change in
625 * multiples of whole pages. Store them as pages
626 * internally to keep the per-cpu counters compact.
627 */
ea426c2a
RG
628 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
629 delta >>= PAGE_SHIFT;
630 }
631
75ef7184
MG
632 do {
633 z = 0; /* overflow to node counters */
634
635 /*
636 * The fetching of the stat_threshold is racy. We may apply
637 * a counter threshold to the wrong the cpu if we get
638 * rescheduled while executing here. However, the next
639 * counter update will apply the threshold again and
640 * therefore bring the counter under the threshold again.
641 *
642 * Most of the time the thresholds are the same anyways
643 * for all cpus in a node.
644 */
645 t = this_cpu_read(pcp->stat_threshold);
646
647 o = this_cpu_read(*p);
648 n = delta + o;
649
40610076 650 if (abs(n) > t) {
75ef7184
MG
651 int os = overstep_mode * (t >> 1) ;
652
653 /* Overflow must be added to node counters */
654 z = n + os;
655 n = -os;
656 }
657 } while (this_cpu_cmpxchg(*p, o, n) != o);
658
659 if (z)
660 node_page_state_add(z, pgdat, item);
661}
662
663void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
664 long delta)
665{
666 mod_node_state(pgdat, item, delta, 0);
667}
668EXPORT_SYMBOL(mod_node_page_state);
669
670void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
671{
672 mod_node_state(pgdat, item, 1, 1);
673}
674
675void inc_node_page_state(struct page *page, enum node_stat_item item)
676{
677 mod_node_state(page_pgdat(page), item, 1, 1);
678}
679EXPORT_SYMBOL(inc_node_page_state);
680
681void dec_node_page_state(struct page *page, enum node_stat_item item)
682{
683 mod_node_state(page_pgdat(page), item, -1, -1);
684}
685EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
686#else
687/*
688 * Use interrupt disable to serialize counter updates
689 */
690void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 691 long delta)
7c839120
CL
692{
693 unsigned long flags;
694
695 local_irq_save(flags);
696 __mod_zone_page_state(zone, item, delta);
697 local_irq_restore(flags);
698}
699EXPORT_SYMBOL(mod_zone_page_state);
700
2244b95a
CL
701void inc_zone_page_state(struct page *page, enum zone_stat_item item)
702{
703 unsigned long flags;
704 struct zone *zone;
2244b95a
CL
705
706 zone = page_zone(page);
707 local_irq_save(flags);
ca889e6c 708 __inc_zone_state(zone, item);
2244b95a
CL
709 local_irq_restore(flags);
710}
711EXPORT_SYMBOL(inc_zone_page_state);
712
713void dec_zone_page_state(struct page *page, enum zone_stat_item item)
714{
715 unsigned long flags;
2244b95a 716
2244b95a 717 local_irq_save(flags);
a302eb4e 718 __dec_zone_page_state(page, item);
2244b95a
CL
719 local_irq_restore(flags);
720}
721EXPORT_SYMBOL(dec_zone_page_state);
722
75ef7184
MG
723void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
724{
725 unsigned long flags;
726
727 local_irq_save(flags);
728 __inc_node_state(pgdat, item);
729 local_irq_restore(flags);
730}
731EXPORT_SYMBOL(inc_node_state);
732
733void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
734 long delta)
735{
736 unsigned long flags;
737
738 local_irq_save(flags);
739 __mod_node_page_state(pgdat, item, delta);
740 local_irq_restore(flags);
741}
742EXPORT_SYMBOL(mod_node_page_state);
743
744void inc_node_page_state(struct page *page, enum node_stat_item item)
745{
746 unsigned long flags;
747 struct pglist_data *pgdat;
748
749 pgdat = page_pgdat(page);
750 local_irq_save(flags);
751 __inc_node_state(pgdat, item);
752 local_irq_restore(flags);
753}
754EXPORT_SYMBOL(inc_node_page_state);
755
756void dec_node_page_state(struct page *page, enum node_stat_item item)
757{
758 unsigned long flags;
759
760 local_irq_save(flags);
761 __dec_node_page_state(page, item);
762 local_irq_restore(flags);
763}
764EXPORT_SYMBOL(dec_node_page_state);
765#endif
7cc36bbd
CL
766
767/*
768 * Fold a differential into the global counters.
769 * Returns the number of counters updated.
770 */
f19298b9 771static int fold_diff(int *zone_diff, int *node_diff)
3a321d2a
KW
772{
773 int i;
774 int changes = 0;
775
776 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
777 if (zone_diff[i]) {
778 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
779 changes++;
780 }
781
3a321d2a
KW
782 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
783 if (node_diff[i]) {
784 atomic_long_add(node_diff[i], &vm_node_stat[i]);
785 changes++;
786 }
787 return changes;
788}
f19298b9 789
2244b95a 790/*
2bb921e5 791 * Update the zone counters for the current cpu.
a7f75e25 792 *
4037d452
CL
793 * Note that refresh_cpu_vm_stats strives to only access
794 * node local memory. The per cpu pagesets on remote zones are placed
795 * in the memory local to the processor using that pageset. So the
796 * loop over all zones will access a series of cachelines local to
797 * the processor.
798 *
799 * The call to zone_page_state_add updates the cachelines with the
800 * statistics in the remote zone struct as well as the global cachelines
801 * with the global counters. These could cause remote node cache line
802 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
803 *
804 * The function returns the number of global counters updated.
2244b95a 805 */
0eb77e98 806static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 807{
75ef7184 808 struct pglist_data *pgdat;
2244b95a
CL
809 struct zone *zone;
810 int i;
75ef7184
MG
811 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
812 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 813 int changes = 0;
2244b95a 814
ee99c71c 815 for_each_populated_zone(zone) {
28f836b6
MG
816 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
817#ifdef CONFIG_NUMA
818 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
819#endif
2244b95a 820
fbc2edb0
CL
821 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
822 int v;
2244b95a 823
28f836b6 824 v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
fbc2edb0 825 if (v) {
a7f75e25 826
a7f75e25 827 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 828 global_zone_diff[i] += v;
4037d452
CL
829#ifdef CONFIG_NUMA
830 /* 3 seconds idle till flush */
28f836b6 831 __this_cpu_write(pcp->expire, 3);
4037d452 832#endif
2244b95a 833 }
fbc2edb0 834 }
4037d452 835#ifdef CONFIG_NUMA
3a321d2a 836
0eb77e98
CL
837 if (do_pagesets) {
838 cond_resched();
839 /*
840 * Deal with draining the remote pageset of this
841 * processor
842 *
843 * Check if there are pages remaining in this pageset
844 * if not then there is nothing to expire.
845 */
28f836b6
MG
846 if (!__this_cpu_read(pcp->expire) ||
847 !__this_cpu_read(pcp->count))
0eb77e98 848 continue;
4037d452 849
0eb77e98
CL
850 /*
851 * We never drain zones local to this processor.
852 */
853 if (zone_to_nid(zone) == numa_node_id()) {
28f836b6 854 __this_cpu_write(pcp->expire, 0);
0eb77e98
CL
855 continue;
856 }
4037d452 857
28f836b6 858 if (__this_cpu_dec_return(pcp->expire))
0eb77e98 859 continue;
4037d452 860
28f836b6
MG
861 if (__this_cpu_read(pcp->count)) {
862 drain_zone_pages(zone, this_cpu_ptr(pcp));
0eb77e98
CL
863 changes++;
864 }
7cc36bbd 865 }
4037d452 866#endif
2244b95a 867 }
75ef7184
MG
868
869 for_each_online_pgdat(pgdat) {
870 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
871
872 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
873 int v;
874
875 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
876 if (v) {
877 atomic_long_add(v, &pgdat->vm_stat[i]);
878 global_node_diff[i] += v;
879 }
880 }
881 }
882
883 changes += fold_diff(global_zone_diff, global_node_diff);
7cc36bbd 884 return changes;
2244b95a
CL
885}
886
2bb921e5
CL
887/*
888 * Fold the data for an offline cpu into the global array.
889 * There cannot be any access by the offline cpu and therefore
890 * synchronization is simplified.
891 */
892void cpu_vm_stats_fold(int cpu)
893{
75ef7184 894 struct pglist_data *pgdat;
2bb921e5
CL
895 struct zone *zone;
896 int i;
75ef7184
MG
897 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
898 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
899
900 for_each_populated_zone(zone) {
28f836b6 901 struct per_cpu_zonestat *pzstats;
2bb921e5 902
28f836b6 903 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bb921e5 904
f19298b9 905 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 906 if (pzstats->vm_stat_diff[i]) {
2bb921e5
CL
907 int v;
908
28f836b6
MG
909 v = pzstats->vm_stat_diff[i];
910 pzstats->vm_stat_diff[i] = 0;
2bb921e5 911 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 912 global_zone_diff[i] += v;
2bb921e5 913 }
f19298b9 914 }
3a321d2a 915#ifdef CONFIG_NUMA
f19298b9
MG
916 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
917 if (pzstats->vm_numa_event[i]) {
918 unsigned long v;
3a321d2a 919
f19298b9
MG
920 v = pzstats->vm_numa_event[i];
921 pzstats->vm_numa_event[i] = 0;
922 zone_numa_event_add(v, zone, i);
3a321d2a 923 }
f19298b9 924 }
3a321d2a 925#endif
2bb921e5
CL
926 }
927
75ef7184
MG
928 for_each_online_pgdat(pgdat) {
929 struct per_cpu_nodestat *p;
930
931 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
932
933 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
934 if (p->vm_node_stat_diff[i]) {
935 int v;
936
937 v = p->vm_node_stat_diff[i];
938 p->vm_node_stat_diff[i] = 0;
939 atomic_long_add(v, &pgdat->vm_stat[i]);
940 global_node_diff[i] += v;
941 }
942 }
943
944 fold_diff(global_zone_diff, global_node_diff);
2bb921e5
CL
945}
946
40f4b1ea
CS
947/*
948 * this is only called if !populated_zone(zone), which implies no other users of
f0953a1b 949 * pset->vm_stat_diff[] exist.
40f4b1ea 950 */
28f836b6 951void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
5a883813 952{
f19298b9 953 unsigned long v;
5a883813
MK
954 int i;
955
f19298b9 956 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 957 if (pzstats->vm_stat_diff[i]) {
f19298b9 958 v = pzstats->vm_stat_diff[i];
28f836b6 959 pzstats->vm_stat_diff[i] = 0;
f19298b9 960 zone_page_state_add(v, zone, i);
5a883813 961 }
f19298b9 962 }
3a321d2a
KW
963
964#ifdef CONFIG_NUMA
f19298b9
MG
965 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
966 if (pzstats->vm_numa_event[i]) {
967 v = pzstats->vm_numa_event[i];
968 pzstats->vm_numa_event[i] = 0;
969 zone_numa_event_add(v, zone, i);
3a321d2a 970 }
f19298b9 971 }
3a321d2a 972#endif
5a883813 973}
2244b95a
CL
974#endif
975
ca889e6c 976#ifdef CONFIG_NUMA
c2d42c16 977/*
75ef7184
MG
978 * Determine the per node value of a stat item. This function
979 * is called frequently in a NUMA machine, so try to be as
980 * frugal as possible.
c2d42c16 981 */
75ef7184
MG
982unsigned long sum_zone_node_page_state(int node,
983 enum zone_stat_item item)
c2d42c16
AM
984{
985 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
986 int i;
987 unsigned long count = 0;
c2d42c16 988
e87d59f7
JK
989 for (i = 0; i < MAX_NR_ZONES; i++)
990 count += zone_page_state(zones + i, item);
991
992 return count;
c2d42c16
AM
993}
994
f19298b9
MG
995/* Determine the per node value of a numa stat item. */
996unsigned long sum_zone_numa_event_state(int node,
3a321d2a
KW
997 enum numa_stat_item item)
998{
999 struct zone *zones = NODE_DATA(node)->node_zones;
3a321d2a 1000 unsigned long count = 0;
f19298b9 1001 int i;
3a321d2a
KW
1002
1003 for (i = 0; i < MAX_NR_ZONES; i++)
f19298b9 1004 count += zone_numa_event_state(zones + i, item);
3a321d2a
KW
1005
1006 return count;
1007}
1008
75ef7184
MG
1009/*
1010 * Determine the per node value of a stat item.
1011 */
ea426c2a
RG
1012unsigned long node_page_state_pages(struct pglist_data *pgdat,
1013 enum node_stat_item item)
75ef7184
MG
1014{
1015 long x = atomic_long_read(&pgdat->vm_stat[item]);
1016#ifdef CONFIG_SMP
1017 if (x < 0)
1018 x = 0;
1019#endif
1020 return x;
1021}
ea426c2a
RG
1022
1023unsigned long node_page_state(struct pglist_data *pgdat,
1024 enum node_stat_item item)
1025{
1026 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1027
1028 return node_page_state_pages(pgdat, item);
1029}
ca889e6c
CL
1030#endif
1031
d7a5752c 1032#ifdef CONFIG_COMPACTION
36deb0be 1033
d7a5752c
MG
1034struct contig_page_info {
1035 unsigned long free_pages;
1036 unsigned long free_blocks_total;
1037 unsigned long free_blocks_suitable;
1038};
1039
1040/*
1041 * Calculate the number of free pages in a zone, how many contiguous
1042 * pages are free and how many are large enough to satisfy an allocation of
1043 * the target size. Note that this function makes no attempt to estimate
1044 * how many suitable free blocks there *might* be if MOVABLE pages were
1045 * migrated. Calculating that is possible, but expensive and can be
1046 * figured out from userspace
1047 */
1048static void fill_contig_page_info(struct zone *zone,
1049 unsigned int suitable_order,
1050 struct contig_page_info *info)
1051{
1052 unsigned int order;
1053
1054 info->free_pages = 0;
1055 info->free_blocks_total = 0;
1056 info->free_blocks_suitable = 0;
1057
23baf831 1058 for (order = 0; order <= MAX_ORDER; order++) {
d7a5752c
MG
1059 unsigned long blocks;
1060
af1c31ac
LS
1061 /*
1062 * Count number of free blocks.
1063 *
1064 * Access to nr_free is lockless as nr_free is used only for
1065 * diagnostic purposes. Use data_race to avoid KCSAN warning.
1066 */
1067 blocks = data_race(zone->free_area[order].nr_free);
d7a5752c
MG
1068 info->free_blocks_total += blocks;
1069
1070 /* Count free base pages */
1071 info->free_pages += blocks << order;
1072
1073 /* Count the suitable free blocks */
1074 if (order >= suitable_order)
1075 info->free_blocks_suitable += blocks <<
1076 (order - suitable_order);
1077 }
1078}
f1a5ab12
MG
1079
1080/*
1081 * A fragmentation index only makes sense if an allocation of a requested
1082 * size would fail. If that is true, the fragmentation index indicates
1083 * whether external fragmentation or a lack of memory was the problem.
1084 * The value can be used to determine if page reclaim or compaction
1085 * should be used
1086 */
56de7263 1087static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1088{
1089 unsigned long requested = 1UL << order;
1090
23baf831 1091 if (WARN_ON_ONCE(order > MAX_ORDER))
88d6ac40
WY
1092 return 0;
1093
f1a5ab12
MG
1094 if (!info->free_blocks_total)
1095 return 0;
1096
1097 /* Fragmentation index only makes sense when a request would fail */
1098 if (info->free_blocks_suitable)
1099 return -1000;
1100
1101 /*
1102 * Index is between 0 and 1 so return within 3 decimal places
1103 *
1104 * 0 => allocation would fail due to lack of memory
1105 * 1 => allocation would fail due to fragmentation
1106 */
1107 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1108}
56de7263 1109
facdaa91
NG
1110/*
1111 * Calculates external fragmentation within a zone wrt the given order.
1112 * It is defined as the percentage of pages found in blocks of size
1113 * less than 1 << order. It returns values in range [0, 100].
1114 */
d34c0a75 1115unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1116{
1117 struct contig_page_info info;
1118
1119 fill_contig_page_info(zone, order, &info);
1120 if (info.free_pages == 0)
1121 return 0;
1122
1123 return div_u64((info.free_pages -
1124 (info.free_blocks_suitable << order)) * 100,
1125 info.free_pages);
1126}
1127
56de7263
MG
1128/* Same as __fragmentation index but allocs contig_page_info on stack */
1129int fragmentation_index(struct zone *zone, unsigned int order)
1130{
1131 struct contig_page_info info;
1132
1133 fill_contig_page_info(zone, order, &info);
1134 return __fragmentation_index(order, &info);
1135}
d7a5752c
MG
1136#endif
1137
ebc5d83d
KK
1138#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1139 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1140#ifdef CONFIG_ZONE_DMA
1141#define TEXT_FOR_DMA(xx) xx "_dma",
1142#else
1143#define TEXT_FOR_DMA(xx)
1144#endif
1145
1146#ifdef CONFIG_ZONE_DMA32
1147#define TEXT_FOR_DMA32(xx) xx "_dma32",
1148#else
1149#define TEXT_FOR_DMA32(xx)
1150#endif
1151
1152#ifdef CONFIG_HIGHMEM
1153#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1154#else
1155#define TEXT_FOR_HIGHMEM(xx)
1156#endif
1157
a39c5d3c
HL
1158#ifdef CONFIG_ZONE_DEVICE
1159#define TEXT_FOR_DEVICE(xx) xx "_device",
1160#else
1161#define TEXT_FOR_DEVICE(xx)
1162#endif
1163
fa25c503 1164#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
a39c5d3c
HL
1165 TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1166 TEXT_FOR_DEVICE(xx)
fa25c503
KM
1167
1168const char * const vmstat_text[] = {
8d92890b 1169 /* enum zone_stat_item counters */
fa25c503 1170 "nr_free_pages",
71c799f4
MK
1171 "nr_zone_inactive_anon",
1172 "nr_zone_active_anon",
1173 "nr_zone_inactive_file",
1174 "nr_zone_active_file",
1175 "nr_zone_unevictable",
5a1c84b4 1176 "nr_zone_write_pending",
fa25c503 1177 "nr_mlock",
fa25c503 1178 "nr_bounce",
91537fee
MK
1179#if IS_ENABLED(CONFIG_ZSMALLOC)
1180 "nr_zspages",
1181#endif
3a321d2a 1182 "nr_free_cma",
dcdfdd40
KS
1183#ifdef CONFIG_UNACCEPTED_MEMORY
1184 "nr_unaccepted",
1185#endif
3a321d2a
KW
1186
1187 /* enum numa_stat_item counters */
fa25c503
KM
1188#ifdef CONFIG_NUMA
1189 "numa_hit",
1190 "numa_miss",
1191 "numa_foreign",
1192 "numa_interleave",
1193 "numa_local",
1194 "numa_other",
1195#endif
09316c09 1196
9d7ea9a2 1197 /* enum node_stat_item counters */
599d0c95
MG
1198 "nr_inactive_anon",
1199 "nr_active_anon",
1200 "nr_inactive_file",
1201 "nr_active_file",
1202 "nr_unevictable",
385386cf
JW
1203 "nr_slab_reclaimable",
1204 "nr_slab_unreclaimable",
599d0c95
MG
1205 "nr_isolated_anon",
1206 "nr_isolated_file",
68d48e6a 1207 "workingset_nodes",
170b04b7
JK
1208 "workingset_refault_anon",
1209 "workingset_refault_file",
1210 "workingset_activate_anon",
1211 "workingset_activate_file",
1212 "workingset_restore_anon",
1213 "workingset_restore_file",
1e6b1085 1214 "workingset_nodereclaim",
50658e2e
MG
1215 "nr_anon_pages",
1216 "nr_mapped",
11fb9989
MG
1217 "nr_file_pages",
1218 "nr_dirty",
1219 "nr_writeback",
1220 "nr_writeback_temp",
1221 "nr_shmem",
1222 "nr_shmem_hugepages",
1223 "nr_shmem_pmdmapped",
60fbf0ab
SL
1224 "nr_file_hugepages",
1225 "nr_file_pmdmapped",
11fb9989 1226 "nr_anon_transparent_hugepages",
c4a25635
MG
1227 "nr_vmscan_write",
1228 "nr_vmscan_immediate_reclaim",
1229 "nr_dirtied",
1230 "nr_written",
8cd7c588 1231 "nr_throttled_written",
b29940c1 1232 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1233 "nr_foll_pin_acquired",
1234 "nr_foll_pin_released",
991e7673
SB
1235 "nr_kernel_stack",
1236#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1237 "nr_shadow_call_stack",
1238#endif
f0c0c115 1239 "nr_page_table_pages",
ebc97a52 1240 "nr_sec_page_table_pages",
b6038942
SB
1241#ifdef CONFIG_SWAP
1242 "nr_swapcached",
1243#endif
e39bb6be
HY
1244#ifdef CONFIG_NUMA_BALANCING
1245 "pgpromote_success",
c6833e10 1246 "pgpromote_candidate",
e39bb6be 1247#endif
599d0c95 1248
09316c09 1249 /* enum writeback_stat_item counters */
fa25c503
KM
1250 "nr_dirty_threshold",
1251 "nr_dirty_background_threshold",
1252
ebc5d83d 1253#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1254 /* enum vm_event_item counters */
fa25c503
KM
1255 "pgpgin",
1256 "pgpgout",
1257 "pswpin",
1258 "pswpout",
1259
1260 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1261 TEXTS_FOR_ZONES("allocstall")
1262 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1263
1264 "pgfree",
1265 "pgactivate",
1266 "pgdeactivate",
f7ad2a6c 1267 "pglazyfree",
fa25c503
KM
1268
1269 "pgfault",
1270 "pgmajfault",
854e9ed0 1271 "pglazyfreed",
fa25c503 1272
599d0c95 1273 "pgrefill",
798a6b87 1274 "pgreuse",
599d0c95
MG
1275 "pgsteal_kswapd",
1276 "pgsteal_direct",
57e9cc50 1277 "pgsteal_khugepaged",
668e4147
YS
1278 "pgdemote_kswapd",
1279 "pgdemote_direct",
57e9cc50 1280 "pgdemote_khugepaged",
599d0c95
MG
1281 "pgscan_kswapd",
1282 "pgscan_direct",
57e9cc50 1283 "pgscan_khugepaged",
68243e76 1284 "pgscan_direct_throttle",
497a6c1b
JW
1285 "pgscan_anon",
1286 "pgscan_file",
1287 "pgsteal_anon",
1288 "pgsteal_file",
fa25c503
KM
1289
1290#ifdef CONFIG_NUMA
1291 "zone_reclaim_failed",
1292#endif
1293 "pginodesteal",
1294 "slabs_scanned",
fa25c503
KM
1295 "kswapd_inodesteal",
1296 "kswapd_low_wmark_hit_quickly",
1297 "kswapd_high_wmark_hit_quickly",
fa25c503 1298 "pageoutrun",
fa25c503
KM
1299
1300 "pgrotated",
1301
5509a5d2
DH
1302 "drop_pagecache",
1303 "drop_slab",
8e675f7a 1304 "oom_kill",
5509a5d2 1305
03c5a6e1
MG
1306#ifdef CONFIG_NUMA_BALANCING
1307 "numa_pte_updates",
72403b4a 1308 "numa_huge_pte_updates",
03c5a6e1
MG
1309 "numa_hint_faults",
1310 "numa_hint_faults_local",
1311 "numa_pages_migrated",
1312#endif
5647bc29
MG
1313#ifdef CONFIG_MIGRATION
1314 "pgmigrate_success",
1315 "pgmigrate_fail",
1a5bae25
AK
1316 "thp_migration_success",
1317 "thp_migration_fail",
1318 "thp_migration_split",
5647bc29 1319#endif
fa25c503 1320#ifdef CONFIG_COMPACTION
397487db
MG
1321 "compact_migrate_scanned",
1322 "compact_free_scanned",
1323 "compact_isolated",
fa25c503
KM
1324 "compact_stall",
1325 "compact_fail",
1326 "compact_success",
698b1b30 1327 "compact_daemon_wake",
7f354a54
DR
1328 "compact_daemon_migrate_scanned",
1329 "compact_daemon_free_scanned",
fa25c503
KM
1330#endif
1331
1332#ifdef CONFIG_HUGETLB_PAGE
1333 "htlb_buddy_alloc_success",
1334 "htlb_buddy_alloc_fail",
bbb26920
MK
1335#endif
1336#ifdef CONFIG_CMA
1337 "cma_alloc_success",
1338 "cma_alloc_fail",
fa25c503
KM
1339#endif
1340 "unevictable_pgs_culled",
1341 "unevictable_pgs_scanned",
1342 "unevictable_pgs_rescued",
1343 "unevictable_pgs_mlocked",
1344 "unevictable_pgs_munlocked",
1345 "unevictable_pgs_cleared",
1346 "unevictable_pgs_stranded",
fa25c503
KM
1347
1348#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1349 "thp_fault_alloc",
1350 "thp_fault_fallback",
85b9f46e 1351 "thp_fault_fallback_charge",
fa25c503
KM
1352 "thp_collapse_alloc",
1353 "thp_collapse_alloc_failed",
95ecedcd 1354 "thp_file_alloc",
dcdf11ee 1355 "thp_file_fallback",
85b9f46e 1356 "thp_file_fallback_charge",
95ecedcd 1357 "thp_file_mapped",
122afea9
KS
1358 "thp_split_page",
1359 "thp_split_page_failed",
f9719a03 1360 "thp_deferred_split_page",
122afea9 1361 "thp_split_pmd",
e9ea874a
YY
1362 "thp_scan_exceed_none_pte",
1363 "thp_scan_exceed_swap_pte",
1364 "thp_scan_exceed_share_pte",
ce9311cf
YX
1365#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1366 "thp_split_pud",
1367#endif
d8a8e1f0
KS
1368 "thp_zero_page_alloc",
1369 "thp_zero_page_alloc_failed",
225311a4 1370 "thp_swpout",
fe490cc0 1371 "thp_swpout_fallback",
fa25c503 1372#endif
09316c09
KK
1373#ifdef CONFIG_MEMORY_BALLOON
1374 "balloon_inflate",
1375 "balloon_deflate",
1376#ifdef CONFIG_BALLOON_COMPACTION
1377 "balloon_migrate",
1378#endif
1379#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1380#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1381 "nr_tlb_remote_flush",
1382 "nr_tlb_remote_flush_received",
1383 "nr_tlb_local_flush_all",
1384 "nr_tlb_local_flush_one",
ec659934 1385#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1386
cbc65df2
HY
1387#ifdef CONFIG_SWAP
1388 "swap_ra",
1389 "swap_ra_hit",
4d45c3af
YY
1390#ifdef CONFIG_KSM
1391 "ksm_swpin_copy",
1392#endif
cbc65df2 1393#endif
94bfe85b
YY
1394#ifdef CONFIG_KSM
1395 "cow_ksm",
1396#endif
f6498b77
JW
1397#ifdef CONFIG_ZSWAP
1398 "zswpin",
1399 "zswpout",
1400#endif
575299ea
S
1401#ifdef CONFIG_X86
1402 "direct_map_level2_splits",
1403 "direct_map_level3_splits",
1404#endif
52f23865
SB
1405#ifdef CONFIG_PER_VMA_LOCK_STATS
1406 "vma_lock_success",
1407 "vma_lock_abort",
1408 "vma_lock_retry",
1409 "vma_lock_miss",
1410#endif
ebc5d83d 1411#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1412};
ebc5d83d 1413#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1414
3c486871
AM
1415#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1416 defined(CONFIG_PROC_FS)
1417static void *frag_start(struct seq_file *m, loff_t *pos)
1418{
1419 pg_data_t *pgdat;
1420 loff_t node = *pos;
1421
1422 for (pgdat = first_online_pgdat();
1423 pgdat && node;
1424 pgdat = next_online_pgdat(pgdat))
1425 --node;
1426
1427 return pgdat;
1428}
1429
1430static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1431{
1432 pg_data_t *pgdat = (pg_data_t *)arg;
1433
1434 (*pos)++;
1435 return next_online_pgdat(pgdat);
1436}
1437
1438static void frag_stop(struct seq_file *m, void *arg)
1439{
1440}
1441
b2bd8598
DR
1442/*
1443 * Walk zones in a node and print using a callback.
1444 * If @assert_populated is true, only use callback for zones that are populated.
1445 */
3c486871 1446static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1447 bool assert_populated, bool nolock,
3c486871
AM
1448 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1449{
1450 struct zone *zone;
1451 struct zone *node_zones = pgdat->node_zones;
1452 unsigned long flags;
1453
1454 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1455 if (assert_populated && !populated_zone(zone))
3c486871
AM
1456 continue;
1457
727c080f
VM
1458 if (!nolock)
1459 spin_lock_irqsave(&zone->lock, flags);
3c486871 1460 print(m, pgdat, zone);
727c080f
VM
1461 if (!nolock)
1462 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1463 }
1464}
1465#endif
1466
d7a5752c 1467#ifdef CONFIG_PROC_FS
467c996c
MG
1468static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1469 struct zone *zone)
1470{
1471 int order;
1472
1473 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
23baf831 1474 for (order = 0; order <= MAX_ORDER; ++order)
af1c31ac
LS
1475 /*
1476 * Access to nr_free is lockless as nr_free is used only for
1477 * printing purposes. Use data_race to avoid KCSAN warning.
1478 */
1479 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
467c996c
MG
1480 seq_putc(m, '\n');
1481}
1482
1483/*
1484 * This walks the free areas for each zone.
1485 */
1486static int frag_show(struct seq_file *m, void *arg)
1487{
1488 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1489 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1490 return 0;
1491}
1492
1493static void pagetypeinfo_showfree_print(struct seq_file *m,
1494 pg_data_t *pgdat, struct zone *zone)
1495{
1496 int order, mtype;
1497
1498 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1499 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1500 pgdat->node_id,
1501 zone->name,
1502 migratetype_names[mtype]);
23baf831 1503 for (order = 0; order <= MAX_ORDER; ++order) {
467c996c
MG
1504 unsigned long freecount = 0;
1505 struct free_area *area;
1506 struct list_head *curr;
93b3a674 1507 bool overflow = false;
467c996c
MG
1508
1509 area = &(zone->free_area[order]);
1510
93b3a674
MH
1511 list_for_each(curr, &area->free_list[mtype]) {
1512 /*
1513 * Cap the free_list iteration because it might
1514 * be really large and we are under a spinlock
1515 * so a long time spent here could trigger a
1516 * hard lockup detector. Anyway this is a
1517 * debugging tool so knowing there is a handful
1518 * of pages of this order should be more than
1519 * sufficient.
1520 */
1521 if (++freecount >= 100000) {
1522 overflow = true;
1523 break;
1524 }
1525 }
1526 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1527 spin_unlock_irq(&zone->lock);
1528 cond_resched();
1529 spin_lock_irq(&zone->lock);
467c996c 1530 }
f6ac2354
CL
1531 seq_putc(m, '\n');
1532 }
467c996c
MG
1533}
1534
1535/* Print out the free pages at each order for each migatetype */
33090af9 1536static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
467c996c
MG
1537{
1538 int order;
1539 pg_data_t *pgdat = (pg_data_t *)arg;
1540
1541 /* Print header */
1542 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
23baf831 1543 for (order = 0; order <= MAX_ORDER; ++order)
467c996c
MG
1544 seq_printf(m, "%6d ", order);
1545 seq_putc(m, '\n');
1546
727c080f 1547 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1548}
1549
1550static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1551 pg_data_t *pgdat, struct zone *zone)
1552{
1553 int mtype;
1554 unsigned long pfn;
1555 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1556 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1557 unsigned long count[MIGRATE_TYPES] = { 0, };
1558
1559 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1560 struct page *page;
1561
d336e94e
MH
1562 page = pfn_to_online_page(pfn);
1563 if (!page)
467c996c
MG
1564 continue;
1565
a91c43c7
JK
1566 if (page_zone(page) != zone)
1567 continue;
1568
467c996c
MG
1569 mtype = get_pageblock_migratetype(page);
1570
e80d6a24
MG
1571 if (mtype < MIGRATE_TYPES)
1572 count[mtype]++;
467c996c
MG
1573 }
1574
1575 /* Print counts */
1576 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1577 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1578 seq_printf(m, "%12lu ", count[mtype]);
1579 seq_putc(m, '\n');
1580}
1581
f113e641 1582/* Print out the number of pageblocks for each migratetype */
33090af9 1583static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
467c996c
MG
1584{
1585 int mtype;
1586 pg_data_t *pgdat = (pg_data_t *)arg;
1587
1588 seq_printf(m, "\n%-23s", "Number of blocks type ");
1589 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1590 seq_printf(m, "%12s ", migratetype_names[mtype]);
1591 seq_putc(m, '\n');
727c080f
VM
1592 walk_zones_in_node(m, pgdat, true, false,
1593 pagetypeinfo_showblockcount_print);
467c996c
MG
1594}
1595
48c96a36
JK
1596/*
1597 * Print out the number of pageblocks for each migratetype that contain pages
1598 * of other types. This gives an indication of how well fallbacks are being
1599 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1600 * to determine what is going on
1601 */
1602static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1603{
1604#ifdef CONFIG_PAGE_OWNER
1605 int mtype;
1606
7dd80b8a 1607 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1608 return;
1609
1610 drain_all_pages(NULL);
1611
1612 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1613 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1614 seq_printf(m, "%12s ", migratetype_names[mtype]);
1615 seq_putc(m, '\n');
1616
727c080f
VM
1617 walk_zones_in_node(m, pgdat, true, true,
1618 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1619#endif /* CONFIG_PAGE_OWNER */
1620}
1621
467c996c
MG
1622/*
1623 * This prints out statistics in relation to grouping pages by mobility.
1624 * It is expensive to collect so do not constantly read the file.
1625 */
1626static int pagetypeinfo_show(struct seq_file *m, void *arg)
1627{
1628 pg_data_t *pgdat = (pg_data_t *)arg;
1629
41b25a37 1630 /* check memoryless node */
a47b53c5 1631 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1632 return 0;
1633
467c996c
MG
1634 seq_printf(m, "Page block order: %d\n", pageblock_order);
1635 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1636 seq_putc(m, '\n');
1637 pagetypeinfo_showfree(m, pgdat);
1638 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1639 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1640
f6ac2354
CL
1641 return 0;
1642}
1643
8f32f7e5 1644static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1645 .start = frag_start,
1646 .next = frag_next,
1647 .stop = frag_stop,
1648 .show = frag_show,
1649};
1650
74e2e8e8 1651static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1652 .start = frag_start,
1653 .next = frag_next,
1654 .stop = frag_stop,
1655 .show = pagetypeinfo_show,
1656};
1657
e2ecc8a7
MG
1658static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1659{
1660 int zid;
1661
1662 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1663 struct zone *compare = &pgdat->node_zones[zid];
1664
1665 if (populated_zone(compare))
1666 return zone == compare;
1667 }
1668
e2ecc8a7
MG
1669 return false;
1670}
1671
467c996c
MG
1672static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1673 struct zone *zone)
f6ac2354 1674{
467c996c
MG
1675 int i;
1676 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1677 if (is_zone_first_populated(pgdat, zone)) {
1678 seq_printf(m, "\n per-node stats");
1679 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1680 unsigned long pages = node_page_state_pages(pgdat, i);
1681
1682 if (vmstat_item_print_in_thp(i))
1683 pages /= HPAGE_PMD_NR;
9d7ea9a2 1684 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1685 pages);
e2ecc8a7
MG
1686 }
1687 }
467c996c
MG
1688 seq_printf(m,
1689 "\n pages free %lu"
a6ea8b5b 1690 "\n boost %lu"
467c996c
MG
1691 "\n min %lu"
1692 "\n low %lu"
1693 "\n high %lu"
467c996c 1694 "\n spanned %lu"
9feedc9d 1695 "\n present %lu"
3c381db1
DH
1696 "\n managed %lu"
1697 "\n cma %lu",
88f5acf8 1698 zone_page_state(zone, NR_FREE_PAGES),
a6ea8b5b 1699 zone->watermark_boost,
41858966
MG
1700 min_wmark_pages(zone),
1701 low_wmark_pages(zone),
1702 high_wmark_pages(zone),
467c996c 1703 zone->spanned_pages,
9feedc9d 1704 zone->present_pages,
3c381db1
DH
1705 zone_managed_pages(zone),
1706 zone_cma_pages(zone));
467c996c 1707
467c996c 1708 seq_printf(m,
3484b2de 1709 "\n protection: (%ld",
467c996c
MG
1710 zone->lowmem_reserve[0]);
1711 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1712 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1713 seq_putc(m, ')');
1714
a8a4b7ae
BH
1715 /* If unpopulated, no other information is useful */
1716 if (!populated_zone(zone)) {
1717 seq_putc(m, '\n');
1718 return;
1719 }
1720
7dfb8bf3 1721 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1722 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1723 zone_page_state(zone, i));
7dfb8bf3 1724
3a321d2a 1725#ifdef CONFIG_NUMA
f19298b9 1726 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
9d7ea9a2 1727 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
f19298b9 1728 zone_numa_event_state(zone, i));
3a321d2a
KW
1729#endif
1730
7dfb8bf3 1731 seq_printf(m, "\n pagesets");
467c996c 1732 for_each_online_cpu(i) {
28f836b6
MG
1733 struct per_cpu_pages *pcp;
1734 struct per_cpu_zonestat __maybe_unused *pzstats;
467c996c 1735
28f836b6 1736 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
3dfa5721
CL
1737 seq_printf(m,
1738 "\n cpu: %i"
1739 "\n count: %i"
1740 "\n high: %i"
1741 "\n batch: %i",
1742 i,
28f836b6
MG
1743 pcp->count,
1744 pcp->high,
1745 pcp->batch);
df9ecaba 1746#ifdef CONFIG_SMP
28f836b6 1747 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
467c996c 1748 seq_printf(m, "\n vm stats threshold: %d",
28f836b6 1749 pzstats->stat_threshold);
df9ecaba 1750#endif
f6ac2354 1751 }
467c996c 1752 seq_printf(m,
599d0c95 1753 "\n node_unreclaimable: %u"
3a50d14d 1754 "\n start_pfn: %lu",
c73322d0 1755 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1756 zone->zone_start_pfn);
467c996c
MG
1757 seq_putc(m, '\n');
1758}
1759
1760/*
b2bd8598
DR
1761 * Output information about zones in @pgdat. All zones are printed regardless
1762 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1763 * set of all zones and userspace would not be aware of such zones if they are
1764 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1765 */
1766static int zoneinfo_show(struct seq_file *m, void *arg)
1767{
1768 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1769 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1770 return 0;
1771}
1772
5c9fe628 1773static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1774 .start = frag_start, /* iterate over all zones. The same as in
1775 * fragmentation. */
1776 .next = frag_next,
1777 .stop = frag_stop,
1778 .show = zoneinfo_show,
1779};
1780
9d7ea9a2 1781#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
f19298b9 1782 NR_VM_NUMA_EVENT_ITEMS + \
9d7ea9a2
KK
1783 NR_VM_NODE_STAT_ITEMS + \
1784 NR_VM_WRITEBACK_STAT_ITEMS + \
1785 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1786 NR_VM_EVENT_ITEMS : 0))
79da826a 1787
f6ac2354
CL
1788static void *vmstat_start(struct seq_file *m, loff_t *pos)
1789{
2244b95a 1790 unsigned long *v;
9d7ea9a2 1791 int i;
f6ac2354 1792
9d7ea9a2 1793 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1794 return NULL;
79da826a 1795
9d7ea9a2 1796 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
f19298b9 1797 fold_vm_numa_events();
9d7ea9a2 1798 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1799 m->private = v;
1800 if (!v)
f6ac2354 1801 return ERR_PTR(-ENOMEM);
2244b95a 1802 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1803 v[i] = global_zone_page_state(i);
79da826a
MR
1804 v += NR_VM_ZONE_STAT_ITEMS;
1805
3a321d2a 1806#ifdef CONFIG_NUMA
f19298b9
MG
1807 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1808 v[i] = global_numa_event_state(i);
1809 v += NR_VM_NUMA_EVENT_ITEMS;
3a321d2a
KW
1810#endif
1811
69473e5d 1812 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1813 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1814 if (vmstat_item_print_in_thp(i))
1815 v[i] /= HPAGE_PMD_NR;
1816 }
75ef7184
MG
1817 v += NR_VM_NODE_STAT_ITEMS;
1818
79da826a
MR
1819 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1820 v + NR_DIRTY_THRESHOLD);
1821 v += NR_VM_WRITEBACK_STAT_ITEMS;
1822
f8891e5e 1823#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1824 all_vm_events(v);
1825 v[PGPGIN] /= 2; /* sectors -> kbytes */
1826 v[PGPGOUT] /= 2;
f8891e5e 1827#endif
ff8b16d7 1828 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1829}
1830
1831static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1832{
1833 (*pos)++;
9d7ea9a2 1834 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1835 return NULL;
1836 return (unsigned long *)m->private + *pos;
1837}
1838
1839static int vmstat_show(struct seq_file *m, void *arg)
1840{
1841 unsigned long *l = arg;
1842 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1843
1844 seq_puts(m, vmstat_text[off]);
75ba1d07 1845 seq_put_decimal_ull(m, " ", *l);
68ba0326 1846 seq_putc(m, '\n');
8d92890b
N
1847
1848 if (off == NR_VMSTAT_ITEMS - 1) {
1849 /*
1850 * We've come to the end - add any deprecated counters to avoid
1851 * breaking userspace which might depend on them being present.
1852 */
1853 seq_puts(m, "nr_unstable 0\n");
1854 }
f6ac2354
CL
1855 return 0;
1856}
1857
1858static void vmstat_stop(struct seq_file *m, void *arg)
1859{
1860 kfree(m->private);
1861 m->private = NULL;
1862}
1863
b6aa44ab 1864static const struct seq_operations vmstat_op = {
f6ac2354
CL
1865 .start = vmstat_start,
1866 .next = vmstat_next,
1867 .stop = vmstat_stop,
1868 .show = vmstat_show,
1869};
f6ac2354
CL
1870#endif /* CONFIG_PROC_FS */
1871
df9ecaba 1872#ifdef CONFIG_SMP
d1187ed2 1873static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1874int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1875
52b6f46b
HD
1876#ifdef CONFIG_PROC_FS
1877static void refresh_vm_stats(struct work_struct *work)
1878{
1879 refresh_cpu_vm_stats(true);
1880}
1881
1882int vmstat_refresh(struct ctl_table *table, int write,
32927393 1883 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1884{
1885 long val;
1886 int err;
1887 int i;
1888
1889 /*
1890 * The regular update, every sysctl_stat_interval, may come later
1891 * than expected: leaving a significant amount in per_cpu buckets.
1892 * This is particularly misleading when checking a quantity of HUGE
1893 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1894 * which can equally be echo'ed to or cat'ted from (by root),
1895 * can be used to update the stats just before reading them.
1896 *
c41f012a 1897 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1898 * transiently negative values, report an error here if any of
1899 * the stats is negative, so we know to go looking for imbalance.
1900 */
1901 err = schedule_on_each_cpu(refresh_vm_stats);
1902 if (err)
1903 return err;
1904 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1905 /*
1906 * Skip checking stats known to go negative occasionally.
1907 */
1908 switch (i) {
1909 case NR_ZONE_WRITE_PENDING:
1910 case NR_FREE_CMA_PAGES:
1911 continue;
1912 }
75ef7184 1913 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1914 if (val < 0) {
c822f622 1915 pr_warn("%s: %s %ld\n",
9d7ea9a2 1916 __func__, zone_stat_name(i), val);
52b6f46b
HD
1917 }
1918 }
76d8cc3c 1919 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1920 /*
1921 * Skip checking stats known to go negative occasionally.
1922 */
1923 switch (i) {
1924 case NR_WRITEBACK:
1925 continue;
1926 }
76d8cc3c
HD
1927 val = atomic_long_read(&vm_node_stat[i]);
1928 if (val < 0) {
1929 pr_warn("%s: %s %ld\n",
1930 __func__, node_stat_name(i), val);
76d8cc3c
HD
1931 }
1932 }
52b6f46b
HD
1933 if (write)
1934 *ppos += *lenp;
1935 else
1936 *lenp = 0;
1937 return 0;
1938}
1939#endif /* CONFIG_PROC_FS */
1940
d1187ed2
CL
1941static void vmstat_update(struct work_struct *w)
1942{
0eb77e98 1943 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1944 /*
1945 * Counters were updated so we expect more updates
1946 * to occur in the future. Keep on running the
1947 * update worker thread.
1948 */
ce612879 1949 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1950 this_cpu_ptr(&vmstat_work),
1951 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1952 }
1953}
1954
1955/*
1956 * Check if the diffs for a certain cpu indicate that
1957 * an update is needed.
1958 */
1959static bool need_update(int cpu)
1960{
2bbd00ae 1961 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1962 struct zone *zone;
1963
1964 for_each_populated_zone(zone) {
28f836b6 1965 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bbd00ae 1966 struct per_cpu_nodestat *n;
28f836b6 1967
7cc36bbd
CL
1968 /*
1969 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1970 */
64632fd3 1971 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
7cc36bbd 1972 return true;
f19298b9 1973
2bbd00ae
JW
1974 if (last_pgdat == zone->zone_pgdat)
1975 continue;
1976 last_pgdat = zone->zone_pgdat;
1977 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
64632fd3
ML
1978 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
1979 return true;
7cc36bbd
CL
1980 }
1981 return false;
1982}
1983
7b8da4c7
CL
1984/*
1985 * Switch off vmstat processing and then fold all the remaining differentials
1986 * until the diffs stay at zero. The function is used by NOHZ and can only be
1987 * invoked when tick processing is not active.
1988 */
f01f17d3
MH
1989void quiet_vmstat(void)
1990{
1991 if (system_state != SYSTEM_RUNNING)
1992 return;
1993
7b8da4c7 1994 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1995 return;
1996
1997 if (!need_update(smp_processor_id()))
1998 return;
1999
2000 /*
2001 * Just refresh counters and do not care about the pending delayed
2002 * vmstat_update. It doesn't fire that often to matter and canceling
2003 * it would be too expensive from this path.
2004 * vmstat_shepherd will take care about that for us.
2005 */
2006 refresh_cpu_vm_stats(false);
2007}
2008
7cc36bbd
CL
2009/*
2010 * Shepherd worker thread that checks the
2011 * differentials of processors that have their worker
2012 * threads for vm statistics updates disabled because of
2013 * inactivity.
2014 */
2015static void vmstat_shepherd(struct work_struct *w);
2016
0eb77e98 2017static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
2018
2019static void vmstat_shepherd(struct work_struct *w)
2020{
2021 int cpu;
2022
7625eccd 2023 cpus_read_lock();
7cc36bbd 2024 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 2025 for_each_online_cpu(cpu) {
f01f17d3 2026 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 2027
be5e015d
MT
2028 /*
2029 * In kernel users of vmstat counters either require the precise value and
2030 * they are using zone_page_state_snapshot interface or they can live with
2031 * an imprecision as the regular flushing can happen at arbitrary time and
2032 * cumulative error can grow (see calculate_normal_threshold).
2033 *
2034 * From that POV the regular flushing can be postponed for CPUs that have
2035 * been isolated from the kernel interference without critical
2036 * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2037 * for all isolated CPUs to avoid interference with the isolated workload.
2038 */
2039 if (cpu_is_isolated(cpu))
2040 continue;
2041
7b8da4c7 2042 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 2043 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
2044
2045 cond_resched();
f01f17d3 2046 }
7625eccd 2047 cpus_read_unlock();
7cc36bbd
CL
2048
2049 schedule_delayed_work(&shepherd,
98f4ebb2 2050 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2051}
2052
7cc36bbd 2053static void __init start_shepherd_timer(void)
d1187ed2 2054{
7cc36bbd
CL
2055 int cpu;
2056
2057 for_each_possible_cpu(cpu)
ccde8bd4 2058 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
2059 vmstat_update);
2060
7cc36bbd
CL
2061 schedule_delayed_work(&shepherd,
2062 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2063}
2064
03e86dba
TC
2065static void __init init_cpu_node_state(void)
2066{
4c501327 2067 int node;
03e86dba 2068
4c501327 2069 for_each_online_node(node) {
b55032f1 2070 if (!cpumask_empty(cpumask_of_node(node)))
4c501327
SAS
2071 node_set_state(node, N_CPU);
2072 }
03e86dba
TC
2073}
2074
5438da97
SAS
2075static int vmstat_cpu_online(unsigned int cpu)
2076{
2077 refresh_zone_stat_thresholds();
734c1570
OS
2078
2079 if (!node_state(cpu_to_node(cpu), N_CPU)) {
2080 node_set_state(cpu_to_node(cpu), N_CPU);
734c1570
OS
2081 }
2082
5438da97
SAS
2083 return 0;
2084}
2085
2086static int vmstat_cpu_down_prep(unsigned int cpu)
2087{
2088 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2089 return 0;
2090}
2091
2092static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2093{
4c501327 2094 const struct cpumask *node_cpus;
5438da97 2095 int node;
807a1bd2 2096
5438da97
SAS
2097 node = cpu_to_node(cpu);
2098
2099 refresh_zone_stat_thresholds();
4c501327 2100 node_cpus = cpumask_of_node(node);
b55032f1 2101 if (!cpumask_empty(node_cpus))
5438da97 2102 return 0;
807a1bd2
TK
2103
2104 node_clear_state(node, N_CPU);
734c1570 2105
5438da97 2106 return 0;
807a1bd2
TK
2107}
2108
8f32f7e5 2109#endif
df9ecaba 2110
ce612879
MH
2111struct workqueue_struct *mm_percpu_wq;
2112
597b7305 2113void __init init_mm_internals(void)
df9ecaba 2114{
ce612879 2115 int ret __maybe_unused;
5438da97 2116
80d136e1 2117 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2118
2119#ifdef CONFIG_SMP
5438da97
SAS
2120 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2121 NULL, vmstat_cpu_dead);
2122 if (ret < 0)
2123 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2124
2125 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2126 vmstat_cpu_online,
2127 vmstat_cpu_down_prep);
2128 if (ret < 0)
2129 pr_err("vmstat: failed to register 'online' hotplug state\n");
2130
7625eccd 2131 cpus_read_lock();
03e86dba 2132 init_cpu_node_state();
7625eccd 2133 cpus_read_unlock();
d1187ed2 2134
7cc36bbd 2135 start_shepherd_timer();
8f32f7e5
AD
2136#endif
2137#ifdef CONFIG_PROC_FS
fddda2b7 2138 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2139 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2140 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2141 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2142#endif
df9ecaba 2143}
d7a5752c
MG
2144
2145#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2146
2147/*
2148 * Return an index indicating how much of the available free memory is
2149 * unusable for an allocation of the requested size.
2150 */
2151static int unusable_free_index(unsigned int order,
2152 struct contig_page_info *info)
2153{
2154 /* No free memory is interpreted as all free memory is unusable */
2155 if (info->free_pages == 0)
2156 return 1000;
2157
2158 /*
2159 * Index should be a value between 0 and 1. Return a value to 3
2160 * decimal places.
2161 *
2162 * 0 => no fragmentation
2163 * 1 => high fragmentation
2164 */
2165 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2166
2167}
2168
2169static void unusable_show_print(struct seq_file *m,
2170 pg_data_t *pgdat, struct zone *zone)
2171{
2172 unsigned int order;
2173 int index;
2174 struct contig_page_info info;
2175
2176 seq_printf(m, "Node %d, zone %8s ",
2177 pgdat->node_id,
2178 zone->name);
23baf831 2179 for (order = 0; order <= MAX_ORDER; ++order) {
d7a5752c
MG
2180 fill_contig_page_info(zone, order, &info);
2181 index = unusable_free_index(order, &info);
2182 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2183 }
2184
2185 seq_putc(m, '\n');
2186}
2187
2188/*
2189 * Display unusable free space index
2190 *
2191 * The unusable free space index measures how much of the available free
2192 * memory cannot be used to satisfy an allocation of a given size and is a
2193 * value between 0 and 1. The higher the value, the more of free memory is
2194 * unusable and by implication, the worse the external fragmentation is. This
2195 * can be expressed as a percentage by multiplying by 100.
2196 */
2197static int unusable_show(struct seq_file *m, void *arg)
2198{
2199 pg_data_t *pgdat = (pg_data_t *)arg;
2200
2201 /* check memoryless node */
a47b53c5 2202 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2203 return 0;
2204
727c080f 2205 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2206
2207 return 0;
2208}
2209
01a99560 2210static const struct seq_operations unusable_sops = {
d7a5752c
MG
2211 .start = frag_start,
2212 .next = frag_next,
2213 .stop = frag_stop,
2214 .show = unusable_show,
2215};
2216
01a99560 2217DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2218
f1a5ab12
MG
2219static void extfrag_show_print(struct seq_file *m,
2220 pg_data_t *pgdat, struct zone *zone)
2221{
2222 unsigned int order;
2223 int index;
2224
2225 /* Alloc on stack as interrupts are disabled for zone walk */
2226 struct contig_page_info info;
2227
2228 seq_printf(m, "Node %d, zone %8s ",
2229 pgdat->node_id,
2230 zone->name);
23baf831 2231 for (order = 0; order <= MAX_ORDER; ++order) {
f1a5ab12 2232 fill_contig_page_info(zone, order, &info);
56de7263 2233 index = __fragmentation_index(order, &info);
a9970586 2234 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
f1a5ab12
MG
2235 }
2236
2237 seq_putc(m, '\n');
2238}
2239
2240/*
2241 * Display fragmentation index for orders that allocations would fail for
2242 */
2243static int extfrag_show(struct seq_file *m, void *arg)
2244{
2245 pg_data_t *pgdat = (pg_data_t *)arg;
2246
727c080f 2247 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2248
2249 return 0;
2250}
2251
01a99560 2252static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2253 .start = frag_start,
2254 .next = frag_next,
2255 .stop = frag_stop,
2256 .show = extfrag_show,
2257};
2258
01a99560 2259DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2260
d7a5752c
MG
2261static int __init extfrag_debug_init(void)
2262{
bde8bd8a
S
2263 struct dentry *extfrag_debug_root;
2264
d7a5752c 2265 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2266
d9f7979c 2267 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2268 &unusable_fops);
d7a5752c 2269
d9f7979c 2270 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2271 &extfrag_fops);
f1a5ab12 2272
d7a5752c
MG
2273 return 0;
2274}
2275
2276module_init(extfrag_debug_init);
2277#endif