]> git.ipfire.org Git - thirdparty/kernel/linux.git/blame - block/blk-iocost.c
blk-iocost: Remove unnecessary advance declaration
[thirdparty/kernel/linux.git] / block / blk-iocost.c
CommitLineData
7caa4715
TH
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
5ba1add2 42 * parameters for several different classes of devices are provided and the
7caa4715
TH
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * 2. Control Strategy
50 *
51 * The device virtual time (vtime) is used as the primary control metric.
52 * The control strategy is composed of the following three parts.
53 *
54 * 2-1. Vtime Distribution
55 *
56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
57 * calculated. Please consider the following hierarchy where the numbers
58 * inside parentheses denote the configured weights.
59 *
60 * root
61 * / \
62 * A (w:100) B (w:300)
63 * / \
64 * A0 (w:100) A1 (w:100)
65 *
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69 * 12.5% each. The distribution mechanism only cares about these flattened
70 * shares. They're called hweights (hierarchical weights) and always add
fe20cdb5 71 * upto 1 (WEIGHT_ONE).
7caa4715
TH
72 *
73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75 * against the device vtime - an IO which takes 10ms on the underlying
76 * device is considered to take 80ms on A0.
77 *
78 * This constitutes the basis of IO capacity distribution. Each cgroup's
79 * vtime is running at a rate determined by its hweight. A cgroup tracks
5ba1add2 80 * the vtime consumed by past IOs and can issue a new IO if doing so
7caa4715
TH
81 * wouldn't outrun the current device vtime. Otherwise, the IO is
82 * suspended until the vtime has progressed enough to cover it.
83 *
84 * 2-2. Vrate Adjustment
85 *
86 * It's unrealistic to expect the cost model to be perfect. There are too
87 * many devices and even on the same device the overall performance
88 * fluctuates depending on numerous factors such as IO mixture and device
89 * internal garbage collection. The controller needs to adapt dynamically.
90 *
91 * This is achieved by adjusting the overall IO rate according to how busy
92 * the device is. If the device becomes overloaded, we're sending down too
93 * many IOs and should generally slow down. If there are waiting issuers
94 * but the device isn't saturated, we're issuing too few and should
95 * generally speed up.
96 *
97 * To slow down, we lower the vrate - the rate at which the device vtime
98 * passes compared to the wall clock. For example, if the vtime is running
99 * at the vrate of 75%, all cgroups added up would only be able to issue
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
101 *
102 * Device business is determined using two criteria - rq wait and
103 * completion latencies.
104 *
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
108 * indication that the device is saturated and we lower the vrate. This
109 * saturation signal is fairly conservative as it only triggers when both
110 * hardware and software queues are filled up, and is used as the default
111 * busy signal.
112 *
113 * As devices can have deep queues and be unfair in how the queued commands
114 * are executed, soley depending on rq wait may not result in satisfactory
115 * control quality. For a better control quality, completion latency QoS
116 * parameters can be configured so that the device is considered saturated
117 * if N'th percentile completion latency rises above the set point.
118 *
119 * The completion latency requirements are a function of both the
120 * underlying device characteristics and the desired IO latency quality of
121 * service. There is an inherent trade-off - the tighter the latency QoS,
122 * the higher the bandwidth lossage. Latency QoS is disabled by default
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
124 *
125 * 2-3. Work Conservation
126 *
127 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
128 * periodically while B is sending out enough parallel IOs to saturate the
129 * device on its own. Let's say A's usage amounts to 100ms worth of IO
130 * cost per second, i.e., 10% of the device capacity. The naive
131 * distribution of half and half would lead to 60% utilization of the
132 * device, a significant reduction in the total amount of work done
133 * compared to free-for-all competition. This is too high a cost to pay
134 * for IO control.
135 *
136 * To conserve the total amount of work done, we keep track of how much
137 * each active cgroup is actually using and yield part of its weight if
138 * there are other cgroups which can make use of it. In the above case,
139 * A's weight will be lowered so that it hovers above the actual usage and
140 * B would be able to use the rest.
141 *
142 * As we don't want to penalize a cgroup for donating its weight, the
143 * surplus weight adjustment factors in a margin and has an immediate
144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
145 *
146 * Note that adjusting down surplus weights has the same effects as
147 * accelerating vtime for other cgroups and work conservation can also be
148 * implemented by adjusting vrate dynamically. However, squaring who can
149 * donate and should take back how much requires hweight propagations
150 * anyway making it easier to implement and understand as a separate
151 * mechanism.
6954ff18
TH
152 *
153 * 3. Monitoring
154 *
155 * Instead of debugfs or other clumsy monitoring mechanisms, this
156 * controller uses a drgn based monitoring script -
157 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
5ba1add2 158 * https://github.com/osandov/drgn. The output looks like the following.
6954ff18
TH
159 *
160 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
7c1ee704
TH
161 * active weight hweight% inflt% dbt delay usages%
162 * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
163 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
6954ff18
TH
164 *
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
7caa4715
TH
173 */
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
181#include <linux/blk-cgroup.h>
5e124f74
TH
182#include <asm/local.h>
183#include <asm/local64.h>
7caa4715
TH
184#include "blk-rq-qos.h"
185#include "blk-stat.h"
186#include "blk-wbt.h"
187
188#ifdef CONFIG_TRACEPOINTS
189
190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191#define TRACE_IOCG_PATH_LEN 1024
192static DEFINE_SPINLOCK(trace_iocg_path_lock);
193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194
195#define TRACE_IOCG_PATH(type, iocg, ...) \
196 do { \
197 unsigned long flags; \
198 if (trace_iocost_##type##_enabled()) { \
199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
202 trace_iocost_##type(iocg, trace_iocg_path, \
203 ##__VA_ARGS__); \
204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
205 } \
206 } while (0)
207
208#else /* CONFIG_TRACE_POINTS */
209#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
210#endif /* CONFIG_TRACE_POINTS */
211
212enum {
213 MILLION = 1000000,
214
215 /* timer period is calculated from latency requirements, bound it */
216 MIN_PERIOD = USEC_PER_MSEC,
217 MAX_PERIOD = USEC_PER_SEC,
218
219 /*
f1de2439 220 * iocg->vtime is targeted at 50% behind the device vtime, which
7caa4715
TH
221 * serves as its IO credit buffer. Surplus weight adjustment is
222 * immediately canceled if the vtime margin runs below 10%.
223 */
7ca5b2e6 224 MARGIN_MIN_PCT = 10,
f1de2439
TH
225 MARGIN_LOW_PCT = 20,
226 MARGIN_TARGET_PCT = 50,
7caa4715 227
b0853ab4
TH
228 INUSE_ADJ_STEP_PCT = 25,
229
7ca5b2e6
TH
230 /* Have some play in timer operations */
231 TIMER_SLACK_PCT = 1,
7caa4715 232
7caa4715 233 /* 1/64k is granular enough and can easily be handled w/ u32 */
fe20cdb5 234 WEIGHT_ONE = 1 << 16,
7caa4715
TH
235
236 /*
237 * As vtime is used to calculate the cost of each IO, it needs to
238 * be fairly high precision. For example, it should be able to
239 * represent the cost of a single page worth of discard with
240 * suffificient accuracy. At the same time, it should be able to
241 * represent reasonably long enough durations to be useful and
242 * convenient during operation.
243 *
244 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
245 * granularity and days of wrap-around time even at extreme vrates.
246 */
247 VTIME_PER_SEC_SHIFT = 37,
248 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
249 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
cd006509 250 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
7caa4715
TH
251
252 /* bound vrate adjustments within two orders of magnitude */
253 VRATE_MIN_PPM = 10000, /* 1% */
254 VRATE_MAX_PPM = 100000000, /* 10000% */
255
256 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
257 VRATE_CLAMP_ADJ_PCT = 4,
258
259 /* if IOs end up waiting for requests, issue less */
260 RQ_WAIT_BUSY_PCT = 5,
261
262 /* unbusy hysterisis */
263 UNBUSY_THR_PCT = 75,
264
5160a5a5
TH
265 /*
266 * The effect of delay is indirect and non-linear and a huge amount of
267 * future debt can accumulate abruptly while unthrottled. Linearly scale
268 * up delay as debt is going up and then let it decay exponentially.
269 * This gives us quick ramp ups while delay is accumulating and long
270 * tails which can help reducing the frequency of debt explosions on
271 * unthrottle. The parameters are experimentally determined.
272 *
273 * The delay mechanism provides adequate protection and behavior in many
274 * cases. However, this is far from ideal and falls shorts on both
275 * fronts. The debtors are often throttled too harshly costing a
276 * significant level of fairness and possibly total work while the
277 * protection against their impacts on the system can be choppy and
278 * unreliable.
279 *
280 * The shortcoming primarily stems from the fact that, unlike for page
281 * cache, the kernel doesn't have well-defined back-pressure propagation
282 * mechanism and policies for anonymous memory. Fully addressing this
283 * issue will likely require substantial improvements in the area.
284 */
285 MIN_DELAY_THR_PCT = 500,
286 MAX_DELAY_THR_PCT = 25000,
287 MIN_DELAY = 250,
288 MAX_DELAY = 250 * USEC_PER_MSEC,
289
c7af2a00
TH
290 /* halve debts if avg usage over 100ms is under 50% */
291 DFGV_USAGE_PCT = 50,
292 DFGV_PERIOD = 100 * USEC_PER_MSEC,
dda1315f 293
7caa4715
TH
294 /* don't let cmds which take a very long time pin lagging for too long */
295 MAX_LAGGING_PERIODS = 10,
296
7caa4715
TH
297 /* switch iff the conditions are met for longer than this */
298 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
299
300 /*
301 * Count IO size in 4k pages. The 12bit shift helps keeping
302 * size-proportional components of cost calculation in closer
303 * numbers of digits to per-IO cost components.
304 */
305 IOC_PAGE_SHIFT = 12,
306 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
307 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
308
309 /* if apart further than 16M, consider randio for linear model */
310 LCOEF_RANDIO_PAGES = 4096,
311};
312
313enum ioc_running {
314 IOC_IDLE,
315 IOC_RUNNING,
316 IOC_STOP,
317};
318
319/* io.cost.qos controls including per-dev enable of the whole controller */
320enum {
321 QOS_ENABLE,
322 QOS_CTRL,
323 NR_QOS_CTRL_PARAMS,
324};
325
326/* io.cost.qos params */
327enum {
328 QOS_RPPM,
329 QOS_RLAT,
330 QOS_WPPM,
331 QOS_WLAT,
332 QOS_MIN,
333 QOS_MAX,
334 NR_QOS_PARAMS,
335};
336
337/* io.cost.model controls */
338enum {
339 COST_CTRL,
340 COST_MODEL,
341 NR_COST_CTRL_PARAMS,
342};
343
344/* builtin linear cost model coefficients */
345enum {
346 I_LCOEF_RBPS,
347 I_LCOEF_RSEQIOPS,
348 I_LCOEF_RRANDIOPS,
349 I_LCOEF_WBPS,
350 I_LCOEF_WSEQIOPS,
351 I_LCOEF_WRANDIOPS,
352 NR_I_LCOEFS,
353};
354
355enum {
356 LCOEF_RPAGE,
357 LCOEF_RSEQIO,
358 LCOEF_RRANDIO,
359 LCOEF_WPAGE,
360 LCOEF_WSEQIO,
361 LCOEF_WRANDIO,
362 NR_LCOEFS,
363};
364
365enum {
366 AUTOP_INVALID,
367 AUTOP_HDD,
368 AUTOP_SSD_QD1,
369 AUTOP_SSD_DFL,
370 AUTOP_SSD_FAST,
371};
372
7caa4715
TH
373struct ioc_params {
374 u32 qos[NR_QOS_PARAMS];
375 u64 i_lcoefs[NR_I_LCOEFS];
376 u64 lcoefs[NR_LCOEFS];
377 u32 too_fast_vrate_pct;
378 u32 too_slow_vrate_pct;
379};
380
7ca5b2e6
TH
381struct ioc_margins {
382 s64 min;
f1de2439
TH
383 s64 low;
384 s64 target;
7ca5b2e6
TH
385};
386
7caa4715 387struct ioc_missed {
5e124f74
TH
388 local_t nr_met;
389 local_t nr_missed;
7caa4715
TH
390 u32 last_met;
391 u32 last_missed;
392};
393
394struct ioc_pcpu_stat {
395 struct ioc_missed missed[2];
396
5e124f74 397 local64_t rq_wait_ns;
7caa4715
TH
398 u64 last_rq_wait_ns;
399};
400
401/* per device */
402struct ioc {
403 struct rq_qos rqos;
404
405 bool enabled;
406
407 struct ioc_params params;
7ca5b2e6 408 struct ioc_margins margins;
7caa4715 409 u32 period_us;
7ca5b2e6 410 u32 timer_slack_ns;
7caa4715
TH
411 u64 vrate_min;
412 u64 vrate_max;
413
414 spinlock_t lock;
415 struct timer_list timer;
416 struct list_head active_iocgs; /* active cgroups */
417 struct ioc_pcpu_stat __percpu *pcpu_stat;
418
419 enum ioc_running running;
420 atomic64_t vtime_rate;
ac33e91e
TH
421 u64 vtime_base_rate;
422 s64 vtime_err;
7caa4715 423
67b7b641 424 seqcount_spinlock_t period_seqcount;
ce95570a 425 u64 period_at; /* wallclock starttime */
7caa4715
TH
426 u64 period_at_vtime; /* vtime starttime */
427
428 atomic64_t cur_period; /* inc'd each period */
429 int busy_level; /* saturation history */
430
7caa4715
TH
431 bool weights_updated;
432 atomic_t hweight_gen; /* for lazy hweights */
433
c7af2a00
TH
434 /* debt forgivness */
435 u64 dfgv_period_at;
436 u64 dfgv_period_rem;
437 u64 dfgv_usage_us_sum;
dda1315f 438
7caa4715
TH
439 u64 autop_too_fast_at;
440 u64 autop_too_slow_at;
441 int autop_idx;
442 bool user_qos_params:1;
443 bool user_cost_model:1;
444};
445
97eb1975
TH
446struct iocg_pcpu_stat {
447 local64_t abs_vusage;
448};
449
450struct iocg_stat {
451 u64 usage_us;
f0bf84a5
TH
452 u64 wait_us;
453 u64 indebt_us;
454 u64 indelay_us;
97eb1975
TH
455};
456
7caa4715
TH
457/* per device-cgroup pair */
458struct ioc_gq {
459 struct blkg_policy_data pd;
460 struct ioc *ioc;
461
462 /*
463 * A iocg can get its weight from two sources - an explicit
464 * per-device-cgroup configuration or the default weight of the
465 * cgroup. `cfg_weight` is the explicit per-device-cgroup
466 * configuration. `weight` is the effective considering both
467 * sources.
468 *
469 * When an idle cgroup becomes active its `active` goes from 0 to
470 * `weight`. `inuse` is the surplus adjusted active weight.
471 * `active` and `inuse` are used to calculate `hweight_active` and
472 * `hweight_inuse`.
473 *
474 * `last_inuse` remembers `inuse` while an iocg is idle to persist
475 * surplus adjustments.
b0853ab4
TH
476 *
477 * `inuse` may be adjusted dynamically during period. `saved_*` are used
478 * to determine and track adjustments.
7caa4715
TH
479 */
480 u32 cfg_weight;
481 u32 weight;
482 u32 active;
483 u32 inuse;
b0853ab4 484
7caa4715 485 u32 last_inuse;
b0853ab4 486 s64 saved_margin;
7caa4715
TH
487
488 sector_t cursor; /* to detect randio */
489
490 /*
491 * `vtime` is this iocg's vtime cursor which progresses as IOs are
492 * issued. If lagging behind device vtime, the delta represents
5ba1add2 493 * the currently available IO budget. If running ahead, the
7caa4715
TH
494 * overage.
495 *
496 * `vtime_done` is the same but progressed on completion rather
497 * than issue. The delta behind `vtime` represents the cost of
498 * currently in-flight IOs.
7caa4715
TH
499 */
500 atomic64_t vtime;
501 atomic64_t done_vtime;
0b80f986 502 u64 abs_vdebt;
7caa4715 503
5160a5a5
TH
504 /* current delay in effect and when it started */
505 u64 delay;
506 u64 delay_at;
507
7caa4715
TH
508 /*
509 * The period this iocg was last active in. Used for deactivation
510 * and invalidating `vtime`.
511 */
512 atomic64_t active_period;
513 struct list_head active_list;
514
00410f1b 515 /* see __propagate_weights() and current_hweight() for details */
7caa4715
TH
516 u64 child_active_sum;
517 u64 child_inuse_sum;
e08d02aa 518 u64 child_adjusted_sum;
7caa4715
TH
519 int hweight_gen;
520 u32 hweight_active;
521 u32 hweight_inuse;
e08d02aa 522 u32 hweight_donating;
93f7d2db 523 u32 hweight_after_donation;
7caa4715 524
97eb1975 525 struct list_head walk_list;
8692d2db 526 struct list_head surplus_list;
97eb1975 527
7caa4715
TH
528 struct wait_queue_head waitq;
529 struct hrtimer waitq_timer;
7caa4715 530
1aa50d02
TH
531 /* timestamp at the latest activation */
532 u64 activated_at;
533
97eb1975
TH
534 /* statistics */
535 struct iocg_pcpu_stat __percpu *pcpu_stat;
536 struct iocg_stat local_stat;
537 struct iocg_stat desc_stat;
538 struct iocg_stat last_stat;
539 u64 last_stat_abs_vusage;
f1de2439 540 u64 usage_delta_us;
f0bf84a5
TH
541 u64 wait_since;
542 u64 indebt_since;
543 u64 indelay_since;
7caa4715
TH
544
545 /* this iocg's depth in the hierarchy and ancestors including self */
546 int level;
547 struct ioc_gq *ancestors[];
548};
549
550/* per cgroup */
551struct ioc_cgrp {
552 struct blkcg_policy_data cpd;
553 unsigned int dfl_weight;
554};
555
556struct ioc_now {
557 u64 now_ns;
ce95570a 558 u64 now;
7caa4715
TH
559 u64 vnow;
560 u64 vrate;
561};
562
563struct iocg_wait {
564 struct wait_queue_entry wait;
565 struct bio *bio;
566 u64 abs_cost;
567 bool committed;
568};
569
570struct iocg_wake_ctx {
571 struct ioc_gq *iocg;
572 u32 hw_inuse;
573 s64 vbudget;
574};
575
576static const struct ioc_params autop[] = {
577 [AUTOP_HDD] = {
578 .qos = {
7afcccaf
TH
579 [QOS_RLAT] = 250000, /* 250ms */
580 [QOS_WLAT] = 250000,
7caa4715
TH
581 [QOS_MIN] = VRATE_MIN_PPM,
582 [QOS_MAX] = VRATE_MAX_PPM,
583 },
584 .i_lcoefs = {
585 [I_LCOEF_RBPS] = 174019176,
586 [I_LCOEF_RSEQIOPS] = 41708,
587 [I_LCOEF_RRANDIOPS] = 370,
588 [I_LCOEF_WBPS] = 178075866,
589 [I_LCOEF_WSEQIOPS] = 42705,
590 [I_LCOEF_WRANDIOPS] = 378,
591 },
592 },
593 [AUTOP_SSD_QD1] = {
594 .qos = {
595 [QOS_RLAT] = 25000, /* 25ms */
596 [QOS_WLAT] = 25000,
597 [QOS_MIN] = VRATE_MIN_PPM,
598 [QOS_MAX] = VRATE_MAX_PPM,
599 },
600 .i_lcoefs = {
601 [I_LCOEF_RBPS] = 245855193,
602 [I_LCOEF_RSEQIOPS] = 61575,
603 [I_LCOEF_RRANDIOPS] = 6946,
604 [I_LCOEF_WBPS] = 141365009,
605 [I_LCOEF_WSEQIOPS] = 33716,
606 [I_LCOEF_WRANDIOPS] = 26796,
607 },
608 },
609 [AUTOP_SSD_DFL] = {
610 .qos = {
611 [QOS_RLAT] = 25000, /* 25ms */
612 [QOS_WLAT] = 25000,
613 [QOS_MIN] = VRATE_MIN_PPM,
614 [QOS_MAX] = VRATE_MAX_PPM,
615 },
616 .i_lcoefs = {
617 [I_LCOEF_RBPS] = 488636629,
618 [I_LCOEF_RSEQIOPS] = 8932,
619 [I_LCOEF_RRANDIOPS] = 8518,
620 [I_LCOEF_WBPS] = 427891549,
621 [I_LCOEF_WSEQIOPS] = 28755,
622 [I_LCOEF_WRANDIOPS] = 21940,
623 },
624 .too_fast_vrate_pct = 500,
625 },
626 [AUTOP_SSD_FAST] = {
627 .qos = {
628 [QOS_RLAT] = 5000, /* 5ms */
629 [QOS_WLAT] = 5000,
630 [QOS_MIN] = VRATE_MIN_PPM,
631 [QOS_MAX] = VRATE_MAX_PPM,
632 },
633 .i_lcoefs = {
634 [I_LCOEF_RBPS] = 3102524156LLU,
635 [I_LCOEF_RSEQIOPS] = 724816,
636 [I_LCOEF_RRANDIOPS] = 778122,
637 [I_LCOEF_WBPS] = 1742780862LLU,
638 [I_LCOEF_WSEQIOPS] = 425702,
639 [I_LCOEF_WRANDIOPS] = 443193,
640 },
641 .too_slow_vrate_pct = 10,
642 },
643};
644
645/*
646 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
647 * vtime credit shortage and down on device saturation.
648 */
649static u32 vrate_adj_pct[] =
650 { 0, 0, 0, 0,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
653 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
654
655static struct blkcg_policy blkcg_policy_iocost;
656
657/* accessors and helpers */
658static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
659{
660 return container_of(rqos, struct ioc, rqos);
661}
662
663static struct ioc *q_to_ioc(struct request_queue *q)
664{
665 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
666}
667
668static const char *q_name(struct request_queue *q)
669{
75e6c00f 670 if (blk_queue_registered(q))
7caa4715
TH
671 return kobject_name(q->kobj.parent);
672 else
673 return "<unknown>";
674}
675
676static const char __maybe_unused *ioc_name(struct ioc *ioc)
677{
678 return q_name(ioc->rqos.q);
679}
680
681static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
682{
683 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
684}
685
686static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
687{
688 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
689}
690
691static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
692{
693 return pd_to_blkg(&iocg->pd);
694}
695
696static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
697{
698 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
699 struct ioc_cgrp, cpd);
700}
701
702/*
703 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
36a52481 704 * weight, the more expensive each IO. Must round up.
7caa4715
TH
705 */
706static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
707{
fe20cdb5 708 return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
7caa4715
TH
709}
710
36a52481
TH
711/*
712 * The inverse of abs_cost_to_cost(). Must round up.
713 */
714static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
715{
fe20cdb5 716 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
36a52481
TH
717}
718
97eb1975
TH
719static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
720 u64 abs_cost, u64 cost)
7caa4715 721{
97eb1975
TH
722 struct iocg_pcpu_stat *gcs;
723
7caa4715
TH
724 bio->bi_iocost_cost = cost;
725 atomic64_add(cost, &iocg->vtime);
97eb1975
TH
726
727 gcs = get_cpu_ptr(iocg->pcpu_stat);
728 local64_add(abs_cost, &gcs->abs_vusage);
729 put_cpu_ptr(gcs);
7caa4715
TH
730}
731
da437b95
TH
732static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
733{
734 if (lock_ioc) {
735 spin_lock_irqsave(&iocg->ioc->lock, *flags);
736 spin_lock(&iocg->waitq.lock);
737 } else {
738 spin_lock_irqsave(&iocg->waitq.lock, *flags);
739 }
740}
741
742static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
743{
744 if (unlock_ioc) {
745 spin_unlock(&iocg->waitq.lock);
746 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
747 } else {
748 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
749 }
750}
751
7caa4715
TH
752#define CREATE_TRACE_POINTS
753#include <trace/events/iocost.h>
754
7ca5b2e6
TH
755static void ioc_refresh_margins(struct ioc *ioc)
756{
757 struct ioc_margins *margins = &ioc->margins;
758 u32 period_us = ioc->period_us;
ac33e91e 759 u64 vrate = ioc->vtime_base_rate;
7ca5b2e6
TH
760
761 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
f1de2439
TH
762 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
763 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
7ca5b2e6
TH
764}
765
7caa4715
TH
766/* latency Qos params changed, update period_us and all the dependent params */
767static void ioc_refresh_period_us(struct ioc *ioc)
768{
769 u32 ppm, lat, multi, period_us;
770
771 lockdep_assert_held(&ioc->lock);
772
773 /* pick the higher latency target */
774 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
775 ppm = ioc->params.qos[QOS_RPPM];
776 lat = ioc->params.qos[QOS_RLAT];
777 } else {
778 ppm = ioc->params.qos[QOS_WPPM];
779 lat = ioc->params.qos[QOS_WLAT];
780 }
781
782 /*
783 * We want the period to be long enough to contain a healthy number
784 * of IOs while short enough for granular control. Define it as a
785 * multiple of the latency target. Ideally, the multiplier should
786 * be scaled according to the percentile so that it would nominally
787 * contain a certain number of requests. Let's be simpler and
788 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
789 */
790 if (ppm)
791 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
792 else
793 multi = 2;
794 period_us = multi * lat;
795 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
796
797 /* calculate dependent params */
798 ioc->period_us = period_us;
7ca5b2e6
TH
799 ioc->timer_slack_ns = div64_u64(
800 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
801 100);
802 ioc_refresh_margins(ioc);
7caa4715
TH
803}
804
805static int ioc_autop_idx(struct ioc *ioc)
806{
807 int idx = ioc->autop_idx;
808 const struct ioc_params *p = &autop[idx];
809 u32 vrate_pct;
810 u64 now_ns;
811
812 /* rotational? */
813 if (!blk_queue_nonrot(ioc->rqos.q))
814 return AUTOP_HDD;
815
816 /* handle SATA SSDs w/ broken NCQ */
817 if (blk_queue_depth(ioc->rqos.q) == 1)
818 return AUTOP_SSD_QD1;
819
820 /* use one of the normal ssd sets */
821 if (idx < AUTOP_SSD_DFL)
822 return AUTOP_SSD_DFL;
823
824 /* if user is overriding anything, maintain what was there */
825 if (ioc->user_qos_params || ioc->user_cost_model)
826 return idx;
827
828 /* step up/down based on the vrate */
ac33e91e 829 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
7caa4715
TH
830 now_ns = ktime_get_ns();
831
832 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
833 if (!ioc->autop_too_fast_at)
834 ioc->autop_too_fast_at = now_ns;
835 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
836 return idx + 1;
837 } else {
838 ioc->autop_too_fast_at = 0;
839 }
840
841 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
842 if (!ioc->autop_too_slow_at)
843 ioc->autop_too_slow_at = now_ns;
844 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
845 return idx - 1;
846 } else {
847 ioc->autop_too_slow_at = 0;
848 }
849
850 return idx;
851}
852
853/*
854 * Take the followings as input
855 *
856 * @bps maximum sequential throughput
857 * @seqiops maximum sequential 4k iops
858 * @randiops maximum random 4k iops
859 *
860 * and calculate the linear model cost coefficients.
861 *
862 * *@page per-page cost 1s / (@bps / 4096)
863 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
864 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
865 */
866static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
867 u64 *page, u64 *seqio, u64 *randio)
868{
869 u64 v;
870
871 *page = *seqio = *randio = 0;
872
873 if (bps)
874 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
875 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
876
877 if (seqiops) {
878 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
879 if (v > *page)
880 *seqio = v - *page;
881 }
882
883 if (randiops) {
884 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
885 if (v > *page)
886 *randio = v - *page;
887 }
888}
889
890static void ioc_refresh_lcoefs(struct ioc *ioc)
891{
892 u64 *u = ioc->params.i_lcoefs;
893 u64 *c = ioc->params.lcoefs;
894
895 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
896 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
897 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
898 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
899}
900
901static bool ioc_refresh_params(struct ioc *ioc, bool force)
902{
903 const struct ioc_params *p;
904 int idx;
905
906 lockdep_assert_held(&ioc->lock);
907
908 idx = ioc_autop_idx(ioc);
909 p = &autop[idx];
910
911 if (idx == ioc->autop_idx && !force)
912 return false;
913
914 if (idx != ioc->autop_idx)
915 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
916
917 ioc->autop_idx = idx;
918 ioc->autop_too_fast_at = 0;
919 ioc->autop_too_slow_at = 0;
920
921 if (!ioc->user_qos_params)
922 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
923 if (!ioc->user_cost_model)
924 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
925
926 ioc_refresh_period_us(ioc);
927 ioc_refresh_lcoefs(ioc);
928
929 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
930 VTIME_PER_USEC, MILLION);
931 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
932 VTIME_PER_USEC, MILLION);
933
934 return true;
935}
936
ac33e91e
TH
937/*
938 * When an iocg accumulates too much vtime or gets deactivated, we throw away
939 * some vtime, which lowers the overall device utilization. As the exact amount
940 * which is being thrown away is known, we can compensate by accelerating the
941 * vrate accordingly so that the extra vtime generated in the current period
942 * matches what got lost.
943 */
944static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
945{
946 s64 pleft = ioc->period_at + ioc->period_us - now->now;
947 s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
948 s64 vcomp, vcomp_min, vcomp_max;
949
950 lockdep_assert_held(&ioc->lock);
951
952 /* we need some time left in this period */
953 if (pleft <= 0)
954 goto done;
955
956 /*
957 * Calculate how much vrate should be adjusted to offset the error.
958 * Limit the amount of adjustment and deduct the adjusted amount from
959 * the error.
960 */
961 vcomp = -div64_s64(ioc->vtime_err, pleft);
962 vcomp_min = -(ioc->vtime_base_rate >> 1);
963 vcomp_max = ioc->vtime_base_rate;
964 vcomp = clamp(vcomp, vcomp_min, vcomp_max);
965
966 ioc->vtime_err += vcomp * pleft;
967
968 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
969done:
970 /* bound how much error can accumulate */
971 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
972}
973
7caa4715
TH
974/* take a snapshot of the current [v]time and vrate */
975static void ioc_now(struct ioc *ioc, struct ioc_now *now)
976{
977 unsigned seq;
978
979 now->now_ns = ktime_get();
980 now->now = ktime_to_us(now->now_ns);
981 now->vrate = atomic64_read(&ioc->vtime_rate);
982
983 /*
984 * The current vtime is
985 *
986 * vtime at period start + (wallclock time since the start) * vrate
987 *
988 * As a consistent snapshot of `period_at_vtime` and `period_at` is
989 * needed, they're seqcount protected.
990 */
991 do {
992 seq = read_seqcount_begin(&ioc->period_seqcount);
993 now->vnow = ioc->period_at_vtime +
994 (now->now - ioc->period_at) * now->vrate;
995 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
996}
997
998static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
999{
7caa4715
TH
1000 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1001
1002 write_seqcount_begin(&ioc->period_seqcount);
1003 ioc->period_at = now->now;
1004 ioc->period_at_vtime = now->vnow;
1005 write_seqcount_end(&ioc->period_seqcount);
1006
1007 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1008 add_timer(&ioc->timer);
1009}
1010
1011/*
1012 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
b0853ab4
TH
1013 * weight sums and propagate upwards accordingly. If @save, the current margin
1014 * is saved to be used as reference for later inuse in-period adjustments.
7caa4715 1015 */
b0853ab4
TH
1016static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1017 bool save, struct ioc_now *now)
7caa4715
TH
1018{
1019 struct ioc *ioc = iocg->ioc;
1020 int lvl;
1021
1022 lockdep_assert_held(&ioc->lock);
1023
db84a72a
TH
1024 inuse = clamp_t(u32, inuse, 1, active);
1025
b0853ab4
TH
1026 iocg->last_inuse = iocg->inuse;
1027 if (save)
1028 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1029
db84a72a
TH
1030 if (active == iocg->active && inuse == iocg->inuse)
1031 return;
7caa4715
TH
1032
1033 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1034 struct ioc_gq *parent = iocg->ancestors[lvl];
1035 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1036 u32 parent_active = 0, parent_inuse = 0;
1037
1038 /* update the level sums */
1039 parent->child_active_sum += (s32)(active - child->active);
1040 parent->child_inuse_sum += (s32)(inuse - child->inuse);
1041 /* apply the udpates */
1042 child->active = active;
1043 child->inuse = inuse;
1044
1045 /*
1046 * The delta between inuse and active sums indicates that
5ba1add2 1047 * much of weight is being given away. Parent's inuse
7caa4715
TH
1048 * and active should reflect the ratio.
1049 */
1050 if (parent->child_active_sum) {
1051 parent_active = parent->weight;
1052 parent_inuse = DIV64_U64_ROUND_UP(
1053 parent_active * parent->child_inuse_sum,
1054 parent->child_active_sum);
1055 }
1056
1057 /* do we need to keep walking up? */
1058 if (parent_active == parent->active &&
1059 parent_inuse == parent->inuse)
1060 break;
1061
1062 active = parent_active;
1063 inuse = parent_inuse;
1064 }
1065
1066 ioc->weights_updated = true;
1067}
1068
00410f1b 1069static void commit_weights(struct ioc *ioc)
7caa4715
TH
1070{
1071 lockdep_assert_held(&ioc->lock);
1072
1073 if (ioc->weights_updated) {
1074 /* paired with rmb in current_hweight(), see there */
1075 smp_wmb();
1076 atomic_inc(&ioc->hweight_gen);
1077 ioc->weights_updated = false;
1078 }
1079}
1080
b0853ab4
TH
1081static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1082 bool save, struct ioc_now *now)
7caa4715 1083{
b0853ab4 1084 __propagate_weights(iocg, active, inuse, save, now);
00410f1b 1085 commit_weights(iocg->ioc);
7caa4715
TH
1086}
1087
1088static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1089{
1090 struct ioc *ioc = iocg->ioc;
1091 int lvl;
1092 u32 hwa, hwi;
1093 int ioc_gen;
1094
1095 /* hot path - if uptodate, use cached */
1096 ioc_gen = atomic_read(&ioc->hweight_gen);
1097 if (ioc_gen == iocg->hweight_gen)
1098 goto out;
1099
1100 /*
00410f1b
TH
1101 * Paired with wmb in commit_weights(). If we saw the updated
1102 * hweight_gen, all the weight updates from __propagate_weights() are
1103 * visible too.
7caa4715
TH
1104 *
1105 * We can race with weight updates during calculation and get it
1106 * wrong. However, hweight_gen would have changed and a future
1107 * reader will recalculate and we're guaranteed to discard the
1108 * wrong result soon.
1109 */
1110 smp_rmb();
1111
fe20cdb5 1112 hwa = hwi = WEIGHT_ONE;
7caa4715
TH
1113 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1114 struct ioc_gq *parent = iocg->ancestors[lvl];
1115 struct ioc_gq *child = iocg->ancestors[lvl + 1];
bd0adb91
TH
1116 u64 active_sum = READ_ONCE(parent->child_active_sum);
1117 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
7caa4715
TH
1118 u32 active = READ_ONCE(child->active);
1119 u32 inuse = READ_ONCE(child->inuse);
1120
1121 /* we can race with deactivations and either may read as zero */
1122 if (!active_sum || !inuse_sum)
1123 continue;
1124
bd0adb91
TH
1125 active_sum = max_t(u64, active, active_sum);
1126 hwa = div64_u64((u64)hwa * active, active_sum);
7caa4715 1127
bd0adb91
TH
1128 inuse_sum = max_t(u64, inuse, inuse_sum);
1129 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
7caa4715
TH
1130 }
1131
1132 iocg->hweight_active = max_t(u32, hwa, 1);
1133 iocg->hweight_inuse = max_t(u32, hwi, 1);
1134 iocg->hweight_gen = ioc_gen;
1135out:
1136 if (hw_activep)
1137 *hw_activep = iocg->hweight_active;
1138 if (hw_inusep)
1139 *hw_inusep = iocg->hweight_inuse;
1140}
1141
93f7d2db
TH
1142/*
1143 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1144 * other weights stay unchanged.
1145 */
1146static u32 current_hweight_max(struct ioc_gq *iocg)
1147{
1148 u32 hwm = WEIGHT_ONE;
1149 u32 inuse = iocg->active;
1150 u64 child_inuse_sum;
1151 int lvl;
1152
1153 lockdep_assert_held(&iocg->ioc->lock);
1154
1155 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1156 struct ioc_gq *parent = iocg->ancestors[lvl];
1157 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1158
1159 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1160 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1161 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1162 parent->child_active_sum);
1163 }
1164
1165 return max_t(u32, hwm, 1);
1166}
1167
b0853ab4 1168static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
7caa4715
TH
1169{
1170 struct ioc *ioc = iocg->ioc;
1171 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1172 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1173 u32 weight;
1174
1175 lockdep_assert_held(&ioc->lock);
1176
1177 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1178 if (weight != iocg->weight && iocg->active)
b0853ab4 1179 propagate_weights(iocg, weight, iocg->inuse, true, now);
7caa4715
TH
1180 iocg->weight = weight;
1181}
1182
1183static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1184{
1185 struct ioc *ioc = iocg->ioc;
ac33e91e
TH
1186 u64 last_period, cur_period;
1187 u64 vtime, vtarget;
7caa4715
TH
1188 int i;
1189
1190 /*
1191 * If seem to be already active, just update the stamp to tell the
1192 * timer that we're still active. We don't mind occassional races.
1193 */
1194 if (!list_empty(&iocg->active_list)) {
1195 ioc_now(ioc, now);
1196 cur_period = atomic64_read(&ioc->cur_period);
1197 if (atomic64_read(&iocg->active_period) != cur_period)
1198 atomic64_set(&iocg->active_period, cur_period);
1199 return true;
1200 }
1201
1202 /* racy check on internal node IOs, treat as root level IOs */
1203 if (iocg->child_active_sum)
1204 return false;
1205
1206 spin_lock_irq(&ioc->lock);
1207
1208 ioc_now(ioc, now);
1209
1210 /* update period */
1211 cur_period = atomic64_read(&ioc->cur_period);
1212 last_period = atomic64_read(&iocg->active_period);
1213 atomic64_set(&iocg->active_period, cur_period);
1214
1215 /* already activated or breaking leaf-only constraint? */
8b37bc27
JX
1216 if (!list_empty(&iocg->active_list))
1217 goto succeed_unlock;
1218 for (i = iocg->level - 1; i > 0; i--)
1219 if (!list_empty(&iocg->ancestors[i]->active_list))
7caa4715 1220 goto fail_unlock;
8b37bc27 1221
7caa4715
TH
1222 if (iocg->child_active_sum)
1223 goto fail_unlock;
1224
1225 /*
ac33e91e
TH
1226 * Always start with the target budget. On deactivation, we throw away
1227 * anything above it.
7caa4715 1228 */
ac33e91e 1229 vtarget = now->vnow - ioc->margins.target;
7caa4715 1230 vtime = atomic64_read(&iocg->vtime);
7caa4715 1231
ac33e91e
TH
1232 atomic64_add(vtarget - vtime, &iocg->vtime);
1233 atomic64_add(vtarget - vtime, &iocg->done_vtime);
1234 vtime = vtarget;
7caa4715
TH
1235
1236 /*
1237 * Activate, propagate weight and start period timer if not
1238 * running. Reset hweight_gen to avoid accidental match from
1239 * wrapping.
1240 */
1241 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1242 list_add(&iocg->active_list, &ioc->active_iocgs);
b0853ab4 1243
00410f1b 1244 propagate_weights(iocg, iocg->weight,
b0853ab4 1245 iocg->last_inuse ?: iocg->weight, true, now);
7caa4715
TH
1246
1247 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1248 last_period, cur_period, vtime);
1249
1aa50d02 1250 iocg->activated_at = now->now;
7caa4715
TH
1251
1252 if (ioc->running == IOC_IDLE) {
1253 ioc->running = IOC_RUNNING;
c7af2a00
TH
1254 ioc->dfgv_period_at = now->now;
1255 ioc->dfgv_period_rem = 0;
7caa4715
TH
1256 ioc_start_period(ioc, now);
1257 }
1258
8b37bc27 1259succeed_unlock:
7caa4715
TH
1260 spin_unlock_irq(&ioc->lock);
1261 return true;
1262
1263fail_unlock:
1264 spin_unlock_irq(&ioc->lock);
1265 return false;
1266}
1267
6ef20f78
TH
1268static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1269{
1270 struct ioc *ioc = iocg->ioc;
1271 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
5160a5a5
TH
1272 u64 tdelta, delay, new_delay;
1273 s64 vover, vover_pct;
c421a3eb 1274 u32 hwa;
6ef20f78
TH
1275
1276 lockdep_assert_held(&iocg->waitq.lock);
1277
5160a5a5
TH
1278 /* calculate the current delay in effect - 1/2 every second */
1279 tdelta = now->now - iocg->delay_at;
1280 if (iocg->delay)
1281 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1282 else
1283 delay = 0;
1284
1285 /* calculate the new delay from the debt amount */
c421a3eb 1286 current_hweight(iocg, &hwa, NULL);
5160a5a5
TH
1287 vover = atomic64_read(&iocg->vtime) +
1288 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
ac33e91e
TH
1289 vover_pct = div64_s64(100 * vover,
1290 ioc->period_us * ioc->vtime_base_rate);
5160a5a5
TH
1291
1292 if (vover_pct <= MIN_DELAY_THR_PCT)
1293 new_delay = 0;
1294 else if (vover_pct >= MAX_DELAY_THR_PCT)
1295 new_delay = MAX_DELAY;
1296 else
1297 new_delay = MIN_DELAY +
1298 div_u64((MAX_DELAY - MIN_DELAY) *
1299 (vover_pct - MIN_DELAY_THR_PCT),
1300 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1301
1302 /* pick the higher one and apply */
1303 if (new_delay > delay) {
1304 iocg->delay = new_delay;
1305 iocg->delay_at = now->now;
1306 delay = new_delay;
1307 }
6ef20f78 1308
5160a5a5 1309 if (delay >= MIN_DELAY) {
f0bf84a5
TH
1310 if (!iocg->indelay_since)
1311 iocg->indelay_since = now->now;
5160a5a5
TH
1312 blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1313 return true;
1314 } else {
f0bf84a5
TH
1315 if (iocg->indelay_since) {
1316 iocg->local_stat.indelay_us += now->now - iocg->indelay_since;
1317 iocg->indelay_since = 0;
1318 }
5160a5a5 1319 iocg->delay = 0;
6ef20f78
TH
1320 blkcg_clear_delay(blkg);
1321 return false;
1322 }
6ef20f78
TH
1323}
1324
c421a3eb
TH
1325static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1326 struct ioc_now *now)
1327{
1328 struct iocg_pcpu_stat *gcs;
1329
1330 lockdep_assert_held(&iocg->ioc->lock);
1331 lockdep_assert_held(&iocg->waitq.lock);
1332 WARN_ON_ONCE(list_empty(&iocg->active_list));
1333
1334 /*
1335 * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1336 * inuse donating all of it share to others until its debt is paid off.
1337 */
f0bf84a5
TH
1338 if (!iocg->abs_vdebt && abs_cost) {
1339 iocg->indebt_since = now->now;
c421a3eb 1340 propagate_weights(iocg, iocg->active, 0, false, now);
f0bf84a5 1341 }
c421a3eb
TH
1342
1343 iocg->abs_vdebt += abs_cost;
1344
1345 gcs = get_cpu_ptr(iocg->pcpu_stat);
1346 local64_add(abs_cost, &gcs->abs_vusage);
1347 put_cpu_ptr(gcs);
1348}
1349
1350static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1351 struct ioc_now *now)
1352{
1353 lockdep_assert_held(&iocg->ioc->lock);
1354 lockdep_assert_held(&iocg->waitq.lock);
1355
1356 /* make sure that nobody messed with @iocg */
1357 WARN_ON_ONCE(list_empty(&iocg->active_list));
1358 WARN_ON_ONCE(iocg->inuse > 1);
1359
1360 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1361
1362 /* if debt is paid in full, restore inuse */
f0bf84a5
TH
1363 if (!iocg->abs_vdebt) {
1364 iocg->local_stat.indebt_us += now->now - iocg->indebt_since;
1365 iocg->indebt_since = 0;
1366
c421a3eb
TH
1367 propagate_weights(iocg, iocg->active, iocg->last_inuse,
1368 false, now);
f0bf84a5 1369 }
c421a3eb
TH
1370}
1371
7caa4715
TH
1372static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1373 int flags, void *key)
1374{
1375 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1376 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1377 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1378
1379 ctx->vbudget -= cost;
1380
1381 if (ctx->vbudget < 0)
1382 return -1;
1383
97eb1975 1384 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
7caa4715
TH
1385
1386 /*
1387 * autoremove_wake_function() removes the wait entry only when it
1388 * actually changed the task state. We want the wait always
1389 * removed. Remove explicitly and use default_wake_function().
1390 */
1391 list_del_init(&wq_entry->entry);
1392 wait->committed = true;
1393
1394 default_wake_function(wq_entry, mode, flags, key);
1395 return 0;
1396}
1397
da437b95
TH
1398/*
1399 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1400 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1401 * addition to iocg->waitq.lock.
1402 */
1403static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1404 struct ioc_now *now)
7caa4715
TH
1405{
1406 struct ioc *ioc = iocg->ioc;
1407 struct iocg_wake_ctx ctx = { .iocg = iocg };
da437b95 1408 u64 vshortage, expires, oexpires;
36a52481 1409 s64 vbudget;
c421a3eb 1410 u32 hwa;
7caa4715
TH
1411
1412 lockdep_assert_held(&iocg->waitq.lock);
1413
c421a3eb 1414 current_hweight(iocg, &hwa, NULL);
36a52481
TH
1415 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1416
1417 /* pay off debt */
da437b95 1418 if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
c421a3eb
TH
1419 u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1420 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1421 u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
36a52481 1422
da437b95
TH
1423 lockdep_assert_held(&ioc->lock);
1424
c421a3eb
TH
1425 atomic64_add(vpay, &iocg->vtime);
1426 atomic64_add(vpay, &iocg->done_vtime);
1427 iocg_pay_debt(iocg, abs_vpay, now);
1428 vbudget -= vpay;
5160a5a5 1429 }
7b84b49e 1430
5160a5a5 1431 if (iocg->abs_vdebt || iocg->delay)
7b84b49e 1432 iocg_kick_delay(iocg, now);
36a52481 1433
da437b95
TH
1434 /*
1435 * Debt can still be outstanding if we haven't paid all yet or the
1436 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1437 * under debt. Make sure @vbudget reflects the outstanding amount and is
1438 * not positive.
1439 */
1440 if (iocg->abs_vdebt) {
c421a3eb 1441 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
da437b95
TH
1442 vbudget = min_t(s64, 0, vbudget - vdebt);
1443 }
1444
7caa4715 1445 /*
c421a3eb
TH
1446 * Wake up the ones which are due and see how much vtime we'll need for
1447 * the next one. As paying off debt restores hw_inuse, it must be read
1448 * after the above debt payment.
7caa4715 1449 */
da437b95 1450 ctx.vbudget = vbudget;
c421a3eb
TH
1451 current_hweight(iocg, NULL, &ctx.hw_inuse);
1452
7caa4715 1453 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
c421a3eb 1454
f0bf84a5
TH
1455 if (!waitqueue_active(&iocg->waitq)) {
1456 if (iocg->wait_since) {
1457 iocg->local_stat.wait_us += now->now - iocg->wait_since;
1458 iocg->wait_since = 0;
1459 }
7caa4715 1460 return;
f0bf84a5
TH
1461 }
1462
1463 if (!iocg->wait_since)
1464 iocg->wait_since = now->now;
1465
7caa4715
TH
1466 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1467 return;
1468
7ca5b2e6 1469 /* determine next wakeup, add a timer margin to guarantee chunking */
7caa4715
TH
1470 vshortage = -ctx.vbudget;
1471 expires = now->now_ns +
ac33e91e
TH
1472 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1473 NSEC_PER_USEC;
7ca5b2e6 1474 expires += ioc->timer_slack_ns;
7caa4715
TH
1475
1476 /* if already active and close enough, don't bother */
1477 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1478 if (hrtimer_is_queued(&iocg->waitq_timer) &&
7ca5b2e6 1479 abs(oexpires - expires) <= ioc->timer_slack_ns)
7caa4715
TH
1480 return;
1481
1482 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
7ca5b2e6 1483 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
7caa4715
TH
1484}
1485
1486static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1487{
1488 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
da437b95 1489 bool pay_debt = READ_ONCE(iocg->abs_vdebt);
7caa4715
TH
1490 struct ioc_now now;
1491 unsigned long flags;
1492
1493 ioc_now(iocg->ioc, &now);
1494
da437b95
TH
1495 iocg_lock(iocg, pay_debt, &flags);
1496 iocg_kick_waitq(iocg, pay_debt, &now);
1497 iocg_unlock(iocg, pay_debt, &flags);
7caa4715
TH
1498
1499 return HRTIMER_NORESTART;
1500}
1501
7caa4715
TH
1502static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1503{
1504 u32 nr_met[2] = { };
1505 u32 nr_missed[2] = { };
1506 u64 rq_wait_ns = 0;
1507 int cpu, rw;
1508
1509 for_each_online_cpu(cpu) {
1510 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1511 u64 this_rq_wait_ns;
1512
1513 for (rw = READ; rw <= WRITE; rw++) {
5e124f74
TH
1514 u32 this_met = local_read(&stat->missed[rw].nr_met);
1515 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
7caa4715
TH
1516
1517 nr_met[rw] += this_met - stat->missed[rw].last_met;
1518 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1519 stat->missed[rw].last_met = this_met;
1520 stat->missed[rw].last_missed = this_missed;
1521 }
1522
5e124f74 1523 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
7caa4715
TH
1524 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1525 stat->last_rq_wait_ns = this_rq_wait_ns;
1526 }
1527
1528 for (rw = READ; rw <= WRITE; rw++) {
1529 if (nr_met[rw] + nr_missed[rw])
1530 missed_ppm_ar[rw] =
1531 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1532 nr_met[rw] + nr_missed[rw]);
1533 else
1534 missed_ppm_ar[rw] = 0;
1535 }
1536
1537 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1538 ioc->period_us * NSEC_PER_USEC);
1539}
1540
1541/* was iocg idle this period? */
1542static bool iocg_is_idle(struct ioc_gq *iocg)
1543{
1544 struct ioc *ioc = iocg->ioc;
1545
1546 /* did something get issued this period? */
1547 if (atomic64_read(&iocg->active_period) ==
1548 atomic64_read(&ioc->cur_period))
1549 return false;
1550
1551 /* is something in flight? */
dcd6589b 1552 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
7caa4715
TH
1553 return false;
1554
1555 return true;
1556}
1557
97eb1975
TH
1558/*
1559 * Call this function on the target leaf @iocg's to build pre-order traversal
1560 * list of all the ancestors in @inner_walk. The inner nodes are linked through
1561 * ->walk_list and the caller is responsible for dissolving the list after use.
1562 */
1563static void iocg_build_inner_walk(struct ioc_gq *iocg,
1564 struct list_head *inner_walk)
1565{
1566 int lvl;
1567
1568 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1569
1570 /* find the first ancestor which hasn't been visited yet */
1571 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1572 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1573 break;
1574 }
1575
1576 /* walk down and visit the inner nodes to get pre-order traversal */
1577 while (++lvl <= iocg->level - 1) {
1578 struct ioc_gq *inner = iocg->ancestors[lvl];
1579
1580 /* record traversal order */
1581 list_add_tail(&inner->walk_list, inner_walk);
1582 }
1583}
1584
1585/* collect per-cpu counters and propagate the deltas to the parent */
1586static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1587{
ac33e91e 1588 struct ioc *ioc = iocg->ioc;
97eb1975
TH
1589 struct iocg_stat new_stat;
1590 u64 abs_vusage = 0;
1591 u64 vusage_delta;
1592 int cpu;
1593
1594 lockdep_assert_held(&iocg->ioc->lock);
1595
1596 /* collect per-cpu counters */
1597 for_each_possible_cpu(cpu) {
1598 abs_vusage += local64_read(
1599 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1600 }
1601 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1602 iocg->last_stat_abs_vusage = abs_vusage;
1603
ac33e91e 1604 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1aa50d02 1605 iocg->local_stat.usage_us += iocg->usage_delta_us;
97eb1975 1606
f0bf84a5 1607 /* propagate upwards */
97eb1975
TH
1608 new_stat.usage_us =
1609 iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
f0bf84a5
TH
1610 new_stat.wait_us =
1611 iocg->local_stat.wait_us + iocg->desc_stat.wait_us;
1612 new_stat.indebt_us =
1613 iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us;
1614 new_stat.indelay_us =
1615 iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us;
97eb1975
TH
1616
1617 /* propagate the deltas to the parent */
1618 if (iocg->level > 0) {
1619 struct iocg_stat *parent_stat =
1620 &iocg->ancestors[iocg->level - 1]->desc_stat;
1621
1622 parent_stat->usage_us +=
1623 new_stat.usage_us - iocg->last_stat.usage_us;
f0bf84a5
TH
1624 parent_stat->wait_us +=
1625 new_stat.wait_us - iocg->last_stat.wait_us;
1626 parent_stat->indebt_us +=
1627 new_stat.indebt_us - iocg->last_stat.indebt_us;
1628 parent_stat->indelay_us +=
1629 new_stat.indelay_us - iocg->last_stat.indelay_us;
97eb1975
TH
1630 }
1631
1632 iocg->last_stat = new_stat;
1633}
1634
1635/* get stat counters ready for reading on all active iocgs */
1636static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1637{
1638 LIST_HEAD(inner_walk);
1639 struct ioc_gq *iocg, *tiocg;
1640
1641 /* flush leaves and build inner node walk list */
1642 list_for_each_entry(iocg, target_iocgs, active_list) {
1643 iocg_flush_stat_one(iocg, now);
1644 iocg_build_inner_walk(iocg, &inner_walk);
1645 }
1646
1647 /* keep flushing upwards by walking the inner list backwards */
1648 list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1649 iocg_flush_stat_one(iocg, now);
1650 list_del_init(&iocg->walk_list);
1651 }
1652}
1653
93f7d2db
TH
1654/*
1655 * Determine what @iocg's hweight_inuse should be after donating unused
1656 * capacity. @hwm is the upper bound and used to signal no donation. This
1657 * function also throws away @iocg's excess budget.
1658 */
ac33e91e
TH
1659static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1660 u32 usage, struct ioc_now *now)
7caa4715 1661{
93f7d2db
TH
1662 struct ioc *ioc = iocg->ioc;
1663 u64 vtime = atomic64_read(&iocg->vtime);
f1de2439 1664 s64 excess, delta, target, new_hwi;
93f7d2db 1665
c421a3eb
TH
1666 /* debt handling owns inuse for debtors */
1667 if (iocg->abs_vdebt)
1668 return 1;
1669
93f7d2db
TH
1670 /* see whether minimum margin requirement is met */
1671 if (waitqueue_active(&iocg->waitq) ||
1672 time_after64(vtime, now->vnow - ioc->margins.min))
1673 return hwm;
1674
ac33e91e
TH
1675 /* throw away excess above target */
1676 excess = now->vnow - vtime - ioc->margins.target;
93f7d2db
TH
1677 if (excess > 0) {
1678 atomic64_add(excess, &iocg->vtime);
1679 atomic64_add(excess, &iocg->done_vtime);
1680 vtime += excess;
ac33e91e 1681 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
93f7d2db
TH
1682 }
1683
f1de2439
TH
1684 /*
1685 * Let's say the distance between iocg's and device's vtimes as a
1686 * fraction of period duration is delta. Assuming that the iocg will
1687 * consume the usage determined above, we want to determine new_hwi so
1688 * that delta equals MARGIN_TARGET at the end of the next period.
1689 *
1690 * We need to execute usage worth of IOs while spending the sum of the
1691 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1692 * (delta):
1693 *
1694 * usage = (1 - MARGIN_TARGET + delta) * new_hwi
1695 *
1696 * Therefore, the new_hwi is:
1697 *
1698 * new_hwi = usage / (1 - MARGIN_TARGET + delta)
1699 */
1700 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1701 now->vnow - ioc->period_at_vtime);
1702 target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1703 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
7caa4715 1704
f1de2439 1705 return clamp_t(s64, new_hwi, 1, hwm);
7caa4715
TH
1706}
1707
e08d02aa
TH
1708/*
1709 * For work-conservation, an iocg which isn't using all of its share should
1710 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1711 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1712 *
1713 * #1 is mathematically simpler but has the drawback of requiring synchronous
1714 * global hweight_inuse updates when idle iocg's get activated or inuse weights
1715 * change due to donation snapbacks as it has the possibility of grossly
1716 * overshooting what's allowed by the model and vrate.
1717 *
1718 * #2 is inherently safe with local operations. The donating iocg can easily
1719 * snap back to higher weights when needed without worrying about impacts on
1720 * other nodes as the impacts will be inherently correct. This also makes idle
1721 * iocg activations safe. The only effect activations have is decreasing
1722 * hweight_inuse of others, the right solution to which is for those iocgs to
1723 * snap back to higher weights.
1724 *
1725 * So, we go with #2. The challenge is calculating how each donating iocg's
1726 * inuse should be adjusted to achieve the target donation amounts. This is done
1727 * using Andy's method described in the following pdf.
1728 *
1729 * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1730 *
1731 * Given the weights and target after-donation hweight_inuse values, Andy's
1732 * method determines how the proportional distribution should look like at each
1733 * sibling level to maintain the relative relationship between all non-donating
1734 * pairs. To roughly summarize, it divides the tree into donating and
1735 * non-donating parts, calculates global donation rate which is used to
1736 * determine the target hweight_inuse for each node, and then derives per-level
1737 * proportions.
1738 *
1739 * The following pdf shows that global distribution calculated this way can be
1740 * achieved by scaling inuse weights of donating leaves and propagating the
1741 * adjustments upwards proportionally.
1742 *
1743 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1744 *
1745 * Combining the above two, we can determine how each leaf iocg's inuse should
1746 * be adjusted to achieve the target donation.
1747 *
1748 * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1749 *
1750 * The inline comments use symbols from the last pdf.
1751 *
1752 * b is the sum of the absolute budgets in the subtree. 1 for the root node.
1753 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1754 * t is the sum of the absolute budgets of donating nodes in the subtree.
1755 * w is the weight of the node. w = w_f + w_t
1756 * w_f is the non-donating portion of w. w_f = w * f / b
1757 * w_b is the donating portion of w. w_t = w * t / b
1758 * s is the sum of all sibling weights. s = Sum(w) for siblings
1759 * s_f and s_t are the non-donating and donating portions of s.
1760 *
1761 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1762 * w_pt is the donating portion of the parent's weight and w'_pt the same value
1763 * after adjustments. Subscript r denotes the root node's values.
1764 */
93f7d2db
TH
1765static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1766{
e08d02aa
TH
1767 LIST_HEAD(over_hwa);
1768 LIST_HEAD(inner_walk);
1769 struct ioc_gq *iocg, *tiocg, *root_iocg;
1770 u32 after_sum, over_sum, over_target, gamma;
93f7d2db 1771
e08d02aa
TH
1772 /*
1773 * It's pretty unlikely but possible for the total sum of
1774 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1775 * confuse the following calculations. If such condition is detected,
1776 * scale down everyone over its full share equally to keep the sum below
1777 * WEIGHT_ONE.
1778 */
1779 after_sum = 0;
1780 over_sum = 0;
93f7d2db 1781 list_for_each_entry(iocg, surpluses, surplus_list) {
e08d02aa 1782 u32 hwa;
93f7d2db 1783
e08d02aa
TH
1784 current_hweight(iocg, &hwa, NULL);
1785 after_sum += iocg->hweight_after_donation;
93f7d2db 1786
e08d02aa
TH
1787 if (iocg->hweight_after_donation > hwa) {
1788 over_sum += iocg->hweight_after_donation;
1789 list_add(&iocg->walk_list, &over_hwa);
1790 }
93f7d2db 1791 }
e08d02aa
TH
1792
1793 if (after_sum >= WEIGHT_ONE) {
1794 /*
1795 * The delta should be deducted from the over_sum, calculate
1796 * target over_sum value.
1797 */
1798 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1799 WARN_ON_ONCE(over_sum <= over_delta);
1800 over_target = over_sum - over_delta;
1801 } else {
1802 over_target = 0;
1803 }
1804
1805 list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1806 if (over_target)
1807 iocg->hweight_after_donation =
1808 div_u64((u64)iocg->hweight_after_donation *
1809 over_target, over_sum);
1810 list_del_init(&iocg->walk_list);
1811 }
1812
1813 /*
1814 * Build pre-order inner node walk list and prepare for donation
1815 * adjustment calculations.
1816 */
1817 list_for_each_entry(iocg, surpluses, surplus_list) {
1818 iocg_build_inner_walk(iocg, &inner_walk);
1819 }
1820
1821 root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1822 WARN_ON_ONCE(root_iocg->level > 0);
1823
1824 list_for_each_entry(iocg, &inner_walk, walk_list) {
1825 iocg->child_adjusted_sum = 0;
1826 iocg->hweight_donating = 0;
1827 iocg->hweight_after_donation = 0;
1828 }
1829
1830 /*
1831 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1832 * up the hierarchy.
1833 */
1834 list_for_each_entry(iocg, surpluses, surplus_list) {
1835 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1836
1837 parent->hweight_donating += iocg->hweight_donating;
1838 parent->hweight_after_donation += iocg->hweight_after_donation;
1839 }
1840
1841 list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1842 if (iocg->level > 0) {
1843 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1844
1845 parent->hweight_donating += iocg->hweight_donating;
1846 parent->hweight_after_donation += iocg->hweight_after_donation;
1847 }
1848 }
1849
1850 /*
1851 * Calculate inner hwa's (b) and make sure the donation values are
1852 * within the accepted ranges as we're doing low res calculations with
1853 * roundups.
1854 */
1855 list_for_each_entry(iocg, &inner_walk, walk_list) {
1856 if (iocg->level) {
1857 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1858
1859 iocg->hweight_active = DIV64_U64_ROUND_UP(
1860 (u64)parent->hweight_active * iocg->active,
1861 parent->child_active_sum);
1862
1863 }
1864
1865 iocg->hweight_donating = min(iocg->hweight_donating,
1866 iocg->hweight_active);
1867 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1868 iocg->hweight_donating - 1);
1869 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1870 iocg->hweight_donating <= 1 ||
1871 iocg->hweight_after_donation == 0)) {
1872 pr_warn("iocg: invalid donation weights in ");
1873 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1874 pr_cont(": active=%u donating=%u after=%u\n",
1875 iocg->hweight_active, iocg->hweight_donating,
1876 iocg->hweight_after_donation);
1877 }
1878 }
1879
1880 /*
1881 * Calculate the global donation rate (gamma) - the rate to adjust
769b628d
TH
1882 * non-donating budgets by.
1883 *
1884 * No need to use 64bit multiplication here as the first operand is
1885 * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1886 *
1887 * We know that there are beneficiary nodes and the sum of the donating
1888 * hweights can't be whole; however, due to the round-ups during hweight
1889 * calculations, root_iocg->hweight_donating might still end up equal to
1890 * or greater than whole. Limit the range when calculating the divider.
e08d02aa
TH
1891 *
1892 * gamma = (1 - t_r') / (1 - t_r)
1893 */
1894 gamma = DIV_ROUND_UP(
1895 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
769b628d 1896 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
e08d02aa
TH
1897
1898 /*
1899 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1900 * nodes.
1901 */
1902 list_for_each_entry(iocg, &inner_walk, walk_list) {
1903 struct ioc_gq *parent;
1904 u32 inuse, wpt, wptp;
1905 u64 st, sf;
1906
1907 if (iocg->level == 0) {
1908 /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1909 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1910 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1911 WEIGHT_ONE - iocg->hweight_after_donation);
1912 continue;
1913 }
1914
1915 parent = iocg->ancestors[iocg->level - 1];
1916
1917 /* b' = gamma * b_f + b_t' */
1918 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1919 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1920 WEIGHT_ONE) + iocg->hweight_after_donation;
1921
1922 /* w' = s' * b' / b'_p */
1923 inuse = DIV64_U64_ROUND_UP(
1924 (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1925 parent->hweight_inuse);
1926
1927 /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1928 st = DIV64_U64_ROUND_UP(
1929 iocg->child_active_sum * iocg->hweight_donating,
1930 iocg->hweight_active);
1931 sf = iocg->child_active_sum - st;
1932 wpt = DIV64_U64_ROUND_UP(
1933 (u64)iocg->active * iocg->hweight_donating,
1934 iocg->hweight_active);
1935 wptp = DIV64_U64_ROUND_UP(
1936 (u64)inuse * iocg->hweight_after_donation,
1937 iocg->hweight_inuse);
1938
1939 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1940 }
1941
1942 /*
1943 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
1944 * we can finally determine leaf adjustments.
1945 */
1946 list_for_each_entry(iocg, surpluses, surplus_list) {
1947 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1948 u32 inuse;
1949
c421a3eb
TH
1950 /*
1951 * In-debt iocgs participated in the donation calculation with
1952 * the minimum target hweight_inuse. Configuring inuse
1953 * accordingly would work fine but debt handling expects
1954 * @iocg->inuse stay at the minimum and we don't wanna
1955 * interfere.
1956 */
1957 if (iocg->abs_vdebt) {
1958 WARN_ON_ONCE(iocg->inuse > 1);
1959 continue;
1960 }
1961
e08d02aa
TH
1962 /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
1963 inuse = DIV64_U64_ROUND_UP(
1964 parent->child_adjusted_sum * iocg->hweight_after_donation,
1965 parent->hweight_inuse);
04603755
TH
1966
1967 TRACE_IOCG_PATH(inuse_transfer, iocg, now,
1968 iocg->inuse, inuse,
1969 iocg->hweight_inuse,
1970 iocg->hweight_after_donation);
1971
b0853ab4 1972 __propagate_weights(iocg, iocg->active, inuse, true, now);
e08d02aa
TH
1973 }
1974
1975 /* walk list should be dissolved after use */
1976 list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
1977 list_del_init(&iocg->walk_list);
93f7d2db
TH
1978}
1979
ab8df828
TH
1980/*
1981 * A low weight iocg can amass a large amount of debt, for example, when
1982 * anonymous memory gets reclaimed aggressively. If the system has a lot of
1983 * memory paired with a slow IO device, the debt can span multiple seconds or
1984 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
1985 * up blocked paying its debt while the IO device is idle.
1986 *
1987 * The following protects against such cases. If the device has been
d9517841
TH
1988 * sufficiently idle for a while, the debts are halved and delays are
1989 * recalculated.
ab8df828
TH
1990 */
1991static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
33a1fe6d 1992 struct ioc_now *now)
ab8df828 1993{
c7af2a00
TH
1994 struct ioc_gq *iocg;
1995 u64 dur, usage_pct, nr_cycles;
1996
1997 /* if no debtor, reset the cycle */
1998 if (!nr_debtors) {
1999 ioc->dfgv_period_at = now->now;
2000 ioc->dfgv_period_rem = 0;
2001 ioc->dfgv_usage_us_sum = 0;
2002 return;
2003 }
2004
2005 /*
2006 * Debtors can pass through a lot of writes choking the device and we
2007 * don't want to be forgiving debts while the device is struggling from
2008 * write bursts. If we're missing latency targets, consider the device
2009 * fully utilized.
2010 */
2011 if (ioc->busy_level > 0)
2012 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2013
2014 ioc->dfgv_usage_us_sum += usage_us_sum;
2015 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2016 return;
2017
2018 /*
2019 * At least DFGV_PERIOD has passed since the last period. Calculate the
2020 * average usage and reset the period counters.
2021 */
2022 dur = now->now - ioc->dfgv_period_at;
2023 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2024
2025 ioc->dfgv_period_at = now->now;
2026 ioc->dfgv_usage_us_sum = 0;
2027
2028 /* if was too busy, reset everything */
2029 if (usage_pct > DFGV_USAGE_PCT) {
2030 ioc->dfgv_period_rem = 0;
2031 return;
2032 }
2033
2034 /*
2035 * Usage is lower than threshold. Let's forgive some debts. Debt
2036 * forgiveness runs off of the usual ioc timer but its period usually
2037 * doesn't match ioc's. Compensate the difference by performing the
2038 * reduction as many times as would fit in the duration since the last
2039 * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2040 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2041 * reductions is doubled.
2042 */
2043 nr_cycles = dur + ioc->dfgv_period_rem;
2044 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2045
2046 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
c5a6561b
TH
2047 u64 __maybe_unused old_debt, __maybe_unused old_delay;
2048
bec02dbb 2049 if (!iocg->abs_vdebt && !iocg->delay)
c7af2a00 2050 continue;
c5a6561b 2051
c7af2a00 2052 spin_lock(&iocg->waitq.lock);
c5a6561b
TH
2053
2054 old_debt = iocg->abs_vdebt;
2055 old_delay = iocg->delay;
2056
bec02dbb
TH
2057 if (iocg->abs_vdebt)
2058 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2059 if (iocg->delay)
2060 iocg->delay = iocg->delay >> nr_cycles ?: 1;
2061
c7af2a00 2062 iocg_kick_waitq(iocg, true, now);
c5a6561b
TH
2063
2064 TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2065 old_debt, iocg->abs_vdebt,
2066 old_delay, iocg->delay);
2067
c7af2a00 2068 spin_unlock(&iocg->waitq.lock);
ab8df828
TH
2069 }
2070}
2071
7caa4715
TH
2072static void ioc_timer_fn(struct timer_list *timer)
2073{
2074 struct ioc *ioc = container_of(timer, struct ioc, timer);
2075 struct ioc_gq *iocg, *tiocg;
2076 struct ioc_now now;
8692d2db 2077 LIST_HEAD(surpluses);
dda1315f
TH
2078 int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0;
2079 u64 usage_us_sum = 0;
7caa4715
TH
2080 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2081 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2082 u32 missed_ppm[2], rq_wait_pct;
2083 u64 period_vtime;
f1de2439 2084 int prev_busy_level;
7caa4715
TH
2085
2086 /* how were the latencies during the period? */
2087 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2088
2089 /* take care of active iocgs */
2090 spin_lock_irq(&ioc->lock);
2091
2092 ioc_now(ioc, &now);
2093
2094 period_vtime = now.vnow - ioc->period_at_vtime;
2095 if (WARN_ON_ONCE(!period_vtime)) {
2096 spin_unlock_irq(&ioc->lock);
2097 return;
2098 }
2099
2100 /*
2101 * Waiters determine the sleep durations based on the vrate they
2102 * saw at the time of sleep. If vrate has increased, some waiters
2103 * could be sleeping for too long. Wake up tardy waiters which
2104 * should have woken up in the last period and expire idle iocgs.
2105 */
2106 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
d9012a59 2107 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
5160a5a5 2108 !iocg->delay && !iocg_is_idle(iocg))
7caa4715
TH
2109 continue;
2110
2111 spin_lock(&iocg->waitq.lock);
2112
f0bf84a5
TH
2113 /* flush wait and indebt stat deltas */
2114 if (iocg->wait_since) {
2115 iocg->local_stat.wait_us += now.now - iocg->wait_since;
2116 iocg->wait_since = now.now;
2117 }
2118 if (iocg->indebt_since) {
2119 iocg->local_stat.indebt_us +=
2120 now.now - iocg->indebt_since;
2121 iocg->indebt_since = now.now;
2122 }
2123 if (iocg->indelay_since) {
2124 iocg->local_stat.indelay_us +=
2125 now.now - iocg->indelay_since;
2126 iocg->indelay_since = now.now;
2127 }
2128
5160a5a5
TH
2129 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2130 iocg->delay) {
7caa4715 2131 /* might be oversleeping vtime / hweight changes, kick */
da437b95 2132 iocg_kick_waitq(iocg, true, &now);
bec02dbb 2133 if (iocg->abs_vdebt || iocg->delay)
dda1315f 2134 nr_debtors++;
7caa4715
TH
2135 } else if (iocg_is_idle(iocg)) {
2136 /* no waiter and idle, deactivate */
ac33e91e
TH
2137 u64 vtime = atomic64_read(&iocg->vtime);
2138 s64 excess;
2139
2140 /*
2141 * @iocg has been inactive for a full duration and will
2142 * have a high budget. Account anything above target as
2143 * error and throw away. On reactivation, it'll start
2144 * with the target budget.
2145 */
2146 excess = now.vnow - vtime - ioc->margins.target;
2147 if (excess > 0) {
2148 u32 old_hwi;
2149
2150 current_hweight(iocg, NULL, &old_hwi);
2151 ioc->vtime_err -= div64_u64(excess * old_hwi,
2152 WEIGHT_ONE);
2153 }
2154
b0853ab4 2155 __propagate_weights(iocg, 0, 0, false, &now);
7caa4715
TH
2156 list_del_init(&iocg->active_list);
2157 }
2158
2159 spin_unlock(&iocg->waitq.lock);
2160 }
00410f1b 2161 commit_weights(ioc);
7caa4715 2162
f0bf84a5
TH
2163 /*
2164 * Wait and indebt stat are flushed above and the donation calculation
2165 * below needs updated usage stat. Let's bring stat up-to-date.
2166 */
2167 iocg_flush_stat(&ioc->active_iocgs, &now);
2168
f1de2439 2169 /* calc usage and see whether some weights need to be moved around */
7caa4715 2170 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
f1de2439
TH
2171 u64 vdone, vtime, usage_us, usage_dur;
2172 u32 usage, hw_active, hw_inuse;
7caa4715
TH
2173
2174 /*
2175 * Collect unused and wind vtime closer to vnow to prevent
2176 * iocgs from accumulating a large amount of budget.
2177 */
2178 vdone = atomic64_read(&iocg->done_vtime);
2179 vtime = atomic64_read(&iocg->vtime);
2180 current_hweight(iocg, &hw_active, &hw_inuse);
2181
2182 /*
2183 * Latency QoS detection doesn't account for IOs which are
2184 * in-flight for longer than a period. Detect them by
2185 * comparing vdone against period start. If lagging behind
2186 * IOs from past periods, don't increase vrate.
2187 */
7cd806a9
TH
2188 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2189 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
7caa4715
TH
2190 time_after64(vtime, vdone) &&
2191 time_after64(vtime, now.vnow -
2192 MAX_LAGGING_PERIODS * period_vtime) &&
2193 time_before64(vdone, now.vnow - period_vtime))
2194 nr_lagging++;
2195
7caa4715 2196 /*
f1de2439
TH
2197 * Determine absolute usage factoring in in-flight IOs to avoid
2198 * high-latency completions appearing as idle.
7caa4715 2199 */
1aa50d02 2200 usage_us = iocg->usage_delta_us;
dda1315f 2201 usage_us_sum += usage_us;
f1de2439 2202
1aa50d02
TH
2203 if (vdone != vtime) {
2204 u64 inflight_us = DIV64_U64_ROUND_UP(
2205 cost_to_abs_cost(vtime - vdone, hw_inuse),
ac33e91e 2206 ioc->vtime_base_rate);
1aa50d02
TH
2207 usage_us = max(usage_us, inflight_us);
2208 }
2209
f1de2439
TH
2210 /* convert to hweight based usage ratio */
2211 if (time_after64(iocg->activated_at, ioc->period_at))
2212 usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2213 else
2214 usage_dur = max_t(u64, now.now - ioc->period_at, 1);
93f7d2db 2215
f1de2439
TH
2216 usage = clamp_t(u32,
2217 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2218 usage_dur),
1aa50d02 2219 1, WEIGHT_ONE);
7caa4715
TH
2220
2221 /* see whether there's surplus vtime */
8692d2db 2222 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
93f7d2db
TH
2223 if (hw_inuse < hw_active ||
2224 (!waitqueue_active(&iocg->waitq) &&
f1de2439 2225 time_before64(vtime, now.vnow - ioc->margins.low))) {
ac33e91e 2226 u32 hwa, old_hwi, hwm, new_hwi;
93f7d2db
TH
2227
2228 /*
2229 * Already donating or accumulated enough to start.
2230 * Determine the donation amount.
2231 */
ac33e91e 2232 current_hweight(iocg, &hwa, &old_hwi);
93f7d2db 2233 hwm = current_hweight_max(iocg);
ac33e91e
TH
2234 new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2235 usage, &now);
93f7d2db 2236 if (new_hwi < hwm) {
e08d02aa 2237 iocg->hweight_donating = hwa;
93f7d2db 2238 iocg->hweight_after_donation = new_hwi;
8692d2db 2239 list_add(&iocg->surplus_list, &surpluses);
7caa4715 2240 } else {
04603755
TH
2241 TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2242 iocg->inuse, iocg->active,
2243 iocg->hweight_inuse, new_hwi);
2244
93f7d2db 2245 __propagate_weights(iocg, iocg->active,
b0853ab4 2246 iocg->active, true, &now);
93f7d2db 2247 nr_shortages++;
7caa4715
TH
2248 }
2249 } else {
93f7d2db 2250 /* genuinely short on vtime */
7caa4715
TH
2251 nr_shortages++;
2252 }
2253 }
2254
93f7d2db
TH
2255 if (!list_empty(&surpluses) && nr_shortages)
2256 transfer_surpluses(&surpluses, &now);
7caa4715 2257
00410f1b 2258 commit_weights(ioc);
7caa4715 2259
8692d2db
TH
2260 /* surplus list should be dissolved after use */
2261 list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2262 list_del_init(&iocg->surplus_list);
2263
7caa4715
TH
2264 /*
2265 * If q is getting clogged or we're missing too much, we're issuing
2266 * too much IO and should lower vtime rate. If we're not missing
2267 * and experiencing shortages but not surpluses, we're too stingy
2268 * and should increase vtime rate.
2269 */
25d41e4a 2270 prev_busy_level = ioc->busy_level;
7caa4715
TH
2271 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2272 missed_ppm[READ] > ppm_rthr ||
2273 missed_ppm[WRITE] > ppm_wthr) {
81ca627a 2274 /* clearly missing QoS targets, slow down vrate */
7caa4715
TH
2275 ioc->busy_level = max(ioc->busy_level, 0);
2276 ioc->busy_level++;
7cd806a9 2277 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
7caa4715
TH
2278 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2279 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
81ca627a
TH
2280 /* QoS targets are being met with >25% margin */
2281 if (nr_shortages) {
2282 /*
2283 * We're throttling while the device has spare
2284 * capacity. If vrate was being slowed down, stop.
2285 */
7cd806a9 2286 ioc->busy_level = min(ioc->busy_level, 0);
81ca627a
TH
2287
2288 /*
2289 * If there are IOs spanning multiple periods, wait
065655c8 2290 * them out before pushing the device harder.
81ca627a 2291 */
065655c8 2292 if (!nr_lagging)
7cd806a9 2293 ioc->busy_level--;
81ca627a
TH
2294 } else {
2295 /*
2296 * Nobody is being throttled and the users aren't
2297 * issuing enough IOs to saturate the device. We
2298 * simply don't know how close the device is to
2299 * saturation. Coast.
2300 */
2301 ioc->busy_level = 0;
7cd806a9 2302 }
7caa4715 2303 } else {
81ca627a 2304 /* inside the hysterisis margin, we're good */
7caa4715
TH
2305 ioc->busy_level = 0;
2306 }
2307
2308 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2309
7cd806a9 2310 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
ac33e91e 2311 u64 vrate = ioc->vtime_base_rate;
7caa4715
TH
2312 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
2313
2314 /* rq_wait signal is always reliable, ignore user vrate_min */
2315 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
2316 vrate_min = VRATE_MIN;
2317
2318 /*
2319 * If vrate is out of bounds, apply clamp gradually as the
2320 * bounds can change abruptly. Otherwise, apply busy_level
2321 * based adjustment.
2322 */
2323 if (vrate < vrate_min) {
2324 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
2325 100);
2326 vrate = min(vrate, vrate_min);
2327 } else if (vrate > vrate_max) {
2328 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
2329 100);
2330 vrate = max(vrate, vrate_max);
2331 } else {
2332 int idx = min_t(int, abs(ioc->busy_level),
2333 ARRAY_SIZE(vrate_adj_pct) - 1);
2334 u32 adj_pct = vrate_adj_pct[idx];
2335
2336 if (ioc->busy_level > 0)
2337 adj_pct = 100 - adj_pct;
2338 else
2339 adj_pct = 100 + adj_pct;
2340
2341 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
2342 vrate_min, vrate_max);
2343 }
2344
d6c8e949 2345 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
065655c8 2346 nr_lagging, nr_shortages);
7caa4715 2347
ac33e91e 2348 ioc->vtime_base_rate = vrate;
7ca5b2e6 2349 ioc_refresh_margins(ioc);
25d41e4a
TH
2350 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
2351 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
d6c8e949 2352 missed_ppm, rq_wait_pct, nr_lagging,
065655c8 2353 nr_shortages);
7caa4715
TH
2354 }
2355
2356 ioc_refresh_params(ioc, false);
2357
33a1fe6d
TH
2358 ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2359
7caa4715
TH
2360 /*
2361 * This period is done. Move onto the next one. If nothing's
2362 * going on with the device, stop the timer.
2363 */
2364 atomic64_inc(&ioc->cur_period);
2365
2366 if (ioc->running != IOC_STOP) {
2367 if (!list_empty(&ioc->active_iocgs)) {
2368 ioc_start_period(ioc, &now);
2369 } else {
2370 ioc->busy_level = 0;
ac33e91e 2371 ioc->vtime_err = 0;
7caa4715
TH
2372 ioc->running = IOC_IDLE;
2373 }
ac33e91e
TH
2374
2375 ioc_refresh_vrate(ioc, &now);
7caa4715
TH
2376 }
2377
2378 spin_unlock_irq(&ioc->lock);
2379}
2380
b0853ab4
TH
2381static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2382 u64 abs_cost, struct ioc_now *now)
2383{
2384 struct ioc *ioc = iocg->ioc;
2385 struct ioc_margins *margins = &ioc->margins;
04603755 2386 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
aa67db24 2387 u32 hwi, adj_step;
b0853ab4
TH
2388 s64 margin;
2389 u64 cost, new_inuse;
2390
2391 current_hweight(iocg, NULL, &hwi);
04603755 2392 old_hwi = hwi;
b0853ab4
TH
2393 cost = abs_cost_to_cost(abs_cost, hwi);
2394 margin = now->vnow - vtime - cost;
2395
c421a3eb
TH
2396 /* debt handling owns inuse for debtors */
2397 if (iocg->abs_vdebt)
2398 return cost;
2399
b0853ab4 2400 /*
5ba1add2 2401 * We only increase inuse during period and do so if the margin has
b0853ab4
TH
2402 * deteriorated since the previous adjustment.
2403 */
2404 if (margin >= iocg->saved_margin || margin >= margins->low ||
2405 iocg->inuse == iocg->active)
2406 return cost;
2407
2408 spin_lock_irq(&ioc->lock);
2409
2410 /* we own inuse only when @iocg is in the normal active state */
c421a3eb 2411 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
b0853ab4
TH
2412 spin_unlock_irq(&ioc->lock);
2413 return cost;
2414 }
2415
aa67db24
TH
2416 /*
2417 * Bump up inuse till @abs_cost fits in the existing budget.
2418 * adj_step must be determined after acquiring ioc->lock - we might
2419 * have raced and lost to another thread for activation and could
2420 * be reading 0 iocg->active before ioc->lock which will lead to
2421 * infinite loop.
2422 */
b0853ab4 2423 new_inuse = iocg->inuse;
aa67db24 2424 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
b0853ab4
TH
2425 do {
2426 new_inuse = new_inuse + adj_step;
2427 propagate_weights(iocg, iocg->active, new_inuse, true, now);
2428 current_hweight(iocg, NULL, &hwi);
2429 cost = abs_cost_to_cost(abs_cost, hwi);
2430 } while (time_after64(vtime + cost, now->vnow) &&
2431 iocg->inuse != iocg->active);
2432
2433 spin_unlock_irq(&ioc->lock);
04603755
TH
2434
2435 TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2436 old_inuse, iocg->inuse, old_hwi, hwi);
2437
b0853ab4
TH
2438 return cost;
2439}
2440
7caa4715
TH
2441static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2442 bool is_merge, u64 *costp)
2443{
2444 struct ioc *ioc = iocg->ioc;
2445 u64 coef_seqio, coef_randio, coef_page;
2446 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2447 u64 seek_pages = 0;
2448 u64 cost = 0;
2449
2450 switch (bio_op(bio)) {
2451 case REQ_OP_READ:
2452 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
2453 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
2454 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
2455 break;
2456 case REQ_OP_WRITE:
2457 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
2458 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
2459 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
2460 break;
2461 default:
2462 goto out;
2463 }
2464
2465 if (iocg->cursor) {
2466 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2467 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2468 }
2469
2470 if (!is_merge) {
2471 if (seek_pages > LCOEF_RANDIO_PAGES) {
2472 cost += coef_randio;
2473 } else {
2474 cost += coef_seqio;
2475 }
2476 }
2477 cost += pages * coef_page;
2478out:
2479 *costp = cost;
2480}
2481
2482static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2483{
2484 u64 cost;
2485
2486 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2487 return cost;
2488}
2489
cd006509
TH
2490static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2491 u64 *costp)
2492{
2493 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2494
2495 switch (req_op(rq)) {
2496 case REQ_OP_READ:
2497 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2498 break;
2499 case REQ_OP_WRITE:
2500 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2501 break;
2502 default:
2503 *costp = 0;
2504 }
2505}
2506
2507static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2508{
2509 u64 cost;
2510
2511 calc_size_vtime_cost_builtin(rq, ioc, &cost);
2512 return cost;
2513}
2514
7caa4715
TH
2515static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2516{
2517 struct blkcg_gq *blkg = bio->bi_blkg;
2518 struct ioc *ioc = rqos_to_ioc(rqos);
2519 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2520 struct ioc_now now;
2521 struct iocg_wait wait;
7caa4715 2522 u64 abs_cost, cost, vtime;
da437b95
TH
2523 bool use_debt, ioc_locked;
2524 unsigned long flags;
7caa4715
TH
2525
2526 /* bypass IOs if disabled or for root cgroup */
2527 if (!ioc->enabled || !iocg->level)
2528 return;
2529
7caa4715
TH
2530 /* calculate the absolute vtime cost */
2531 abs_cost = calc_vtime_cost(bio, iocg, false);
2532 if (!abs_cost)
2533 return;
2534
f1de2439
TH
2535 if (!iocg_activate(iocg, &now))
2536 return;
2537
7caa4715 2538 iocg->cursor = bio_end_sector(bio);
7caa4715 2539 vtime = atomic64_read(&iocg->vtime);
b0853ab4 2540 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
7caa4715
TH
2541
2542 /*
2543 * If no one's waiting and within budget, issue right away. The
2544 * tests are racy but the races aren't systemic - we only miss once
2545 * in a while which is fine.
2546 */
0b80f986 2547 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
7caa4715 2548 time_before_eq64(vtime + cost, now.vnow)) {
97eb1975 2549 iocg_commit_bio(iocg, bio, abs_cost, cost);
7caa4715
TH
2550 return;
2551 }
2552
36a52481 2553 /*
da437b95
TH
2554 * We're over budget. This can be handled in two ways. IOs which may
2555 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2556 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2557 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2558 * whether debt handling is needed and acquire locks accordingly.
0b80f986 2559 */
da437b95
TH
2560 use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2561 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
b0853ab4 2562retry_lock:
da437b95
TH
2563 iocg_lock(iocg, ioc_locked, &flags);
2564
2565 /*
2566 * @iocg must stay activated for debt and waitq handling. Deactivation
2567 * is synchronized against both ioc->lock and waitq.lock and we won't
2568 * get deactivated as long as we're waiting or has debt, so we're good
2569 * if we're activated here. In the unlikely cases that we aren't, just
2570 * issue the IO.
2571 */
0b80f986 2572 if (unlikely(list_empty(&iocg->active_list))) {
da437b95 2573 iocg_unlock(iocg, ioc_locked, &flags);
97eb1975 2574 iocg_commit_bio(iocg, bio, abs_cost, cost);
0b80f986
TH
2575 return;
2576 }
2577
2578 /*
2579 * We're over budget. If @bio has to be issued regardless, remember
2580 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2581 * off the debt before waking more IOs.
2582 *
36a52481 2583 * This way, the debt is continuously paid off each period with the
0b80f986
TH
2584 * actual budget available to the cgroup. If we just wound vtime, we
2585 * would incorrectly use the current hw_inuse for the entire amount
2586 * which, for example, can lead to the cgroup staying blocked for a
2587 * long time even with substantially raised hw_inuse.
2588 *
2589 * An iocg with vdebt should stay online so that the timer can keep
2590 * deducting its vdebt and [de]activate use_delay mechanism
2591 * accordingly. We don't want to race against the timer trying to
2592 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2593 * penalizing the cgroup and its descendants.
36a52481 2594 */
da437b95 2595 if (use_debt) {
c421a3eb 2596 iocg_incur_debt(iocg, abs_cost, &now);
54c52e10 2597 if (iocg_kick_delay(iocg, &now))
d7bd15a1
TH
2598 blkcg_schedule_throttle(rqos->q,
2599 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
da437b95 2600 iocg_unlock(iocg, ioc_locked, &flags);
7caa4715
TH
2601 return;
2602 }
2603
b0853ab4 2604 /* guarantee that iocgs w/ waiters have maximum inuse */
c421a3eb 2605 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
b0853ab4
TH
2606 if (!ioc_locked) {
2607 iocg_unlock(iocg, false, &flags);
2608 ioc_locked = true;
2609 goto retry_lock;
2610 }
2611 propagate_weights(iocg, iocg->active, iocg->active, true,
2612 &now);
2613 }
2614
7caa4715
TH
2615 /*
2616 * Append self to the waitq and schedule the wakeup timer if we're
2617 * the first waiter. The timer duration is calculated based on the
2618 * current vrate. vtime and hweight changes can make it too short
2619 * or too long. Each wait entry records the absolute cost it's
2620 * waiting for to allow re-evaluation using a custom wait entry.
2621 *
2622 * If too short, the timer simply reschedules itself. If too long,
2623 * the period timer will notice and trigger wakeups.
2624 *
2625 * All waiters are on iocg->waitq and the wait states are
2626 * synchronized using waitq.lock.
2627 */
7caa4715
TH
2628 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2629 wait.wait.private = current;
2630 wait.bio = bio;
2631 wait.abs_cost = abs_cost;
2632 wait.committed = false; /* will be set true by waker */
2633
2634 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
da437b95 2635 iocg_kick_waitq(iocg, ioc_locked, &now);
7caa4715 2636
da437b95 2637 iocg_unlock(iocg, ioc_locked, &flags);
7caa4715
TH
2638
2639 while (true) {
2640 set_current_state(TASK_UNINTERRUPTIBLE);
2641 if (wait.committed)
2642 break;
2643 io_schedule();
2644 }
2645
2646 /* waker already committed us, proceed */
2647 finish_wait(&iocg->waitq, &wait.wait);
2648}
2649
2650static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2651 struct bio *bio)
2652{
2653 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
e1518f63 2654 struct ioc *ioc = iocg->ioc;
7caa4715 2655 sector_t bio_end = bio_end_sector(bio);
e1518f63 2656 struct ioc_now now;
b0853ab4 2657 u64 vtime, abs_cost, cost;
0b80f986 2658 unsigned long flags;
7caa4715 2659
e1518f63
TH
2660 /* bypass if disabled or for root cgroup */
2661 if (!ioc->enabled || !iocg->level)
7caa4715
TH
2662 return;
2663
2664 abs_cost = calc_vtime_cost(bio, iocg, true);
2665 if (!abs_cost)
2666 return;
2667
e1518f63 2668 ioc_now(ioc, &now);
b0853ab4
TH
2669
2670 vtime = atomic64_read(&iocg->vtime);
2671 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
e1518f63 2672
7caa4715
TH
2673 /* update cursor if backmerging into the request at the cursor */
2674 if (blk_rq_pos(rq) < bio_end &&
2675 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2676 iocg->cursor = bio_end;
2677
e1518f63 2678 /*
0b80f986
TH
2679 * Charge if there's enough vtime budget and the existing request has
2680 * cost assigned.
e1518f63
TH
2681 */
2682 if (rq->bio && rq->bio->bi_iocost_cost &&
0b80f986 2683 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
97eb1975 2684 iocg_commit_bio(iocg, bio, abs_cost, cost);
0b80f986
TH
2685 return;
2686 }
2687
2688 /*
2689 * Otherwise, account it as debt if @iocg is online, which it should
2690 * be for the vast majority of cases. See debt handling in
2691 * ioc_rqos_throttle() for details.
2692 */
c421a3eb
TH
2693 spin_lock_irqsave(&ioc->lock, flags);
2694 spin_lock(&iocg->waitq.lock);
2695
0b80f986 2696 if (likely(!list_empty(&iocg->active_list))) {
c421a3eb
TH
2697 iocg_incur_debt(iocg, abs_cost, &now);
2698 if (iocg_kick_delay(iocg, &now))
2699 blkcg_schedule_throttle(rqos->q,
2700 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
0b80f986 2701 } else {
97eb1975 2702 iocg_commit_bio(iocg, bio, abs_cost, cost);
0b80f986 2703 }
c421a3eb
TH
2704
2705 spin_unlock(&iocg->waitq.lock);
2706 spin_unlock_irqrestore(&ioc->lock, flags);
7caa4715
TH
2707}
2708
2709static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2710{
2711 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2712
2713 if (iocg && bio->bi_iocost_cost)
2714 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2715}
2716
2717static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2718{
2719 struct ioc *ioc = rqos_to_ioc(rqos);
5e124f74 2720 struct ioc_pcpu_stat *ccs;
cd006509 2721 u64 on_q_ns, rq_wait_ns, size_nsec;
7caa4715
TH
2722 int pidx, rw;
2723
2724 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2725 return;
2726
2727 switch (req_op(rq) & REQ_OP_MASK) {
2728 case REQ_OP_READ:
2729 pidx = QOS_RLAT;
2730 rw = READ;
2731 break;
2732 case REQ_OP_WRITE:
2733 pidx = QOS_WLAT;
2734 rw = WRITE;
2735 break;
2736 default:
2737 return;
2738 }
2739
2740 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2741 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
cd006509 2742 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
7caa4715 2743
5e124f74
TH
2744 ccs = get_cpu_ptr(ioc->pcpu_stat);
2745
cd006509
TH
2746 if (on_q_ns <= size_nsec ||
2747 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
5e124f74 2748 local_inc(&ccs->missed[rw].nr_met);
7caa4715 2749 else
5e124f74
TH
2750 local_inc(&ccs->missed[rw].nr_missed);
2751
2752 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
7caa4715 2753
5e124f74 2754 put_cpu_ptr(ccs);
7caa4715
TH
2755}
2756
2757static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2758{
2759 struct ioc *ioc = rqos_to_ioc(rqos);
2760
2761 spin_lock_irq(&ioc->lock);
2762 ioc_refresh_params(ioc, false);
2763 spin_unlock_irq(&ioc->lock);
2764}
2765
2766static void ioc_rqos_exit(struct rq_qos *rqos)
2767{
2768 struct ioc *ioc = rqos_to_ioc(rqos);
2769
2770 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2771
2772 spin_lock_irq(&ioc->lock);
2773 ioc->running = IOC_STOP;
2774 spin_unlock_irq(&ioc->lock);
2775
2776 del_timer_sync(&ioc->timer);
2777 free_percpu(ioc->pcpu_stat);
2778 kfree(ioc);
2779}
2780
2781static struct rq_qos_ops ioc_rqos_ops = {
2782 .throttle = ioc_rqos_throttle,
2783 .merge = ioc_rqos_merge,
2784 .done_bio = ioc_rqos_done_bio,
2785 .done = ioc_rqos_done,
2786 .queue_depth_changed = ioc_rqos_queue_depth_changed,
2787 .exit = ioc_rqos_exit,
2788};
2789
2790static int blk_iocost_init(struct request_queue *q)
2791{
2792 struct ioc *ioc;
2793 struct rq_qos *rqos;
5e124f74 2794 int i, cpu, ret;
7caa4715
TH
2795
2796 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2797 if (!ioc)
2798 return -ENOMEM;
2799
2800 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2801 if (!ioc->pcpu_stat) {
2802 kfree(ioc);
2803 return -ENOMEM;
2804 }
2805
5e124f74
TH
2806 for_each_possible_cpu(cpu) {
2807 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2808
2809 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2810 local_set(&ccs->missed[i].nr_met, 0);
2811 local_set(&ccs->missed[i].nr_missed, 0);
2812 }
2813 local64_set(&ccs->rq_wait_ns, 0);
2814 }
2815
7caa4715
TH
2816 rqos = &ioc->rqos;
2817 rqos->id = RQ_QOS_COST;
2818 rqos->ops = &ioc_rqos_ops;
2819 rqos->q = q;
2820
2821 spin_lock_init(&ioc->lock);
2822 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2823 INIT_LIST_HEAD(&ioc->active_iocgs);
2824
2825 ioc->running = IOC_IDLE;
ac33e91e 2826 ioc->vtime_base_rate = VTIME_PER_USEC;
7caa4715 2827 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
67b7b641 2828 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
7caa4715
TH
2829 ioc->period_at = ktime_to_us(ktime_get());
2830 atomic64_set(&ioc->cur_period, 0);
2831 atomic_set(&ioc->hweight_gen, 0);
2832
2833 spin_lock_irq(&ioc->lock);
2834 ioc->autop_idx = AUTOP_INVALID;
2835 ioc_refresh_params(ioc, true);
2836 spin_unlock_irq(&ioc->lock);
2837
2838 rq_qos_add(q, rqos);
2839 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2840 if (ret) {
2841 rq_qos_del(q, rqos);
3532e722 2842 free_percpu(ioc->pcpu_stat);
7caa4715
TH
2843 kfree(ioc);
2844 return ret;
2845 }
2846 return 0;
2847}
2848
2849static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2850{
2851 struct ioc_cgrp *iocc;
2852
2853 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
e916ad29
TH
2854 if (!iocc)
2855 return NULL;
7caa4715 2856
bd0adb91 2857 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
7caa4715
TH
2858 return &iocc->cpd;
2859}
2860
2861static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2862{
2863 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2864}
2865
2866static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2867 struct blkcg *blkcg)
2868{
2869 int levels = blkcg->css.cgroup->level + 1;
2870 struct ioc_gq *iocg;
2871
f61d6e25 2872 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
7caa4715
TH
2873 if (!iocg)
2874 return NULL;
2875
97eb1975
TH
2876 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2877 if (!iocg->pcpu_stat) {
2878 kfree(iocg);
2879 return NULL;
2880 }
2881
7caa4715
TH
2882 return &iocg->pd;
2883}
2884
2885static void ioc_pd_init(struct blkg_policy_data *pd)
2886{
2887 struct ioc_gq *iocg = pd_to_iocg(pd);
2888 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2889 struct ioc *ioc = q_to_ioc(blkg->q);
2890 struct ioc_now now;
2891 struct blkcg_gq *tblkg;
2892 unsigned long flags;
2893
2894 ioc_now(ioc, &now);
2895
2896 iocg->ioc = ioc;
2897 atomic64_set(&iocg->vtime, now.vnow);
2898 atomic64_set(&iocg->done_vtime, now.vnow);
2899 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2900 INIT_LIST_HEAD(&iocg->active_list);
97eb1975 2901 INIT_LIST_HEAD(&iocg->walk_list);
8692d2db 2902 INIT_LIST_HEAD(&iocg->surplus_list);
fe20cdb5
TH
2903 iocg->hweight_active = WEIGHT_ONE;
2904 iocg->hweight_inuse = WEIGHT_ONE;
7caa4715
TH
2905
2906 init_waitqueue_head(&iocg->waitq);
2907 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2908 iocg->waitq_timer.function = iocg_waitq_timer_fn;
7caa4715
TH
2909
2910 iocg->level = blkg->blkcg->css.cgroup->level;
2911
2912 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2913 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2914 iocg->ancestors[tiocg->level] = tiocg;
2915 }
2916
2917 spin_lock_irqsave(&ioc->lock, flags);
b0853ab4 2918 weight_updated(iocg, &now);
7caa4715
TH
2919 spin_unlock_irqrestore(&ioc->lock, flags);
2920}
2921
2922static void ioc_pd_free(struct blkg_policy_data *pd)
2923{
2924 struct ioc_gq *iocg = pd_to_iocg(pd);
2925 struct ioc *ioc = iocg->ioc;
5aeac7c4 2926 unsigned long flags;
7caa4715
TH
2927
2928 if (ioc) {
5aeac7c4 2929 spin_lock_irqsave(&ioc->lock, flags);
97eb1975 2930
7caa4715 2931 if (!list_empty(&iocg->active_list)) {
b0853ab4
TH
2932 struct ioc_now now;
2933
2934 ioc_now(ioc, &now);
2935 propagate_weights(iocg, 0, 0, false, &now);
7caa4715
TH
2936 list_del_init(&iocg->active_list);
2937 }
97eb1975
TH
2938
2939 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
8692d2db 2940 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
97eb1975 2941
5aeac7c4 2942 spin_unlock_irqrestore(&ioc->lock, flags);
e036c4ca
TH
2943
2944 hrtimer_cancel(&iocg->waitq_timer);
7caa4715 2945 }
97eb1975 2946 free_percpu(iocg->pcpu_stat);
7caa4715
TH
2947 kfree(iocg);
2948}
2949
97eb1975
TH
2950static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
2951{
2952 struct ioc_gq *iocg = pd_to_iocg(pd);
2953 struct ioc *ioc = iocg->ioc;
2954 size_t pos = 0;
2955
2956 if (!ioc->enabled)
2957 return 0;
2958
2959 if (iocg->level == 0) {
2960 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
ac33e91e 2961 ioc->vtime_base_rate * 10000,
97eb1975
TH
2962 VTIME_PER_USEC);
2963 pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
2964 vp10k / 100, vp10k % 100);
2965 }
2966
2967 pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
2968 iocg->last_stat.usage_us);
2969
f0bf84a5
TH
2970 if (blkcg_debug_stats)
2971 pos += scnprintf(buf + pos, size - pos,
2972 " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
2973 iocg->last_stat.wait_us,
2974 iocg->last_stat.indebt_us,
2975 iocg->last_stat.indelay_us);
2976
97eb1975
TH
2977 return pos;
2978}
2979
7caa4715
TH
2980static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2981 int off)
2982{
2983 const char *dname = blkg_dev_name(pd->blkg);
2984 struct ioc_gq *iocg = pd_to_iocg(pd);
2985
2986 if (dname && iocg->cfg_weight)
bd0adb91 2987 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
7caa4715
TH
2988 return 0;
2989}
2990
2991
2992static int ioc_weight_show(struct seq_file *sf, void *v)
2993{
2994 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2995 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2996
bd0adb91 2997 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
7caa4715
TH
2998 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2999 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3000 return 0;
3001}
3002
3003static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3004 size_t nbytes, loff_t off)
3005{
3006 struct blkcg *blkcg = css_to_blkcg(of_css(of));
3007 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3008 struct blkg_conf_ctx ctx;
b0853ab4 3009 struct ioc_now now;
7caa4715
TH
3010 struct ioc_gq *iocg;
3011 u32 v;
3012 int ret;
3013
3014 if (!strchr(buf, ':')) {
3015 struct blkcg_gq *blkg;
3016
3017 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3018 return -EINVAL;
3019
3020 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3021 return -EINVAL;
3022
3023 spin_lock(&blkcg->lock);
bd0adb91 3024 iocc->dfl_weight = v * WEIGHT_ONE;
7caa4715
TH
3025 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3026 struct ioc_gq *iocg = blkg_to_iocg(blkg);
3027
3028 if (iocg) {
3029 spin_lock_irq(&iocg->ioc->lock);
b0853ab4
TH
3030 ioc_now(iocg->ioc, &now);
3031 weight_updated(iocg, &now);
7caa4715
TH
3032 spin_unlock_irq(&iocg->ioc->lock);
3033 }
3034 }
3035 spin_unlock(&blkcg->lock);
3036
3037 return nbytes;
3038 }
3039
3040 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
3041 if (ret)
3042 return ret;
3043
3044 iocg = blkg_to_iocg(ctx.blkg);
3045
3046 if (!strncmp(ctx.body, "default", 7)) {
3047 v = 0;
3048 } else {
3049 if (!sscanf(ctx.body, "%u", &v))
3050 goto einval;
3051 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3052 goto einval;
3053 }
3054
41591a51 3055 spin_lock(&iocg->ioc->lock);
bd0adb91 3056 iocg->cfg_weight = v * WEIGHT_ONE;
b0853ab4
TH
3057 ioc_now(iocg->ioc, &now);
3058 weight_updated(iocg, &now);
41591a51 3059 spin_unlock(&iocg->ioc->lock);
7caa4715
TH
3060
3061 blkg_conf_finish(&ctx);
3062 return nbytes;
3063
3064einval:
3065 blkg_conf_finish(&ctx);
3066 return -EINVAL;
3067}
3068
3069static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3070 int off)
3071{
3072 const char *dname = blkg_dev_name(pd->blkg);
3073 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3074
3075 if (!dname)
3076 return 0;
3077
3078 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3079 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3080 ioc->params.qos[QOS_RPPM] / 10000,
3081 ioc->params.qos[QOS_RPPM] % 10000 / 100,
3082 ioc->params.qos[QOS_RLAT],
3083 ioc->params.qos[QOS_WPPM] / 10000,
3084 ioc->params.qos[QOS_WPPM] % 10000 / 100,
3085 ioc->params.qos[QOS_WLAT],
3086 ioc->params.qos[QOS_MIN] / 10000,
3087 ioc->params.qos[QOS_MIN] % 10000 / 100,
3088 ioc->params.qos[QOS_MAX] / 10000,
3089 ioc->params.qos[QOS_MAX] % 10000 / 100);
3090 return 0;
3091}
3092
3093static int ioc_qos_show(struct seq_file *sf, void *v)
3094{
3095 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3096
3097 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3098 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3099 return 0;
3100}
3101
3102static const match_table_t qos_ctrl_tokens = {
3103 { QOS_ENABLE, "enable=%u" },
3104 { QOS_CTRL, "ctrl=%s" },
3105 { NR_QOS_CTRL_PARAMS, NULL },
3106};
3107
3108static const match_table_t qos_tokens = {
3109 { QOS_RPPM, "rpct=%s" },
3110 { QOS_RLAT, "rlat=%u" },
3111 { QOS_WPPM, "wpct=%s" },
3112 { QOS_WLAT, "wlat=%u" },
3113 { QOS_MIN, "min=%s" },
3114 { QOS_MAX, "max=%s" },
3115 { NR_QOS_PARAMS, NULL },
3116};
3117
3118static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3119 size_t nbytes, loff_t off)
3120{
22ae8ce8 3121 struct block_device *bdev;
7caa4715
TH
3122 struct ioc *ioc;
3123 u32 qos[NR_QOS_PARAMS];
3124 bool enable, user;
3125 char *p;
3126 int ret;
3127
22ae8ce8
CH
3128 bdev = blkcg_conf_open_bdev(&input);
3129 if (IS_ERR(bdev))
3130 return PTR_ERR(bdev);
7caa4715 3131
22ae8ce8 3132 ioc = q_to_ioc(bdev->bd_disk->queue);
7caa4715 3133 if (!ioc) {
22ae8ce8 3134 ret = blk_iocost_init(bdev->bd_disk->queue);
7caa4715
TH
3135 if (ret)
3136 goto err;
22ae8ce8 3137 ioc = q_to_ioc(bdev->bd_disk->queue);
7caa4715
TH
3138 }
3139
3140 spin_lock_irq(&ioc->lock);
3141 memcpy(qos, ioc->params.qos, sizeof(qos));
3142 enable = ioc->enabled;
3143 user = ioc->user_qos_params;
3144 spin_unlock_irq(&ioc->lock);
3145
3146 while ((p = strsep(&input, " \t\n"))) {
3147 substring_t args[MAX_OPT_ARGS];
3148 char buf[32];
3149 int tok;
3150 s64 v;
3151
3152 if (!*p)
3153 continue;
3154
3155 switch (match_token(p, qos_ctrl_tokens, args)) {
3156 case QOS_ENABLE:
3157 match_u64(&args[0], &v);
3158 enable = v;
3159 continue;
3160 case QOS_CTRL:
3161 match_strlcpy(buf, &args[0], sizeof(buf));
3162 if (!strcmp(buf, "auto"))
3163 user = false;
3164 else if (!strcmp(buf, "user"))
3165 user = true;
3166 else
3167 goto einval;
3168 continue;
3169 }
3170
3171 tok = match_token(p, qos_tokens, args);
3172 switch (tok) {
3173 case QOS_RPPM:
3174 case QOS_WPPM:
3175 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3176 sizeof(buf))
3177 goto einval;
3178 if (cgroup_parse_float(buf, 2, &v))
3179 goto einval;
3180 if (v < 0 || v > 10000)
3181 goto einval;
3182 qos[tok] = v * 100;
3183 break;
3184 case QOS_RLAT:
3185 case QOS_WLAT:
3186 if (match_u64(&args[0], &v))
3187 goto einval;
3188 qos[tok] = v;
3189 break;
3190 case QOS_MIN:
3191 case QOS_MAX:
3192 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3193 sizeof(buf))
3194 goto einval;
3195 if (cgroup_parse_float(buf, 2, &v))
3196 goto einval;
3197 if (v < 0)
3198 goto einval;
3199 qos[tok] = clamp_t(s64, v * 100,
3200 VRATE_MIN_PPM, VRATE_MAX_PPM);
3201 break;
3202 default:
3203 goto einval;
3204 }
3205 user = true;
3206 }
3207
3208 if (qos[QOS_MIN] > qos[QOS_MAX])
3209 goto einval;
3210
3211 spin_lock_irq(&ioc->lock);
3212
3213 if (enable) {
cd006509 3214 blk_stat_enable_accounting(ioc->rqos.q);
7caa4715
TH
3215 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3216 ioc->enabled = true;
3217 } else {
3218 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3219 ioc->enabled = false;
3220 }
3221
3222 if (user) {
3223 memcpy(ioc->params.qos, qos, sizeof(qos));
3224 ioc->user_qos_params = true;
3225 } else {
3226 ioc->user_qos_params = false;
3227 }
3228
3229 ioc_refresh_params(ioc, true);
3230 spin_unlock_irq(&ioc->lock);
3231
22ae8ce8 3232 blkdev_put_no_open(bdev);
7caa4715
TH
3233 return nbytes;
3234einval:
3235 ret = -EINVAL;
3236err:
22ae8ce8 3237 blkdev_put_no_open(bdev);
7caa4715
TH
3238 return ret;
3239}
3240
3241static u64 ioc_cost_model_prfill(struct seq_file *sf,
3242 struct blkg_policy_data *pd, int off)
3243{
3244 const char *dname = blkg_dev_name(pd->blkg);
3245 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3246 u64 *u = ioc->params.i_lcoefs;
3247
3248 if (!dname)
3249 return 0;
3250
3251 seq_printf(sf, "%s ctrl=%s model=linear "
3252 "rbps=%llu rseqiops=%llu rrandiops=%llu "
3253 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3254 dname, ioc->user_cost_model ? "user" : "auto",
3255 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3256 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3257 return 0;
3258}
3259
3260static int ioc_cost_model_show(struct seq_file *sf, void *v)
3261{
3262 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3263
3264 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3265 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3266 return 0;
3267}
3268
3269static const match_table_t cost_ctrl_tokens = {
3270 { COST_CTRL, "ctrl=%s" },
3271 { COST_MODEL, "model=%s" },
3272 { NR_COST_CTRL_PARAMS, NULL },
3273};
3274
3275static const match_table_t i_lcoef_tokens = {
3276 { I_LCOEF_RBPS, "rbps=%u" },
3277 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
3278 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
3279 { I_LCOEF_WBPS, "wbps=%u" },
3280 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
3281 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
3282 { NR_I_LCOEFS, NULL },
3283};
3284
3285static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3286 size_t nbytes, loff_t off)
3287{
22ae8ce8 3288 struct block_device *bdev;
7caa4715
TH
3289 struct ioc *ioc;
3290 u64 u[NR_I_LCOEFS];
3291 bool user;
3292 char *p;
3293 int ret;
3294
22ae8ce8
CH
3295 bdev = blkcg_conf_open_bdev(&input);
3296 if (IS_ERR(bdev))
3297 return PTR_ERR(bdev);
7caa4715 3298
22ae8ce8 3299 ioc = q_to_ioc(bdev->bd_disk->queue);
7caa4715 3300 if (!ioc) {
22ae8ce8 3301 ret = blk_iocost_init(bdev->bd_disk->queue);
7caa4715
TH
3302 if (ret)
3303 goto err;
22ae8ce8 3304 ioc = q_to_ioc(bdev->bd_disk->queue);
7caa4715
TH
3305 }
3306
3307 spin_lock_irq(&ioc->lock);
3308 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3309 user = ioc->user_cost_model;
3310 spin_unlock_irq(&ioc->lock);
3311
3312 while ((p = strsep(&input, " \t\n"))) {
3313 substring_t args[MAX_OPT_ARGS];
3314 char buf[32];
3315 int tok;
3316 u64 v;
3317
3318 if (!*p)
3319 continue;
3320
3321 switch (match_token(p, cost_ctrl_tokens, args)) {
3322 case COST_CTRL:
3323 match_strlcpy(buf, &args[0], sizeof(buf));
3324 if (!strcmp(buf, "auto"))
3325 user = false;
3326 else if (!strcmp(buf, "user"))
3327 user = true;
3328 else
3329 goto einval;
3330 continue;
3331 case COST_MODEL:
3332 match_strlcpy(buf, &args[0], sizeof(buf));
3333 if (strcmp(buf, "linear"))
3334 goto einval;
3335 continue;
3336 }
3337
3338 tok = match_token(p, i_lcoef_tokens, args);
3339 if (tok == NR_I_LCOEFS)
3340 goto einval;
3341 if (match_u64(&args[0], &v))
3342 goto einval;
3343 u[tok] = v;
3344 user = true;
3345 }
3346
3347 spin_lock_irq(&ioc->lock);
3348 if (user) {
3349 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3350 ioc->user_cost_model = true;
3351 } else {
3352 ioc->user_cost_model = false;
3353 }
3354 ioc_refresh_params(ioc, true);
3355 spin_unlock_irq(&ioc->lock);
3356
22ae8ce8 3357 blkdev_put_no_open(bdev);
7caa4715
TH
3358 return nbytes;
3359
3360einval:
3361 ret = -EINVAL;
3362err:
22ae8ce8 3363 blkdev_put_no_open(bdev);
7caa4715
TH
3364 return ret;
3365}
3366
3367static struct cftype ioc_files[] = {
3368 {
3369 .name = "weight",
3370 .flags = CFTYPE_NOT_ON_ROOT,
3371 .seq_show = ioc_weight_show,
3372 .write = ioc_weight_write,
3373 },
3374 {
3375 .name = "cost.qos",
3376 .flags = CFTYPE_ONLY_ON_ROOT,
3377 .seq_show = ioc_qos_show,
3378 .write = ioc_qos_write,
3379 },
3380 {
3381 .name = "cost.model",
3382 .flags = CFTYPE_ONLY_ON_ROOT,
3383 .seq_show = ioc_cost_model_show,
3384 .write = ioc_cost_model_write,
3385 },
3386 {}
3387};
3388
3389static struct blkcg_policy blkcg_policy_iocost = {
3390 .dfl_cftypes = ioc_files,
3391 .cpd_alloc_fn = ioc_cpd_alloc,
3392 .cpd_free_fn = ioc_cpd_free,
3393 .pd_alloc_fn = ioc_pd_alloc,
3394 .pd_init_fn = ioc_pd_init,
3395 .pd_free_fn = ioc_pd_free,
97eb1975 3396 .pd_stat_fn = ioc_pd_stat,
7caa4715
TH
3397};
3398
3399static int __init ioc_init(void)
3400{
3401 return blkcg_policy_register(&blkcg_policy_iocost);
3402}
3403
3404static void __exit ioc_exit(void)
3405{
fa1c3eaf 3406 blkcg_policy_unregister(&blkcg_policy_iocost);
7caa4715
TH
3407}
3408
3409module_init(ioc_init);
3410module_exit(ioc_exit);