]> git.ipfire.org Git - thirdparty/kernel/linux.git/blame - block/blk-iocost.c
blk-iocost: fix divide by 0 error in calc_lcoefs()
[thirdparty/kernel/linux.git] / block / blk-iocost.c
CommitLineData
7caa4715
TH
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
5ba1add2 42 * parameters for several different classes of devices are provided and the
7caa4715
TH
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * 2. Control Strategy
50 *
51 * The device virtual time (vtime) is used as the primary control metric.
52 * The control strategy is composed of the following three parts.
53 *
54 * 2-1. Vtime Distribution
55 *
56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
57 * calculated. Please consider the following hierarchy where the numbers
58 * inside parentheses denote the configured weights.
59 *
60 * root
61 * / \
62 * A (w:100) B (w:300)
63 * / \
64 * A0 (w:100) A1 (w:100)
65 *
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69 * 12.5% each. The distribution mechanism only cares about these flattened
70 * shares. They're called hweights (hierarchical weights) and always add
fe20cdb5 71 * upto 1 (WEIGHT_ONE).
7caa4715
TH
72 *
73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75 * against the device vtime - an IO which takes 10ms on the underlying
76 * device is considered to take 80ms on A0.
77 *
78 * This constitutes the basis of IO capacity distribution. Each cgroup's
79 * vtime is running at a rate determined by its hweight. A cgroup tracks
5ba1add2 80 * the vtime consumed by past IOs and can issue a new IO if doing so
7caa4715
TH
81 * wouldn't outrun the current device vtime. Otherwise, the IO is
82 * suspended until the vtime has progressed enough to cover it.
83 *
84 * 2-2. Vrate Adjustment
85 *
86 * It's unrealistic to expect the cost model to be perfect. There are too
87 * many devices and even on the same device the overall performance
88 * fluctuates depending on numerous factors such as IO mixture and device
89 * internal garbage collection. The controller needs to adapt dynamically.
90 *
91 * This is achieved by adjusting the overall IO rate according to how busy
92 * the device is. If the device becomes overloaded, we're sending down too
93 * many IOs and should generally slow down. If there are waiting issuers
94 * but the device isn't saturated, we're issuing too few and should
95 * generally speed up.
96 *
97 * To slow down, we lower the vrate - the rate at which the device vtime
98 * passes compared to the wall clock. For example, if the vtime is running
99 * at the vrate of 75%, all cgroups added up would only be able to issue
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
101 *
102 * Device business is determined using two criteria - rq wait and
103 * completion latencies.
104 *
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
108 * indication that the device is saturated and we lower the vrate. This
109 * saturation signal is fairly conservative as it only triggers when both
110 * hardware and software queues are filled up, and is used as the default
111 * busy signal.
112 *
113 * As devices can have deep queues and be unfair in how the queued commands
ecaaaabe 114 * are executed, solely depending on rq wait may not result in satisfactory
7caa4715
TH
115 * control quality. For a better control quality, completion latency QoS
116 * parameters can be configured so that the device is considered saturated
117 * if N'th percentile completion latency rises above the set point.
118 *
119 * The completion latency requirements are a function of both the
120 * underlying device characteristics and the desired IO latency quality of
121 * service. There is an inherent trade-off - the tighter the latency QoS,
122 * the higher the bandwidth lossage. Latency QoS is disabled by default
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
124 *
125 * 2-3. Work Conservation
126 *
127 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
128 * periodically while B is sending out enough parallel IOs to saturate the
129 * device on its own. Let's say A's usage amounts to 100ms worth of IO
130 * cost per second, i.e., 10% of the device capacity. The naive
131 * distribution of half and half would lead to 60% utilization of the
132 * device, a significant reduction in the total amount of work done
133 * compared to free-for-all competition. This is too high a cost to pay
134 * for IO control.
135 *
136 * To conserve the total amount of work done, we keep track of how much
137 * each active cgroup is actually using and yield part of its weight if
138 * there are other cgroups which can make use of it. In the above case,
139 * A's weight will be lowered so that it hovers above the actual usage and
140 * B would be able to use the rest.
141 *
142 * As we don't want to penalize a cgroup for donating its weight, the
143 * surplus weight adjustment factors in a margin and has an immediate
144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
145 *
146 * Note that adjusting down surplus weights has the same effects as
147 * accelerating vtime for other cgroups and work conservation can also be
148 * implemented by adjusting vrate dynamically. However, squaring who can
149 * donate and should take back how much requires hweight propagations
150 * anyway making it easier to implement and understand as a separate
151 * mechanism.
6954ff18
TH
152 *
153 * 3. Monitoring
154 *
155 * Instead of debugfs or other clumsy monitoring mechanisms, this
156 * controller uses a drgn based monitoring script -
157 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
5ba1add2 158 * https://github.com/osandov/drgn. The output looks like the following.
6954ff18
TH
159 *
160 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
7c1ee704
TH
161 * active weight hweight% inflt% dbt delay usages%
162 * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
163 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
6954ff18
TH
164 *
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
7caa4715
TH
173 */
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
5e124f74
TH
181#include <asm/local.h>
182#include <asm/local64.h>
7caa4715
TH
183#include "blk-rq-qos.h"
184#include "blk-stat.h"
185#include "blk-wbt.h"
672fdcf0 186#include "blk-cgroup.h"
7caa4715
TH
187
188#ifdef CONFIG_TRACEPOINTS
189
190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191#define TRACE_IOCG_PATH_LEN 1024
192static DEFINE_SPINLOCK(trace_iocg_path_lock);
193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194
195#define TRACE_IOCG_PATH(type, iocg, ...) \
196 do { \
197 unsigned long flags; \
198 if (trace_iocost_##type##_enabled()) { \
199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
202 trace_iocost_##type(iocg, trace_iocg_path, \
203 ##__VA_ARGS__); \
204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
205 } \
206 } while (0)
207
208#else /* CONFIG_TRACE_POINTS */
209#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
210#endif /* CONFIG_TRACE_POINTS */
211
212enum {
213 MILLION = 1000000,
214
215 /* timer period is calculated from latency requirements, bound it */
216 MIN_PERIOD = USEC_PER_MSEC,
217 MAX_PERIOD = USEC_PER_SEC,
218
219 /*
f1de2439 220 * iocg->vtime is targeted at 50% behind the device vtime, which
7caa4715
TH
221 * serves as its IO credit buffer. Surplus weight adjustment is
222 * immediately canceled if the vtime margin runs below 10%.
223 */
7ca5b2e6 224 MARGIN_MIN_PCT = 10,
f1de2439
TH
225 MARGIN_LOW_PCT = 20,
226 MARGIN_TARGET_PCT = 50,
7caa4715 227
b0853ab4
TH
228 INUSE_ADJ_STEP_PCT = 25,
229
7ca5b2e6
TH
230 /* Have some play in timer operations */
231 TIMER_SLACK_PCT = 1,
7caa4715 232
7caa4715 233 /* 1/64k is granular enough and can easily be handled w/ u32 */
fe20cdb5 234 WEIGHT_ONE = 1 << 16,
ff1cc97b 235};
7caa4715 236
ff1cc97b 237enum {
7caa4715
TH
238 /*
239 * As vtime is used to calculate the cost of each IO, it needs to
240 * be fairly high precision. For example, it should be able to
241 * represent the cost of a single page worth of discard with
242 * suffificient accuracy. At the same time, it should be able to
243 * represent reasonably long enough durations to be useful and
244 * convenient during operation.
245 *
246 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
247 * granularity and days of wrap-around time even at extreme vrates.
248 */
249 VTIME_PER_SEC_SHIFT = 37,
250 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
251 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
cd006509 252 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
7caa4715
TH
253
254 /* bound vrate adjustments within two orders of magnitude */
255 VRATE_MIN_PPM = 10000, /* 1% */
256 VRATE_MAX_PPM = 100000000, /* 10000% */
257
258 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
259 VRATE_CLAMP_ADJ_PCT = 4,
260
5f2779df
AB
261 /* switch iff the conditions are met for longer than this */
262 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
263};
264
265enum {
7caa4715
TH
266 /* if IOs end up waiting for requests, issue less */
267 RQ_WAIT_BUSY_PCT = 5,
268
269 /* unbusy hysterisis */
270 UNBUSY_THR_PCT = 75,
271
5160a5a5
TH
272 /*
273 * The effect of delay is indirect and non-linear and a huge amount of
274 * future debt can accumulate abruptly while unthrottled. Linearly scale
275 * up delay as debt is going up and then let it decay exponentially.
276 * This gives us quick ramp ups while delay is accumulating and long
277 * tails which can help reducing the frequency of debt explosions on
278 * unthrottle. The parameters are experimentally determined.
279 *
280 * The delay mechanism provides adequate protection and behavior in many
281 * cases. However, this is far from ideal and falls shorts on both
282 * fronts. The debtors are often throttled too harshly costing a
283 * significant level of fairness and possibly total work while the
284 * protection against their impacts on the system can be choppy and
285 * unreliable.
286 *
287 * The shortcoming primarily stems from the fact that, unlike for page
288 * cache, the kernel doesn't have well-defined back-pressure propagation
289 * mechanism and policies for anonymous memory. Fully addressing this
290 * issue will likely require substantial improvements in the area.
291 */
292 MIN_DELAY_THR_PCT = 500,
293 MAX_DELAY_THR_PCT = 25000,
294 MIN_DELAY = 250,
295 MAX_DELAY = 250 * USEC_PER_MSEC,
296
c7af2a00
TH
297 /* halve debts if avg usage over 100ms is under 50% */
298 DFGV_USAGE_PCT = 50,
299 DFGV_PERIOD = 100 * USEC_PER_MSEC,
dda1315f 300
7caa4715
TH
301 /* don't let cmds which take a very long time pin lagging for too long */
302 MAX_LAGGING_PERIODS = 10,
303
7caa4715
TH
304 /*
305 * Count IO size in 4k pages. The 12bit shift helps keeping
306 * size-proportional components of cost calculation in closer
307 * numbers of digits to per-IO cost components.
308 */
309 IOC_PAGE_SHIFT = 12,
310 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
311 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
312
313 /* if apart further than 16M, consider randio for linear model */
314 LCOEF_RANDIO_PAGES = 4096,
315};
316
317enum ioc_running {
318 IOC_IDLE,
319 IOC_RUNNING,
320 IOC_STOP,
321};
322
323/* io.cost.qos controls including per-dev enable of the whole controller */
324enum {
325 QOS_ENABLE,
326 QOS_CTRL,
327 NR_QOS_CTRL_PARAMS,
328};
329
330/* io.cost.qos params */
331enum {
332 QOS_RPPM,
333 QOS_RLAT,
334 QOS_WPPM,
335 QOS_WLAT,
336 QOS_MIN,
337 QOS_MAX,
338 NR_QOS_PARAMS,
339};
340
341/* io.cost.model controls */
342enum {
343 COST_CTRL,
344 COST_MODEL,
345 NR_COST_CTRL_PARAMS,
346};
347
348/* builtin linear cost model coefficients */
349enum {
350 I_LCOEF_RBPS,
351 I_LCOEF_RSEQIOPS,
352 I_LCOEF_RRANDIOPS,
353 I_LCOEF_WBPS,
354 I_LCOEF_WSEQIOPS,
355 I_LCOEF_WRANDIOPS,
356 NR_I_LCOEFS,
357};
358
359enum {
360 LCOEF_RPAGE,
361 LCOEF_RSEQIO,
362 LCOEF_RRANDIO,
363 LCOEF_WPAGE,
364 LCOEF_WSEQIO,
365 LCOEF_WRANDIO,
366 NR_LCOEFS,
367};
368
369enum {
370 AUTOP_INVALID,
371 AUTOP_HDD,
372 AUTOP_SSD_QD1,
373 AUTOP_SSD_DFL,
374 AUTOP_SSD_FAST,
375};
376
7caa4715
TH
377struct ioc_params {
378 u32 qos[NR_QOS_PARAMS];
379 u64 i_lcoefs[NR_I_LCOEFS];
380 u64 lcoefs[NR_LCOEFS];
381 u32 too_fast_vrate_pct;
382 u32 too_slow_vrate_pct;
383};
384
7ca5b2e6
TH
385struct ioc_margins {
386 s64 min;
f1de2439
TH
387 s64 low;
388 s64 target;
7ca5b2e6
TH
389};
390
7caa4715 391struct ioc_missed {
5e124f74
TH
392 local_t nr_met;
393 local_t nr_missed;
7caa4715
TH
394 u32 last_met;
395 u32 last_missed;
396};
397
398struct ioc_pcpu_stat {
399 struct ioc_missed missed[2];
400
5e124f74 401 local64_t rq_wait_ns;
7caa4715
TH
402 u64 last_rq_wait_ns;
403};
404
405/* per device */
406struct ioc {
407 struct rq_qos rqos;
408
409 bool enabled;
410
411 struct ioc_params params;
7ca5b2e6 412 struct ioc_margins margins;
7caa4715 413 u32 period_us;
7ca5b2e6 414 u32 timer_slack_ns;
7caa4715
TH
415 u64 vrate_min;
416 u64 vrate_max;
417
418 spinlock_t lock;
419 struct timer_list timer;
420 struct list_head active_iocgs; /* active cgroups */
421 struct ioc_pcpu_stat __percpu *pcpu_stat;
422
423 enum ioc_running running;
424 atomic64_t vtime_rate;
ac33e91e
TH
425 u64 vtime_base_rate;
426 s64 vtime_err;
7caa4715 427
67b7b641 428 seqcount_spinlock_t period_seqcount;
ce95570a 429 u64 period_at; /* wallclock starttime */
7caa4715
TH
430 u64 period_at_vtime; /* vtime starttime */
431
432 atomic64_t cur_period; /* inc'd each period */
433 int busy_level; /* saturation history */
434
7caa4715
TH
435 bool weights_updated;
436 atomic_t hweight_gen; /* for lazy hweights */
437
c7af2a00
TH
438 /* debt forgivness */
439 u64 dfgv_period_at;
440 u64 dfgv_period_rem;
441 u64 dfgv_usage_us_sum;
dda1315f 442
7caa4715
TH
443 u64 autop_too_fast_at;
444 u64 autop_too_slow_at;
445 int autop_idx;
446 bool user_qos_params:1;
447 bool user_cost_model:1;
448};
449
97eb1975
TH
450struct iocg_pcpu_stat {
451 local64_t abs_vusage;
452};
453
454struct iocg_stat {
455 u64 usage_us;
f0bf84a5
TH
456 u64 wait_us;
457 u64 indebt_us;
458 u64 indelay_us;
97eb1975
TH
459};
460
7caa4715
TH
461/* per device-cgroup pair */
462struct ioc_gq {
463 struct blkg_policy_data pd;
464 struct ioc *ioc;
465
466 /*
467 * A iocg can get its weight from two sources - an explicit
468 * per-device-cgroup configuration or the default weight of the
469 * cgroup. `cfg_weight` is the explicit per-device-cgroup
470 * configuration. `weight` is the effective considering both
471 * sources.
472 *
473 * When an idle cgroup becomes active its `active` goes from 0 to
474 * `weight`. `inuse` is the surplus adjusted active weight.
475 * `active` and `inuse` are used to calculate `hweight_active` and
476 * `hweight_inuse`.
477 *
478 * `last_inuse` remembers `inuse` while an iocg is idle to persist
479 * surplus adjustments.
b0853ab4
TH
480 *
481 * `inuse` may be adjusted dynamically during period. `saved_*` are used
482 * to determine and track adjustments.
7caa4715
TH
483 */
484 u32 cfg_weight;
485 u32 weight;
486 u32 active;
487 u32 inuse;
b0853ab4 488
7caa4715 489 u32 last_inuse;
b0853ab4 490 s64 saved_margin;
7caa4715
TH
491
492 sector_t cursor; /* to detect randio */
493
494 /*
495 * `vtime` is this iocg's vtime cursor which progresses as IOs are
496 * issued. If lagging behind device vtime, the delta represents
5ba1add2 497 * the currently available IO budget. If running ahead, the
7caa4715
TH
498 * overage.
499 *
500 * `vtime_done` is the same but progressed on completion rather
501 * than issue. The delta behind `vtime` represents the cost of
502 * currently in-flight IOs.
7caa4715
TH
503 */
504 atomic64_t vtime;
505 atomic64_t done_vtime;
0b80f986 506 u64 abs_vdebt;
7caa4715 507
5160a5a5
TH
508 /* current delay in effect and when it started */
509 u64 delay;
510 u64 delay_at;
511
7caa4715
TH
512 /*
513 * The period this iocg was last active in. Used for deactivation
514 * and invalidating `vtime`.
515 */
516 atomic64_t active_period;
517 struct list_head active_list;
518
00410f1b 519 /* see __propagate_weights() and current_hweight() for details */
7caa4715
TH
520 u64 child_active_sum;
521 u64 child_inuse_sum;
e08d02aa 522 u64 child_adjusted_sum;
7caa4715
TH
523 int hweight_gen;
524 u32 hweight_active;
525 u32 hweight_inuse;
e08d02aa 526 u32 hweight_donating;
93f7d2db 527 u32 hweight_after_donation;
7caa4715 528
97eb1975 529 struct list_head walk_list;
8692d2db 530 struct list_head surplus_list;
97eb1975 531
7caa4715
TH
532 struct wait_queue_head waitq;
533 struct hrtimer waitq_timer;
7caa4715 534
1aa50d02
TH
535 /* timestamp at the latest activation */
536 u64 activated_at;
537
97eb1975
TH
538 /* statistics */
539 struct iocg_pcpu_stat __percpu *pcpu_stat;
2a371f7d 540 struct iocg_stat stat;
97eb1975
TH
541 struct iocg_stat last_stat;
542 u64 last_stat_abs_vusage;
f1de2439 543 u64 usage_delta_us;
f0bf84a5
TH
544 u64 wait_since;
545 u64 indebt_since;
546 u64 indelay_since;
7caa4715
TH
547
548 /* this iocg's depth in the hierarchy and ancestors including self */
549 int level;
550 struct ioc_gq *ancestors[];
551};
552
553/* per cgroup */
554struct ioc_cgrp {
555 struct blkcg_policy_data cpd;
556 unsigned int dfl_weight;
557};
558
559struct ioc_now {
560 u64 now_ns;
ce95570a 561 u64 now;
7caa4715 562 u64 vnow;
7caa4715
TH
563};
564
565struct iocg_wait {
566 struct wait_queue_entry wait;
567 struct bio *bio;
568 u64 abs_cost;
569 bool committed;
570};
571
572struct iocg_wake_ctx {
573 struct ioc_gq *iocg;
574 u32 hw_inuse;
575 s64 vbudget;
576};
577
578static const struct ioc_params autop[] = {
579 [AUTOP_HDD] = {
580 .qos = {
7afcccaf
TH
581 [QOS_RLAT] = 250000, /* 250ms */
582 [QOS_WLAT] = 250000,
7caa4715
TH
583 [QOS_MIN] = VRATE_MIN_PPM,
584 [QOS_MAX] = VRATE_MAX_PPM,
585 },
586 .i_lcoefs = {
587 [I_LCOEF_RBPS] = 174019176,
588 [I_LCOEF_RSEQIOPS] = 41708,
589 [I_LCOEF_RRANDIOPS] = 370,
590 [I_LCOEF_WBPS] = 178075866,
591 [I_LCOEF_WSEQIOPS] = 42705,
592 [I_LCOEF_WRANDIOPS] = 378,
593 },
594 },
595 [AUTOP_SSD_QD1] = {
596 .qos = {
597 [QOS_RLAT] = 25000, /* 25ms */
598 [QOS_WLAT] = 25000,
599 [QOS_MIN] = VRATE_MIN_PPM,
600 [QOS_MAX] = VRATE_MAX_PPM,
601 },
602 .i_lcoefs = {
603 [I_LCOEF_RBPS] = 245855193,
604 [I_LCOEF_RSEQIOPS] = 61575,
605 [I_LCOEF_RRANDIOPS] = 6946,
606 [I_LCOEF_WBPS] = 141365009,
607 [I_LCOEF_WSEQIOPS] = 33716,
608 [I_LCOEF_WRANDIOPS] = 26796,
609 },
610 },
611 [AUTOP_SSD_DFL] = {
612 .qos = {
613 [QOS_RLAT] = 25000, /* 25ms */
614 [QOS_WLAT] = 25000,
615 [QOS_MIN] = VRATE_MIN_PPM,
616 [QOS_MAX] = VRATE_MAX_PPM,
617 },
618 .i_lcoefs = {
619 [I_LCOEF_RBPS] = 488636629,
620 [I_LCOEF_RSEQIOPS] = 8932,
621 [I_LCOEF_RRANDIOPS] = 8518,
622 [I_LCOEF_WBPS] = 427891549,
623 [I_LCOEF_WSEQIOPS] = 28755,
624 [I_LCOEF_WRANDIOPS] = 21940,
625 },
626 .too_fast_vrate_pct = 500,
627 },
628 [AUTOP_SSD_FAST] = {
629 .qos = {
630 [QOS_RLAT] = 5000, /* 5ms */
631 [QOS_WLAT] = 5000,
632 [QOS_MIN] = VRATE_MIN_PPM,
633 [QOS_MAX] = VRATE_MAX_PPM,
634 },
635 .i_lcoefs = {
636 [I_LCOEF_RBPS] = 3102524156LLU,
637 [I_LCOEF_RSEQIOPS] = 724816,
638 [I_LCOEF_RRANDIOPS] = 778122,
639 [I_LCOEF_WBPS] = 1742780862LLU,
640 [I_LCOEF_WSEQIOPS] = 425702,
641 [I_LCOEF_WRANDIOPS] = 443193,
642 },
643 .too_slow_vrate_pct = 10,
644 },
645};
646
647/*
648 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
649 * vtime credit shortage and down on device saturation.
650 */
651static u32 vrate_adj_pct[] =
652 { 0, 0, 0, 0,
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
655 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
656
657static struct blkcg_policy blkcg_policy_iocost;
658
659/* accessors and helpers */
660static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
661{
662 return container_of(rqos, struct ioc, rqos);
663}
664
665static struct ioc *q_to_ioc(struct request_queue *q)
666{
667 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
668}
669
7caa4715
TH
670static const char __maybe_unused *ioc_name(struct ioc *ioc)
671{
9df3e651
CH
672 struct gendisk *disk = ioc->rqos.q->disk;
673
674 if (!disk)
675 return "<unknown>";
676 return disk->disk_name;
7caa4715
TH
677}
678
679static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
680{
681 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
682}
683
684static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
685{
686 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
687}
688
689static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
690{
691 return pd_to_blkg(&iocg->pd);
692}
693
694static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
695{
696 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
697 struct ioc_cgrp, cpd);
698}
699
700/*
701 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
36a52481 702 * weight, the more expensive each IO. Must round up.
7caa4715
TH
703 */
704static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
705{
fe20cdb5 706 return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
7caa4715
TH
707}
708
36a52481
TH
709/*
710 * The inverse of abs_cost_to_cost(). Must round up.
711 */
712static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
713{
fe20cdb5 714 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
36a52481
TH
715}
716
97eb1975
TH
717static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
718 u64 abs_cost, u64 cost)
7caa4715 719{
97eb1975
TH
720 struct iocg_pcpu_stat *gcs;
721
7caa4715
TH
722 bio->bi_iocost_cost = cost;
723 atomic64_add(cost, &iocg->vtime);
97eb1975
TH
724
725 gcs = get_cpu_ptr(iocg->pcpu_stat);
726 local64_add(abs_cost, &gcs->abs_vusage);
727 put_cpu_ptr(gcs);
7caa4715
TH
728}
729
da437b95
TH
730static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
731{
732 if (lock_ioc) {
733 spin_lock_irqsave(&iocg->ioc->lock, *flags);
734 spin_lock(&iocg->waitq.lock);
735 } else {
736 spin_lock_irqsave(&iocg->waitq.lock, *flags);
737 }
738}
739
740static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
741{
742 if (unlock_ioc) {
743 spin_unlock(&iocg->waitq.lock);
744 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
745 } else {
746 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
747 }
748}
749
7caa4715
TH
750#define CREATE_TRACE_POINTS
751#include <trace/events/iocost.h>
752
7ca5b2e6
TH
753static void ioc_refresh_margins(struct ioc *ioc)
754{
755 struct ioc_margins *margins = &ioc->margins;
756 u32 period_us = ioc->period_us;
ac33e91e 757 u64 vrate = ioc->vtime_base_rate;
7ca5b2e6
TH
758
759 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
f1de2439
TH
760 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
761 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
7ca5b2e6
TH
762}
763
7caa4715
TH
764/* latency Qos params changed, update period_us and all the dependent params */
765static void ioc_refresh_period_us(struct ioc *ioc)
766{
767 u32 ppm, lat, multi, period_us;
768
769 lockdep_assert_held(&ioc->lock);
770
771 /* pick the higher latency target */
772 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
773 ppm = ioc->params.qos[QOS_RPPM];
774 lat = ioc->params.qos[QOS_RLAT];
775 } else {
776 ppm = ioc->params.qos[QOS_WPPM];
777 lat = ioc->params.qos[QOS_WLAT];
778 }
779
780 /*
781 * We want the period to be long enough to contain a healthy number
782 * of IOs while short enough for granular control. Define it as a
783 * multiple of the latency target. Ideally, the multiplier should
784 * be scaled according to the percentile so that it would nominally
785 * contain a certain number of requests. Let's be simpler and
786 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
787 */
788 if (ppm)
789 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
790 else
791 multi = 2;
792 period_us = multi * lat;
793 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
794
795 /* calculate dependent params */
796 ioc->period_us = period_us;
7ca5b2e6
TH
797 ioc->timer_slack_ns = div64_u64(
798 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
799 100);
800 ioc_refresh_margins(ioc);
7caa4715
TH
801}
802
803static int ioc_autop_idx(struct ioc *ioc)
804{
805 int idx = ioc->autop_idx;
806 const struct ioc_params *p = &autop[idx];
807 u32 vrate_pct;
808 u64 now_ns;
809
810 /* rotational? */
811 if (!blk_queue_nonrot(ioc->rqos.q))
812 return AUTOP_HDD;
813
814 /* handle SATA SSDs w/ broken NCQ */
815 if (blk_queue_depth(ioc->rqos.q) == 1)
816 return AUTOP_SSD_QD1;
817
818 /* use one of the normal ssd sets */
819 if (idx < AUTOP_SSD_DFL)
820 return AUTOP_SSD_DFL;
821
822 /* if user is overriding anything, maintain what was there */
823 if (ioc->user_qos_params || ioc->user_cost_model)
824 return idx;
825
826 /* step up/down based on the vrate */
ac33e91e 827 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
7caa4715
TH
828 now_ns = ktime_get_ns();
829
830 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
831 if (!ioc->autop_too_fast_at)
832 ioc->autop_too_fast_at = now_ns;
833 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
834 return idx + 1;
835 } else {
836 ioc->autop_too_fast_at = 0;
837 }
838
839 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
840 if (!ioc->autop_too_slow_at)
841 ioc->autop_too_slow_at = now_ns;
842 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
843 return idx - 1;
844 } else {
845 ioc->autop_too_slow_at = 0;
846 }
847
848 return idx;
849}
850
851/*
852 * Take the followings as input
853 *
854 * @bps maximum sequential throughput
855 * @seqiops maximum sequential 4k iops
856 * @randiops maximum random 4k iops
857 *
858 * and calculate the linear model cost coefficients.
859 *
860 * *@page per-page cost 1s / (@bps / 4096)
861 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
862 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
863 */
864static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
865 u64 *page, u64 *seqio, u64 *randio)
866{
867 u64 v;
868
869 *page = *seqio = *randio = 0;
870
984af1e6
LN
871 if (bps) {
872 u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
873
874 if (bps_pages)
875 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
876 else
877 *page = 1;
878 }
7caa4715
TH
879
880 if (seqiops) {
881 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
882 if (v > *page)
883 *seqio = v - *page;
884 }
885
886 if (randiops) {
887 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
888 if (v > *page)
889 *randio = v - *page;
890 }
891}
892
893static void ioc_refresh_lcoefs(struct ioc *ioc)
894{
895 u64 *u = ioc->params.i_lcoefs;
896 u64 *c = ioc->params.lcoefs;
897
898 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
899 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
900 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
901 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
902}
903
904static bool ioc_refresh_params(struct ioc *ioc, bool force)
905{
906 const struct ioc_params *p;
907 int idx;
908
909 lockdep_assert_held(&ioc->lock);
910
911 idx = ioc_autop_idx(ioc);
912 p = &autop[idx];
913
914 if (idx == ioc->autop_idx && !force)
915 return false;
916
c6d2efdd 917 if (idx != ioc->autop_idx) {
7caa4715 918 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
c6d2efdd
KS
919 ioc->vtime_base_rate = VTIME_PER_USEC;
920 }
7caa4715
TH
921
922 ioc->autop_idx = idx;
923 ioc->autop_too_fast_at = 0;
924 ioc->autop_too_slow_at = 0;
925
926 if (!ioc->user_qos_params)
927 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
928 if (!ioc->user_cost_model)
929 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
930
931 ioc_refresh_period_us(ioc);
932 ioc_refresh_lcoefs(ioc);
933
934 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
935 VTIME_PER_USEC, MILLION);
936 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
937 VTIME_PER_USEC, MILLION);
938
939 return true;
940}
941
ac33e91e
TH
942/*
943 * When an iocg accumulates too much vtime or gets deactivated, we throw away
944 * some vtime, which lowers the overall device utilization. As the exact amount
945 * which is being thrown away is known, we can compensate by accelerating the
946 * vrate accordingly so that the extra vtime generated in the current period
947 * matches what got lost.
948 */
949static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
950{
951 s64 pleft = ioc->period_at + ioc->period_us - now->now;
952 s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
953 s64 vcomp, vcomp_min, vcomp_max;
954
955 lockdep_assert_held(&ioc->lock);
956
957 /* we need some time left in this period */
958 if (pleft <= 0)
959 goto done;
960
961 /*
962 * Calculate how much vrate should be adjusted to offset the error.
963 * Limit the amount of adjustment and deduct the adjusted amount from
964 * the error.
965 */
966 vcomp = -div64_s64(ioc->vtime_err, pleft);
967 vcomp_min = -(ioc->vtime_base_rate >> 1);
968 vcomp_max = ioc->vtime_base_rate;
969 vcomp = clamp(vcomp, vcomp_min, vcomp_max);
970
971 ioc->vtime_err += vcomp * pleft;
972
973 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
974done:
975 /* bound how much error can accumulate */
976 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
977}
978
926f75f6
BW
979static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
980 int nr_lagging, int nr_shortages,
981 int prev_busy_level, u32 *missed_ppm)
982{
983 u64 vrate = ioc->vtime_base_rate;
984 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
985
986 if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
987 if (ioc->busy_level != prev_busy_level || nr_lagging)
63c9eac4 988 trace_iocost_ioc_vrate_adj(ioc, vrate,
926f75f6
BW
989 missed_ppm, rq_wait_pct,
990 nr_lagging, nr_shortages);
991
992 return;
993 }
994
926f75f6
BW
995 /*
996 * If vrate is out of bounds, apply clamp gradually as the
997 * bounds can change abruptly. Otherwise, apply busy_level
998 * based adjustment.
999 */
1000 if (vrate < vrate_min) {
1001 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
1002 vrate = min(vrate, vrate_min);
1003 } else if (vrate > vrate_max) {
1004 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1005 vrate = max(vrate, vrate_max);
1006 } else {
1007 int idx = min_t(int, abs(ioc->busy_level),
1008 ARRAY_SIZE(vrate_adj_pct) - 1);
1009 u32 adj_pct = vrate_adj_pct[idx];
1010
1011 if (ioc->busy_level > 0)
1012 adj_pct = 100 - adj_pct;
1013 else
1014 adj_pct = 100 + adj_pct;
1015
1016 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1017 vrate_min, vrate_max);
1018 }
1019
1020 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1021 nr_lagging, nr_shortages);
1022
1023 ioc->vtime_base_rate = vrate;
1024 ioc_refresh_margins(ioc);
1025}
1026
7caa4715
TH
1027/* take a snapshot of the current [v]time and vrate */
1028static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1029{
1030 unsigned seq;
6c31be32 1031 u64 vrate;
7caa4715
TH
1032
1033 now->now_ns = ktime_get();
1034 now->now = ktime_to_us(now->now_ns);
6c31be32 1035 vrate = atomic64_read(&ioc->vtime_rate);
7caa4715
TH
1036
1037 /*
1038 * The current vtime is
1039 *
1040 * vtime at period start + (wallclock time since the start) * vrate
1041 *
1042 * As a consistent snapshot of `period_at_vtime` and `period_at` is
1043 * needed, they're seqcount protected.
1044 */
1045 do {
1046 seq = read_seqcount_begin(&ioc->period_seqcount);
1047 now->vnow = ioc->period_at_vtime +
6c31be32 1048 (now->now - ioc->period_at) * vrate;
7caa4715
TH
1049 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
1050}
1051
1052static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1053{
7caa4715
TH
1054 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1055
1056 write_seqcount_begin(&ioc->period_seqcount);
1057 ioc->period_at = now->now;
1058 ioc->period_at_vtime = now->vnow;
1059 write_seqcount_end(&ioc->period_seqcount);
1060
1061 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1062 add_timer(&ioc->timer);
1063}
1064
1065/*
1066 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
b0853ab4
TH
1067 * weight sums and propagate upwards accordingly. If @save, the current margin
1068 * is saved to be used as reference for later inuse in-period adjustments.
7caa4715 1069 */
b0853ab4
TH
1070static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1071 bool save, struct ioc_now *now)
7caa4715
TH
1072{
1073 struct ioc *ioc = iocg->ioc;
1074 int lvl;
1075
1076 lockdep_assert_held(&ioc->lock);
1077
e9f4eee9
TH
1078 /*
1079 * For an active leaf node, its inuse shouldn't be zero or exceed
1080 * @active. An active internal node's inuse is solely determined by the
1081 * inuse to active ratio of its children regardless of @inuse.
1082 */
1083 if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1084 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1085 iocg->child_active_sum);
1086 } else {
1087 inuse = clamp_t(u32, inuse, 1, active);
1088 }
db84a72a 1089
b0853ab4
TH
1090 iocg->last_inuse = iocg->inuse;
1091 if (save)
1092 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1093
db84a72a
TH
1094 if (active == iocg->active && inuse == iocg->inuse)
1095 return;
7caa4715
TH
1096
1097 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1098 struct ioc_gq *parent = iocg->ancestors[lvl];
1099 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1100 u32 parent_active = 0, parent_inuse = 0;
1101
1102 /* update the level sums */
1103 parent->child_active_sum += (s32)(active - child->active);
1104 parent->child_inuse_sum += (s32)(inuse - child->inuse);
e9f4eee9 1105 /* apply the updates */
7caa4715
TH
1106 child->active = active;
1107 child->inuse = inuse;
1108
1109 /*
1110 * The delta between inuse and active sums indicates that
5ba1add2 1111 * much of weight is being given away. Parent's inuse
7caa4715
TH
1112 * and active should reflect the ratio.
1113 */
1114 if (parent->child_active_sum) {
1115 parent_active = parent->weight;
1116 parent_inuse = DIV64_U64_ROUND_UP(
1117 parent_active * parent->child_inuse_sum,
1118 parent->child_active_sum);
1119 }
1120
1121 /* do we need to keep walking up? */
1122 if (parent_active == parent->active &&
1123 parent_inuse == parent->inuse)
1124 break;
1125
1126 active = parent_active;
1127 inuse = parent_inuse;
1128 }
1129
1130 ioc->weights_updated = true;
1131}
1132
00410f1b 1133static void commit_weights(struct ioc *ioc)
7caa4715
TH
1134{
1135 lockdep_assert_held(&ioc->lock);
1136
1137 if (ioc->weights_updated) {
1138 /* paired with rmb in current_hweight(), see there */
1139 smp_wmb();
1140 atomic_inc(&ioc->hweight_gen);
1141 ioc->weights_updated = false;
1142 }
1143}
1144
b0853ab4
TH
1145static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1146 bool save, struct ioc_now *now)
7caa4715 1147{
b0853ab4 1148 __propagate_weights(iocg, active, inuse, save, now);
00410f1b 1149 commit_weights(iocg->ioc);
7caa4715
TH
1150}
1151
1152static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1153{
1154 struct ioc *ioc = iocg->ioc;
1155 int lvl;
1156 u32 hwa, hwi;
1157 int ioc_gen;
1158
1159 /* hot path - if uptodate, use cached */
1160 ioc_gen = atomic_read(&ioc->hweight_gen);
1161 if (ioc_gen == iocg->hweight_gen)
1162 goto out;
1163
1164 /*
00410f1b
TH
1165 * Paired with wmb in commit_weights(). If we saw the updated
1166 * hweight_gen, all the weight updates from __propagate_weights() are
1167 * visible too.
7caa4715
TH
1168 *
1169 * We can race with weight updates during calculation and get it
1170 * wrong. However, hweight_gen would have changed and a future
1171 * reader will recalculate and we're guaranteed to discard the
1172 * wrong result soon.
1173 */
1174 smp_rmb();
1175
fe20cdb5 1176 hwa = hwi = WEIGHT_ONE;
7caa4715
TH
1177 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1178 struct ioc_gq *parent = iocg->ancestors[lvl];
1179 struct ioc_gq *child = iocg->ancestors[lvl + 1];
bd0adb91
TH
1180 u64 active_sum = READ_ONCE(parent->child_active_sum);
1181 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
7caa4715
TH
1182 u32 active = READ_ONCE(child->active);
1183 u32 inuse = READ_ONCE(child->inuse);
1184
1185 /* we can race with deactivations and either may read as zero */
1186 if (!active_sum || !inuse_sum)
1187 continue;
1188
bd0adb91
TH
1189 active_sum = max_t(u64, active, active_sum);
1190 hwa = div64_u64((u64)hwa * active, active_sum);
7caa4715 1191
bd0adb91
TH
1192 inuse_sum = max_t(u64, inuse, inuse_sum);
1193 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
7caa4715
TH
1194 }
1195
1196 iocg->hweight_active = max_t(u32, hwa, 1);
1197 iocg->hweight_inuse = max_t(u32, hwi, 1);
1198 iocg->hweight_gen = ioc_gen;
1199out:
1200 if (hw_activep)
1201 *hw_activep = iocg->hweight_active;
1202 if (hw_inusep)
1203 *hw_inusep = iocg->hweight_inuse;
1204}
1205
93f7d2db
TH
1206/*
1207 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1208 * other weights stay unchanged.
1209 */
1210static u32 current_hweight_max(struct ioc_gq *iocg)
1211{
1212 u32 hwm = WEIGHT_ONE;
1213 u32 inuse = iocg->active;
1214 u64 child_inuse_sum;
1215 int lvl;
1216
1217 lockdep_assert_held(&iocg->ioc->lock);
1218
1219 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1220 struct ioc_gq *parent = iocg->ancestors[lvl];
1221 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1222
1223 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1224 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1225 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1226 parent->child_active_sum);
1227 }
1228
1229 return max_t(u32, hwm, 1);
1230}
1231
b0853ab4 1232static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
7caa4715
TH
1233{
1234 struct ioc *ioc = iocg->ioc;
1235 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1236 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1237 u32 weight;
1238
1239 lockdep_assert_held(&ioc->lock);
1240
1241 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1242 if (weight != iocg->weight && iocg->active)
b0853ab4 1243 propagate_weights(iocg, weight, iocg->inuse, true, now);
7caa4715
TH
1244 iocg->weight = weight;
1245}
1246
1247static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1248{
1249 struct ioc *ioc = iocg->ioc;
ac33e91e
TH
1250 u64 last_period, cur_period;
1251 u64 vtime, vtarget;
7caa4715
TH
1252 int i;
1253
1254 /*
1255 * If seem to be already active, just update the stamp to tell the
1256 * timer that we're still active. We don't mind occassional races.
1257 */
1258 if (!list_empty(&iocg->active_list)) {
1259 ioc_now(ioc, now);
1260 cur_period = atomic64_read(&ioc->cur_period);
1261 if (atomic64_read(&iocg->active_period) != cur_period)
1262 atomic64_set(&iocg->active_period, cur_period);
1263 return true;
1264 }
1265
1266 /* racy check on internal node IOs, treat as root level IOs */
1267 if (iocg->child_active_sum)
1268 return false;
1269
1270 spin_lock_irq(&ioc->lock);
1271
1272 ioc_now(ioc, now);
1273
1274 /* update period */
1275 cur_period = atomic64_read(&ioc->cur_period);
1276 last_period = atomic64_read(&iocg->active_period);
1277 atomic64_set(&iocg->active_period, cur_period);
1278
1279 /* already activated or breaking leaf-only constraint? */
8b37bc27
JX
1280 if (!list_empty(&iocg->active_list))
1281 goto succeed_unlock;
1282 for (i = iocg->level - 1; i > 0; i--)
1283 if (!list_empty(&iocg->ancestors[i]->active_list))
7caa4715 1284 goto fail_unlock;
8b37bc27 1285
7caa4715
TH
1286 if (iocg->child_active_sum)
1287 goto fail_unlock;
1288
1289 /*
ac33e91e
TH
1290 * Always start with the target budget. On deactivation, we throw away
1291 * anything above it.
7caa4715 1292 */
ac33e91e 1293 vtarget = now->vnow - ioc->margins.target;
7caa4715 1294 vtime = atomic64_read(&iocg->vtime);
7caa4715 1295
ac33e91e
TH
1296 atomic64_add(vtarget - vtime, &iocg->vtime);
1297 atomic64_add(vtarget - vtime, &iocg->done_vtime);
1298 vtime = vtarget;
7caa4715
TH
1299
1300 /*
1301 * Activate, propagate weight and start period timer if not
1302 * running. Reset hweight_gen to avoid accidental match from
1303 * wrapping.
1304 */
1305 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1306 list_add(&iocg->active_list, &ioc->active_iocgs);
b0853ab4 1307
00410f1b 1308 propagate_weights(iocg, iocg->weight,
b0853ab4 1309 iocg->last_inuse ?: iocg->weight, true, now);
7caa4715
TH
1310
1311 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1312 last_period, cur_period, vtime);
1313
1aa50d02 1314 iocg->activated_at = now->now;
7caa4715
TH
1315
1316 if (ioc->running == IOC_IDLE) {
1317 ioc->running = IOC_RUNNING;
c7af2a00
TH
1318 ioc->dfgv_period_at = now->now;
1319 ioc->dfgv_period_rem = 0;
7caa4715
TH
1320 ioc_start_period(ioc, now);
1321 }
1322
8b37bc27 1323succeed_unlock:
7caa4715
TH
1324 spin_unlock_irq(&ioc->lock);
1325 return true;
1326
1327fail_unlock:
1328 spin_unlock_irq(&ioc->lock);
1329 return false;
1330}
1331
6ef20f78
TH
1332static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1333{
1334 struct ioc *ioc = iocg->ioc;
1335 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
5160a5a5
TH
1336 u64 tdelta, delay, new_delay;
1337 s64 vover, vover_pct;
c421a3eb 1338 u32 hwa;
6ef20f78
TH
1339
1340 lockdep_assert_held(&iocg->waitq.lock);
1341
5160a5a5
TH
1342 /* calculate the current delay in effect - 1/2 every second */
1343 tdelta = now->now - iocg->delay_at;
1344 if (iocg->delay)
1345 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1346 else
1347 delay = 0;
1348
1349 /* calculate the new delay from the debt amount */
c421a3eb 1350 current_hweight(iocg, &hwa, NULL);
5160a5a5
TH
1351 vover = atomic64_read(&iocg->vtime) +
1352 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
ac33e91e
TH
1353 vover_pct = div64_s64(100 * vover,
1354 ioc->period_us * ioc->vtime_base_rate);
5160a5a5
TH
1355
1356 if (vover_pct <= MIN_DELAY_THR_PCT)
1357 new_delay = 0;
1358 else if (vover_pct >= MAX_DELAY_THR_PCT)
1359 new_delay = MAX_DELAY;
1360 else
1361 new_delay = MIN_DELAY +
1362 div_u64((MAX_DELAY - MIN_DELAY) *
1363 (vover_pct - MIN_DELAY_THR_PCT),
1364 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1365
1366 /* pick the higher one and apply */
1367 if (new_delay > delay) {
1368 iocg->delay = new_delay;
1369 iocg->delay_at = now->now;
1370 delay = new_delay;
1371 }
6ef20f78 1372
5160a5a5 1373 if (delay >= MIN_DELAY) {
f0bf84a5
TH
1374 if (!iocg->indelay_since)
1375 iocg->indelay_since = now->now;
5160a5a5
TH
1376 blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1377 return true;
1378 } else {
f0bf84a5 1379 if (iocg->indelay_since) {
2a371f7d 1380 iocg->stat.indelay_us += now->now - iocg->indelay_since;
f0bf84a5
TH
1381 iocg->indelay_since = 0;
1382 }
5160a5a5 1383 iocg->delay = 0;
6ef20f78
TH
1384 blkcg_clear_delay(blkg);
1385 return false;
1386 }
6ef20f78
TH
1387}
1388
c421a3eb
TH
1389static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1390 struct ioc_now *now)
1391{
1392 struct iocg_pcpu_stat *gcs;
1393
1394 lockdep_assert_held(&iocg->ioc->lock);
1395 lockdep_assert_held(&iocg->waitq.lock);
1396 WARN_ON_ONCE(list_empty(&iocg->active_list));
1397
1398 /*
1399 * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1400 * inuse donating all of it share to others until its debt is paid off.
1401 */
f0bf84a5
TH
1402 if (!iocg->abs_vdebt && abs_cost) {
1403 iocg->indebt_since = now->now;
c421a3eb 1404 propagate_weights(iocg, iocg->active, 0, false, now);
f0bf84a5 1405 }
c421a3eb
TH
1406
1407 iocg->abs_vdebt += abs_cost;
1408
1409 gcs = get_cpu_ptr(iocg->pcpu_stat);
1410 local64_add(abs_cost, &gcs->abs_vusage);
1411 put_cpu_ptr(gcs);
1412}
1413
1414static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1415 struct ioc_now *now)
1416{
1417 lockdep_assert_held(&iocg->ioc->lock);
1418 lockdep_assert_held(&iocg->waitq.lock);
1419
1420 /* make sure that nobody messed with @iocg */
1421 WARN_ON_ONCE(list_empty(&iocg->active_list));
1422 WARN_ON_ONCE(iocg->inuse > 1);
1423
1424 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1425
1426 /* if debt is paid in full, restore inuse */
f0bf84a5 1427 if (!iocg->abs_vdebt) {
2a371f7d 1428 iocg->stat.indebt_us += now->now - iocg->indebt_since;
f0bf84a5
TH
1429 iocg->indebt_since = 0;
1430
c421a3eb
TH
1431 propagate_weights(iocg, iocg->active, iocg->last_inuse,
1432 false, now);
f0bf84a5 1433 }
c421a3eb
TH
1434}
1435
7caa4715
TH
1436static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1437 int flags, void *key)
1438{
1439 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
a7609c68 1440 struct iocg_wake_ctx *ctx = key;
7caa4715
TH
1441 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1442
1443 ctx->vbudget -= cost;
1444
1445 if (ctx->vbudget < 0)
1446 return -1;
1447
97eb1975 1448 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
5ab189cf 1449 wait->committed = true;
7caa4715
TH
1450
1451 /*
1452 * autoremove_wake_function() removes the wait entry only when it
5ab189cf
TH
1453 * actually changed the task state. We want the wait always removed.
1454 * Remove explicitly and use default_wake_function(). Note that the
1455 * order of operations is important as finish_wait() tests whether
1456 * @wq_entry is removed without grabbing the lock.
7caa4715 1457 */
7caa4715 1458 default_wake_function(wq_entry, mode, flags, key);
5ab189cf 1459 list_del_init_careful(&wq_entry->entry);
7caa4715
TH
1460 return 0;
1461}
1462
da437b95
TH
1463/*
1464 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1465 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1466 * addition to iocg->waitq.lock.
1467 */
1468static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1469 struct ioc_now *now)
7caa4715
TH
1470{
1471 struct ioc *ioc = iocg->ioc;
1472 struct iocg_wake_ctx ctx = { .iocg = iocg };
da437b95 1473 u64 vshortage, expires, oexpires;
36a52481 1474 s64 vbudget;
c421a3eb 1475 u32 hwa;
7caa4715
TH
1476
1477 lockdep_assert_held(&iocg->waitq.lock);
1478
c421a3eb 1479 current_hweight(iocg, &hwa, NULL);
36a52481
TH
1480 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1481
1482 /* pay off debt */
da437b95 1483 if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
c421a3eb
TH
1484 u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1485 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1486 u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
36a52481 1487
da437b95
TH
1488 lockdep_assert_held(&ioc->lock);
1489
c421a3eb
TH
1490 atomic64_add(vpay, &iocg->vtime);
1491 atomic64_add(vpay, &iocg->done_vtime);
1492 iocg_pay_debt(iocg, abs_vpay, now);
1493 vbudget -= vpay;
5160a5a5 1494 }
7b84b49e 1495
5160a5a5 1496 if (iocg->abs_vdebt || iocg->delay)
7b84b49e 1497 iocg_kick_delay(iocg, now);
36a52481 1498
da437b95
TH
1499 /*
1500 * Debt can still be outstanding if we haven't paid all yet or the
1501 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1502 * under debt. Make sure @vbudget reflects the outstanding amount and is
1503 * not positive.
1504 */
1505 if (iocg->abs_vdebt) {
c421a3eb 1506 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
da437b95
TH
1507 vbudget = min_t(s64, 0, vbudget - vdebt);
1508 }
1509
7caa4715 1510 /*
c421a3eb
TH
1511 * Wake up the ones which are due and see how much vtime we'll need for
1512 * the next one. As paying off debt restores hw_inuse, it must be read
1513 * after the above debt payment.
7caa4715 1514 */
da437b95 1515 ctx.vbudget = vbudget;
c421a3eb
TH
1516 current_hweight(iocg, NULL, &ctx.hw_inuse);
1517
7caa4715 1518 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
c421a3eb 1519
f0bf84a5
TH
1520 if (!waitqueue_active(&iocg->waitq)) {
1521 if (iocg->wait_since) {
2a371f7d 1522 iocg->stat.wait_us += now->now - iocg->wait_since;
f0bf84a5
TH
1523 iocg->wait_since = 0;
1524 }
7caa4715 1525 return;
f0bf84a5
TH
1526 }
1527
1528 if (!iocg->wait_since)
1529 iocg->wait_since = now->now;
1530
7caa4715
TH
1531 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1532 return;
1533
7ca5b2e6 1534 /* determine next wakeup, add a timer margin to guarantee chunking */
7caa4715
TH
1535 vshortage = -ctx.vbudget;
1536 expires = now->now_ns +
ac33e91e
TH
1537 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1538 NSEC_PER_USEC;
7ca5b2e6 1539 expires += ioc->timer_slack_ns;
7caa4715
TH
1540
1541 /* if already active and close enough, don't bother */
1542 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1543 if (hrtimer_is_queued(&iocg->waitq_timer) &&
7ca5b2e6 1544 abs(oexpires - expires) <= ioc->timer_slack_ns)
7caa4715
TH
1545 return;
1546
1547 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
7ca5b2e6 1548 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
7caa4715
TH
1549}
1550
1551static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1552{
1553 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
da437b95 1554 bool pay_debt = READ_ONCE(iocg->abs_vdebt);
7caa4715
TH
1555 struct ioc_now now;
1556 unsigned long flags;
1557
1558 ioc_now(iocg->ioc, &now);
1559
da437b95
TH
1560 iocg_lock(iocg, pay_debt, &flags);
1561 iocg_kick_waitq(iocg, pay_debt, &now);
1562 iocg_unlock(iocg, pay_debt, &flags);
7caa4715
TH
1563
1564 return HRTIMER_NORESTART;
1565}
1566
7caa4715
TH
1567static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1568{
1569 u32 nr_met[2] = { };
1570 u32 nr_missed[2] = { };
1571 u64 rq_wait_ns = 0;
1572 int cpu, rw;
1573
1574 for_each_online_cpu(cpu) {
1575 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1576 u64 this_rq_wait_ns;
1577
1578 for (rw = READ; rw <= WRITE; rw++) {
5e124f74
TH
1579 u32 this_met = local_read(&stat->missed[rw].nr_met);
1580 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
7caa4715
TH
1581
1582 nr_met[rw] += this_met - stat->missed[rw].last_met;
1583 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1584 stat->missed[rw].last_met = this_met;
1585 stat->missed[rw].last_missed = this_missed;
1586 }
1587
5e124f74 1588 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
7caa4715
TH
1589 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1590 stat->last_rq_wait_ns = this_rq_wait_ns;
1591 }
1592
1593 for (rw = READ; rw <= WRITE; rw++) {
1594 if (nr_met[rw] + nr_missed[rw])
1595 missed_ppm_ar[rw] =
1596 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1597 nr_met[rw] + nr_missed[rw]);
1598 else
1599 missed_ppm_ar[rw] = 0;
1600 }
1601
1602 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1603 ioc->period_us * NSEC_PER_USEC);
1604}
1605
1606/* was iocg idle this period? */
1607static bool iocg_is_idle(struct ioc_gq *iocg)
1608{
1609 struct ioc *ioc = iocg->ioc;
1610
1611 /* did something get issued this period? */
1612 if (atomic64_read(&iocg->active_period) ==
1613 atomic64_read(&ioc->cur_period))
1614 return false;
1615
1616 /* is something in flight? */
dcd6589b 1617 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
7caa4715
TH
1618 return false;
1619
1620 return true;
1621}
1622
97eb1975
TH
1623/*
1624 * Call this function on the target leaf @iocg's to build pre-order traversal
1625 * list of all the ancestors in @inner_walk. The inner nodes are linked through
1626 * ->walk_list and the caller is responsible for dissolving the list after use.
1627 */
1628static void iocg_build_inner_walk(struct ioc_gq *iocg,
1629 struct list_head *inner_walk)
1630{
1631 int lvl;
1632
1633 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1634
1635 /* find the first ancestor which hasn't been visited yet */
1636 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1637 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1638 break;
1639 }
1640
1641 /* walk down and visit the inner nodes to get pre-order traversal */
1642 while (++lvl <= iocg->level - 1) {
1643 struct ioc_gq *inner = iocg->ancestors[lvl];
1644
1645 /* record traversal order */
1646 list_add_tail(&inner->walk_list, inner_walk);
1647 }
1648}
1649
2a371f7d
CZ
1650/* propagate the deltas to the parent */
1651static void iocg_flush_stat_upward(struct ioc_gq *iocg)
1652{
1653 if (iocg->level > 0) {
1654 struct iocg_stat *parent_stat =
1655 &iocg->ancestors[iocg->level - 1]->stat;
1656
1657 parent_stat->usage_us +=
1658 iocg->stat.usage_us - iocg->last_stat.usage_us;
1659 parent_stat->wait_us +=
1660 iocg->stat.wait_us - iocg->last_stat.wait_us;
1661 parent_stat->indebt_us +=
1662 iocg->stat.indebt_us - iocg->last_stat.indebt_us;
1663 parent_stat->indelay_us +=
1664 iocg->stat.indelay_us - iocg->last_stat.indelay_us;
1665 }
1666
1667 iocg->last_stat = iocg->stat;
1668}
1669
97eb1975 1670/* collect per-cpu counters and propagate the deltas to the parent */
2a371f7d 1671static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
97eb1975 1672{
ac33e91e 1673 struct ioc *ioc = iocg->ioc;
97eb1975
TH
1674 u64 abs_vusage = 0;
1675 u64 vusage_delta;
1676 int cpu;
1677
1678 lockdep_assert_held(&iocg->ioc->lock);
1679
1680 /* collect per-cpu counters */
1681 for_each_possible_cpu(cpu) {
1682 abs_vusage += local64_read(
1683 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1684 }
1685 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1686 iocg->last_stat_abs_vusage = abs_vusage;
1687
ac33e91e 1688 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
2a371f7d 1689 iocg->stat.usage_us += iocg->usage_delta_us;
97eb1975 1690
2a371f7d 1691 iocg_flush_stat_upward(iocg);
97eb1975
TH
1692}
1693
1694/* get stat counters ready for reading on all active iocgs */
1695static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1696{
1697 LIST_HEAD(inner_walk);
1698 struct ioc_gq *iocg, *tiocg;
1699
1700 /* flush leaves and build inner node walk list */
1701 list_for_each_entry(iocg, target_iocgs, active_list) {
2a371f7d 1702 iocg_flush_stat_leaf(iocg, now);
97eb1975
TH
1703 iocg_build_inner_walk(iocg, &inner_walk);
1704 }
1705
1706 /* keep flushing upwards by walking the inner list backwards */
1707 list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
2a371f7d 1708 iocg_flush_stat_upward(iocg);
97eb1975
TH
1709 list_del_init(&iocg->walk_list);
1710 }
1711}
1712
93f7d2db
TH
1713/*
1714 * Determine what @iocg's hweight_inuse should be after donating unused
1715 * capacity. @hwm is the upper bound and used to signal no donation. This
1716 * function also throws away @iocg's excess budget.
1717 */
ac33e91e
TH
1718static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1719 u32 usage, struct ioc_now *now)
7caa4715 1720{
93f7d2db
TH
1721 struct ioc *ioc = iocg->ioc;
1722 u64 vtime = atomic64_read(&iocg->vtime);
f1de2439 1723 s64 excess, delta, target, new_hwi;
93f7d2db 1724
c421a3eb
TH
1725 /* debt handling owns inuse for debtors */
1726 if (iocg->abs_vdebt)
1727 return 1;
1728
93f7d2db
TH
1729 /* see whether minimum margin requirement is met */
1730 if (waitqueue_active(&iocg->waitq) ||
1731 time_after64(vtime, now->vnow - ioc->margins.min))
1732 return hwm;
1733
ac33e91e
TH
1734 /* throw away excess above target */
1735 excess = now->vnow - vtime - ioc->margins.target;
93f7d2db
TH
1736 if (excess > 0) {
1737 atomic64_add(excess, &iocg->vtime);
1738 atomic64_add(excess, &iocg->done_vtime);
1739 vtime += excess;
ac33e91e 1740 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
93f7d2db
TH
1741 }
1742
f1de2439
TH
1743 /*
1744 * Let's say the distance between iocg's and device's vtimes as a
1745 * fraction of period duration is delta. Assuming that the iocg will
1746 * consume the usage determined above, we want to determine new_hwi so
1747 * that delta equals MARGIN_TARGET at the end of the next period.
1748 *
1749 * We need to execute usage worth of IOs while spending the sum of the
1750 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1751 * (delta):
1752 *
1753 * usage = (1 - MARGIN_TARGET + delta) * new_hwi
1754 *
1755 * Therefore, the new_hwi is:
1756 *
1757 * new_hwi = usage / (1 - MARGIN_TARGET + delta)
1758 */
1759 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1760 now->vnow - ioc->period_at_vtime);
1761 target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1762 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
7caa4715 1763
f1de2439 1764 return clamp_t(s64, new_hwi, 1, hwm);
7caa4715
TH
1765}
1766
e08d02aa
TH
1767/*
1768 * For work-conservation, an iocg which isn't using all of its share should
1769 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1770 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1771 *
1772 * #1 is mathematically simpler but has the drawback of requiring synchronous
1773 * global hweight_inuse updates when idle iocg's get activated or inuse weights
1774 * change due to donation snapbacks as it has the possibility of grossly
1775 * overshooting what's allowed by the model and vrate.
1776 *
1777 * #2 is inherently safe with local operations. The donating iocg can easily
1778 * snap back to higher weights when needed without worrying about impacts on
1779 * other nodes as the impacts will be inherently correct. This also makes idle
1780 * iocg activations safe. The only effect activations have is decreasing
1781 * hweight_inuse of others, the right solution to which is for those iocgs to
1782 * snap back to higher weights.
1783 *
1784 * So, we go with #2. The challenge is calculating how each donating iocg's
1785 * inuse should be adjusted to achieve the target donation amounts. This is done
1786 * using Andy's method described in the following pdf.
1787 *
1788 * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1789 *
1790 * Given the weights and target after-donation hweight_inuse values, Andy's
1791 * method determines how the proportional distribution should look like at each
1792 * sibling level to maintain the relative relationship between all non-donating
1793 * pairs. To roughly summarize, it divides the tree into donating and
1794 * non-donating parts, calculates global donation rate which is used to
1795 * determine the target hweight_inuse for each node, and then derives per-level
1796 * proportions.
1797 *
1798 * The following pdf shows that global distribution calculated this way can be
1799 * achieved by scaling inuse weights of donating leaves and propagating the
1800 * adjustments upwards proportionally.
1801 *
1802 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1803 *
1804 * Combining the above two, we can determine how each leaf iocg's inuse should
1805 * be adjusted to achieve the target donation.
1806 *
1807 * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1808 *
1809 * The inline comments use symbols from the last pdf.
1810 *
1811 * b is the sum of the absolute budgets in the subtree. 1 for the root node.
1812 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1813 * t is the sum of the absolute budgets of donating nodes in the subtree.
1814 * w is the weight of the node. w = w_f + w_t
1815 * w_f is the non-donating portion of w. w_f = w * f / b
1816 * w_b is the donating portion of w. w_t = w * t / b
1817 * s is the sum of all sibling weights. s = Sum(w) for siblings
1818 * s_f and s_t are the non-donating and donating portions of s.
1819 *
1820 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1821 * w_pt is the donating portion of the parent's weight and w'_pt the same value
1822 * after adjustments. Subscript r denotes the root node's values.
1823 */
93f7d2db
TH
1824static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1825{
e08d02aa
TH
1826 LIST_HEAD(over_hwa);
1827 LIST_HEAD(inner_walk);
1828 struct ioc_gq *iocg, *tiocg, *root_iocg;
1829 u32 after_sum, over_sum, over_target, gamma;
93f7d2db 1830
e08d02aa
TH
1831 /*
1832 * It's pretty unlikely but possible for the total sum of
1833 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1834 * confuse the following calculations. If such condition is detected,
1835 * scale down everyone over its full share equally to keep the sum below
1836 * WEIGHT_ONE.
1837 */
1838 after_sum = 0;
1839 over_sum = 0;
93f7d2db 1840 list_for_each_entry(iocg, surpluses, surplus_list) {
e08d02aa 1841 u32 hwa;
93f7d2db 1842
e08d02aa
TH
1843 current_hweight(iocg, &hwa, NULL);
1844 after_sum += iocg->hweight_after_donation;
93f7d2db 1845
e08d02aa
TH
1846 if (iocg->hweight_after_donation > hwa) {
1847 over_sum += iocg->hweight_after_donation;
1848 list_add(&iocg->walk_list, &over_hwa);
1849 }
93f7d2db 1850 }
e08d02aa
TH
1851
1852 if (after_sum >= WEIGHT_ONE) {
1853 /*
1854 * The delta should be deducted from the over_sum, calculate
1855 * target over_sum value.
1856 */
1857 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1858 WARN_ON_ONCE(over_sum <= over_delta);
1859 over_target = over_sum - over_delta;
1860 } else {
1861 over_target = 0;
1862 }
1863
1864 list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1865 if (over_target)
1866 iocg->hweight_after_donation =
1867 div_u64((u64)iocg->hweight_after_donation *
1868 over_target, over_sum);
1869 list_del_init(&iocg->walk_list);
1870 }
1871
1872 /*
1873 * Build pre-order inner node walk list and prepare for donation
1874 * adjustment calculations.
1875 */
1876 list_for_each_entry(iocg, surpluses, surplus_list) {
1877 iocg_build_inner_walk(iocg, &inner_walk);
1878 }
1879
1880 root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1881 WARN_ON_ONCE(root_iocg->level > 0);
1882
1883 list_for_each_entry(iocg, &inner_walk, walk_list) {
1884 iocg->child_adjusted_sum = 0;
1885 iocg->hweight_donating = 0;
1886 iocg->hweight_after_donation = 0;
1887 }
1888
1889 /*
1890 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1891 * up the hierarchy.
1892 */
1893 list_for_each_entry(iocg, surpluses, surplus_list) {
1894 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1895
1896 parent->hweight_donating += iocg->hweight_donating;
1897 parent->hweight_after_donation += iocg->hweight_after_donation;
1898 }
1899
1900 list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1901 if (iocg->level > 0) {
1902 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1903
1904 parent->hweight_donating += iocg->hweight_donating;
1905 parent->hweight_after_donation += iocg->hweight_after_donation;
1906 }
1907 }
1908
1909 /*
1910 * Calculate inner hwa's (b) and make sure the donation values are
1911 * within the accepted ranges as we're doing low res calculations with
1912 * roundups.
1913 */
1914 list_for_each_entry(iocg, &inner_walk, walk_list) {
1915 if (iocg->level) {
1916 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1917
1918 iocg->hweight_active = DIV64_U64_ROUND_UP(
1919 (u64)parent->hweight_active * iocg->active,
1920 parent->child_active_sum);
1921
1922 }
1923
1924 iocg->hweight_donating = min(iocg->hweight_donating,
1925 iocg->hweight_active);
1926 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1927 iocg->hweight_donating - 1);
1928 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1929 iocg->hweight_donating <= 1 ||
1930 iocg->hweight_after_donation == 0)) {
1931 pr_warn("iocg: invalid donation weights in ");
1932 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1933 pr_cont(": active=%u donating=%u after=%u\n",
1934 iocg->hweight_active, iocg->hweight_donating,
1935 iocg->hweight_after_donation);
1936 }
1937 }
1938
1939 /*
1940 * Calculate the global donation rate (gamma) - the rate to adjust
769b628d
TH
1941 * non-donating budgets by.
1942 *
1943 * No need to use 64bit multiplication here as the first operand is
1944 * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1945 *
1946 * We know that there are beneficiary nodes and the sum of the donating
1947 * hweights can't be whole; however, due to the round-ups during hweight
1948 * calculations, root_iocg->hweight_donating might still end up equal to
1949 * or greater than whole. Limit the range when calculating the divider.
e08d02aa
TH
1950 *
1951 * gamma = (1 - t_r') / (1 - t_r)
1952 */
1953 gamma = DIV_ROUND_UP(
1954 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
769b628d 1955 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
e08d02aa
TH
1956
1957 /*
1958 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1959 * nodes.
1960 */
1961 list_for_each_entry(iocg, &inner_walk, walk_list) {
1962 struct ioc_gq *parent;
1963 u32 inuse, wpt, wptp;
1964 u64 st, sf;
1965
1966 if (iocg->level == 0) {
1967 /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1968 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1969 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1970 WEIGHT_ONE - iocg->hweight_after_donation);
1971 continue;
1972 }
1973
1974 parent = iocg->ancestors[iocg->level - 1];
1975
1976 /* b' = gamma * b_f + b_t' */
1977 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1978 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1979 WEIGHT_ONE) + iocg->hweight_after_donation;
1980
1981 /* w' = s' * b' / b'_p */
1982 inuse = DIV64_U64_ROUND_UP(
1983 (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1984 parent->hweight_inuse);
1985
1986 /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1987 st = DIV64_U64_ROUND_UP(
1988 iocg->child_active_sum * iocg->hweight_donating,
1989 iocg->hweight_active);
1990 sf = iocg->child_active_sum - st;
1991 wpt = DIV64_U64_ROUND_UP(
1992 (u64)iocg->active * iocg->hweight_donating,
1993 iocg->hweight_active);
1994 wptp = DIV64_U64_ROUND_UP(
1995 (u64)inuse * iocg->hweight_after_donation,
1996 iocg->hweight_inuse);
1997
1998 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1999 }
2000
2001 /*
2002 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
2003 * we can finally determine leaf adjustments.
2004 */
2005 list_for_each_entry(iocg, surpluses, surplus_list) {
2006 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2007 u32 inuse;
2008
c421a3eb
TH
2009 /*
2010 * In-debt iocgs participated in the donation calculation with
2011 * the minimum target hweight_inuse. Configuring inuse
2012 * accordingly would work fine but debt handling expects
2013 * @iocg->inuse stay at the minimum and we don't wanna
2014 * interfere.
2015 */
2016 if (iocg->abs_vdebt) {
2017 WARN_ON_ONCE(iocg->inuse > 1);
2018 continue;
2019 }
2020
e08d02aa
TH
2021 /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2022 inuse = DIV64_U64_ROUND_UP(
2023 parent->child_adjusted_sum * iocg->hweight_after_donation,
2024 parent->hweight_inuse);
04603755
TH
2025
2026 TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2027 iocg->inuse, inuse,
2028 iocg->hweight_inuse,
2029 iocg->hweight_after_donation);
2030
b0853ab4 2031 __propagate_weights(iocg, iocg->active, inuse, true, now);
e08d02aa
TH
2032 }
2033
2034 /* walk list should be dissolved after use */
2035 list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2036 list_del_init(&iocg->walk_list);
93f7d2db
TH
2037}
2038
ab8df828
TH
2039/*
2040 * A low weight iocg can amass a large amount of debt, for example, when
2041 * anonymous memory gets reclaimed aggressively. If the system has a lot of
2042 * memory paired with a slow IO device, the debt can span multiple seconds or
2043 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2044 * up blocked paying its debt while the IO device is idle.
2045 *
2046 * The following protects against such cases. If the device has been
d9517841
TH
2047 * sufficiently idle for a while, the debts are halved and delays are
2048 * recalculated.
ab8df828
TH
2049 */
2050static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
33a1fe6d 2051 struct ioc_now *now)
ab8df828 2052{
c7af2a00
TH
2053 struct ioc_gq *iocg;
2054 u64 dur, usage_pct, nr_cycles;
2055
2056 /* if no debtor, reset the cycle */
2057 if (!nr_debtors) {
2058 ioc->dfgv_period_at = now->now;
2059 ioc->dfgv_period_rem = 0;
2060 ioc->dfgv_usage_us_sum = 0;
2061 return;
2062 }
2063
2064 /*
2065 * Debtors can pass through a lot of writes choking the device and we
2066 * don't want to be forgiving debts while the device is struggling from
2067 * write bursts. If we're missing latency targets, consider the device
2068 * fully utilized.
2069 */
2070 if (ioc->busy_level > 0)
2071 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2072
2073 ioc->dfgv_usage_us_sum += usage_us_sum;
2074 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2075 return;
2076
2077 /*
2078 * At least DFGV_PERIOD has passed since the last period. Calculate the
2079 * average usage and reset the period counters.
2080 */
2081 dur = now->now - ioc->dfgv_period_at;
2082 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2083
2084 ioc->dfgv_period_at = now->now;
2085 ioc->dfgv_usage_us_sum = 0;
2086
2087 /* if was too busy, reset everything */
2088 if (usage_pct > DFGV_USAGE_PCT) {
2089 ioc->dfgv_period_rem = 0;
2090 return;
2091 }
2092
2093 /*
2094 * Usage is lower than threshold. Let's forgive some debts. Debt
2095 * forgiveness runs off of the usual ioc timer but its period usually
2096 * doesn't match ioc's. Compensate the difference by performing the
2097 * reduction as many times as would fit in the duration since the last
2098 * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2099 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2100 * reductions is doubled.
2101 */
2102 nr_cycles = dur + ioc->dfgv_period_rem;
2103 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2104
2105 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
c5a6561b
TH
2106 u64 __maybe_unused old_debt, __maybe_unused old_delay;
2107
bec02dbb 2108 if (!iocg->abs_vdebt && !iocg->delay)
c7af2a00 2109 continue;
c5a6561b 2110
c7af2a00 2111 spin_lock(&iocg->waitq.lock);
c5a6561b
TH
2112
2113 old_debt = iocg->abs_vdebt;
2114 old_delay = iocg->delay;
2115
bec02dbb
TH
2116 if (iocg->abs_vdebt)
2117 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2118 if (iocg->delay)
2119 iocg->delay = iocg->delay >> nr_cycles ?: 1;
2120
c7af2a00 2121 iocg_kick_waitq(iocg, true, now);
c5a6561b
TH
2122
2123 TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2124 old_debt, iocg->abs_vdebt,
2125 old_delay, iocg->delay);
2126
c7af2a00 2127 spin_unlock(&iocg->waitq.lock);
ab8df828
TH
2128 }
2129}
2130
2474787a
BW
2131/*
2132 * Check the active iocgs' state to avoid oversleeping and deactive
2133 * idle iocgs.
2134 *
2135 * Since waiters determine the sleep durations based on the vrate
2136 * they saw at the time of sleep, if vrate has increased, some
2137 * waiters could be sleeping for too long. Wake up tardy waiters
2138 * which should have woken up in the last period and expire idle
2139 * iocgs.
2140 */
2141static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
7caa4715 2142{
2474787a 2143 int nr_debtors = 0;
7caa4715 2144 struct ioc_gq *iocg, *tiocg;
7caa4715 2145
7caa4715 2146 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
d9012a59 2147 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
5160a5a5 2148 !iocg->delay && !iocg_is_idle(iocg))
7caa4715
TH
2149 continue;
2150
2151 spin_lock(&iocg->waitq.lock);
2152
f0bf84a5
TH
2153 /* flush wait and indebt stat deltas */
2154 if (iocg->wait_since) {
2a371f7d 2155 iocg->stat.wait_us += now->now - iocg->wait_since;
2474787a 2156 iocg->wait_since = now->now;
f0bf84a5
TH
2157 }
2158 if (iocg->indebt_since) {
2a371f7d 2159 iocg->stat.indebt_us +=
2474787a
BW
2160 now->now - iocg->indebt_since;
2161 iocg->indebt_since = now->now;
f0bf84a5
TH
2162 }
2163 if (iocg->indelay_since) {
2a371f7d 2164 iocg->stat.indelay_us +=
2474787a
BW
2165 now->now - iocg->indelay_since;
2166 iocg->indelay_since = now->now;
f0bf84a5
TH
2167 }
2168
5160a5a5
TH
2169 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2170 iocg->delay) {
7caa4715 2171 /* might be oversleeping vtime / hweight changes, kick */
2474787a 2172 iocg_kick_waitq(iocg, true, now);
bec02dbb 2173 if (iocg->abs_vdebt || iocg->delay)
dda1315f 2174 nr_debtors++;
7caa4715
TH
2175 } else if (iocg_is_idle(iocg)) {
2176 /* no waiter and idle, deactivate */
ac33e91e
TH
2177 u64 vtime = atomic64_read(&iocg->vtime);
2178 s64 excess;
2179
2180 /*
2181 * @iocg has been inactive for a full duration and will
2182 * have a high budget. Account anything above target as
2183 * error and throw away. On reactivation, it'll start
2184 * with the target budget.
2185 */
2474787a 2186 excess = now->vnow - vtime - ioc->margins.target;
ac33e91e
TH
2187 if (excess > 0) {
2188 u32 old_hwi;
2189
2190 current_hweight(iocg, NULL, &old_hwi);
2191 ioc->vtime_err -= div64_u64(excess * old_hwi,
2192 WEIGHT_ONE);
2193 }
2194
76efc1c7
BW
2195 TRACE_IOCG_PATH(iocg_idle, iocg, now,
2196 atomic64_read(&iocg->active_period),
2197 atomic64_read(&ioc->cur_period), vtime);
2474787a 2198 __propagate_weights(iocg, 0, 0, false, now);
7caa4715
TH
2199 list_del_init(&iocg->active_list);
2200 }
2201
2202 spin_unlock(&iocg->waitq.lock);
2203 }
2474787a 2204
00410f1b 2205 commit_weights(ioc);
2474787a
BW
2206 return nr_debtors;
2207}
2208
2209static void ioc_timer_fn(struct timer_list *timer)
2210{
2211 struct ioc *ioc = container_of(timer, struct ioc, timer);
2212 struct ioc_gq *iocg, *tiocg;
2213 struct ioc_now now;
2214 LIST_HEAD(surpluses);
2215 int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2216 u64 usage_us_sum = 0;
074501bc
YK
2217 u32 ppm_rthr;
2218 u32 ppm_wthr;
2474787a
BW
2219 u32 missed_ppm[2], rq_wait_pct;
2220 u64 period_vtime;
2221 int prev_busy_level;
2222
2223 /* how were the latencies during the period? */
2224 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2225
2226 /* take care of active iocgs */
2227 spin_lock_irq(&ioc->lock);
2228
074501bc
YK
2229 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2230 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2474787a
BW
2231 ioc_now(ioc, &now);
2232
2233 period_vtime = now.vnow - ioc->period_at_vtime;
2234 if (WARN_ON_ONCE(!period_vtime)) {
2235 spin_unlock_irq(&ioc->lock);
2236 return;
2237 }
2238
2239 nr_debtors = ioc_check_iocgs(ioc, &now);
7caa4715 2240
f0bf84a5
TH
2241 /*
2242 * Wait and indebt stat are flushed above and the donation calculation
2243 * below needs updated usage stat. Let's bring stat up-to-date.
2244 */
2245 iocg_flush_stat(&ioc->active_iocgs, &now);
2246
f1de2439 2247 /* calc usage and see whether some weights need to be moved around */
7caa4715 2248 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
c09245f6
BW
2249 u64 vdone, vtime, usage_us;
2250 u32 hw_active, hw_inuse;
7caa4715
TH
2251
2252 /*
2253 * Collect unused and wind vtime closer to vnow to prevent
2254 * iocgs from accumulating a large amount of budget.
2255 */
2256 vdone = atomic64_read(&iocg->done_vtime);
2257 vtime = atomic64_read(&iocg->vtime);
2258 current_hweight(iocg, &hw_active, &hw_inuse);
2259
2260 /*
2261 * Latency QoS detection doesn't account for IOs which are
2262 * in-flight for longer than a period. Detect them by
2263 * comparing vdone against period start. If lagging behind
2264 * IOs from past periods, don't increase vrate.
2265 */
7cd806a9
TH
2266 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2267 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
7caa4715
TH
2268 time_after64(vtime, vdone) &&
2269 time_after64(vtime, now.vnow -
2270 MAX_LAGGING_PERIODS * period_vtime) &&
2271 time_before64(vdone, now.vnow - period_vtime))
2272 nr_lagging++;
2273
7caa4715 2274 /*
f1de2439
TH
2275 * Determine absolute usage factoring in in-flight IOs to avoid
2276 * high-latency completions appearing as idle.
7caa4715 2277 */
1aa50d02 2278 usage_us = iocg->usage_delta_us;
dda1315f 2279 usage_us_sum += usage_us;
f1de2439 2280
7caa4715 2281 /* see whether there's surplus vtime */
8692d2db 2282 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
93f7d2db
TH
2283 if (hw_inuse < hw_active ||
2284 (!waitqueue_active(&iocg->waitq) &&
f1de2439 2285 time_before64(vtime, now.vnow - ioc->margins.low))) {
c09245f6
BW
2286 u32 hwa, old_hwi, hwm, new_hwi, usage;
2287 u64 usage_dur;
2288
2289 if (vdone != vtime) {
2290 u64 inflight_us = DIV64_U64_ROUND_UP(
2291 cost_to_abs_cost(vtime - vdone, hw_inuse),
2292 ioc->vtime_base_rate);
2293
2294 usage_us = max(usage_us, inflight_us);
2295 }
2296
2297 /* convert to hweight based usage ratio */
2298 if (time_after64(iocg->activated_at, ioc->period_at))
2299 usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2300 else
2301 usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2302
2303 usage = clamp_t(u32,
2304 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2305 usage_dur),
2306 1, WEIGHT_ONE);
93f7d2db
TH
2307
2308 /*
2309 * Already donating or accumulated enough to start.
2310 * Determine the donation amount.
2311 */
ac33e91e 2312 current_hweight(iocg, &hwa, &old_hwi);
93f7d2db 2313 hwm = current_hweight_max(iocg);
ac33e91e
TH
2314 new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2315 usage, &now);
edaa2633
TH
2316 /*
2317 * Donation calculation assumes hweight_after_donation
2318 * to be positive, a condition that a donor w/ hwa < 2
2319 * can't meet. Don't bother with donation if hwa is
2320 * below 2. It's not gonna make a meaningful difference
2321 * anyway.
2322 */
2323 if (new_hwi < hwm && hwa >= 2) {
e08d02aa 2324 iocg->hweight_donating = hwa;
93f7d2db 2325 iocg->hweight_after_donation = new_hwi;
8692d2db 2326 list_add(&iocg->surplus_list, &surpluses);
8c936f9e
TH
2327 } else if (!iocg->abs_vdebt) {
2328 /*
2329 * @iocg doesn't have enough to donate. Reset
2330 * its inuse to active.
2331 *
2332 * Don't reset debtors as their inuse's are
2333 * owned by debt handling. This shouldn't affect
2334 * donation calculuation in any meaningful way
2335 * as @iocg doesn't have a meaningful amount of
2336 * share anyway.
2337 */
04603755
TH
2338 TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2339 iocg->inuse, iocg->active,
2340 iocg->hweight_inuse, new_hwi);
2341
93f7d2db 2342 __propagate_weights(iocg, iocg->active,
b0853ab4 2343 iocg->active, true, &now);
93f7d2db 2344 nr_shortages++;
7caa4715
TH
2345 }
2346 } else {
93f7d2db 2347 /* genuinely short on vtime */
7caa4715
TH
2348 nr_shortages++;
2349 }
2350 }
2351
93f7d2db
TH
2352 if (!list_empty(&surpluses) && nr_shortages)
2353 transfer_surpluses(&surpluses, &now);
7caa4715 2354
00410f1b 2355 commit_weights(ioc);
7caa4715 2356
8692d2db
TH
2357 /* surplus list should be dissolved after use */
2358 list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2359 list_del_init(&iocg->surplus_list);
2360
7caa4715
TH
2361 /*
2362 * If q is getting clogged or we're missing too much, we're issuing
2363 * too much IO and should lower vtime rate. If we're not missing
2364 * and experiencing shortages but not surpluses, we're too stingy
2365 * and should increase vtime rate.
2366 */
25d41e4a 2367 prev_busy_level = ioc->busy_level;
7caa4715
TH
2368 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2369 missed_ppm[READ] > ppm_rthr ||
2370 missed_ppm[WRITE] > ppm_wthr) {
81ca627a 2371 /* clearly missing QoS targets, slow down vrate */
7caa4715
TH
2372 ioc->busy_level = max(ioc->busy_level, 0);
2373 ioc->busy_level++;
7cd806a9 2374 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
7caa4715
TH
2375 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2376 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
81ca627a
TH
2377 /* QoS targets are being met with >25% margin */
2378 if (nr_shortages) {
2379 /*
2380 * We're throttling while the device has spare
2381 * capacity. If vrate was being slowed down, stop.
2382 */
7cd806a9 2383 ioc->busy_level = min(ioc->busy_level, 0);
81ca627a
TH
2384
2385 /*
2386 * If there are IOs spanning multiple periods, wait
065655c8 2387 * them out before pushing the device harder.
81ca627a 2388 */
065655c8 2389 if (!nr_lagging)
7cd806a9 2390 ioc->busy_level--;
81ca627a
TH
2391 } else {
2392 /*
2393 * Nobody is being throttled and the users aren't
2394 * issuing enough IOs to saturate the device. We
2395 * simply don't know how close the device is to
2396 * saturation. Coast.
2397 */
2398 ioc->busy_level = 0;
7cd806a9 2399 }
7caa4715 2400 } else {
81ca627a 2401 /* inside the hysterisis margin, we're good */
7caa4715
TH
2402 ioc->busy_level = 0;
2403 }
2404
2405 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2406
926f75f6
BW
2407 ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2408 prev_busy_level, missed_ppm);
7caa4715
TH
2409
2410 ioc_refresh_params(ioc, false);
2411
33a1fe6d
TH
2412 ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2413
7caa4715
TH
2414 /*
2415 * This period is done. Move onto the next one. If nothing's
2416 * going on with the device, stop the timer.
2417 */
2418 atomic64_inc(&ioc->cur_period);
2419
2420 if (ioc->running != IOC_STOP) {
2421 if (!list_empty(&ioc->active_iocgs)) {
2422 ioc_start_period(ioc, &now);
2423 } else {
2424 ioc->busy_level = 0;
ac33e91e 2425 ioc->vtime_err = 0;
7caa4715
TH
2426 ioc->running = IOC_IDLE;
2427 }
ac33e91e
TH
2428
2429 ioc_refresh_vrate(ioc, &now);
7caa4715
TH
2430 }
2431
2432 spin_unlock_irq(&ioc->lock);
2433}
2434
b0853ab4
TH
2435static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2436 u64 abs_cost, struct ioc_now *now)
2437{
2438 struct ioc *ioc = iocg->ioc;
2439 struct ioc_margins *margins = &ioc->margins;
04603755 2440 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
aa67db24 2441 u32 hwi, adj_step;
b0853ab4
TH
2442 s64 margin;
2443 u64 cost, new_inuse;
2444
2445 current_hweight(iocg, NULL, &hwi);
04603755 2446 old_hwi = hwi;
b0853ab4
TH
2447 cost = abs_cost_to_cost(abs_cost, hwi);
2448 margin = now->vnow - vtime - cost;
2449
c421a3eb
TH
2450 /* debt handling owns inuse for debtors */
2451 if (iocg->abs_vdebt)
2452 return cost;
2453
b0853ab4 2454 /*
5ba1add2 2455 * We only increase inuse during period and do so if the margin has
b0853ab4
TH
2456 * deteriorated since the previous adjustment.
2457 */
2458 if (margin >= iocg->saved_margin || margin >= margins->low ||
2459 iocg->inuse == iocg->active)
2460 return cost;
2461
2462 spin_lock_irq(&ioc->lock);
2463
2464 /* we own inuse only when @iocg is in the normal active state */
c421a3eb 2465 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
b0853ab4
TH
2466 spin_unlock_irq(&ioc->lock);
2467 return cost;
2468 }
2469
aa67db24
TH
2470 /*
2471 * Bump up inuse till @abs_cost fits in the existing budget.
2472 * adj_step must be determined after acquiring ioc->lock - we might
2473 * have raced and lost to another thread for activation and could
2474 * be reading 0 iocg->active before ioc->lock which will lead to
2475 * infinite loop.
2476 */
b0853ab4 2477 new_inuse = iocg->inuse;
aa67db24 2478 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
b0853ab4
TH
2479 do {
2480 new_inuse = new_inuse + adj_step;
2481 propagate_weights(iocg, iocg->active, new_inuse, true, now);
2482 current_hweight(iocg, NULL, &hwi);
2483 cost = abs_cost_to_cost(abs_cost, hwi);
2484 } while (time_after64(vtime + cost, now->vnow) &&
2485 iocg->inuse != iocg->active);
2486
2487 spin_unlock_irq(&ioc->lock);
04603755
TH
2488
2489 TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2490 old_inuse, iocg->inuse, old_hwi, hwi);
2491
b0853ab4
TH
2492 return cost;
2493}
2494
7caa4715
TH
2495static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2496 bool is_merge, u64 *costp)
2497{
2498 struct ioc *ioc = iocg->ioc;
2499 u64 coef_seqio, coef_randio, coef_page;
2500 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2501 u64 seek_pages = 0;
2502 u64 cost = 0;
2503
2504 switch (bio_op(bio)) {
2505 case REQ_OP_READ:
2506 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
2507 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
2508 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
2509 break;
2510 case REQ_OP_WRITE:
2511 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
2512 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
2513 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
2514 break;
2515 default:
2516 goto out;
2517 }
2518
2519 if (iocg->cursor) {
2520 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2521 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2522 }
2523
2524 if (!is_merge) {
2525 if (seek_pages > LCOEF_RANDIO_PAGES) {
2526 cost += coef_randio;
2527 } else {
2528 cost += coef_seqio;
2529 }
2530 }
2531 cost += pages * coef_page;
2532out:
2533 *costp = cost;
2534}
2535
2536static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2537{
2538 u64 cost;
2539
2540 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2541 return cost;
2542}
2543
cd006509
TH
2544static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2545 u64 *costp)
2546{
2547 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2548
2549 switch (req_op(rq)) {
2550 case REQ_OP_READ:
2551 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2552 break;
2553 case REQ_OP_WRITE:
2554 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2555 break;
2556 default:
2557 *costp = 0;
2558 }
2559}
2560
2561static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2562{
2563 u64 cost;
2564
2565 calc_size_vtime_cost_builtin(rq, ioc, &cost);
2566 return cost;
2567}
2568
7caa4715
TH
2569static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2570{
2571 struct blkcg_gq *blkg = bio->bi_blkg;
2572 struct ioc *ioc = rqos_to_ioc(rqos);
2573 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2574 struct ioc_now now;
2575 struct iocg_wait wait;
7caa4715 2576 u64 abs_cost, cost, vtime;
da437b95
TH
2577 bool use_debt, ioc_locked;
2578 unsigned long flags;
7caa4715 2579
d16baa3f
TH
2580 /* bypass IOs if disabled, still initializing, or for root cgroup */
2581 if (!ioc->enabled || !iocg || !iocg->level)
7caa4715
TH
2582 return;
2583
7caa4715
TH
2584 /* calculate the absolute vtime cost */
2585 abs_cost = calc_vtime_cost(bio, iocg, false);
2586 if (!abs_cost)
2587 return;
2588
f1de2439
TH
2589 if (!iocg_activate(iocg, &now))
2590 return;
2591
7caa4715 2592 iocg->cursor = bio_end_sector(bio);
7caa4715 2593 vtime = atomic64_read(&iocg->vtime);
b0853ab4 2594 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
7caa4715
TH
2595
2596 /*
2597 * If no one's waiting and within budget, issue right away. The
2598 * tests are racy but the races aren't systemic - we only miss once
2599 * in a while which is fine.
2600 */
0b80f986 2601 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
7caa4715 2602 time_before_eq64(vtime + cost, now.vnow)) {
97eb1975 2603 iocg_commit_bio(iocg, bio, abs_cost, cost);
7caa4715
TH
2604 return;
2605 }
2606
36a52481 2607 /*
da437b95
TH
2608 * We're over budget. This can be handled in two ways. IOs which may
2609 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2610 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2611 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2612 * whether debt handling is needed and acquire locks accordingly.
0b80f986 2613 */
da437b95
TH
2614 use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2615 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
b0853ab4 2616retry_lock:
da437b95
TH
2617 iocg_lock(iocg, ioc_locked, &flags);
2618
2619 /*
2620 * @iocg must stay activated for debt and waitq handling. Deactivation
2621 * is synchronized against both ioc->lock and waitq.lock and we won't
2622 * get deactivated as long as we're waiting or has debt, so we're good
2623 * if we're activated here. In the unlikely cases that we aren't, just
2624 * issue the IO.
2625 */
0b80f986 2626 if (unlikely(list_empty(&iocg->active_list))) {
da437b95 2627 iocg_unlock(iocg, ioc_locked, &flags);
97eb1975 2628 iocg_commit_bio(iocg, bio, abs_cost, cost);
0b80f986
TH
2629 return;
2630 }
2631
2632 /*
2633 * We're over budget. If @bio has to be issued regardless, remember
2634 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2635 * off the debt before waking more IOs.
2636 *
36a52481 2637 * This way, the debt is continuously paid off each period with the
0b80f986
TH
2638 * actual budget available to the cgroup. If we just wound vtime, we
2639 * would incorrectly use the current hw_inuse for the entire amount
2640 * which, for example, can lead to the cgroup staying blocked for a
2641 * long time even with substantially raised hw_inuse.
2642 *
2643 * An iocg with vdebt should stay online so that the timer can keep
2644 * deducting its vdebt and [de]activate use_delay mechanism
2645 * accordingly. We don't want to race against the timer trying to
2646 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2647 * penalizing the cgroup and its descendants.
36a52481 2648 */
da437b95 2649 if (use_debt) {
c421a3eb 2650 iocg_incur_debt(iocg, abs_cost, &now);
54c52e10 2651 if (iocg_kick_delay(iocg, &now))
de185b56 2652 blkcg_schedule_throttle(rqos->q->disk,
d7bd15a1 2653 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
da437b95 2654 iocg_unlock(iocg, ioc_locked, &flags);
7caa4715
TH
2655 return;
2656 }
2657
b0853ab4 2658 /* guarantee that iocgs w/ waiters have maximum inuse */
c421a3eb 2659 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
b0853ab4
TH
2660 if (!ioc_locked) {
2661 iocg_unlock(iocg, false, &flags);
2662 ioc_locked = true;
2663 goto retry_lock;
2664 }
2665 propagate_weights(iocg, iocg->active, iocg->active, true,
2666 &now);
2667 }
2668
7caa4715
TH
2669 /*
2670 * Append self to the waitq and schedule the wakeup timer if we're
2671 * the first waiter. The timer duration is calculated based on the
2672 * current vrate. vtime and hweight changes can make it too short
2673 * or too long. Each wait entry records the absolute cost it's
2674 * waiting for to allow re-evaluation using a custom wait entry.
2675 *
2676 * If too short, the timer simply reschedules itself. If too long,
2677 * the period timer will notice and trigger wakeups.
2678 *
2679 * All waiters are on iocg->waitq and the wait states are
2680 * synchronized using waitq.lock.
2681 */
7caa4715
TH
2682 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2683 wait.wait.private = current;
2684 wait.bio = bio;
2685 wait.abs_cost = abs_cost;
2686 wait.committed = false; /* will be set true by waker */
2687
2688 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
da437b95 2689 iocg_kick_waitq(iocg, ioc_locked, &now);
7caa4715 2690
da437b95 2691 iocg_unlock(iocg, ioc_locked, &flags);
7caa4715
TH
2692
2693 while (true) {
2694 set_current_state(TASK_UNINTERRUPTIBLE);
2695 if (wait.committed)
2696 break;
2697 io_schedule();
2698 }
2699
2700 /* waker already committed us, proceed */
2701 finish_wait(&iocg->waitq, &wait.wait);
2702}
2703
2704static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2705 struct bio *bio)
2706{
2707 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
d16baa3f 2708 struct ioc *ioc = rqos_to_ioc(rqos);
7caa4715 2709 sector_t bio_end = bio_end_sector(bio);
e1518f63 2710 struct ioc_now now;
b0853ab4 2711 u64 vtime, abs_cost, cost;
0b80f986 2712 unsigned long flags;
7caa4715 2713
d16baa3f
TH
2714 /* bypass if disabled, still initializing, or for root cgroup */
2715 if (!ioc->enabled || !iocg || !iocg->level)
7caa4715
TH
2716 return;
2717
2718 abs_cost = calc_vtime_cost(bio, iocg, true);
2719 if (!abs_cost)
2720 return;
2721
e1518f63 2722 ioc_now(ioc, &now);
b0853ab4
TH
2723
2724 vtime = atomic64_read(&iocg->vtime);
2725 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
e1518f63 2726
7caa4715
TH
2727 /* update cursor if backmerging into the request at the cursor */
2728 if (blk_rq_pos(rq) < bio_end &&
2729 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2730 iocg->cursor = bio_end;
2731
e1518f63 2732 /*
0b80f986
TH
2733 * Charge if there's enough vtime budget and the existing request has
2734 * cost assigned.
e1518f63
TH
2735 */
2736 if (rq->bio && rq->bio->bi_iocost_cost &&
0b80f986 2737 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
97eb1975 2738 iocg_commit_bio(iocg, bio, abs_cost, cost);
0b80f986
TH
2739 return;
2740 }
2741
2742 /*
2743 * Otherwise, account it as debt if @iocg is online, which it should
2744 * be for the vast majority of cases. See debt handling in
2745 * ioc_rqos_throttle() for details.
2746 */
c421a3eb
TH
2747 spin_lock_irqsave(&ioc->lock, flags);
2748 spin_lock(&iocg->waitq.lock);
2749
0b80f986 2750 if (likely(!list_empty(&iocg->active_list))) {
c421a3eb
TH
2751 iocg_incur_debt(iocg, abs_cost, &now);
2752 if (iocg_kick_delay(iocg, &now))
de185b56 2753 blkcg_schedule_throttle(rqos->q->disk,
c421a3eb 2754 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
0b80f986 2755 } else {
97eb1975 2756 iocg_commit_bio(iocg, bio, abs_cost, cost);
0b80f986 2757 }
c421a3eb
TH
2758
2759 spin_unlock(&iocg->waitq.lock);
2760 spin_unlock_irqrestore(&ioc->lock, flags);
7caa4715
TH
2761}
2762
2763static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2764{
2765 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2766
2767 if (iocg && bio->bi_iocost_cost)
2768 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2769}
2770
2771static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2772{
2773 struct ioc *ioc = rqos_to_ioc(rqos);
5e124f74 2774 struct ioc_pcpu_stat *ccs;
cd006509 2775 u64 on_q_ns, rq_wait_ns, size_nsec;
7caa4715
TH
2776 int pidx, rw;
2777
2778 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2779 return;
2780
62c159a0 2781 switch (req_op(rq)) {
7caa4715
TH
2782 case REQ_OP_READ:
2783 pidx = QOS_RLAT;
2784 rw = READ;
2785 break;
2786 case REQ_OP_WRITE:
2787 pidx = QOS_WLAT;
2788 rw = WRITE;
2789 break;
2790 default:
2791 return;
2792 }
2793
2794 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2795 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
cd006509 2796 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
7caa4715 2797
5e124f74
TH
2798 ccs = get_cpu_ptr(ioc->pcpu_stat);
2799
cd006509
TH
2800 if (on_q_ns <= size_nsec ||
2801 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
5e124f74 2802 local_inc(&ccs->missed[rw].nr_met);
7caa4715 2803 else
5e124f74
TH
2804 local_inc(&ccs->missed[rw].nr_missed);
2805
2806 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
7caa4715 2807
5e124f74 2808 put_cpu_ptr(ccs);
7caa4715
TH
2809}
2810
2811static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2812{
2813 struct ioc *ioc = rqos_to_ioc(rqos);
2814
2815 spin_lock_irq(&ioc->lock);
2816 ioc_refresh_params(ioc, false);
2817 spin_unlock_irq(&ioc->lock);
2818}
2819
2820static void ioc_rqos_exit(struct rq_qos *rqos)
2821{
2822 struct ioc *ioc = rqos_to_ioc(rqos);
2823
2824 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2825
2826 spin_lock_irq(&ioc->lock);
2827 ioc->running = IOC_STOP;
2828 spin_unlock_irq(&ioc->lock);
2829
292a089d 2830 timer_shutdown_sync(&ioc->timer);
7caa4715
TH
2831 free_percpu(ioc->pcpu_stat);
2832 kfree(ioc);
2833}
2834
2835static struct rq_qos_ops ioc_rqos_ops = {
2836 .throttle = ioc_rqos_throttle,
2837 .merge = ioc_rqos_merge,
2838 .done_bio = ioc_rqos_done_bio,
2839 .done = ioc_rqos_done,
2840 .queue_depth_changed = ioc_rqos_queue_depth_changed,
2841 .exit = ioc_rqos_exit,
2842};
2843
57b64554 2844static int blk_iocost_init(struct gendisk *disk)
7caa4715 2845{
57b64554 2846 struct request_queue *q = disk->queue;
7caa4715
TH
2847 struct ioc *ioc;
2848 struct rq_qos *rqos;
5e124f74 2849 int i, cpu, ret;
7caa4715
TH
2850
2851 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2852 if (!ioc)
2853 return -ENOMEM;
2854
2855 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2856 if (!ioc->pcpu_stat) {
2857 kfree(ioc);
2858 return -ENOMEM;
2859 }
2860
5e124f74
TH
2861 for_each_possible_cpu(cpu) {
2862 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2863
2864 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2865 local_set(&ccs->missed[i].nr_met, 0);
2866 local_set(&ccs->missed[i].nr_missed, 0);
2867 }
2868 local64_set(&ccs->rq_wait_ns, 0);
2869 }
2870
7caa4715
TH
2871 rqos = &ioc->rqos;
2872 rqos->id = RQ_QOS_COST;
2873 rqos->ops = &ioc_rqos_ops;
2874 rqos->q = q;
2875
2876 spin_lock_init(&ioc->lock);
2877 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2878 INIT_LIST_HEAD(&ioc->active_iocgs);
2879
2880 ioc->running = IOC_IDLE;
ac33e91e 2881 ioc->vtime_base_rate = VTIME_PER_USEC;
7caa4715 2882 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
67b7b641 2883 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
7caa4715
TH
2884 ioc->period_at = ktime_to_us(ktime_get());
2885 atomic64_set(&ioc->cur_period, 0);
2886 atomic_set(&ioc->hweight_gen, 0);
2887
2888 spin_lock_irq(&ioc->lock);
2889 ioc->autop_idx = AUTOP_INVALID;
2890 ioc_refresh_params(ioc, true);
2891 spin_unlock_irq(&ioc->lock);
2892
d16baa3f 2893 /*
7a88b1a8 2894 * rqos must be added before activation to allow ioc_pd_init() to
d16baa3f
TH
2895 * lookup the ioc from q. This means that the rqos methods may get
2896 * called before policy activation completion, can't assume that the
2897 * target bio has an iocg associated and need to test for NULL iocg.
2898 */
14a6e2eb
JH
2899 ret = rq_qos_add(q, rqos);
2900 if (ret)
2901 goto err_free_ioc;
2902
7caa4715 2903 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
14a6e2eb
JH
2904 if (ret)
2905 goto err_del_qos;
7caa4715 2906 return 0;
14a6e2eb
JH
2907
2908err_del_qos:
2909 rq_qos_del(q, rqos);
2910err_free_ioc:
2911 free_percpu(ioc->pcpu_stat);
2912 kfree(ioc);
2913 return ret;
7caa4715
TH
2914}
2915
2916static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2917{
2918 struct ioc_cgrp *iocc;
2919
2920 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
e916ad29
TH
2921 if (!iocc)
2922 return NULL;
7caa4715 2923
bd0adb91 2924 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
7caa4715
TH
2925 return &iocc->cpd;
2926}
2927
2928static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2929{
2930 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2931}
2932
2933static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2934 struct blkcg *blkcg)
2935{
2936 int levels = blkcg->css.cgroup->level + 1;
2937 struct ioc_gq *iocg;
2938
f61d6e25 2939 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
7caa4715
TH
2940 if (!iocg)
2941 return NULL;
2942
97eb1975
TH
2943 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2944 if (!iocg->pcpu_stat) {
2945 kfree(iocg);
2946 return NULL;
2947 }
2948
7caa4715
TH
2949 return &iocg->pd;
2950}
2951
2952static void ioc_pd_init(struct blkg_policy_data *pd)
2953{
2954 struct ioc_gq *iocg = pd_to_iocg(pd);
2955 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2956 struct ioc *ioc = q_to_ioc(blkg->q);
2957 struct ioc_now now;
2958 struct blkcg_gq *tblkg;
2959 unsigned long flags;
2960
2961 ioc_now(ioc, &now);
2962
2963 iocg->ioc = ioc;
2964 atomic64_set(&iocg->vtime, now.vnow);
2965 atomic64_set(&iocg->done_vtime, now.vnow);
2966 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2967 INIT_LIST_HEAD(&iocg->active_list);
97eb1975 2968 INIT_LIST_HEAD(&iocg->walk_list);
8692d2db 2969 INIT_LIST_HEAD(&iocg->surplus_list);
fe20cdb5
TH
2970 iocg->hweight_active = WEIGHT_ONE;
2971 iocg->hweight_inuse = WEIGHT_ONE;
7caa4715
TH
2972
2973 init_waitqueue_head(&iocg->waitq);
2974 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2975 iocg->waitq_timer.function = iocg_waitq_timer_fn;
7caa4715
TH
2976
2977 iocg->level = blkg->blkcg->css.cgroup->level;
2978
2979 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2980 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2981 iocg->ancestors[tiocg->level] = tiocg;
2982 }
2983
2984 spin_lock_irqsave(&ioc->lock, flags);
b0853ab4 2985 weight_updated(iocg, &now);
7caa4715
TH
2986 spin_unlock_irqrestore(&ioc->lock, flags);
2987}
2988
2989static void ioc_pd_free(struct blkg_policy_data *pd)
2990{
2991 struct ioc_gq *iocg = pd_to_iocg(pd);
2992 struct ioc *ioc = iocg->ioc;
5aeac7c4 2993 unsigned long flags;
7caa4715
TH
2994
2995 if (ioc) {
5aeac7c4 2996 spin_lock_irqsave(&ioc->lock, flags);
97eb1975 2997
7caa4715 2998 if (!list_empty(&iocg->active_list)) {
b0853ab4
TH
2999 struct ioc_now now;
3000
3001 ioc_now(ioc, &now);
3002 propagate_weights(iocg, 0, 0, false, &now);
7caa4715
TH
3003 list_del_init(&iocg->active_list);
3004 }
97eb1975
TH
3005
3006 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
8692d2db 3007 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
97eb1975 3008
5aeac7c4 3009 spin_unlock_irqrestore(&ioc->lock, flags);
e036c4ca
TH
3010
3011 hrtimer_cancel(&iocg->waitq_timer);
7caa4715 3012 }
97eb1975 3013 free_percpu(iocg->pcpu_stat);
7caa4715
TH
3014 kfree(iocg);
3015}
3016
3607849d 3017static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
97eb1975
TH
3018{
3019 struct ioc_gq *iocg = pd_to_iocg(pd);
3020 struct ioc *ioc = iocg->ioc;
97eb1975
TH
3021
3022 if (!ioc->enabled)
3607849d 3023 return;
97eb1975
TH
3024
3025 if (iocg->level == 0) {
3026 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
ac33e91e 3027 ioc->vtime_base_rate * 10000,
97eb1975 3028 VTIME_PER_USEC);
252c651a 3029 seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
97eb1975
TH
3030 }
3031
252c651a 3032 seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
97eb1975 3033
f0bf84a5 3034 if (blkcg_debug_stats)
252c651a
CH
3035 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3036 iocg->last_stat.wait_us,
3037 iocg->last_stat.indebt_us,
3038 iocg->last_stat.indelay_us);
97eb1975
TH
3039}
3040
7caa4715
TH
3041static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3042 int off)
3043{
3044 const char *dname = blkg_dev_name(pd->blkg);
3045 struct ioc_gq *iocg = pd_to_iocg(pd);
3046
3047 if (dname && iocg->cfg_weight)
bd0adb91 3048 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
7caa4715
TH
3049 return 0;
3050}
3051
3052
3053static int ioc_weight_show(struct seq_file *sf, void *v)
3054{
3055 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3056 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3057
bd0adb91 3058 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
7caa4715
TH
3059 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3060 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3061 return 0;
3062}
3063
3064static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3065 size_t nbytes, loff_t off)
3066{
3067 struct blkcg *blkcg = css_to_blkcg(of_css(of));
3068 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3069 struct blkg_conf_ctx ctx;
b0853ab4 3070 struct ioc_now now;
7caa4715
TH
3071 struct ioc_gq *iocg;
3072 u32 v;
3073 int ret;
3074
3075 if (!strchr(buf, ':')) {
3076 struct blkcg_gq *blkg;
3077
3078 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3079 return -EINVAL;
3080
3081 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3082 return -EINVAL;
3083
11431e26 3084 spin_lock_irq(&blkcg->lock);
bd0adb91 3085 iocc->dfl_weight = v * WEIGHT_ONE;
7caa4715
TH
3086 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3087 struct ioc_gq *iocg = blkg_to_iocg(blkg);
3088
3089 if (iocg) {
11431e26 3090 spin_lock(&iocg->ioc->lock);
b0853ab4
TH
3091 ioc_now(iocg->ioc, &now);
3092 weight_updated(iocg, &now);
11431e26 3093 spin_unlock(&iocg->ioc->lock);
7caa4715
TH
3094 }
3095 }
11431e26 3096 spin_unlock_irq(&blkcg->lock);
7caa4715
TH
3097
3098 return nbytes;
3099 }
3100
3101 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
3102 if (ret)
3103 return ret;
3104
3105 iocg = blkg_to_iocg(ctx.blkg);
3106
3107 if (!strncmp(ctx.body, "default", 7)) {
3108 v = 0;
3109 } else {
3110 if (!sscanf(ctx.body, "%u", &v))
3111 goto einval;
3112 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3113 goto einval;
3114 }
3115
41591a51 3116 spin_lock(&iocg->ioc->lock);
bd0adb91 3117 iocg->cfg_weight = v * WEIGHT_ONE;
b0853ab4
TH
3118 ioc_now(iocg->ioc, &now);
3119 weight_updated(iocg, &now);
41591a51 3120 spin_unlock(&iocg->ioc->lock);
7caa4715
TH
3121
3122 blkg_conf_finish(&ctx);
3123 return nbytes;
3124
3125einval:
3126 blkg_conf_finish(&ctx);
3127 return -EINVAL;
3128}
3129
3130static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3131 int off)
3132{
3133 const char *dname = blkg_dev_name(pd->blkg);
3134 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3135
3136 if (!dname)
3137 return 0;
3138
35198e32 3139 spin_lock_irq(&ioc->lock);
7caa4715
TH
3140 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3141 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3142 ioc->params.qos[QOS_RPPM] / 10000,
3143 ioc->params.qos[QOS_RPPM] % 10000 / 100,
3144 ioc->params.qos[QOS_RLAT],
3145 ioc->params.qos[QOS_WPPM] / 10000,
3146 ioc->params.qos[QOS_WPPM] % 10000 / 100,
3147 ioc->params.qos[QOS_WLAT],
3148 ioc->params.qos[QOS_MIN] / 10000,
3149 ioc->params.qos[QOS_MIN] % 10000 / 100,
3150 ioc->params.qos[QOS_MAX] / 10000,
3151 ioc->params.qos[QOS_MAX] % 10000 / 100);
35198e32 3152 spin_unlock_irq(&ioc->lock);
7caa4715
TH
3153 return 0;
3154}
3155
3156static int ioc_qos_show(struct seq_file *sf, void *v)
3157{
3158 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3159
3160 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3161 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3162 return 0;
3163}
3164
3165static const match_table_t qos_ctrl_tokens = {
3166 { QOS_ENABLE, "enable=%u" },
3167 { QOS_CTRL, "ctrl=%s" },
3168 { NR_QOS_CTRL_PARAMS, NULL },
3169};
3170
3171static const match_table_t qos_tokens = {
3172 { QOS_RPPM, "rpct=%s" },
3173 { QOS_RLAT, "rlat=%u" },
3174 { QOS_WPPM, "wpct=%s" },
3175 { QOS_WLAT, "wlat=%u" },
3176 { QOS_MIN, "min=%s" },
3177 { QOS_MAX, "max=%s" },
3178 { NR_QOS_PARAMS, NULL },
3179};
3180
3181static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3182 size_t nbytes, loff_t off)
3183{
22ae8ce8 3184 struct block_device *bdev;
3657647e 3185 struct gendisk *disk;
7caa4715
TH
3186 struct ioc *ioc;
3187 u32 qos[NR_QOS_PARAMS];
3188 bool enable, user;
3189 char *p;
3190 int ret;
3191
22ae8ce8
CH
3192 bdev = blkcg_conf_open_bdev(&input);
3193 if (IS_ERR(bdev))
3194 return PTR_ERR(bdev);
7caa4715 3195
3657647e 3196 disk = bdev->bd_disk;
235a5a83
YK
3197 if (!queue_is_mq(disk->queue)) {
3198 ret = -EOPNOTSUPP;
3199 goto err;
3200 }
3201
3657647e 3202 ioc = q_to_ioc(disk->queue);
7caa4715 3203 if (!ioc) {
3657647e 3204 ret = blk_iocost_init(disk);
7caa4715
TH
3205 if (ret)
3206 goto err;
3657647e 3207 ioc = q_to_ioc(disk->queue);
7caa4715
TH
3208 }
3209
2b2da2f6
YK
3210 blk_mq_freeze_queue(disk->queue);
3211 blk_mq_quiesce_queue(disk->queue);
3212
7caa4715
TH
3213 spin_lock_irq(&ioc->lock);
3214 memcpy(qos, ioc->params.qos, sizeof(qos));
3215 enable = ioc->enabled;
3216 user = ioc->user_qos_params;
7caa4715
TH
3217
3218 while ((p = strsep(&input, " \t\n"))) {
3219 substring_t args[MAX_OPT_ARGS];
3220 char buf[32];
3221 int tok;
3222 s64 v;
3223
3224 if (!*p)
3225 continue;
3226
3227 switch (match_token(p, qos_ctrl_tokens, args)) {
3228 case QOS_ENABLE:
7b7c5ae4
YK
3229 if (match_u64(&args[0], &v))
3230 goto einval;
7caa4715
TH
3231 enable = v;
3232 continue;
3233 case QOS_CTRL:
3234 match_strlcpy(buf, &args[0], sizeof(buf));
3235 if (!strcmp(buf, "auto"))
3236 user = false;
3237 else if (!strcmp(buf, "user"))
3238 user = true;
3239 else
3240 goto einval;
3241 continue;
3242 }
3243
3244 tok = match_token(p, qos_tokens, args);
3245 switch (tok) {
3246 case QOS_RPPM:
3247 case QOS_WPPM:
3248 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3249 sizeof(buf))
3250 goto einval;
3251 if (cgroup_parse_float(buf, 2, &v))
3252 goto einval;
3253 if (v < 0 || v > 10000)
3254 goto einval;
3255 qos[tok] = v * 100;
3256 break;
3257 case QOS_RLAT:
3258 case QOS_WLAT:
3259 if (match_u64(&args[0], &v))
3260 goto einval;
3261 qos[tok] = v;
3262 break;
3263 case QOS_MIN:
3264 case QOS_MAX:
3265 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3266 sizeof(buf))
3267 goto einval;
3268 if (cgroup_parse_float(buf, 2, &v))
3269 goto einval;
3270 if (v < 0)
3271 goto einval;
3272 qos[tok] = clamp_t(s64, v * 100,
3273 VRATE_MIN_PPM, VRATE_MAX_PPM);
3274 break;
3275 default:
3276 goto einval;
3277 }
3278 user = true;
3279 }
3280
3281 if (qos[QOS_MIN] > qos[QOS_MAX])
3282 goto einval;
3283
7caa4715 3284 if (enable) {
3657647e
CH
3285 blk_stat_enable_accounting(disk->queue);
3286 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
7caa4715 3287 ioc->enabled = true;
8796acbc 3288 wbt_disable_default(disk->queue);
7caa4715 3289 } else {
3657647e 3290 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
7caa4715 3291 ioc->enabled = false;
8796acbc 3292 wbt_enable_default(disk->queue);
7caa4715
TH
3293 }
3294
3295 if (user) {
3296 memcpy(ioc->params.qos, qos, sizeof(qos));
3297 ioc->user_qos_params = true;
3298 } else {
3299 ioc->user_qos_params = false;
3300 }
3301
3302 ioc_refresh_params(ioc, true);
3303 spin_unlock_irq(&ioc->lock);
3304
2b2da2f6
YK
3305 blk_mq_unquiesce_queue(disk->queue);
3306 blk_mq_unfreeze_queue(disk->queue);
3307
22ae8ce8 3308 blkdev_put_no_open(bdev);
7caa4715
TH
3309 return nbytes;
3310einval:
2c064798 3311 spin_unlock_irq(&ioc->lock);
2b2da2f6
YK
3312
3313 blk_mq_unquiesce_queue(disk->queue);
3314 blk_mq_unfreeze_queue(disk->queue);
3315
7caa4715
TH
3316 ret = -EINVAL;
3317err:
22ae8ce8 3318 blkdev_put_no_open(bdev);
7caa4715
TH
3319 return ret;
3320}
3321
3322static u64 ioc_cost_model_prfill(struct seq_file *sf,
3323 struct blkg_policy_data *pd, int off)
3324{
3325 const char *dname = blkg_dev_name(pd->blkg);
3326 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3327 u64 *u = ioc->params.i_lcoefs;
3328
3329 if (!dname)
3330 return 0;
3331
35198e32 3332 spin_lock_irq(&ioc->lock);
7caa4715
TH
3333 seq_printf(sf, "%s ctrl=%s model=linear "
3334 "rbps=%llu rseqiops=%llu rrandiops=%llu "
3335 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3336 dname, ioc->user_cost_model ? "user" : "auto",
3337 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3338 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
35198e32 3339 spin_unlock_irq(&ioc->lock);
7caa4715
TH
3340 return 0;
3341}
3342
3343static int ioc_cost_model_show(struct seq_file *sf, void *v)
3344{
3345 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3346
3347 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3348 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3349 return 0;
3350}
3351
3352static const match_table_t cost_ctrl_tokens = {
3353 { COST_CTRL, "ctrl=%s" },
3354 { COST_MODEL, "model=%s" },
3355 { NR_COST_CTRL_PARAMS, NULL },
3356};
3357
3358static const match_table_t i_lcoef_tokens = {
3359 { I_LCOEF_RBPS, "rbps=%u" },
3360 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
3361 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
3362 { I_LCOEF_WBPS, "wbps=%u" },
3363 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
3364 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
3365 { NR_I_LCOEFS, NULL },
3366};
3367
3368static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3369 size_t nbytes, loff_t off)
3370{
22ae8ce8 3371 struct block_device *bdev;
2b2da2f6 3372 struct request_queue *q;
7caa4715
TH
3373 struct ioc *ioc;
3374 u64 u[NR_I_LCOEFS];
3375 bool user;
3376 char *p;
3377 int ret;
3378
22ae8ce8
CH
3379 bdev = blkcg_conf_open_bdev(&input);
3380 if (IS_ERR(bdev))
3381 return PTR_ERR(bdev);
7caa4715 3382
2b2da2f6 3383 q = bdev_get_queue(bdev);
235a5a83
YK
3384 if (!queue_is_mq(q)) {
3385 ret = -EOPNOTSUPP;
3386 goto err;
3387 }
3388
2b2da2f6 3389 ioc = q_to_ioc(q);
7caa4715 3390 if (!ioc) {
57b64554 3391 ret = blk_iocost_init(bdev->bd_disk);
7caa4715
TH
3392 if (ret)
3393 goto err;
2b2da2f6 3394 ioc = q_to_ioc(q);
7caa4715
TH
3395 }
3396
2b2da2f6
YK
3397 blk_mq_freeze_queue(q);
3398 blk_mq_quiesce_queue(q);
3399
7caa4715
TH
3400 spin_lock_irq(&ioc->lock);
3401 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3402 user = ioc->user_cost_model;
7caa4715
TH
3403
3404 while ((p = strsep(&input, " \t\n"))) {
3405 substring_t args[MAX_OPT_ARGS];
3406 char buf[32];
3407 int tok;
3408 u64 v;
3409
3410 if (!*p)
3411 continue;
3412
3413 switch (match_token(p, cost_ctrl_tokens, args)) {
3414 case COST_CTRL:
3415 match_strlcpy(buf, &args[0], sizeof(buf));
3416 if (!strcmp(buf, "auto"))
3417 user = false;
3418 else if (!strcmp(buf, "user"))
3419 user = true;
3420 else
3421 goto einval;
3422 continue;
3423 case COST_MODEL:
3424 match_strlcpy(buf, &args[0], sizeof(buf));
3425 if (strcmp(buf, "linear"))
3426 goto einval;
3427 continue;
3428 }
3429
3430 tok = match_token(p, i_lcoef_tokens, args);
3431 if (tok == NR_I_LCOEFS)
3432 goto einval;
3433 if (match_u64(&args[0], &v))
3434 goto einval;
3435 u[tok] = v;
3436 user = true;
3437 }
3438
7caa4715
TH
3439 if (user) {
3440 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3441 ioc->user_cost_model = true;
3442 } else {
3443 ioc->user_cost_model = false;
3444 }
3445 ioc_refresh_params(ioc, true);
3446 spin_unlock_irq(&ioc->lock);
3447
2b2da2f6
YK
3448 blk_mq_unquiesce_queue(q);
3449 blk_mq_unfreeze_queue(q);
3450
22ae8ce8 3451 blkdev_put_no_open(bdev);
7caa4715
TH
3452 return nbytes;
3453
3454einval:
2c064798 3455 spin_unlock_irq(&ioc->lock);
2b2da2f6
YK
3456
3457 blk_mq_unquiesce_queue(q);
3458 blk_mq_unfreeze_queue(q);
3459
7caa4715
TH
3460 ret = -EINVAL;
3461err:
22ae8ce8 3462 blkdev_put_no_open(bdev);
7caa4715
TH
3463 return ret;
3464}
3465
3466static struct cftype ioc_files[] = {
3467 {
3468 .name = "weight",
3469 .flags = CFTYPE_NOT_ON_ROOT,
3470 .seq_show = ioc_weight_show,
3471 .write = ioc_weight_write,
3472 },
3473 {
3474 .name = "cost.qos",
3475 .flags = CFTYPE_ONLY_ON_ROOT,
3476 .seq_show = ioc_qos_show,
3477 .write = ioc_qos_write,
3478 },
3479 {
3480 .name = "cost.model",
3481 .flags = CFTYPE_ONLY_ON_ROOT,
3482 .seq_show = ioc_cost_model_show,
3483 .write = ioc_cost_model_write,
3484 },
3485 {}
3486};
3487
3488static struct blkcg_policy blkcg_policy_iocost = {
3489 .dfl_cftypes = ioc_files,
3490 .cpd_alloc_fn = ioc_cpd_alloc,
3491 .cpd_free_fn = ioc_cpd_free,
3492 .pd_alloc_fn = ioc_pd_alloc,
3493 .pd_init_fn = ioc_pd_init,
3494 .pd_free_fn = ioc_pd_free,
97eb1975 3495 .pd_stat_fn = ioc_pd_stat,
7caa4715
TH
3496};
3497
3498static int __init ioc_init(void)
3499{
3500 return blkcg_policy_register(&blkcg_policy_iocost);
3501}
3502
3503static void __exit ioc_exit(void)
3504{
fa1c3eaf 3505 blkcg_policy_unregister(&blkcg_policy_iocost);
7caa4715
TH
3506}
3507
3508module_init(ioc_init);
3509module_exit(ioc_exit);