]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - net/sched/sch_tbf.c
NFS4: Only set creation opendata if O_CREAT
[thirdparty/kernel/stable.git] / net / sched / sch_tbf.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * net/sched/sch_tbf.c Token Bucket Filter queue.
4 *
1da177e4
LT
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
7 * original idea by Martin Devera
1da177e4
LT
8 */
9
1da177e4 10#include <linux/module.h>
1da177e4
LT
11#include <linux/types.h>
12#include <linux/kernel.h>
1da177e4 13#include <linux/string.h>
1da177e4 14#include <linux/errno.h>
1da177e4 15#include <linux/skbuff.h>
0ba48053 16#include <net/netlink.h>
b757c933 17#include <net/sch_generic.h>
1da177e4
LT
18#include <net/pkt_sched.h>
19
20
21/* Simple Token Bucket Filter.
22 =======================================
23
24 SOURCE.
25 -------
26
27 None.
28
29 Description.
30 ------------
31
32 A data flow obeys TBF with rate R and depth B, if for any
33 time interval t_i...t_f the number of transmitted bits
34 does not exceed B + R*(t_f-t_i).
35
36 Packetized version of this definition:
37 The sequence of packets of sizes s_i served at moments t_i
38 obeys TBF, if for any i<=k:
39
40 s_i+....+s_k <= B + R*(t_k - t_i)
41
42 Algorithm.
43 ----------
44
45 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
46
47 N(t+delta) = min{B/R, N(t) + delta}
48
49 If the first packet in queue has length S, it may be
50 transmitted only at the time t_* when S/R <= N(t_*),
51 and in this case N(t) jumps:
52
53 N(t_* + 0) = N(t_* - 0) - S/R.
54
55
56
57 Actually, QoS requires two TBF to be applied to a data stream.
58 One of them controls steady state burst size, another
59 one with rate P (peak rate) and depth M (equal to link MTU)
60 limits bursts at a smaller time scale.
61
62 It is easy to see that P>R, and B>M. If P is infinity, this double
63 TBF is equivalent to a single one.
64
65 When TBF works in reshaping mode, latency is estimated as:
66
67 lat = max ((L-B)/R, (L-M)/P)
68
69
70 NOTES.
71 ------
72
73 If TBF throttles, it starts a watchdog timer, which will wake it up
74 when it is ready to transmit.
75 Note that the minimal timer resolution is 1/HZ.
76 If no new packets arrive during this period,
77 or if the device is not awaken by EOI for some previous packet,
78 TBF can stop its activity for 1/HZ.
79
80
81 This means, that with depth B, the maximal rate is
82
83 R_crit = B*HZ
84
85 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
86
87 Note that the peak rate TBF is much more tough: with MTU 1500
88 P_crit = 150Kbytes/sec. So, if you need greater peak
89 rates, use alpha with HZ=1000 :-)
90
91 With classful TBF, limit is just kept for backwards compatibility.
92 It is passed to the default bfifo qdisc - if the inner qdisc is
93 changed the limit is not effective anymore.
94*/
95
cc7ec456 96struct tbf_sched_data {
1da177e4
LT
97/* Parameters */
98 u32 limit; /* Maximal length of backlog: bytes */
a135e598 99 u32 max_size;
b757c933
JP
100 s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
101 s64 mtu;
b757c933
JP
102 struct psched_ratecfg rate;
103 struct psched_ratecfg peak;
1da177e4
LT
104
105/* Variables */
b757c933
JP
106 s64 tokens; /* Current number of B tokens */
107 s64 ptokens; /* Current number of P tokens */
108 s64 t_c; /* Time check-point */
1da177e4 109 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
f7f593e3 110 struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4
LT
111};
112
e43ac79a 113
cc106e44
YY
114/* Time to Length, convert time in ns to length in bytes
115 * to determinate how many bytes can be sent in given time.
116 */
117static u64 psched_ns_t2l(const struct psched_ratecfg *r,
118 u64 time_in_ns)
119{
120 /* The formula is :
121 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
122 */
123 u64 len = time_in_ns * r->rate_bytes_ps;
124
125 do_div(len, NSEC_PER_SEC);
126
d55d282e
YY
127 if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
128 do_div(len, 53);
129 len = len * 48;
130 }
cc106e44
YY
131
132 if (len > r->overhead)
133 len -= r->overhead;
134 else
135 len = 0;
136
137 return len;
138}
139
e43ac79a
ED
140/* GSO packet is too big, segment it so that tbf can transmit
141 * each segment in time
142 */
520ac30f
ED
143static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
144 struct sk_buff **to_free)
e43ac79a
ED
145{
146 struct tbf_sched_data *q = qdisc_priv(sch);
147 struct sk_buff *segs, *nskb;
148 netdev_features_t features = netif_skb_features(skb);
2ccccf5f 149 unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
e43ac79a
ED
150 int ret, nb;
151
152 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153
154 if (IS_ERR_OR_NULL(segs))
520ac30f 155 return qdisc_drop(skb, sch, to_free);
e43ac79a
ED
156
157 nb = 0;
158 while (segs) {
159 nskb = segs->next;
a8305bff 160 skb_mark_not_on_list(segs);
4d0820cf 161 qdisc_skb_cb(segs)->pkt_len = segs->len;
2ccccf5f 162 len += segs->len;
520ac30f 163 ret = qdisc_enqueue(segs, q->qdisc, to_free);
e43ac79a
ED
164 if (ret != NET_XMIT_SUCCESS) {
165 if (net_xmit_drop_count(ret))
25331d6c 166 qdisc_qstats_drop(sch);
e43ac79a
ED
167 } else {
168 nb++;
169 }
170 segs = nskb;
171 }
172 sch->q.qlen += nb;
173 if (nb > 1)
2ccccf5f 174 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
e43ac79a
ED
175 consume_skb(skb);
176 return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
177}
178
520ac30f
ED
179static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
180 struct sk_buff **to_free)
1da177e4
LT
181{
182 struct tbf_sched_data *q = qdisc_priv(sch);
f6bab199 183 unsigned int len = qdisc_pkt_len(skb);
1da177e4
LT
184 int ret;
185
e43ac79a 186 if (qdisc_pkt_len(skb) > q->max_size) {
ee78bbef
DA
187 if (skb_is_gso(skb) &&
188 skb_gso_validate_mac_len(skb, q->max_size))
520ac30f
ED
189 return tbf_segment(skb, sch, to_free);
190 return qdisc_drop(skb, sch, to_free);
e43ac79a 191 }
520ac30f 192 ret = qdisc_enqueue(skb, q->qdisc, to_free);
9871e50e 193 if (ret != NET_XMIT_SUCCESS) {
378a2f09 194 if (net_xmit_drop_count(ret))
25331d6c 195 qdisc_qstats_drop(sch);
1da177e4
LT
196 return ret;
197 }
198
f6bab199 199 sch->qstats.backlog += len;
1da177e4 200 sch->q.qlen++;
9871e50e 201 return NET_XMIT_SUCCESS;
1da177e4
LT
202}
203
a135e598
HS
204static bool tbf_peak_present(const struct tbf_sched_data *q)
205{
206 return q->peak.rate_bytes_ps;
207}
208
cc7ec456 209static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
1da177e4
LT
210{
211 struct tbf_sched_data *q = qdisc_priv(sch);
212 struct sk_buff *skb;
213
03c05f0d 214 skb = q->qdisc->ops->peek(q->qdisc);
1da177e4
LT
215
216 if (skb) {
b757c933
JP
217 s64 now;
218 s64 toks;
219 s64 ptoks = 0;
0abf77e5 220 unsigned int len = qdisc_pkt_len(skb);
1da177e4 221
d2de875c 222 now = ktime_get_ns();
b757c933 223 toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4 224
a135e598 225 if (tbf_peak_present(q)) {
1da177e4 226 ptoks = toks + q->ptokens;
b757c933 227 if (ptoks > q->mtu)
1da177e4 228 ptoks = q->mtu;
b757c933 229 ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4
LT
230 }
231 toks += q->tokens;
b757c933 232 if (toks > q->buffer)
1da177e4 233 toks = q->buffer;
b757c933 234 toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4
LT
235
236 if ((toks|ptoks) >= 0) {
77be155c 237 skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d
JP
238 if (unlikely(!skb))
239 return NULL;
240
1da177e4
LT
241 q->t_c = now;
242 q->tokens = toks;
243 q->ptokens = ptoks;
8d5958f4 244 qdisc_qstats_backlog_dec(sch, skb);
1da177e4 245 sch->q.qlen--;
9190b3b3 246 qdisc_bstats_update(sch, skb);
1da177e4
LT
247 return skb;
248 }
249
b757c933 250 qdisc_watchdog_schedule_ns(&q->watchdog,
45f50bed 251 now + max_t(long, -toks, -ptoks));
1da177e4
LT
252
253 /* Maybe we have a shorter packet in the queue,
254 which can be sent now. It sounds cool,
255 but, however, this is wrong in principle.
256 We MUST NOT reorder packets under these circumstances.
257
258 Really, if we split the flow into independent
259 subflows, it would be a very good solution.
260 This is the main idea of all FQ algorithms
261 (cf. CSZ, HPFQ, HFSC)
262 */
263
25331d6c 264 qdisc_qstats_overlimit(sch);
1da177e4
LT
265 }
266 return NULL;
267}
268
cc7ec456 269static void tbf_reset(struct Qdisc *sch)
1da177e4
LT
270{
271 struct tbf_sched_data *q = qdisc_priv(sch);
272
273 qdisc_reset(q->qdisc);
8d5958f4 274 sch->qstats.backlog = 0;
1da177e4 275 sch->q.qlen = 0;
d2de875c 276 q->t_c = ktime_get_ns();
1da177e4
LT
277 q->tokens = q->buffer;
278 q->ptokens = q->mtu;
f7f593e3 279 qdisc_watchdog_cancel(&q->watchdog);
1da177e4
LT
280}
281
27a3421e
PM
282static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
283 [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
284 [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
285 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
a33c4a26
YY
286 [TCA_TBF_RATE64] = { .type = NLA_U64 },
287 [TCA_TBF_PRATE64] = { .type = NLA_U64 },
2e04ad42
YY
288 [TCA_TBF_BURST] = { .type = NLA_U32 },
289 [TCA_TBF_PBURST] = { .type = NLA_U32 },
27a3421e
PM
290};
291
2030721c
AA
292static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
293 struct netlink_ext_ack *extack)
1da177e4 294{
cee63723 295 int err;
1da177e4 296 struct tbf_sched_data *q = qdisc_priv(sch);
a33c4a26 297 struct nlattr *tb[TCA_TBF_MAX + 1];
1da177e4 298 struct tc_tbf_qopt *qopt;
1da177e4 299 struct Qdisc *child = NULL;
cc106e44
YY
300 struct psched_ratecfg rate;
301 struct psched_ratecfg peak;
302 u64 max_size;
303 s64 buffer, mtu;
a33c4a26 304 u64 rate64 = 0, prate64 = 0;
1da177e4 305
8cb08174
JB
306 err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
307 NULL);
cee63723
PM
308 if (err < 0)
309 return err;
310
311 err = -EINVAL;
27a3421e 312 if (tb[TCA_TBF_PARMS] == NULL)
1da177e4
LT
313 goto done;
314
1e90474c 315 qopt = nla_data(tb[TCA_TBF_PARMS]);
cc106e44
YY
316 if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
317 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
e9bc3fa2
AA
318 tb[TCA_TBF_RTAB],
319 NULL));
1da177e4 320
cc106e44
YY
321 if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
322 qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
e9bc3fa2
AA
323 tb[TCA_TBF_PTAB],
324 NULL));
4d0820cf 325
cc106e44
YY
326 buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
327 mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
328
329 if (tb[TCA_TBF_RATE64])
330 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
331 psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
332
2e04ad42
YY
333 if (tb[TCA_TBF_BURST]) {
334 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
335 buffer = psched_l2t_ns(&rate, max_size);
336 } else {
337 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
338 }
cc106e44
YY
339
340 if (qopt->peakrate.rate) {
341 if (tb[TCA_TBF_PRATE64])
342 prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
343 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
344 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
345 pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
2e04ad42 346 peak.rate_bytes_ps, rate.rate_bytes_ps);
cc106e44
YY
347 err = -EINVAL;
348 goto done;
349 }
350
2e04ad42
YY
351 if (tb[TCA_TBF_PBURST]) {
352 u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
353 max_size = min_t(u32, max_size, pburst);
354 mtu = psched_l2t_ns(&peak, pburst);
355 } else {
356 max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
357 }
a135e598
HS
358 } else {
359 memset(&peak, 0, sizeof(peak));
cc106e44
YY
360 }
361
362 if (max_size < psched_mtu(qdisc_dev(sch)))
363 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
364 max_size, qdisc_dev(sch)->name,
365 psched_mtu(qdisc_dev(sch)));
366
367 if (!max_size) {
368 err = -EINVAL;
369 goto done;
370 }
371
724b9e1d
HS
372 if (q->qdisc != &noop_qdisc) {
373 err = fifo_set_limit(q->qdisc, qopt->limit);
374 if (err)
375 goto done;
376 } else if (qopt->limit > 0) {
a38a9882
AA
377 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
378 extack);
724b9e1d
HS
379 if (IS_ERR(child)) {
380 err = PTR_ERR(child);
381 goto done;
382 }
44a63b13
PA
383
384 /* child is fifo, no need to check for noop_qdisc */
385 qdisc_hash_add(child, true);
724b9e1d
HS
386 }
387
1da177e4 388 sch_tree_lock(sch);
5e50da01 389 if (child) {
e5f0e8f8 390 qdisc_tree_flush_backlog(q->qdisc);
86bd446b 391 qdisc_put(q->qdisc);
b94c8afc 392 q->qdisc = child;
5e50da01 393 }
1da177e4 394 q->limit = qopt->limit;
2e04ad42
YY
395 if (tb[TCA_TBF_PBURST])
396 q->mtu = mtu;
397 else
398 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4 399 q->max_size = max_size;
2e04ad42
YY
400 if (tb[TCA_TBF_BURST])
401 q->buffer = buffer;
402 else
403 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4
LT
404 q->tokens = q->buffer;
405 q->ptokens = q->mtu;
b94c8afc 406
cc106e44 407 memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
a135e598 408 memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
b94c8afc 409
1da177e4
LT
410 sch_tree_unlock(sch);
411 err = 0;
412done:
1da177e4
LT
413 return err;
414}
415
e63d7dfd
AA
416static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
417 struct netlink_ext_ack *extack)
1da177e4
LT
418{
419 struct tbf_sched_data *q = qdisc_priv(sch);
420
c2d6511e
NA
421 qdisc_watchdog_init(&q->watchdog, sch);
422 q->qdisc = &noop_qdisc;
423
ac8ef4ab 424 if (!opt)
1da177e4
LT
425 return -EINVAL;
426
d2de875c 427 q->t_c = ktime_get_ns();
1da177e4 428
2030721c 429 return tbf_change(sch, opt, extack);
1da177e4
LT
430}
431
432static void tbf_destroy(struct Qdisc *sch)
433{
434 struct tbf_sched_data *q = qdisc_priv(sch);
435
f7f593e3 436 qdisc_watchdog_cancel(&q->watchdog);
86bd446b 437 qdisc_put(q->qdisc);
1da177e4
LT
438}
439
440static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
441{
442 struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef 443 struct nlattr *nest;
1da177e4
LT
444 struct tc_tbf_qopt opt;
445
b0460e44 446 sch->qstats.backlog = q->qdisc->qstats.backlog;
ae0be8de 447 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
4b3550ef
PM
448 if (nest == NULL)
449 goto nla_put_failure;
1da177e4
LT
450
451 opt.limit = q->limit;
01cb71d2 452 psched_ratecfg_getrate(&opt.rate, &q->rate);
a135e598 453 if (tbf_peak_present(q))
01cb71d2 454 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4
LT
455 else
456 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933
JP
457 opt.mtu = PSCHED_NS2TICKS(q->mtu);
458 opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43
DM
459 if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
460 goto nla_put_failure;
a33c4a26 461 if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
2a51c1e8
ND
462 nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
463 TCA_TBF_PAD))
a33c4a26 464 goto nla_put_failure;
a135e598 465 if (tbf_peak_present(q) &&
a33c4a26 466 q->peak.rate_bytes_ps >= (1ULL << 32) &&
2a51c1e8
ND
467 nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
468 TCA_TBF_PAD))
a33c4a26 469 goto nla_put_failure;
1da177e4 470
d59b7d80 471 return nla_nest_end(skb, nest);
1da177e4 472
1e90474c 473nla_put_failure:
4b3550ef 474 nla_nest_cancel(skb, nest);
1da177e4
LT
475 return -1;
476}
477
478static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
479 struct sk_buff *skb, struct tcmsg *tcm)
480{
481 struct tbf_sched_data *q = qdisc_priv(sch);
482
1da177e4
LT
483 tcm->tcm_handle |= TC_H_MIN(1);
484 tcm->tcm_info = q->qdisc->handle;
485
486 return 0;
487}
488
489static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
653d6fd6 490 struct Qdisc **old, struct netlink_ext_ack *extack)
1da177e4
LT
491{
492 struct tbf_sched_data *q = qdisc_priv(sch);
493
494 if (new == NULL)
495 new = &noop_qdisc;
496
86a7996c 497 *old = qdisc_replace(sch, new, &q->qdisc);
1da177e4
LT
498 return 0;
499}
500
501static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
502{
503 struct tbf_sched_data *q = qdisc_priv(sch);
504 return q->qdisc;
505}
506
143976ce 507static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
1da177e4
LT
508{
509 return 1;
510}
511
1da177e4
LT
512static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
513{
514 if (!walker->stop) {
515 if (walker->count >= walker->skip)
516 if (walker->fn(sch, 1, walker) < 0) {
517 walker->stop = 1;
518 return;
519 }
520 walker->count++;
521 }
522}
523
cc7ec456 524static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4
LT
525 .graft = tbf_graft,
526 .leaf = tbf_leaf,
143976ce 527 .find = tbf_find,
1da177e4 528 .walk = tbf_walk,
1da177e4
LT
529 .dump = tbf_dump_class,
530};
531
20fea08b 532static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4
LT
533 .next = NULL,
534 .cl_ops = &tbf_class_ops,
535 .id = "tbf",
536 .priv_size = sizeof(struct tbf_sched_data),
537 .enqueue = tbf_enqueue,
538 .dequeue = tbf_dequeue,
77be155c 539 .peek = qdisc_peek_dequeued,
1da177e4
LT
540 .init = tbf_init,
541 .reset = tbf_reset,
542 .destroy = tbf_destroy,
543 .change = tbf_change,
544 .dump = tbf_dump,
545 .owner = THIS_MODULE,
546};
547
548static int __init tbf_module_init(void)
549{
550 return register_qdisc(&tbf_qdisc_ops);
551}
552
553static void __exit tbf_module_exit(void)
554{
555 unregister_qdisc(&tbf_qdisc_ops);
556}
557module_init(tbf_module_init)
558module_exit(tbf_module_exit)
559MODULE_LICENSE("GPL");