]> git.ipfire.org Git - thirdparty/kernel/linux.git/blame - net/sched/sch_fq.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
[thirdparty/kernel/linux.git] / net / sched / sch_fq.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
afe4fd06
ED
2/*
3 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
4 *
076433bd 5 * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
afe4fd06 6 *
05e8bb86 7 * Meant to be mostly used for locally generated traffic :
afe4fd06
ED
8 * Fast classification depends on skb->sk being set before reaching us.
9 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
10 * All packets belonging to a socket are considered as a 'flow'.
11 *
12 * Flows are dynamically allocated and stored in a hash table of RB trees
13 * They are also part of one Round Robin 'queues' (new or old flows)
14 *
15 * Burst avoidance (aka pacing) capability :
16 *
17 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
18 * bunch of packets, and this packet scheduler adds delay between
19 * packets to respect rate limitation.
20 *
21 * enqueue() :
22 * - lookup one RB tree (out of 1024 or more) to find the flow.
23 * If non existent flow, create it, add it to the tree.
24 * Add skb to the per flow list of skb (fifo).
25 * - Use a special fifo for high prio packets
26 *
27 * dequeue() : serves flows in Round Robin
28 * Note : When a flow becomes empty, we do not immediately remove it from
29 * rb trees, for performance reasons (its expected to send additional packets,
30 * or SLAB cache will reuse socket for another flow)
31 */
32
33#include <linux/module.h>
34#include <linux/types.h>
35#include <linux/kernel.h>
36#include <linux/jiffies.h>
37#include <linux/string.h>
38#include <linux/in.h>
39#include <linux/errno.h>
40#include <linux/init.h>
41#include <linux/skbuff.h>
42#include <linux/slab.h>
43#include <linux/rbtree.h>
44#include <linux/hash.h>
08f89b98 45#include <linux/prefetch.h>
c3bd8549 46#include <linux/vmalloc.h>
afe4fd06
ED
47#include <net/netlink.h>
48#include <net/pkt_sched.h>
49#include <net/sock.h>
50#include <net/tcp_states.h>
98781965 51#include <net/tcp.h>
afe4fd06 52
eeb84aa0 53struct fq_skb_cb {
29f834aa
ED
54 u64 time_to_send;
55 u8 band;
eeb84aa0
ED
56};
57
58static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
59{
60 qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
61 return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
62}
63
afe4fd06 64/*
eeb84aa0
ED
65 * Per flow structure, dynamically allocated.
66 * If packets have monotically increasing time_to_send, they are placed in O(1)
67 * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
afe4fd06
ED
68 */
69struct fq_flow {
7ba0537c 70/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
eeb84aa0 71 struct rb_root t_root;
afe4fd06
ED
72 struct sk_buff *head; /* list of skbs for this flow : first skb */
73 union {
74 struct sk_buff *tail; /* last skb in the list */
dde0a648 75 unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
afe4fd06 76 };
076433bd
ED
77 union {
78 struct rb_node fq_node; /* anchor in fq_root[] trees */
79 /* Following field is only used for q->internal,
80 * because q->internal is not hashed in fq_root[]
81 */
82 u64 stat_fastpath_packets;
83 };
afe4fd06 84 struct sock *sk;
7ba0537c 85 u32 socket_hash; /* sk_hash */
afe4fd06 86 int qlen; /* number of packets in flow queue */
7ba0537c 87
29f834aa 88/* Second cache line */
afe4fd06 89 int credit;
29f834aa 90 int band;
dde0a648 91 struct fq_flow *next; /* next pointer in RR lists */
afe4fd06
ED
92
93 struct rb_node rate_node; /* anchor in q->delayed tree */
94 u64 time_next_packet;
29f834aa 95};
afe4fd06
ED
96
97struct fq_flow_head {
98 struct fq_flow *first;
99 struct fq_flow *last;
100};
101
29f834aa 102struct fq_perband_flows {
afe4fd06 103 struct fq_flow_head new_flows;
afe4fd06 104 struct fq_flow_head old_flows;
29f834aa
ED
105 int credit;
106 int quantum; /* based on band nr : 576KB, 192KB, 64KB */
107};
afe4fd06 108
24bcc307
ED
109#define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2)
110
29f834aa 111struct fq_sched_data {
54ff8ad6
ED
112/* Read mostly cache line */
113
afe4fd06
ED
114 u32 quantum;
115 u32 initial_quantum;
f52ed899 116 u32 flow_refill_delay;
afe4fd06 117 u32 flow_plimit; /* max packets per flow */
76a9ebe8 118 unsigned long flow_max_rate; /* optional max rate per flow */
48872c11 119 u64 ce_threshold;
39d01050 120 u64 horizon; /* horizon in ns */
06eb395f 121 u32 orphan_mask; /* mask for orphaned skb */
77879147 122 u32 low_rate_threshold;
afe4fd06
ED
123 struct rb_root *fq_root;
124 u8 rate_enable;
125 u8 fq_trees_log;
39d01050 126 u8 horizon_drop;
24bcc307 127 u8 prio2band[FQ_PRIO2BAND_CRUMB_SIZE];
54ff8ad6
ED
128 u32 timer_slack; /* hrtimer slack in ns */
129
130/* Read/Write fields. */
131
29f834aa
ED
132 unsigned int band_nr; /* band being serviced in fq_dequeue() */
133
134 struct fq_perband_flows band_flows[FQ_BANDS];
135
136 struct fq_flow internal; /* fastpath queue. */
137 struct rb_root delayed; /* for rate limited flows */
138 u64 time_next_delayed_flow;
139 unsigned long unthrottle_latency_ns;
140
141 u32 band_pkt_count[FQ_BANDS];
afe4fd06 142 u32 flows;
ee9af4e1 143 u32 inactive_flows; /* Flows with no packet to send. */
afe4fd06
ED
144 u32 throttled_flows;
145
54ff8ad6
ED
146 u64 stat_throttled;
147 struct qdisc_watchdog watchdog;
afe4fd06 148 u64 stat_gc_flows;
54ff8ad6
ED
149
150/* Seldom used fields. */
151
29f834aa 152 u64 stat_band_drops[FQ_BANDS];
48872c11 153 u64 stat_ce_mark;
39d01050
ED
154 u64 stat_horizon_drops;
155 u64 stat_horizon_caps;
afe4fd06
ED
156 u64 stat_flows_plimit;
157 u64 stat_pkts_too_long;
158 u64 stat_allocation_errors;
afe4fd06
ED
159};
160
29f834aa
ED
161/* return the i-th 2-bit value ("crumb") */
162static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
163{
24bcc307 164 return (READ_ONCE(prio2band[prio / 4]) >> (2 * (prio & 0x3))) & 0x3;
29f834aa
ED
165}
166
dde0a648
ED
167/*
168 * f->tail and f->age share the same location.
169 * We can use the low order bit to differentiate if this location points
170 * to a sk_buff or contains a jiffies value, if we force this value to be odd.
171 * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
172 */
afe4fd06
ED
173static void fq_flow_set_detached(struct fq_flow *f)
174{
dde0a648 175 f->age = jiffies | 1UL;
afe4fd06
ED
176}
177
178static bool fq_flow_is_detached(const struct fq_flow *f)
179{
dde0a648 180 return !!(f->age & 1UL);
afe4fd06
ED
181}
182
dde0a648
ED
183/* special value to mark a throttled flow (not on old/new list) */
184static struct fq_flow throttled;
185
7df40c26
ED
186static bool fq_flow_is_throttled(const struct fq_flow *f)
187{
188 return f->next == &throttled;
189}
190
29f834aa
ED
191enum new_flow {
192 NEW_FLOW,
193 OLD_FLOW
194};
195
196static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
197 enum new_flow list_sel)
7df40c26 198{
29f834aa
ED
199 struct fq_perband_flows *pband = &q->band_flows[flow->band];
200 struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
201 &pband->new_flows :
202 &pband->old_flows;
203
7df40c26
ED
204 if (head->first)
205 head->last->next = flow;
206 else
207 head->first = flow;
208 head->last = flow;
209 flow->next = NULL;
210}
211
212static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
213{
214 rb_erase(&f->rate_node, &q->delayed);
215 q->throttled_flows--;
29f834aa 216 fq_flow_add_tail(q, f, OLD_FLOW);
7df40c26
ED
217}
218
afe4fd06
ED
219static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
220{
221 struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
222
223 while (*p) {
224 struct fq_flow *aux;
225
226 parent = *p;
e124557d 227 aux = rb_entry(parent, struct fq_flow, rate_node);
afe4fd06
ED
228 if (f->time_next_packet >= aux->time_next_packet)
229 p = &parent->rb_right;
230 else
231 p = &parent->rb_left;
232 }
233 rb_link_node(&f->rate_node, parent, p);
234 rb_insert_color(&f->rate_node, &q->delayed);
235 q->throttled_flows++;
236 q->stat_throttled++;
237
238 f->next = &throttled;
239 if (q->time_next_delayed_flow > f->time_next_packet)
240 q->time_next_delayed_flow = f->time_next_packet;
241}
242
243
244static struct kmem_cache *fq_flow_cachep __read_mostly;
245
afe4fd06
ED
246
247/* limit number of collected flows per round */
248#define FQ_GC_MAX 8
249#define FQ_GC_AGE (3*HZ)
250
251static bool fq_gc_candidate(const struct fq_flow *f)
252{
253 return fq_flow_is_detached(f) &&
254 time_after(jiffies, f->age + FQ_GC_AGE);
255}
256
257static void fq_gc(struct fq_sched_data *q,
258 struct rb_root *root,
259 struct sock *sk)
260{
afe4fd06 261 struct rb_node **p, *parent;
82a0aa53
ED
262 void *tofree[FQ_GC_MAX];
263 struct fq_flow *f;
264 int i, fcnt = 0;
afe4fd06
ED
265
266 p = &root->rb_node;
267 parent = NULL;
268 while (*p) {
269 parent = *p;
270
e124557d 271 f = rb_entry(parent, struct fq_flow, fq_node);
afe4fd06
ED
272 if (f->sk == sk)
273 break;
274
275 if (fq_gc_candidate(f)) {
276 tofree[fcnt++] = f;
277 if (fcnt == FQ_GC_MAX)
278 break;
279 }
280
281 if (f->sk > sk)
282 p = &parent->rb_right;
283 else
284 p = &parent->rb_left;
285 }
286
82a0aa53
ED
287 if (!fcnt)
288 return;
289
290 for (i = fcnt; i > 0; ) {
291 f = tofree[--i];
292 rb_erase(&f->fq_node, root);
293 }
afe4fd06
ED
294 q->flows -= fcnt;
295 q->inactive_flows -= fcnt;
296 q->stat_gc_flows += fcnt;
afe4fd06 297
82a0aa53 298 kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
afe4fd06
ED
299}
300
076433bd
ED
301/* Fast path can be used if :
302 * 1) Packet tstamp is in the past.
303 * 2) FQ qlen == 0 OR
304 * (no flow is currently eligible for transmit,
305 * AND fast path queue has less than 8 packets)
306 * 3) No SO_MAX_PACING_RATE on the socket (if any).
307 * 4) No @maxrate attribute on this qdisc,
308 *
309 * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
310 */
2ae45136
ED
311static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
312 u64 now)
076433bd
ED
313{
314 const struct fq_sched_data *q = qdisc_priv(sch);
315 const struct sock *sk;
316
2ae45136 317 if (fq_skb_cb(skb)->time_to_send > now)
076433bd
ED
318 return false;
319
320 if (sch->q.qlen != 0) {
321 /* Even if some packets are stored in this qdisc,
322 * we can still enable fast path if all of them are
323 * scheduled in the future (ie no flows are eligible)
324 * or in the fast path queue.
325 */
326 if (q->flows != q->inactive_flows + q->throttled_flows)
327 return false;
328
329 /* Do not allow fast path queue to explode, we want Fair Queue mode
330 * under pressure.
331 */
332 if (q->internal.qlen >= 8)
333 return false;
334 }
335
336 sk = skb->sk;
337 if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
338 sk->sk_max_pacing_rate != ~0UL)
339 return false;
340
341 if (q->flow_max_rate != ~0UL)
342 return false;
343
344 return true;
345}
346
2ae45136
ED
347static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
348 u64 now)
afe4fd06 349{
076433bd 350 struct fq_sched_data *q = qdisc_priv(sch);
afe4fd06
ED
351 struct rb_node **p, *parent;
352 struct sock *sk = skb->sk;
353 struct rb_root *root;
354 struct fq_flow *f;
afe4fd06 355
ca6fb065 356 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
e446f9df 357 * or a listener (SYNCOOKIE mode)
ca6fb065
ED
358 * 1) request sockets are not full blown,
359 * they do not contain sk_pacing_rate
360 * 2) They are not part of a 'flow' yet
361 * 3) We do not want to rate limit them (eg SYNFLOOD attack),
06eb395f 362 * especially if the listener set SO_MAX_PACING_RATE
ca6fb065 363 * 4) We pretend they are orphaned
06eb395f 364 */
e446f9df 365 if (!sk || sk_listener(sk)) {
06eb395f
ED
366 unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
367
afe4fd06
ED
368 /* By forcing low order bit to 1, we make sure to not
369 * collide with a local flow (socket pointers are word aligned)
370 */
06eb395f
ED
371 sk = (struct sock *)((hash << 1) | 1UL);
372 skb_orphan(skb);
37c0aead
ED
373 } else if (sk->sk_state == TCP_CLOSE) {
374 unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
375 /*
376 * Sockets in TCP_CLOSE are non connected.
377 * Typical use case is UDP sockets, they can send packets
378 * with sendto() to many different destinations.
379 * We probably could use a generic bit advertising
380 * non connected sockets, instead of sk_state == TCP_CLOSE,
381 * if we care enough.
382 */
383 sk = (struct sock *)((hash << 1) | 1UL);
afe4fd06
ED
384 }
385
2ae45136 386 if (fq_fastpath_check(sch, skb, now)) {
076433bd 387 q->internal.stat_fastpath_packets++;
81a41698
ED
388 if (skb->sk == sk && q->rate_enable &&
389 READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ)
390 smp_store_release(&sk->sk_pacing_status,
391 SK_PACING_FQ);
076433bd
ED
392 return &q->internal;
393 }
394
29c58472 395 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
afe4fd06 396
8f6c4ff9 397 fq_gc(q, root, sk);
afe4fd06
ED
398
399 p = &root->rb_node;
400 parent = NULL;
401 while (*p) {
402 parent = *p;
403
e124557d 404 f = rb_entry(parent, struct fq_flow, fq_node);
afe4fd06
ED
405 if (f->sk == sk) {
406 /* socket might have been reallocated, so check
407 * if its sk_hash is the same.
408 * It not, we need to refill credit with
409 * initial quantum
410 */
37c0aead 411 if (unlikely(skb->sk == sk &&
afe4fd06
ED
412 f->socket_hash != sk->sk_hash)) {
413 f->credit = q->initial_quantum;
414 f->socket_hash = sk->sk_hash;
bb3d0b8b
ED
415 if (q->rate_enable)
416 smp_store_release(&sk->sk_pacing_status,
417 SK_PACING_FQ);
7df40c26
ED
418 if (fq_flow_is_throttled(f))
419 fq_flow_unset_throttled(q, f);
fc59d5bd 420 f->time_next_packet = 0ULL;
afe4fd06
ED
421 }
422 return f;
423 }
424 if (f->sk > sk)
425 p = &parent->rb_right;
426 else
427 p = &parent->rb_left;
428 }
429
430 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
431 if (unlikely(!f)) {
432 q->stat_allocation_errors++;
433 return &q->internal;
434 }
eeb84aa0
ED
435 /* f->t_root is already zeroed after kmem_cache_zalloc() */
436
afe4fd06
ED
437 fq_flow_set_detached(f);
438 f->sk = sk;
bb3d0b8b 439 if (skb->sk == sk) {
afe4fd06 440 f->socket_hash = sk->sk_hash;
bb3d0b8b
ED
441 if (q->rate_enable)
442 smp_store_release(&sk->sk_pacing_status,
443 SK_PACING_FQ);
444 }
afe4fd06
ED
445 f->credit = q->initial_quantum;
446
447 rb_link_node(&f->fq_node, parent, p);
448 rb_insert_color(&f->fq_node, root);
449
450 q->flows++;
451 q->inactive_flows++;
452 return f;
453}
454
eeb84aa0
ED
455static struct sk_buff *fq_peek(struct fq_flow *flow)
456{
457 struct sk_buff *skb = skb_rb_first(&flow->t_root);
458 struct sk_buff *head = flow->head;
459
460 if (!skb)
461 return head;
462
463 if (!head)
464 return skb;
465
466 if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
467 return skb;
468 return head;
469}
470
471static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
472 struct sk_buff *skb)
473{
474 if (skb == flow->head) {
475 flow->head = skb->next;
476 } else {
477 rb_erase(&skb->rbnode, &flow->t_root);
478 skb->dev = qdisc_dev(sch);
479 }
480}
afe4fd06 481
c288b0ca
ED
482/* Remove one skb from flow queue.
483 * This skb must be the return value of prior fq_peek().
484 */
485static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
486 struct sk_buff *skb)
afe4fd06 487{
c288b0ca
ED
488 fq_erase_head(sch, flow, skb);
489 skb_mark_not_on_list(skb);
c288b0ca
ED
490 qdisc_qstats_backlog_dec(sch, skb);
491 sch->q.qlen--;
afe4fd06
ED
492}
493
afe4fd06
ED
494static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
495{
eeb84aa0
ED
496 struct rb_node **p, *parent;
497 struct sk_buff *head, *aux;
afe4fd06 498
eeb84aa0
ED
499 head = flow->head;
500 if (!head ||
501 fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
502 if (!head)
503 flow->head = skb;
504 else
505 flow->tail->next = skb;
506 flow->tail = skb;
507 skb->next = NULL;
508 return;
509 }
510
511 p = &flow->t_root.rb_node;
512 parent = NULL;
afe4fd06 513
eeb84aa0
ED
514 while (*p) {
515 parent = *p;
516 aux = rb_to_skb(parent);
517 if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
518 p = &parent->rb_right;
519 else
520 p = &parent->rb_left;
521 }
522 rb_link_node(&skb->rbnode, parent, p);
523 rb_insert_color(&skb->rbnode, &flow->t_root);
afe4fd06
ED
524}
525
39d01050 526static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
2ae45136 527 const struct fq_sched_data *q, u64 now)
39d01050 528{
2ae45136 529 return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
39d01050
ED
530}
531
520ac30f
ED
532static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
533 struct sk_buff **to_free)
afe4fd06
ED
534{
535 struct fq_sched_data *q = qdisc_priv(sch);
536 struct fq_flow *f;
2ae45136 537 u64 now;
29f834aa 538 u8 band;
afe4fd06 539
29f834aa
ED
540 band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
541 if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
542 q->stat_band_drops[band]++;
520ac30f 543 return qdisc_drop(skb, sch, to_free);
29f834aa 544 }
afe4fd06 545
2ae45136 546 now = ktime_get_ns();
39d01050 547 if (!skb->tstamp) {
2ae45136 548 fq_skb_cb(skb)->time_to_send = now;
39d01050 549 } else {
076433bd 550 /* Check if packet timestamp is too far in the future. */
2ae45136 551 if (fq_packet_beyond_horizon(skb, q, now)) {
076433bd 552 if (q->horizon_drop) {
39d01050
ED
553 q->stat_horizon_drops++;
554 return qdisc_drop(skb, sch, to_free);
39d01050 555 }
076433bd 556 q->stat_horizon_caps++;
2ae45136 557 skb->tstamp = now + q->horizon;
39d01050
ED
558 }
559 fq_skb_cb(skb)->time_to_send = skb->tstamp;
560 }
561
2ae45136 562 f = fq_classify(sch, skb, now);
afe4fd06 563
076433bd
ED
564 if (f != &q->internal) {
565 if (unlikely(f->qlen >= q->flow_plimit)) {
566 q->stat_flows_plimit++;
567 return qdisc_drop(skb, sch, to_free);
568 }
569
570 if (fq_flow_is_detached(f)) {
29f834aa 571 fq_flow_add_tail(q, f, NEW_FLOW);
076433bd
ED
572 if (time_after(jiffies, f->age + q->flow_refill_delay))
573 f->credit = max_t(u32, f->credit, q->quantum);
574 }
575
29f834aa
ED
576 f->band = band;
577 q->band_pkt_count[band]++;
578 fq_skb_cb(skb)->band = band;
076433bd
ED
579 if (f->qlen == 0)
580 q->inactive_flows--;
afe4fd06 581 }
f52ed899 582
076433bd 583 f->qlen++;
f52ed899
ED
584 /* Note: this overwrites f->age */
585 flow_queue_add(f, skb);
586
076433bd 587 qdisc_qstats_backlog_inc(sch, skb);
afe4fd06
ED
588 sch->q.qlen++;
589
590 return NET_XMIT_SUCCESS;
591}
592
593static void fq_check_throttled(struct fq_sched_data *q, u64 now)
594{
fefa569a 595 unsigned long sample;
afe4fd06
ED
596 struct rb_node *p;
597
598 if (q->time_next_delayed_flow > now)
599 return;
600
fefa569a
ED
601 /* Update unthrottle latency EWMA.
602 * This is cheap and can help diagnosing timer/latency problems.
603 */
604 sample = (unsigned long)(now - q->time_next_delayed_flow);
605 q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
606 q->unthrottle_latency_ns += sample >> 3;
607
afe4fd06
ED
608 q->time_next_delayed_flow = ~0ULL;
609 while ((p = rb_first(&q->delayed)) != NULL) {
e124557d 610 struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
afe4fd06
ED
611
612 if (f->time_next_packet > now) {
613 q->time_next_delayed_flow = f->time_next_packet;
614 break;
615 }
7df40c26 616 fq_flow_unset_throttled(q, f);
afe4fd06
ED
617 }
618}
619
29f834aa
ED
620static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
621{
622 if (pband->credit <= 0)
623 return NULL;
624
625 if (pband->new_flows.first)
626 return &pband->new_flows;
627
628 return pband->old_flows.first ? &pband->old_flows : NULL;
629}
630
afe4fd06
ED
631static struct sk_buff *fq_dequeue(struct Qdisc *sch)
632{
633 struct fq_sched_data *q = qdisc_priv(sch);
29f834aa 634 struct fq_perband_flows *pband;
afe4fd06
ED
635 struct fq_flow_head *head;
636 struct sk_buff *skb;
637 struct fq_flow *f;
76a9ebe8 638 unsigned long rate;
29f834aa 639 int retry;
76a9ebe8 640 u32 plen;
6b015a52
ED
641 u64 now;
642
643 if (!sch->q.qlen)
644 return NULL;
afe4fd06 645
c288b0ca
ED
646 skb = fq_peek(&q->internal);
647 if (unlikely(skb)) {
076433bd 648 q->internal.qlen--;
c288b0ca 649 fq_dequeue_skb(sch, &q->internal, skb);
afe4fd06 650 goto out;
c288b0ca 651 }
6b015a52 652
2ae45136 653 now = ktime_get_ns();
afe4fd06 654 fq_check_throttled(q, now);
29f834aa
ED
655 retry = 0;
656 pband = &q->band_flows[q->band_nr];
afe4fd06 657begin:
29f834aa
ED
658 head = fq_pband_head_select(pband);
659 if (!head) {
06e4dd18 660 while (++retry <= FQ_BANDS) {
29f834aa
ED
661 if (++q->band_nr == FQ_BANDS)
662 q->band_nr = 0;
663 pband = &q->band_flows[q->band_nr];
664 pband->credit = min(pband->credit + pband->quantum,
665 pband->quantum);
666 goto begin;
667 }
668 if (q->time_next_delayed_flow != ~0ULL)
669 qdisc_watchdog_schedule_range_ns(&q->watchdog,
583396f4
ED
670 q->time_next_delayed_flow,
671 q->timer_slack);
29f834aa 672 return NULL;
afe4fd06
ED
673 }
674 f = head->first;
29f834aa 675 retry = 0;
afe4fd06
ED
676 if (f->credit <= 0) {
677 f->credit += q->quantum;
678 head->first = f->next;
29f834aa 679 fq_flow_add_tail(q, f, OLD_FLOW);
afe4fd06
ED
680 goto begin;
681 }
682
eeb84aa0 683 skb = fq_peek(f);
7baf33bd 684 if (skb) {
eeb84aa0 685 u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
ab408b6d
ED
686 f->time_next_packet);
687
688 if (now < time_next_packet) {
689 head->first = f->next;
690 f->time_next_packet = time_next_packet;
691 fq_flow_set_throttled(q, f);
692 goto begin;
693 }
348e289b 694 prefetch(&skb->end);
e9c43add 695 if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
48872c11
ED
696 INET_ECN_set_ce(skb);
697 q->stat_ce_mark++;
698 }
076433bd
ED
699 if (--f->qlen == 0)
700 q->inactive_flows++;
29f834aa 701 q->band_pkt_count[fq_skb_cb(skb)->band]--;
c288b0ca
ED
702 fq_dequeue_skb(sch, f, skb);
703 } else {
afe4fd06
ED
704 head->first = f->next;
705 /* force a pass through old_flows to prevent starvation */
29f834aa
ED
706 if (head == &pband->new_flows) {
707 fq_flow_add_tail(q, f, OLD_FLOW);
afe4fd06
ED
708 } else {
709 fq_flow_set_detached(f);
afe4fd06
ED
710 }
711 goto begin;
712 }
08e14fe4
ED
713 plen = qdisc_pkt_len(skb);
714 f->credit -= plen;
29f834aa 715 pband->credit -= plen;
afe4fd06 716
08e14fe4 717 if (!q->rate_enable)
98781965
ED
718 goto out;
719
7eec4174 720 rate = q->flow_max_rate;
08e14fe4
ED
721
722 /* If EDT time was provided for this skb, we need to
723 * update f->time_next_packet only if this qdisc enforces
724 * a flow max rate.
725 */
726 if (!skb->tstamp) {
727 if (skb->sk)
28b24f90 728 rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
08e14fe4
ED
729
730 if (rate <= q->low_rate_threshold) {
731 f->credit = 0;
732 } else {
733 plen = max(plen, q->quantum);
734 if (f->credit > 0)
735 goto out;
736 }
77879147 737 }
76a9ebe8 738 if (rate != ~0UL) {
0eab5eb7
ED
739 u64 len = (u64)plen * NSEC_PER_SEC;
740
7eec4174 741 if (likely(rate))
76a9ebe8 742 len = div64_ul(len, rate);
0eab5eb7 743 /* Since socket rate can change later,
ced7a04e
ED
744 * clamp the delay to 1 second.
745 * Really, providers of too big packets should be fixed !
0eab5eb7 746 */
ced7a04e
ED
747 if (unlikely(len > NSEC_PER_SEC)) {
748 len = NSEC_PER_SEC;
0eab5eb7 749 q->stat_pkts_too_long++;
afe4fd06 750 }
fefa569a
ED
751 /* Account for schedule/timers drifts.
752 * f->time_next_packet was set when prior packet was sent,
753 * and current time (@now) can be too late by tens of us.
754 */
755 if (f->time_next_packet)
756 len -= min(len/2, now - f->time_next_packet);
0eab5eb7 757 f->time_next_packet = now + len;
afe4fd06
ED
758 }
759out:
afe4fd06 760 qdisc_bstats_update(sch, skb);
afe4fd06
ED
761 return skb;
762}
763
e14ffdfd
ED
764static void fq_flow_purge(struct fq_flow *flow)
765{
eeb84aa0
ED
766 struct rb_node *p = rb_first(&flow->t_root);
767
768 while (p) {
769 struct sk_buff *skb = rb_to_skb(p);
770
771 p = rb_next(p);
772 rb_erase(&skb->rbnode, &flow->t_root);
773 rtnl_kfree_skbs(skb, skb);
774 }
e14ffdfd
ED
775 rtnl_kfree_skbs(flow->head, flow->tail);
776 flow->head = NULL;
777 flow->qlen = 0;
778}
779
afe4fd06
ED
780static void fq_reset(struct Qdisc *sch)
781{
8d34ce10
ED
782 struct fq_sched_data *q = qdisc_priv(sch);
783 struct rb_root *root;
8d34ce10
ED
784 struct rb_node *p;
785 struct fq_flow *f;
786 unsigned int idx;
afe4fd06 787
e14ffdfd
ED
788 sch->q.qlen = 0;
789 sch->qstats.backlog = 0;
790
791 fq_flow_purge(&q->internal);
8d34ce10
ED
792
793 if (!q->fq_root)
794 return;
795
796 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
797 root = &q->fq_root[idx];
798 while ((p = rb_first(root)) != NULL) {
e124557d 799 f = rb_entry(p, struct fq_flow, fq_node);
8d34ce10
ED
800 rb_erase(p, root);
801
e14ffdfd 802 fq_flow_purge(f);
8d34ce10
ED
803
804 kmem_cache_free(fq_flow_cachep, f);
805 }
806 }
29f834aa
ED
807 for (idx = 0; idx < FQ_BANDS; idx++) {
808 q->band_flows[idx].new_flows.first = NULL;
809 q->band_flows[idx].old_flows.first = NULL;
810 }
8d34ce10
ED
811 q->delayed = RB_ROOT;
812 q->flows = 0;
813 q->inactive_flows = 0;
814 q->throttled_flows = 0;
afe4fd06
ED
815}
816
817static void fq_rehash(struct fq_sched_data *q,
818 struct rb_root *old_array, u32 old_log,
819 struct rb_root *new_array, u32 new_log)
820{
821 struct rb_node *op, **np, *parent;
822 struct rb_root *oroot, *nroot;
823 struct fq_flow *of, *nf;
824 int fcnt = 0;
825 u32 idx;
826
827 for (idx = 0; idx < (1U << old_log); idx++) {
828 oroot = &old_array[idx];
829 while ((op = rb_first(oroot)) != NULL) {
830 rb_erase(op, oroot);
e124557d 831 of = rb_entry(op, struct fq_flow, fq_node);
afe4fd06
ED
832 if (fq_gc_candidate(of)) {
833 fcnt++;
834 kmem_cache_free(fq_flow_cachep, of);
835 continue;
836 }
29c58472 837 nroot = &new_array[hash_ptr(of->sk, new_log)];
afe4fd06
ED
838
839 np = &nroot->rb_node;
840 parent = NULL;
841 while (*np) {
842 parent = *np;
843
e124557d 844 nf = rb_entry(parent, struct fq_flow, fq_node);
afe4fd06
ED
845 BUG_ON(nf->sk == of->sk);
846
847 if (nf->sk > of->sk)
848 np = &parent->rb_right;
849 else
850 np = &parent->rb_left;
851 }
852
853 rb_link_node(&of->fq_node, parent, np);
854 rb_insert_color(&of->fq_node, nroot);
855 }
856 }
857 q->flows -= fcnt;
858 q->inactive_flows -= fcnt;
859 q->stat_gc_flows += fcnt;
860}
861
c3bd8549
ED
862static void fq_free(void *addr)
863{
4cb28970 864 kvfree(addr);
c3bd8549
ED
865}
866
867static int fq_resize(struct Qdisc *sch, u32 log)
868{
869 struct fq_sched_data *q = qdisc_priv(sch);
afe4fd06 870 struct rb_root *array;
2d8d40af 871 void *old_fq_root;
afe4fd06
ED
872 u32 idx;
873
874 if (q->fq_root && log == q->fq_trees_log)
875 return 0;
876
c3bd8549 877 /* If XPS was setup, we can allocate memory on right NUMA node */
dcda9b04 878 array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
c3bd8549 879 netdev_queue_numa_node_read(sch->dev_queue));
afe4fd06
ED
880 if (!array)
881 return -ENOMEM;
882
883 for (idx = 0; idx < (1U << log); idx++)
884 array[idx] = RB_ROOT;
885
2d8d40af
ED
886 sch_tree_lock(sch);
887
888 old_fq_root = q->fq_root;
889 if (old_fq_root)
890 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
891
afe4fd06 892 q->fq_root = array;
24bcc307 893 WRITE_ONCE(q->fq_trees_log, log);
afe4fd06 894
2d8d40af
ED
895 sch_tree_unlock(sch);
896
897 fq_free(old_fq_root);
898
afe4fd06
ED
899 return 0;
900}
901
ea23fbd2 902static const struct netlink_range_validation iq_range = {
7041101f
DC
903 .max = INT_MAX,
904};
905
afe4fd06 906static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
583396f4
ED
907 [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
908
afe4fd06
ED
909 [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
910 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
911 [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
7041101f 912 [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range),
afe4fd06
ED
913 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
914 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
915 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
916 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
f52ed899 917 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
7e6dc03e 918 [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
77879147 919 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
48872c11 920 [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
583396f4 921 [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
39d01050
ED
922 [TCA_FQ_HORIZON] = { .type = NLA_U32 },
923 [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
f1a3b283
ED
924 [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)),
925 [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)),
afe4fd06
ED
926};
927
29f834aa
ED
928/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
929static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
930{
931 const int num_elems = TC_PRIO_MAX + 1;
24bcc307 932 u8 tmp[FQ_PRIO2BAND_CRUMB_SIZE];
29f834aa
ED
933 int i;
934
24bcc307 935 memset(tmp, 0, sizeof(tmp));
29f834aa 936 for (i = 0; i < num_elems; i++)
24bcc307
ED
937 tmp[i / 4] |= in[i] << (2 * (i & 0x3));
938
939 for (i = 0; i < FQ_PRIO2BAND_CRUMB_SIZE; i++)
940 WRITE_ONCE(out[i], tmp[i]);
29f834aa
ED
941}
942
943static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
944{
945 const int num_elems = TC_PRIO_MAX + 1;
946 int i;
947
948 for (i = 0; i < num_elems; i++)
949 out[i] = fq_prio2band(in, i);
950}
951
49e7265f
ED
952static int fq_load_weights(struct fq_sched_data *q,
953 const struct nlattr *attr,
954 struct netlink_ext_ack *extack)
955{
956 s32 *weights = nla_data(attr);
957 int i;
958
959 for (i = 0; i < FQ_BANDS; i++) {
960 if (weights[i] < FQ_MIN_WEIGHT) {
961 NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
962 weights[i], FQ_MIN_WEIGHT);
963 return -EINVAL;
964 }
965 }
966 for (i = 0; i < FQ_BANDS; i++)
24bcc307 967 WRITE_ONCE(q->band_flows[i].quantum, weights[i]);
49e7265f
ED
968 return 0;
969}
970
29f834aa
ED
971static int fq_load_priomap(struct fq_sched_data *q,
972 const struct nlattr *attr,
973 struct netlink_ext_ack *extack)
974{
975 const struct tc_prio_qopt *map = nla_data(attr);
976 int i;
977
978 if (map->bands != FQ_BANDS) {
979 NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
980 return -EINVAL;
981 }
982 for (i = 0; i < TC_PRIO_MAX + 1; i++) {
983 if (map->priomap[i] >= FQ_BANDS) {
984 NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
985 i, map->priomap[i]);
986 return -EINVAL;
987 }
988 }
989 fq_prio2band_compress_crumb(map->priomap, q->prio2band);
990 return 0;
991}
992
2030721c
AA
993static int fq_change(struct Qdisc *sch, struct nlattr *opt,
994 struct netlink_ext_ack *extack)
afe4fd06
ED
995{
996 struct fq_sched_data *q = qdisc_priv(sch);
997 struct nlattr *tb[TCA_FQ_MAX + 1];
998 int err, drop_count = 0;
2ccccf5f 999 unsigned drop_len = 0;
afe4fd06
ED
1000 u32 fq_log;
1001
8cb08174
JB
1002 err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
1003 NULL);
afe4fd06
ED
1004 if (err < 0)
1005 return err;
1006
1007 sch_tree_lock(sch);
1008
1009 fq_log = q->fq_trees_log;
1010
1011 if (tb[TCA_FQ_BUCKETS_LOG]) {
1012 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
1013
1014 if (nval >= 1 && nval <= ilog2(256*1024))
1015 fq_log = nval;
1016 else
1017 err = -EINVAL;
1018 }
1019 if (tb[TCA_FQ_PLIMIT])
24bcc307
ED
1020 WRITE_ONCE(sch->limit,
1021 nla_get_u32(tb[TCA_FQ_PLIMIT]));
afe4fd06
ED
1022
1023 if (tb[TCA_FQ_FLOW_PLIMIT])
24bcc307
ED
1024 WRITE_ONCE(q->flow_plimit,
1025 nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]));
afe4fd06 1026
3725a269
KKJ
1027 if (tb[TCA_FQ_QUANTUM]) {
1028 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
1029
d9e15a27 1030 if (quantum > 0 && quantum <= (1 << 20)) {
24bcc307 1031 WRITE_ONCE(q->quantum, quantum);
d9e15a27
ED
1032 } else {
1033 NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
3725a269 1034 err = -EINVAL;
d9e15a27 1035 }
3725a269 1036 }
afe4fd06
ED
1037
1038 if (tb[TCA_FQ_INITIAL_QUANTUM])
24bcc307
ED
1039 WRITE_ONCE(q->initial_quantum,
1040 nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]));
afe4fd06
ED
1041
1042 if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
65c5189a
ED
1043 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
1044 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
afe4fd06 1045
76a9ebe8
ED
1046 if (tb[TCA_FQ_FLOW_MAX_RATE]) {
1047 u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
afe4fd06 1048
24bcc307
ED
1049 WRITE_ONCE(q->flow_max_rate,
1050 (rate == ~0U) ? ~0UL : rate);
76a9ebe8 1051 }
77879147 1052 if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
24bcc307
ED
1053 WRITE_ONCE(q->low_rate_threshold,
1054 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]));
77879147 1055
afe4fd06
ED
1056 if (tb[TCA_FQ_RATE_ENABLE]) {
1057 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
1058
1059 if (enable <= 1)
24bcc307
ED
1060 WRITE_ONCE(q->rate_enable,
1061 enable);
afe4fd06
ED
1062 else
1063 err = -EINVAL;
1064 }
1065
f52ed899
ED
1066 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
1067 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
1068
24bcc307
ED
1069 WRITE_ONCE(q->flow_refill_delay,
1070 usecs_to_jiffies(usecs_delay));
f52ed899
ED
1071 }
1072
29f834aa
ED
1073 if (!err && tb[TCA_FQ_PRIOMAP])
1074 err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
1075
49e7265f
ED
1076 if (!err && tb[TCA_FQ_WEIGHTS])
1077 err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
1078
06eb395f 1079 if (tb[TCA_FQ_ORPHAN_MASK])
24bcc307
ED
1080 WRITE_ONCE(q->orphan_mask,
1081 nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]));
06eb395f 1082
48872c11 1083 if (tb[TCA_FQ_CE_THRESHOLD])
24bcc307
ED
1084 WRITE_ONCE(q->ce_threshold,
1085 (u64)NSEC_PER_USEC *
1086 nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]));
48872c11 1087
583396f4 1088 if (tb[TCA_FQ_TIMER_SLACK])
24bcc307
ED
1089 WRITE_ONCE(q->timer_slack,
1090 nla_get_u32(tb[TCA_FQ_TIMER_SLACK]));
583396f4 1091
39d01050 1092 if (tb[TCA_FQ_HORIZON])
24bcc307
ED
1093 WRITE_ONCE(q->horizon,
1094 (u64)NSEC_PER_USEC *
1095 nla_get_u32(tb[TCA_FQ_HORIZON]));
39d01050
ED
1096
1097 if (tb[TCA_FQ_HORIZON_DROP])
24bcc307
ED
1098 WRITE_ONCE(q->horizon_drop,
1099 nla_get_u8(tb[TCA_FQ_HORIZON_DROP]));
39d01050 1100
2d8d40af 1101 if (!err) {
39d01050 1102
2d8d40af 1103 sch_tree_unlock(sch);
c3bd8549 1104 err = fq_resize(sch, fq_log);
2d8d40af
ED
1105 sch_tree_lock(sch);
1106 }
afe4fd06
ED
1107 while (sch->q.qlen > sch->limit) {
1108 struct sk_buff *skb = fq_dequeue(sch);
1109
8d34ce10
ED
1110 if (!skb)
1111 break;
2ccccf5f 1112 drop_len += qdisc_pkt_len(skb);
e14ffdfd 1113 rtnl_kfree_skbs(skb, skb);
afe4fd06
ED
1114 drop_count++;
1115 }
2ccccf5f 1116 qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
afe4fd06
ED
1117
1118 sch_tree_unlock(sch);
1119 return err;
1120}
1121
1122static void fq_destroy(struct Qdisc *sch)
1123{
1124 struct fq_sched_data *q = qdisc_priv(sch);
afe4fd06 1125
8d34ce10 1126 fq_reset(sch);
c3bd8549 1127 fq_free(q->fq_root);
afe4fd06
ED
1128 qdisc_watchdog_cancel(&q->watchdog);
1129}
1130
e63d7dfd
AA
1131static int fq_init(struct Qdisc *sch, struct nlattr *opt,
1132 struct netlink_ext_ack *extack)
afe4fd06
ED
1133{
1134 struct fq_sched_data *q = qdisc_priv(sch);
29f834aa 1135 int i, err;
afe4fd06
ED
1136
1137 sch->limit = 10000;
1138 q->flow_plimit = 100;
1139 q->quantum = 2 * psched_mtu(qdisc_dev(sch));
1140 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
f52ed899 1141 q->flow_refill_delay = msecs_to_jiffies(40);
76a9ebe8 1142 q->flow_max_rate = ~0UL;
fefa569a 1143 q->time_next_delayed_flow = ~0ULL;
afe4fd06 1144 q->rate_enable = 1;
29f834aa
ED
1145 for (i = 0; i < FQ_BANDS; i++) {
1146 q->band_flows[i].new_flows.first = NULL;
1147 q->band_flows[i].old_flows.first = NULL;
1148 }
1149 q->band_flows[0].quantum = 9 << 16;
1150 q->band_flows[1].quantum = 3 << 16;
1151 q->band_flows[2].quantum = 1 << 16;
afe4fd06
ED
1152 q->delayed = RB_ROOT;
1153 q->fq_root = NULL;
1154 q->fq_trees_log = ilog2(1024);
06eb395f 1155 q->orphan_mask = 1024 - 1;
77879147 1156 q->low_rate_threshold = 550000 / 8;
48872c11 1157
583396f4
ED
1158 q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
1159
39d01050
ED
1160 q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
1161 q->horizon_drop = 1; /* by default, drop packets beyond horizon */
1162
48872c11
ED
1163 /* Default ce_threshold of 4294 seconds */
1164 q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
1165
29f834aa 1166 fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
fb420d5d 1167 qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
afe4fd06
ED
1168
1169 if (opt)
2030721c 1170 err = fq_change(sch, opt, extack);
afe4fd06 1171 else
c3bd8549 1172 err = fq_resize(sch, q->fq_trees_log);
afe4fd06
ED
1173
1174 return err;
1175}
1176
1177static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
1178{
1179 struct fq_sched_data *q = qdisc_priv(sch);
29f834aa
ED
1180 struct tc_prio_qopt prio = {
1181 .bands = FQ_BANDS,
1182 };
afe4fd06 1183 struct nlattr *opts;
24bcc307 1184 u64 ce_threshold;
49e7265f 1185 s32 weights[3];
24bcc307 1186 u64 horizon;
afe4fd06 1187
ae0be8de 1188 opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
afe4fd06
ED
1189 if (opts == NULL)
1190 goto nla_put_failure;
1191
65c5189a
ED
1192 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
1193
24bcc307 1194 ce_threshold = READ_ONCE(q->ce_threshold);
48872c11 1195 do_div(ce_threshold, NSEC_PER_USEC);
24bcc307
ED
1196
1197 horizon = READ_ONCE(q->horizon);
39d01050 1198 do_div(horizon, NSEC_PER_USEC);
48872c11 1199
24bcc307
ED
1200 if (nla_put_u32(skb, TCA_FQ_PLIMIT,
1201 READ_ONCE(sch->limit)) ||
1202 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT,
1203 READ_ONCE(q->flow_plimit)) ||
1204 nla_put_u32(skb, TCA_FQ_QUANTUM,
1205 READ_ONCE(q->quantum)) ||
1206 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM,
1207 READ_ONCE(q->initial_quantum)) ||
1208 nla_put_u32(skb, TCA_FQ_RATE_ENABLE,
1209 READ_ONCE(q->rate_enable)) ||
76a9ebe8 1210 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
24bcc307
ED
1211 min_t(unsigned long,
1212 READ_ONCE(q->flow_max_rate), ~0U)) ||
f52ed899 1213 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
24bcc307
ED
1214 jiffies_to_usecs(READ_ONCE(q->flow_refill_delay))) ||
1215 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK,
1216 READ_ONCE(q->orphan_mask)) ||
77879147 1217 nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
24bcc307 1218 READ_ONCE(q->low_rate_threshold)) ||
48872c11 1219 nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
24bcc307
ED
1220 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG,
1221 READ_ONCE(q->fq_trees_log)) ||
1222 nla_put_u32(skb, TCA_FQ_TIMER_SLACK,
1223 READ_ONCE(q->timer_slack)) ||
39d01050 1224 nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
24bcc307
ED
1225 nla_put_u8(skb, TCA_FQ_HORIZON_DROP,
1226 READ_ONCE(q->horizon_drop)))
afe4fd06
ED
1227 goto nla_put_failure;
1228
29f834aa
ED
1229 fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
1230 if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
1231 goto nla_put_failure;
1232
24bcc307
ED
1233 weights[0] = READ_ONCE(q->band_flows[0].quantum);
1234 weights[1] = READ_ONCE(q->band_flows[1].quantum);
1235 weights[2] = READ_ONCE(q->band_flows[2].quantum);
49e7265f
ED
1236 if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
1237 goto nla_put_failure;
1238
d59b7d80 1239 return nla_nest_end(skb, opts);
afe4fd06
ED
1240
1241nla_put_failure:
1242 return -1;
1243}
1244
1245static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1246{
1247 struct fq_sched_data *q = qdisc_priv(sch);
695b4ec0 1248 struct tc_fq_qd_stats st;
29f834aa
ED
1249 int i;
1250
1251 st.pad = 0;
695b4ec0
ED
1252
1253 sch_tree_lock(sch);
1254
1255 st.gc_flows = q->stat_gc_flows;
29f834aa 1256 st.highprio_packets = 0;
076433bd 1257 st.fastpath_packets = q->internal.stat_fastpath_packets;
90caf67b 1258 st.tcp_retrans = 0;
695b4ec0
ED
1259 st.throttled = q->stat_throttled;
1260 st.flows_plimit = q->stat_flows_plimit;
1261 st.pkts_too_long = q->stat_pkts_too_long;
1262 st.allocation_errors = q->stat_allocation_errors;
583396f4
ED
1263 st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1264 ktime_get_ns();
695b4ec0
ED
1265 st.flows = q->flows;
1266 st.inactive_flows = q->inactive_flows;
1267 st.throttled_flows = q->throttled_flows;
fefa569a
ED
1268 st.unthrottle_latency_ns = min_t(unsigned long,
1269 q->unthrottle_latency_ns, ~0U);
48872c11 1270 st.ce_mark = q->stat_ce_mark;
39d01050
ED
1271 st.horizon_drops = q->stat_horizon_drops;
1272 st.horizon_caps = q->stat_horizon_caps;
29f834aa
ED
1273 for (i = 0; i < FQ_BANDS; i++) {
1274 st.band_drops[i] = q->stat_band_drops[i];
1275 st.band_pkt_count[i] = q->band_pkt_count[i];
1276 }
695b4ec0 1277 sch_tree_unlock(sch);
afe4fd06
ED
1278
1279 return gnet_stats_copy_app(d, &st, sizeof(st));
1280}
1281
1282static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
1283 .id = "fq",
1284 .priv_size = sizeof(struct fq_sched_data),
1285
1286 .enqueue = fq_enqueue,
1287 .dequeue = fq_dequeue,
1288 .peek = qdisc_peek_dequeued,
1289 .init = fq_init,
1290 .reset = fq_reset,
1291 .destroy = fq_destroy,
1292 .change = fq_change,
1293 .dump = fq_dump,
1294 .dump_stats = fq_dump_stats,
1295 .owner = THIS_MODULE,
1296};
241a94ab 1297MODULE_ALIAS_NET_SCH("fq");
afe4fd06
ED
1298
1299static int __init fq_module_init(void)
1300{
1301 int ret;
1302
1303 fq_flow_cachep = kmem_cache_create("fq_flow_cache",
1304 sizeof(struct fq_flow),
29f834aa 1305 0, SLAB_HWCACHE_ALIGN, NULL);
afe4fd06
ED
1306 if (!fq_flow_cachep)
1307 return -ENOMEM;
1308
1309 ret = register_qdisc(&fq_qdisc_ops);
1310 if (ret)
1311 kmem_cache_destroy(fq_flow_cachep);
1312 return ret;
1313}
1314
1315static void __exit fq_module_exit(void)
1316{
1317 unregister_qdisc(&fq_qdisc_ops);
1318 kmem_cache_destroy(fq_flow_cachep);
1319}
1320
1321module_init(fq_module_init)
1322module_exit(fq_module_exit)
1323MODULE_AUTHOR("Eric Dumazet");
1324MODULE_LICENSE("GPL");
67c20de3 1325MODULE_DESCRIPTION("Fair Queue Packet Scheduler");