[thirdparty/kernel/stable.git] / net / sched / sch_sfb.c

/*
 * net/sched/sch_sfb.c	  Stochastic Fair Blue
 *
 * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 2 as published by the Free Software Foundation.
 *
 * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
 * A New Class of Active Queue Management Algorithms.
 * U. Michigan CSE-TR-387-99, April 1999.
 *
 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/flow_keys.h>

/*
 * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
 * This implementation uses L = 8 and N = 16
 * This permits us to split one 32bit hash (provided per packet by rxhash or
 * external classifier) into 8 subhashes of 4 bits.
 */
#define SFB_BUCKET_SHIFT 4
#define SFB_NUMBUCKETS	(1 << SFB_BUCKET_SHIFT) /* N bins per Level */
#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
#define SFB_LEVELS	(32 / SFB_BUCKET_SHIFT) /* L */

/* SFB algo uses a virtual queue, named "bin" */
struct sfb_bucket {
	u16		qlen; /* length of virtual queue */
	u16		p_mark; /* marking probability */
};

/* We use a double buffering right before hash change
 * (Section 4.4 of SFB reference : moving hash functions)
 */
struct sfb_bins {
	u32		  perturbation; /* jhash perturbation */
	struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
};

struct sfb_sched_data {
	struct Qdisc	*qdisc;
	struct tcf_proto *filter_list;
	unsigned long	rehash_interval;
	unsigned long	warmup_time;	/* double buffering warmup time in jiffies */
	u32		max;
	u32		bin_size;	/* maximum queue length per bin */
	u32		increment;	/* d1 */
	u32		decrement;	/* d2 */
	u32		limit;		/* HARD maximal queue length */
	u32		penalty_rate;
	u32		penalty_burst;
	u32		tokens_avail;
	unsigned long	rehash_time;
	unsigned long	token_time;

	u8		slot;		/* current active bins (0 or 1) */
	bool		double_buffering;
	struct sfb_bins bins[2];

	struct {
		u32	earlydrop;
		u32	penaltydrop;
		u32	bucketdrop;
		u32	queuedrop;
		u32	childdrop;	/* drops in child qdisc */
		u32	marked;		/* ECN mark */
	} stats;
};

/*
 * Each queued skb might be hashed on one or two bins
 * We store in skb_cb the two hash values.
 * (A zero value means double buffering was not used)
 */
struct sfb_skb_cb {
	u32 hashes[2];
};

static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
{
	BUILD_BUG_ON(sizeof(skb->cb) <
		sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
}

/*
 * If using 'internal' SFB flow classifier, hash comes from skb rxhash
 * If using external classifier, hash comes from the classid.
 */
static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
{
	return sfb_skb_cb(skb)->hashes[slot];
}

/* Probabilities are coded as Q0.16 fixed-point values,
 * with 0xFFFF representing 65535/65536 (almost 1.0)
 * Addition and subtraction are saturating in [0, 65535]
 */
static u32 prob_plus(u32 p1, u32 p2)
{
	u32 res = p1 + p2;

	return min_t(u32, res, SFB_MAX_PROB);
}

static u32 prob_minus(u32 p1, u32 p2)
{
	return p1 > p2 ? p1 - p2 : 0;
}

static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
{
	int i;
	struct sfb_bucket *b = &q->bins[slot].bins[0][0];

	for (i = 0; i < SFB_LEVELS; i++) {
		u32 hash = sfbhash & SFB_BUCKET_MASK;

		sfbhash >>= SFB_BUCKET_SHIFT;
		if (b[hash].qlen < 0xFFFF)
			b[hash].qlen++;
		b += SFB_NUMBUCKETS; /* next level */
	}
}

static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
{
	u32 sfbhash;

	sfbhash = sfb_hash(skb, 0);
	if (sfbhash)
		increment_one_qlen(sfbhash, 0, q);

	sfbhash = sfb_hash(skb, 1);
	if (sfbhash)
		increment_one_qlen(sfbhash, 1, q);
}

static void decrement_one_qlen(u32 sfbhash, u32 slot,
			       struct sfb_sched_data *q)
{
	int i;
	struct sfb_bucket *b = &q->bins[slot].bins[0][0];

	for (i = 0; i < SFB_LEVELS; i++) {
		u32 hash = sfbhash & SFB_BUCKET_MASK;

		sfbhash >>= SFB_BUCKET_SHIFT;
		if (b[hash].qlen > 0)
			b[hash].qlen--;
		b += SFB_NUMBUCKETS; /* next level */
	}
}

static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
{
	u32 sfbhash;

	sfbhash = sfb_hash(skb, 0);
	if (sfbhash)
		decrement_one_qlen(sfbhash, 0, q);

	sfbhash = sfb_hash(skb, 1);
	if (sfbhash)
		decrement_one_qlen(sfbhash, 1, q);
}

static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
{
	b->p_mark = prob_minus(b->p_mark, q->decrement);
}

static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
{
	b->p_mark = prob_plus(b->p_mark, q->increment);
}

static void sfb_zero_all_buckets(struct sfb_sched_data *q)
{
	memset(&q->bins, 0, sizeof(q->bins));
}

/*
 * compute max qlen, max p_mark, and avg p_mark
 */
static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
{
	int i;
	u32 qlen = 0, prob = 0, totalpm = 0;
	const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];

	for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
		if (qlen < b->qlen)
			qlen = b->qlen;
		totalpm += b->p_mark;
		if (prob < b->p_mark)
			prob = b->p_mark;
		b++;
	}
	*prob_r = prob;
	*avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
	return qlen;
}


static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
{
	q->bins[slot].perturbation = net_random();
}

static void sfb_swap_slot(struct sfb_sched_data *q)
{
	sfb_init_perturbation(q->slot, q);
	q->slot ^= 1;
	q->double_buffering = false;
}

/* Non elastic flows are allowed to use part of the bandwidth, expressed
 * in "penalty_rate" packets per second, with "penalty_burst" burst
 */
static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
{
	if (q->penalty_rate == 0 || q->penalty_burst == 0)
		return true;

	if (q->tokens_avail < 1) {
		unsigned long age = min(10UL * HZ, jiffies - q->token_time);

		q->tokens_avail = (age * q->penalty_rate) / HZ;
		if (q->tokens_avail > q->penalty_burst)
			q->tokens_avail = q->penalty_burst;
		q->token_time = jiffies;
		if (q->tokens_avail < 1)
			return true;
	}

	q->tokens_avail--;
	return false;
}

static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
			 int *qerr, u32 *salt)
{
	struct tcf_result res;
	int result;

	result = tc_classify(skb, q->filter_list, &res);
	if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
		switch (result) {
		case TC_ACT_STOLEN:
		case TC_ACT_QUEUED:
			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
		case TC_ACT_SHOT:
			return false;
		}
#endif
		*salt = TC_H_MIN(res.classid);
		return true;
	}
	return false;
}

static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{

	struct sfb_sched_data *q = qdisc_priv(sch);
	struct Qdisc *child = q->qdisc;
	int i;
	u32 p_min = ~0;
	u32 minqlen = ~0;
	u32 r, slot, salt, sfbhash;
	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
	struct flow_keys keys;

	if (unlikely(sch->q.qlen >= q->limit)) {
		sch->qstats.overlimits++;
		q->stats.queuedrop++;
		goto drop;
	}

	if (q->rehash_interval > 0) {
		unsigned long limit = q->rehash_time + q->rehash_interval;

		if (unlikely(time_after(jiffies, limit))) {
			sfb_swap_slot(q);
			q->rehash_time = jiffies;
		} else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
				    time_after(jiffies, limit - q->warmup_time))) {
			q->double_buffering = true;
		}
	}

	if (q->filter_list) {
		/* If using external classifiers, get result and record it. */
		if (!sfb_classify(skb, q, &ret, &salt))
			goto other_drop;
		keys.src = salt;
		keys.dst = 0;
		keys.ports = 0;
	} else {
		skb_flow_dissect(skb, &keys);
	}

	slot = q->slot;

	sfbhash = jhash_3words((__force u32)keys.dst,
			       (__force u32)keys.src,
			       (__force u32)keys.ports,
			       q->bins[slot].perturbation);
	if (!sfbhash)
		sfbhash = 1;
	sfb_skb_cb(skb)->hashes[slot] = sfbhash;

	for (i = 0; i < SFB_LEVELS; i++) {
		u32 hash = sfbhash & SFB_BUCKET_MASK;
		struct sfb_bucket *b = &q->bins[slot].bins[i][hash];

		sfbhash >>= SFB_BUCKET_SHIFT;
		if (b->qlen == 0)
			decrement_prob(b, q);
		else if (b->qlen >= q->bin_size)
			increment_prob(b, q);
		if (minqlen > b->qlen)
			minqlen = b->qlen;
		if (p_min > b->p_mark)
			p_min = b->p_mark;
	}

	slot ^= 1;
	sfb_skb_cb(skb)->hashes[slot] = 0;

	if (unlikely(minqlen >= q->max)) {
		sch->qstats.overlimits++;
		q->stats.bucketdrop++;
		goto drop;
	}

	if (unlikely(p_min >= SFB_MAX_PROB)) {
		/* Inelastic flow */
		if (q->double_buffering) {
			sfbhash = jhash_3words((__force u32)keys.dst,
					       (__force u32)keys.src,
					       (__force u32)keys.ports,
					       q->bins[slot].perturbation);
			if (!sfbhash)
				sfbhash = 1;
			sfb_skb_cb(skb)->hashes[slot] = sfbhash;

			for (i = 0; i < SFB_LEVELS; i++) {
				u32 hash = sfbhash & SFB_BUCKET_MASK;
				struct sfb_bucket *b = &q->bins[slot].bins[i][hash];

				sfbhash >>= SFB_BUCKET_SHIFT;
				if (b->qlen == 0)
					decrement_prob(b, q);
				else if (b->qlen >= q->bin_size)
					increment_prob(b, q);
			}
		}
		if (sfb_rate_limit(skb, q)) {
			sch->qstats.overlimits++;
			q->stats.penaltydrop++;
			goto drop;
		}
		goto enqueue;
	}

	r = net_random() & SFB_MAX_PROB;

	if (unlikely(r < p_min)) {
		if (unlikely(p_min > SFB_MAX_PROB / 2)) {
			/* If we're marking that many packets, then either
			 * this flow is unresponsive, or we're badly congested.
			 * In either case, we want to start dropping packets.
			 */
			if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
				q->stats.earlydrop++;
				goto drop;
			}
		}
		if (INET_ECN_set_ce(skb)) {
			q->stats.marked++;
		} else {
			q->stats.earlydrop++;
			goto drop;
		}
	}

enqueue:
	ret = qdisc_enqueue(skb, child);
	if (likely(ret == NET_XMIT_SUCCESS)) {
		sch->q.qlen++;
		increment_qlen(skb, q);
	} else if (net_xmit_drop_count(ret)) {
		q->stats.childdrop++;
		sch->qstats.drops++;
	}
	return ret;

drop:
	qdisc_drop(skb, sch);
	return NET_XMIT_CN;
other_drop:
	if (ret & __NET_XMIT_BYPASS)
		sch->qstats.drops++;
	kfree_skb(skb);
	return ret;
}

static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
{
	struct sfb_sched_data *q = qdisc_priv(sch);
	struct Qdisc *child = q->qdisc;
	struct sk_buff *skb;

	skb = child->dequeue(q->qdisc);

	if (skb) {
		qdisc_bstats_update(sch, skb);
		sch->q.qlen--;
		decrement_qlen(skb, q);
	}

	return skb;
}

static struct sk_buff *sfb_peek(struct Qdisc *sch)
{
	struct sfb_sched_data *q = qdisc_priv(sch);
	struct Qdisc *child = q->qdisc;

	return child->ops->peek(child);
}

/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */

static void sfb_reset(struct Qdisc *sch)
{
	struct sfb_sched_data *q = qdisc_priv(sch);

	qdisc_reset(q->qdisc);
	sch->q.qlen = 0;
	q->slot = 0;
	q->double_buffering = false;
	sfb_zero_all_buckets(q);
	sfb_init_perturbation(0, q);
}

static void sfb_destroy(struct Qdisc *sch)
{
	struct sfb_sched_data *q = qdisc_priv(sch);

	tcf_destroy_chain(&q->filter_list);
	qdisc_destroy(q->qdisc);
}

static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
	[TCA_SFB_PARMS]	= { .len = sizeof(struct tc_sfb_qopt) },
};

static const struct tc_sfb_qopt sfb_default_ops = {
	.rehash_interval = 600 * MSEC_PER_SEC,
	.warmup_time = 60 * MSEC_PER_SEC,
	.limit = 0,
	.max = 25,
	.bin_size = 20,
	.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
	.decrement = (SFB_MAX_PROB + 3000) / 6000,
	.penalty_rate = 10,
	.penalty_burst = 20,
};

static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
{
	struct sfb_sched_data *q = qdisc_priv(sch);
	struct Qdisc *child;
	struct nlattr *tb[TCA_SFB_MAX + 1];
	const struct tc_sfb_qopt *ctl = &sfb_default_ops;
	u32 limit;
	int err;

	if (opt) {
		err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
		if (err < 0)
			return -EINVAL;

		if (tb[TCA_SFB_PARMS] == NULL)
			return -EINVAL;

		ctl = nla_data(tb[TCA_SFB_PARMS]);
	}

	limit = ctl->limit;
	if (limit == 0)
		limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);

	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
	if (IS_ERR(child))
		return PTR_ERR(child);

	sch_tree_lock(sch);

	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
	qdisc_destroy(q->qdisc);
	q->qdisc = child;

	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
	q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
	q->rehash_time = jiffies;
	q->limit = limit;
	q->increment = ctl->increment;
	q->decrement = ctl->decrement;
	q->max = ctl->max;
	q->bin_size = ctl->bin_size;
	q->penalty_rate = ctl->penalty_rate;
	q->penalty_burst = ctl->penalty_burst;
	q->tokens_avail = ctl->penalty_burst;
	q->token_time = jiffies;

	q->slot = 0;
	q->double_buffering = false;
	sfb_zero_all_buckets(q);
	sfb_init_perturbation(0, q);
	sfb_init_perturbation(1, q);

	sch_tree_unlock(sch);

	return 0;
}

static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct sfb_sched_data *q = qdisc_priv(sch);

	q->qdisc = &noop_qdisc;
	return sfb_change(sch, opt);
}

static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct sfb_sched_data *q = qdisc_priv(sch);
	struct nlattr *opts;
	struct tc_sfb_qopt opt = {
		.rehash_interval = jiffies_to_msecs(q->rehash_interval),
		.warmup_time = jiffies_to_msecs(q->warmup_time),
		.limit = q->limit,
		.max = q->max,
		.bin_size = q->bin_size,
		.increment = q->increment,
		.decrement = q->decrement,
		.penalty_rate = q->penalty_rate,
		.penalty_burst = q->penalty_burst,
	};

	sch->qstats.backlog = q->qdisc->qstats.backlog;
	opts = nla_nest_start(skb, TCA_OPTIONS);
	NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
	return nla_nest_end(skb, opts);

nla_put_failure:
	nla_nest_cancel(skb, opts);
	return -EMSGSIZE;
}

static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
	struct sfb_sched_data *q = qdisc_priv(sch);
	struct tc_sfb_xstats st = {
		.earlydrop = q->stats.earlydrop,
		.penaltydrop = q->stats.penaltydrop,
		.bucketdrop = q->stats.bucketdrop,
		.queuedrop = q->stats.queuedrop,
		.childdrop = q->stats.childdrop,
		.marked = q->stats.marked,
	};

	st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);

	return gnet_stats_copy_app(d, &st, sizeof(st));
}

static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
			  struct sk_buff *skb, struct tcmsg *tcm)
{
	return -ENOSYS;
}

static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
		     struct Qdisc **old)
{
	struct sfb_sched_data *q = qdisc_priv(sch);

	if (new == NULL)
		new = &noop_qdisc;

	sch_tree_lock(sch);
	*old = q->qdisc;
	q->qdisc = new;
	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
	qdisc_reset(*old);
	sch_tree_unlock(sch);
	return 0;
}

static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
{
	struct sfb_sched_data *q = qdisc_priv(sch);

	return q->qdisc;
}

static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
{
	return 1;
}

static void sfb_put(struct Qdisc *sch, unsigned long arg)
{
}

static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
			    struct nlattr **tca, unsigned long *arg)
{
	return -ENOSYS;
}

static int sfb_delete(struct Qdisc *sch, unsigned long cl)
{
	return -ENOSYS;
}

static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
	if (!walker->stop) {
		if (walker->count >= walker->skip)
			if (walker->fn(sch, 1, walker) < 0) {
				walker->stop = 1;
				return;
			}
		walker->count++;
	}
}

static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
{
	struct sfb_sched_data *q = qdisc_priv(sch);

	if (cl)
		return NULL;
	return &q->filter_list;
}

static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
			      u32 classid)
{
	return 0;
}


static const struct Qdisc_class_ops sfb_class_ops = {
	.graft		=	sfb_graft,
	.leaf		=	sfb_leaf,
	.get		=	sfb_get,
	.put		=	sfb_put,
	.change		=	sfb_change_class,
	.delete		=	sfb_delete,
	.walk		=	sfb_walk,
	.tcf_chain	=	sfb_find_tcf,
	.bind_tcf	=	sfb_bind,
	.unbind_tcf	=	sfb_put,
	.dump		=	sfb_dump_class,
};

static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
	.id		=	"sfb",
	.priv_size	=	sizeof(struct sfb_sched_data),
	.cl_ops		=	&sfb_class_ops,
	.enqueue	=	sfb_enqueue,
	.dequeue	=	sfb_dequeue,
	.peek		=	sfb_peek,
	.init		=	sfb_init,
	.reset		=	sfb_reset,
	.destroy	=	sfb_destroy,
	.change		=	sfb_change,
	.dump		=	sfb_dump,
	.dump_stats	=	sfb_dump_stats,
	.owner		=	THIS_MODULE,
};

static int __init sfb_module_init(void)
{
	return register_qdisc(&sfb_qdisc_ops);
}

static void __exit sfb_module_exit(void)
{
	unregister_qdisc(&sfb_qdisc_ops);
}

module_init(sfb_module_init)
module_exit(sfb_module_exit)

MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
MODULE_AUTHOR("Juliusz Chroboczek");
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");
Commit	Line	Data
e13e02a3 ED	1	/*
	2	* net/sched/sch_sfb.c Stochastic Fair Blue
	3	*
	4	* Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
	5	* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
	6	*
	7	* This program is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU General Public License
	9	* version 2 as published by the Free Software Foundation.
	10	*
	11	* W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
	12	* A New Class of Active Queue Management Algorithms.
	13	* U. Michigan CSE-TR-387-99, April 1999.
	14	*
	15	* http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
	16	*
	17	*/
	18
	19	#include <linux/module.h>
	20	#include <linux/types.h>
	21	#include <linux/kernel.h>
	22	#include <linux/errno.h>
	23	#include <linux/skbuff.h>
	24	#include <linux/random.h>
	25	#include <linux/jhash.h>
	26	#include <net/ip.h>
	27	#include <net/pkt_sched.h>
	28	#include <net/inet_ecn.h>
a00bd469	29	#include <net/flow_keys.h>
e13e02a3 ED	30
	31	/*
	32	* SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
	33	* This implementation uses L = 8 and N = 16
	34	* This permits us to split one 32bit hash (provided per packet by rxhash or
	35	* external classifier) into 8 subhashes of 4 bits.
	36	*/
	37	#define SFB_BUCKET_SHIFT 4
	38	#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
	39	#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
	40	#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
	41
	42	/* SFB algo uses a virtual queue, named "bin" */
	43	struct sfb_bucket {
	44	u16 qlen; /* length of virtual queue */
	45	u16 p_mark; /* marking probability */
	46	};
	47
	48	/* We use a double buffering right before hash change
	49	* (Section 4.4 of SFB reference : moving hash functions)
	50	*/
	51	struct sfb_bins {
	52	u32 perturbation; /* jhash perturbation */
	53	struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
	54	};
	55
	56	struct sfb_sched_data {
	57	struct Qdisc *qdisc;
	58	struct tcf_proto *filter_list;
	59	unsigned long rehash_interval;
	60	unsigned long warmup_time; /* double buffering warmup time in jiffies */
	61	u32 max;
	62	u32 bin_size; /* maximum queue length per bin */
	63	u32 increment; /* d1 */
	64	u32 decrement; /* d2 */
	65	u32 limit; /* HARD maximal queue length */
	66	u32 penalty_rate;
	67	u32 penalty_burst;
	68	u32 tokens_avail;
	69	unsigned long rehash_time;
	70	unsigned long token_time;
	71
	72	u8 slot; /* current active bins (0 or 1) */
	73	bool double_buffering;
	74	struct sfb_bins bins[2];
	75
	76	struct {
	77	u32 earlydrop;
	78	u32 penaltydrop;
	79	u32 bucketdrop;
	80	u32 queuedrop;
	81	u32 childdrop; /* drops in child qdisc */
	82	u32 marked; /* ECN mark */
	83	} stats;
	84	};
	85
	86	/*
	87	* Each queued skb might be hashed on one or two bins
	88	* We store in skb_cb the two hash values.
	89	* (A zero value means double buffering was not used)
	90	*/
	91	struct sfb_skb_cb {
	92	u32 hashes[2];
	93	};
94
95	static inline struct sfb_skb_cb sfb_skb_cb(const struct sk_buff skb)
96	{
97	BUILD_BUG_ON(sizeof(skb->cb) <
98	sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
99	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
100	}
101
102	/*
103	* If using 'internal' SFB flow classifier, hash comes from skb rxhash
104	* If using external classifier, hash comes from the classid.
105	*/
106	static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
107	{
108	return sfb_skb_cb(skb)->hashes[slot];
109	}
110
111	/* Probabilities are coded as Q0.16 fixed-point values,
112	* with 0xFFFF representing 65535/65536 (almost 1.0)
113	* Addition and subtraction are saturating in [0, 65535]
114	*/
115	static u32 prob_plus(u32 p1, u32 p2)
116	{
117	u32 res = p1 + p2;
118
119	return min_t(u32, res, SFB_MAX_PROB);
120	}
121
122	static u32 prob_minus(u32 p1, u32 p2)
123	{
124	return p1 > p2 ? p1 - p2 : 0;
125	}
126
127	static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
128	{
129	int i;
130	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
131
132	for (i = 0; i < SFB_LEVELS; i++) {
133	u32 hash = sfbhash & SFB_BUCKET_MASK;
134
135	sfbhash >>= SFB_BUCKET_SHIFT;
136	if (b[hash].qlen < 0xFFFF)
137	b[hash].qlen++;
138	b += SFB_NUMBUCKETS; /* next level */
139	}
140	}
141
142	static void increment_qlen(const struct sk_buff skb, struct sfb_sched_data q)
143	{
144	u32 sfbhash;
145
146	sfbhash = sfb_hash(skb, 0);
147	if (sfbhash)
148	increment_one_qlen(sfbhash, 0, q);
149
150	sfbhash = sfb_hash(skb, 1);
151	if (sfbhash)
152	increment_one_qlen(sfbhash, 1, q);
153	}
154
155	static void decrement_one_qlen(u32 sfbhash, u32 slot,
156	struct sfb_sched_data *q)
157	{
158	int i;
159	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
160
161	for (i = 0; i < SFB_LEVELS; i++) {
162	u32 hash = sfbhash & SFB_BUCKET_MASK;
163
164	sfbhash >>= SFB_BUCKET_SHIFT;
165	if (b[hash].qlen > 0)
166	b[hash].qlen--;
167	b += SFB_NUMBUCKETS; /* next level */
168	}
169	}
170
171	static void decrement_qlen(const struct sk_buff skb, struct sfb_sched_data q)
172	{
173	u32 sfbhash;
174
175	sfbhash = sfb_hash(skb, 0);
176	if (sfbhash)
177	decrement_one_qlen(sfbhash, 0, q);
178
179	sfbhash = sfb_hash(skb, 1);
180	if (sfbhash)
181	decrement_one_qlen(sfbhash, 1, q);
182	}
183
184	static void decrement_prob(struct sfb_bucket b, struct sfb_sched_data q)
185	{
186	b->p_mark = prob_minus(b->p_mark, q->decrement);
187	}
188
189	static void increment_prob(struct sfb_bucket b, struct sfb_sched_data q)
190	{
191	b->p_mark = prob_plus(b->p_mark, q->increment);
192	}
193
194	static void sfb_zero_all_buckets(struct sfb_sched_data *q)
195	{
196	memset(&q->bins, 0, sizeof(q->bins));
197	}
198
199	/*
200	* compute max qlen, max p_mark, and avg p_mark
201	*/
202	static u32 sfb_compute_qlen(u32 prob_r, u32 avgpm_r, const struct sfb_sched_data *q)
203	{
204	int i;
205	u32 qlen = 0, prob = 0, totalpm = 0;
206	const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
207
208	for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
209	if (qlen < b->qlen)
210	qlen = b->qlen;
211	totalpm += b->p_mark;
212	if (prob < b->p_mark)
213	prob = b->p_mark;
214	b++;
215	}
216	*prob_r = prob;
217	avgpm_r = totalpm / (SFB_LEVELS SFB_NUMBUCKETS);
218	return qlen;
219	}
220
221
222	static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
223	{
224	q->bins[slot].perturbation = net_random();
225	}
226
227	static void sfb_swap_slot(struct sfb_sched_data *q)
228	{
229	sfb_init_perturbation(q->slot, q);
230	q->slot ^= 1;
231	q->double_buffering = false;
232	}
233
234	/* Non elastic flows are allowed to use part of the bandwidth, expressed
235	* in "penalty_rate" packets per second, with "penalty_burst" burst
236	*/
237	static bool sfb_rate_limit(struct sk_buff skb, struct sfb_sched_data q)
238	{
239	if (q->penalty_rate == 0 \|\| q->penalty_burst == 0)
240	return true;
241
242	if (q->tokens_avail < 1) {
243	unsigned long age = min(10UL * HZ, jiffies - q->token_time);
244
245	q->tokens_avail = (age * q->penalty_rate) / HZ;
246	if (q->tokens_avail > q->penalty_burst)
247	q->tokens_avail = q->penalty_burst;
248	q->token_time = jiffies;
249	if (q->tokens_avail < 1)
250	return true;
251	}
252
253	q->tokens_avail--;
254	return false;
255	}
256
257	static bool sfb_classify(struct sk_buff skb, struct sfb_sched_data q,
258	int qerr, u32 salt)
259	{
260	struct tcf_result res;
261	int result;
262
263	result = tc_classify(skb, q->filter_list, &res);
264	if (result >= 0) {
265	#ifdef CONFIG_NET_CLS_ACT
266	switch (result) {
267	case TC_ACT_STOLEN:
268	case TC_ACT_QUEUED:
269	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
270	case TC_ACT_SHOT:
271	return false;
272	}
273	#endif
274	*salt = TC_H_MIN(res.classid);
275	return true;
276	}
277	return false;
278	}
279
280	static int sfb_enqueue(struct sk_buff skb, struct Qdisc sch)
281	{
282
283	struct sfb_sched_data *q = qdisc_priv(sch);
284	struct Qdisc *child = q->qdisc;
285	int i;
286	u32 p_min = ~0;
287	u32 minqlen = ~0;
288	u32 r, slot, salt, sfbhash;
289	int ret = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
a00bd469	290	struct flow_keys keys;
e13e02a3	291
363437f4 ED	292	if (unlikely(sch->q.qlen >= q->limit)) {
	293	sch->qstats.overlimits++;
	294	q->stats.queuedrop++;
	295	goto drop;
	296	}
	297
e13e02a3 ED	298	if (q->rehash_interval > 0) {
	299	unsigned long limit = q->rehash_time + q->rehash_interval;
	300
	301	if (unlikely(time_after(jiffies, limit))) {
	302	sfb_swap_slot(q);
	303	q->rehash_time = jiffies;
	304	} else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
	305	time_after(jiffies, limit - q->warmup_time))) {
	306	q->double_buffering = true;
	307	}
	308	}
	309
	310	if (q->filter_list) {
	311	/* If using external classifiers, get result and record it. */
	312	if (!sfb_classify(skb, q, &ret, &salt))
	313	goto other_drop;
a00bd469 ED	314	keys.src = salt;
	315	keys.dst = 0;
	316	keys.ports = 0;
e13e02a3	317	} else {
a00bd469	318	skb_flow_dissect(skb, &keys);
e13e02a3 ED	319	}
	320
	321	slot = q->slot;
	322
a00bd469 ED	323	sfbhash = jhash_3words((__force u32)keys.dst,
	324	(__force u32)keys.src,
	325	(__force u32)keys.ports,
	326	q->bins[slot].perturbation);
e13e02a3 ED	327	if (!sfbhash)
	328	sfbhash = 1;
	329	sfb_skb_cb(skb)->hashes[slot] = sfbhash;
	330
	331	for (i = 0; i < SFB_LEVELS; i++) {
	332	u32 hash = sfbhash & SFB_BUCKET_MASK;
	333	struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
	334
	335	sfbhash >>= SFB_BUCKET_SHIFT;
	336	if (b->qlen == 0)
	337	decrement_prob(b, q);
	338	else if (b->qlen >= q->bin_size)
	339	increment_prob(b, q);
	340	if (minqlen > b->qlen)
	341	minqlen = b->qlen;
	342	if (p_min > b->p_mark)
	343	p_min = b->p_mark;
	344	}
	345
	346	slot ^= 1;
	347	sfb_skb_cb(skb)->hashes[slot] = 0;
	348
363437f4	349	if (unlikely(minqlen >= q->max)) {
e13e02a3	350	sch->qstats.overlimits++;
363437f4	351	q->stats.bucketdrop++;
e13e02a3 ED	352	goto drop;
	353	}
	354
	355	if (unlikely(p_min >= SFB_MAX_PROB)) {
	356	/* Inelastic flow */
	357	if (q->double_buffering) {
a00bd469 ED	358	sfbhash = jhash_3words((__force u32)keys.dst,
	359	(__force u32)keys.src,
	360	(__force u32)keys.ports,
	361	q->bins[slot].perturbation);
e13e02a3 ED	362	if (!sfbhash)
	363	sfbhash = 1;
	364	sfb_skb_cb(skb)->hashes[slot] = sfbhash;
	365
	366	for (i = 0; i < SFB_LEVELS; i++) {
	367	u32 hash = sfbhash & SFB_BUCKET_MASK;
	368	struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
	369
	370	sfbhash >>= SFB_BUCKET_SHIFT;
	371	if (b->qlen == 0)
	372	decrement_prob(b, q);
	373	else if (b->qlen >= q->bin_size)
	374	increment_prob(b, q);
	375	}
	376	}
	377	if (sfb_rate_limit(skb, q)) {
	378	sch->qstats.overlimits++;
	379	q->stats.penaltydrop++;
	380	goto drop;
	381	}
	382	goto enqueue;
	383	}
	384
	385	r = net_random() & SFB_MAX_PROB;
	386
	387	if (unlikely(r < p_min)) {
	388	if (unlikely(p_min > SFB_MAX_PROB / 2)) {
	389	/* If we're marking that many packets, then either
	390	* this flow is unresponsive, or we're badly congested.
	391	* In either case, we want to start dropping packets.
	392	*/
	393	if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
	394	q->stats.earlydrop++;
	395	goto drop;
	396	}
	397	}
	398	if (INET_ECN_set_ce(skb)) {
	399	q->stats.marked++;
	400	} else {
	401	q->stats.earlydrop++;
	402	goto drop;
	403	}
	404	}
	405
	406	enqueue:
	407	ret = qdisc_enqueue(skb, child);
	408	if (likely(ret == NET_XMIT_SUCCESS)) {
	409	sch->q.qlen++;
	410	increment_qlen(skb, q);
	411	} else if (net_xmit_drop_count(ret)) {
	412	q->stats.childdrop++;
	413	sch->qstats.drops++;
	414	}
	415	return ret;
	416
	417	drop:
	418	qdisc_drop(skb, sch);
	419	return NET_XMIT_CN;
	420	other_drop:
	421	if (ret & __NET_XMIT_BYPASS)
	422	sch->qstats.drops++;
	423	kfree_skb(skb);
	424	return ret;
	425	}
426
427	static struct sk_buff sfb_dequeue(struct Qdisc sch)
428	{
429	struct sfb_sched_data *q = qdisc_priv(sch);
430	struct Qdisc *child = q->qdisc;
431	struct sk_buff *skb;
432
433	skb = child->dequeue(q->qdisc);
434
435	if (skb) {
436	qdisc_bstats_update(sch, skb);
437	sch->q.qlen--;
438	decrement_qlen(skb, q);
439	}
440
441	return skb;
442	}
443
444	static struct sk_buff sfb_peek(struct Qdisc sch)
445	{
446	struct sfb_sched_data *q = qdisc_priv(sch);
447	struct Qdisc *child = q->qdisc;
448
449	return child->ops->peek(child);
450	}
451
452	/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
453
454	static void sfb_reset(struct Qdisc *sch)
455	{
456	struct sfb_sched_data *q = qdisc_priv(sch);
457
458	qdisc_reset(q->qdisc);
459	sch->q.qlen = 0;
460	q->slot = 0;
461	q->double_buffering = false;
462	sfb_zero_all_buckets(q);
463	sfb_init_perturbation(0, q);
464	}
465
466	static void sfb_destroy(struct Qdisc *sch)
467	{
468	struct sfb_sched_data *q = qdisc_priv(sch);
469
470	tcf_destroy_chain(&q->filter_list);
471	qdisc_destroy(q->qdisc);
472	}
473
474	static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
475	[TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
476	};
477
478	static const struct tc_sfb_qopt sfb_default_ops = {
479	.rehash_interval = 600 * MSEC_PER_SEC,
480	.warmup_time = 60 * MSEC_PER_SEC,
481	.limit = 0,
482	.max = 25,
483	.bin_size = 20,
484	.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
485	.decrement = (SFB_MAX_PROB + 3000) / 6000,
486	.penalty_rate = 10,
487	.penalty_burst = 20,
488	};
489
490	static int sfb_change(struct Qdisc sch, struct nlattr opt)
491	{
492	struct sfb_sched_data *q = qdisc_priv(sch);
493	struct Qdisc *child;
494	struct nlattr *tb[TCA_SFB_MAX + 1];
495	const struct tc_sfb_qopt *ctl = &sfb_default_ops;
496	u32 limit;
497	int err;
498
499	if (opt) {
500	err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
501	if (err < 0)
502	return -EINVAL;
503
504	if (tb[TCA_SFB_PARMS] == NULL)
505	return -EINVAL;
506
507	ctl = nla_data(tb[TCA_SFB_PARMS]);
508	}
509
510	limit = ctl->limit;
511	if (limit == 0)
512	limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
513
514	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
515	if (IS_ERR(child))
516	return PTR_ERR(child);
517
518	sch_tree_lock(sch);
519
520	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
521	qdisc_destroy(q->qdisc);
522	q->qdisc = child;
523
524	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
525	q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
526	q->rehash_time = jiffies;
527	q->limit = limit;
528	q->increment = ctl->increment;
529	q->decrement = ctl->decrement;
530	q->max = ctl->max;
531	q->bin_size = ctl->bin_size;
532	q->penalty_rate = ctl->penalty_rate;
533	q->penalty_burst = ctl->penalty_burst;
534	q->tokens_avail = ctl->penalty_burst;
535	q->token_time = jiffies;
536
537	q->slot = 0;
538	q->double_buffering = false;
539	sfb_zero_all_buckets(q);
540	sfb_init_perturbation(0, q);
541	sfb_init_perturbation(1, q);
542
543	sch_tree_unlock(sch);
544
545	return 0;
546	}
547
548	static int sfb_init(struct Qdisc sch, struct nlattr opt)
549	{
550	struct sfb_sched_data *q = qdisc_priv(sch);
551
552	q->qdisc = &noop_qdisc;
553	return sfb_change(sch, opt);
554	}
555
556	static int sfb_dump(struct Qdisc sch, struct sk_buff skb)
557	{
558	struct sfb_sched_data *q = qdisc_priv(sch);
559	struct nlattr *opts;
560	struct tc_sfb_qopt opt = {
561	.rehash_interval = jiffies_to_msecs(q->rehash_interval),
562	.warmup_time = jiffies_to_msecs(q->warmup_time),
563	.limit = q->limit,
564	.max = q->max,
565	.bin_size = q->bin_size,
566	.increment = q->increment,
567	.decrement = q->decrement,
568	.penalty_rate = q->penalty_rate,
569	.penalty_burst = q->penalty_burst,
570	};
571
572	sch->qstats.backlog = q->qdisc->qstats.backlog;
573	opts = nla_nest_start(skb, TCA_OPTIONS);
574	NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
575	return nla_nest_end(skb, opts);
576
577	nla_put_failure:
578	nla_nest_cancel(skb, opts);
579	return -EMSGSIZE;
580	}
581
582	static int sfb_dump_stats(struct Qdisc sch, struct gnet_dump d)
583	{
584	struct sfb_sched_data *q = qdisc_priv(sch);
585	struct tc_sfb_xstats st = {
586	.earlydrop = q->stats.earlydrop,
587	.penaltydrop = q->stats.penaltydrop,
588	.bucketdrop = q->stats.bucketdrop,
589	.queuedrop = q->stats.queuedrop,
590	.childdrop = q->stats.childdrop,
591	.marked = q->stats.marked,
592	};
593
594	st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
595
596	return gnet_stats_copy_app(d, &st, sizeof(st));
597	}
598
599	static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
600	struct sk_buff skb, struct tcmsg tcm)
601	{
602	return -ENOSYS;
603	}
604
605	static int sfb_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
606	struct Qdisc **old)
607	{
608	struct sfb_sched_data *q = qdisc_priv(sch);
609
610	if (new == NULL)
611	new = &noop_qdisc;
612
613	sch_tree_lock(sch);
614	*old = q->qdisc;
615	q->qdisc = new;
616	qdisc_tree_decrease_qlen(old, (old)->q.qlen);
617	qdisc_reset(*old);
618	sch_tree_unlock(sch);
619	return 0;
620	}
621
622	static struct Qdisc sfb_leaf(struct Qdisc sch, unsigned long arg)
623	{
624	struct sfb_sched_data *q = qdisc_priv(sch);
625
626	return q->qdisc;
627	}
628
629	static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
630	{
631	return 1;
632	}
633
634	static void sfb_put(struct Qdisc *sch, unsigned long arg)
635	{
636	}
637
638	static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
639	struct nlattr *tca, unsigned long arg)
640	{
641	return -ENOSYS;
642	}
643
644	static int sfb_delete(struct Qdisc *sch, unsigned long cl)
645	{
646	return -ENOSYS;
647	}
648
649	static void sfb_walk(struct Qdisc sch, struct qdisc_walker walker)
650	{
651	if (!walker->stop) {
652	if (walker->count >= walker->skip)
653	if (walker->fn(sch, 1, walker) < 0) {
654	walker->stop = 1;
655	return;
656	}
657	walker->count++;
658	}
659	}
660
661	static struct tcf_proto *sfb_find_tcf(struct Qdisc sch, unsigned long cl)
662	{
663	struct sfb_sched_data *q = qdisc_priv(sch);
664
665	if (cl)
666	return NULL;
667	return &q->filter_list;
668	}
669
670	static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
671	u32 classid)
672	{
673	return 0;
674	}
675
676
677	static const struct Qdisc_class_ops sfb_class_ops = {
678	.graft = sfb_graft,
679	.leaf = sfb_leaf,
680	.get = sfb_get,
681	.put = sfb_put,
682	.change = sfb_change_class,
683	.delete = sfb_delete,
684	.walk = sfb_walk,
685	.tcf_chain = sfb_find_tcf,
686	.bind_tcf = sfb_bind,
687	.unbind_tcf = sfb_put,
688	.dump = sfb_dump_class,
689	};
690
691	static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
692	.id = "sfb",
693	.priv_size = sizeof(struct sfb_sched_data),
694	.cl_ops = &sfb_class_ops,
695	.enqueue = sfb_enqueue,
696	.dequeue = sfb_dequeue,
697	.peek = sfb_peek,
698	.init = sfb_init,
699	.reset = sfb_reset,
700	.destroy = sfb_destroy,
701	.change = sfb_change,
702	.dump = sfb_dump,
703	.dump_stats = sfb_dump_stats,
704	.owner = THIS_MODULE,
705	};
706
707	static int __init sfb_module_init(void)
708	{
709	return register_qdisc(&sfb_qdisc_ops);
710	}
711
712	static void __exit sfb_module_exit(void)
713	{
714	unregister_qdisc(&sfb_qdisc_ops);
715	}
716
717	module_init(sfb_module_init)
718	module_exit(sfb_module_exit)
719
720	MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
721	MODULE_AUTHOR("Juliusz Chroboczek");
722	MODULE_AUTHOR("Eric Dumazet");
723	MODULE_LICENSE("GPL");