net_sched: sfq: add optional RED on top of SFQ

Adds an optional Random Early Detection on each SFQ flow queue. Traditional SFQ limits count of packets, while RED permits to also control number of bytes per flow, and adds ECN capability as well. 1) We dont handle the idle time management in this RED implementation, since each 'new flow' begins with a null qavg. We really want to address backlogged flows. 2) if headdrop is selected, we try to ecn mark first packet instead of currently enqueued packet. This gives faster feedback for tcp flows compared to traditional RED [ marking the last packet in queue ] Example of use : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \ limit 3000 headdrop flows 512 divisor 16384 \ redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 ewma 6 min 8000b max 60000b probability 0.2 ecn prob_mark 0 prob_mark_head 4876 prob_drop 6131 forced_mark 0 forced_mark_head 0 forced_drop 0 Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007 requeues 0) rate 99483Kbit 8219pps backlog 689392b 456p requeues 0 In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled flows, we can see number of packets CE marked is smaller than number of drops (for non ECN flows) If same test is run, without RED, we can check backlog is much bigger. qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0) rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0 Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Stephen Hemminger <shemminger@vyatta.com> CC: Dave Taht <dave.taht@gmail.com> Tested-by: Dave Taht <dave.taht@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <eric.dumazet@gmail.com> 2012-01-06 07:31:44 +0100
committer: David S. Miller <davem@davemloft.net> 2012-01-13 05:05:28 +0100
commit: ddecf0f4db44ef94847a62d6ecf74456b4dcc66f (patch)
tree: 48c49b70f83d713c7e3f594e067ee726bb70777e
parent: dp83640: Fix NOHZ local_softirq_pending 08 warning (diff)
download: linux-ddecf0f4db44ef94847a62d6ecf74456b4dcc66f.tar.xz
linux-ddecf0f4db44ef94847a62d6ecf74456b4dcc66f.zip
3 files changed, 152 insertions, 17 deletions
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 8f1b928f777c..0d5b79365d03 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -162,10 +162,30 @@ struct tc_sfq_qopt {
 	unsigned	flows;		/* Maximal number of flows  */
 };
 
+struct tc_sfqred_stats {
+	__u32           prob_drop;      /* Early drops, below max threshold */
+	__u32           forced_drop;	/* Early drops, after max threshold */
+	__u32           prob_mark;      /* Marked packets, below max threshold */
+	__u32           forced_mark;    /* Marked packets, after max threshold */
+	__u32           prob_mark_head; /* Marked packets, below max threshold */
+	__u32           forced_mark_head;/* Marked packets, after max threshold */
+};
+
 struct tc_sfq_qopt_v1 {
 	struct tc_sfq_qopt v0;
 	unsigned int	depth;		/* max number of packets per flow */
 	unsigned int	headdrop;
+/* SFQRED parameters */
+	__u32		limit;		/* HARD maximal flow queue length (bytes) */
+	__u32		qth_min;	/* Min average length threshold (bytes) */
+	__u32		qth_max;	/* Max average length threshold (bytes) */
+	unsigned char   Wlog;		/* log(W)		*/
+	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
+	unsigned char   Scell_log;	/* cell size for idle damping */
+	unsigned char	flags;
+	__u32		max_P;		/* probability, high resolution */
+/* SFQRED stats */
+	struct tc_sfqred_stats stats;
 };
 
 
diff --git a/include/net/red.h b/include/net/red.h
index baab385a4736..28068ec614b2 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -199,7 +199,8 @@ static inline void red_set_parms(struct red_parms *p,
 	p->Scell_log	= Scell_log;
 	p->Scell_max	= (255 << Scell_log);
 
-	memcpy(p->Stab, stab, sizeof(p->Stab));
+	if (stab)
+		memcpy(p->Stab, stab, sizeof(p->Stab));
 }
 
 static inline int red_is_idling(const struct red_vars *v)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 0a7964009e8c..67494aef9acf 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -24,6 +24,7 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/flow_keys.h>
+#include <net/red.h>
 
 
 /*	Stochastic Fairness Queuing algorithm.
@@ -108,24 +109,30 @@ struct sfq_slot {
 	struct sfq_head dep; /* anchor in dep[] chains */
 	unsigned short	hash; /* hash value (index in ht[]) */
 	short		allot; /* credit for this slot */
+
+	unsigned int    backlog;
+	struct red_vars vars;
 };
 
 struct sfq_sched_data {
 /* frequently used fields */
 	int		limit;		/* limit of total number of packets in this qdisc */
 	unsigned int	divisor;	/* number of slots in hash table */
-	unsigned int	maxflows;	/* number of flows in flows array */
-	int		headdrop;
-	int		maxdepth;	/* limit of packets per flow */
+	u8		headdrop;
+	u8		maxdepth;	/* limit of packets per flow */
 
 	u32		perturbation;
-	struct tcf_proto *filter_list;
-	sfq_index	cur_depth;	/* depth of longest slot */
+	u8		cur_depth;	/* depth of longest slot */
+	u8		flags;
 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
-	struct sfq_slot *tail;		/* current slot in round */
+	struct tcf_proto *filter_list;
 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
 
+	struct red_parms *red_parms;
+	struct tc_sfqred_stats stats;
+	struct sfq_slot *tail;		/* current slot in round */
+
 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
 					/* Linked lists of slots, indexed by depth
 					 * dep[0] : list of unused flows
@@ -133,6 +140,7 @@ struct sfq_sched_data {
 					 * dep[X] : list of flows with X packets
 					 */
 
+	unsigned int	maxflows;	/* number of flows in flows array */
 	int		perturb_period;
 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
 	struct timer_list perturb_timer;
@@ -321,6 +329,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
 drop:
 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 		len = qdisc_pkt_len(skb);
+		slot->backlog -= len;
 		sfq_dec(q, x);
 		kfree_skb(skb);
 		sch->q.qlen--;
@@ -341,6 +350,23 @@ drop:
 	return 0;
 }
 
+/* Is ECN parameter configured */
+static int sfq_prob_mark(const struct sfq_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max threshold just be marked */
+static int sfq_hard_mark(const struct sfq_sched_data *q)
+{
+	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
+}
+
+static int sfq_headdrop(const struct sfq_sched_data *q)
+{
+	return q->headdrop;
+}
+
 static int
 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
@@ -349,6 +375,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	sfq_index x, qlen;
 	struct sfq_slot *slot;
 	int uninitialized_var(ret);
+	struct sk_buff *head;
+	int delta;
 
 	hash = sfq_classify(skb, sch, &ret);
 	if (hash == 0) {
@@ -368,24 +396,75 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->ht[hash] = x;
 		slot = &q->slots[x];
 		slot->hash = hash;
+		slot->backlog = 0; /* should already be 0 anyway... */
+		red_set_vars(&slot->vars);
+		goto enqueue;
 	}
+	if (q->red_parms) {
+		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
+							&slot->vars,
+							slot->backlog);
+		switch (red_action(q->red_parms,
+				   &slot->vars,
+				   slot->vars.qavg)) {
+		case RED_DONT_MARK:
+			break;
 
-	if (slot->qlen >= q->maxdepth) {
-		struct sk_buff *head;
+		case RED_PROB_MARK:
+			sch->qstats.overlimits++;
+			if (sfq_prob_mark(q)) {
+				/* We know we have at least one packet in queue */
+				if (sfq_headdrop(q) &&
+				    INET_ECN_set_ce(slot->skblist_next)) {
+					q->stats.prob_mark_head++;
+					break;
+				}
+				if (INET_ECN_set_ce(skb)) {
+					q->stats.prob_mark++;
+					break;
+				}
+			}
+			q->stats.prob_drop++;
+			goto congestion_drop;
+
+		case RED_HARD_MARK:
+			sch->qstats.overlimits++;
+			if (sfq_hard_mark(q)) {
+				/* We know we have at least one packet in queue */
+				if (sfq_headdrop(q) &&
+				    INET_ECN_set_ce(slot->skblist_next)) {
+					q->stats.forced_mark_head++;
+					break;
+				}
+				if (INET_ECN_set_ce(skb)) {
+					q->stats.forced_mark++;
+					break;
+				}
+			}
+			q->stats.forced_drop++;
+			goto congestion_drop;
+		}
+	}
 
-		if (!q->headdrop)
+	if (slot->qlen >= q->maxdepth) {
+congestion_drop:
+		if (!sfq_headdrop(q))
 			return qdisc_drop(skb, sch);
 
+		/* We know we have at least one packet in queue */
 		head = slot_dequeue_head(slot);
-		sch->qstats.backlog -= qdisc_pkt_len(head);
+		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
+		sch->qstats.backlog -= delta;
+		slot->backlog -= delta;
 		qdisc_drop(head, sch);
 
-		sch->qstats.backlog += qdisc_pkt_len(skb);
 		slot_queue_add(slot, skb);
 		return NET_XMIT_CN;
 	}
 
+enqueue:
 	sch->qstats.backlog += qdisc_pkt_len(skb);
+	slot->backlog += qdisc_pkt_len(skb);
 	slot_queue_add(slot, skb);
 	sfq_inc(q, x);
 	if (slot->qlen == 1) {		/* The flow is new */
@@ -396,6 +475,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 			slot->next = q->tail->next;
 			q->tail->next = x;
 		}
+		/* We could use a bigger initial quantum for new flows */
 		slot->allot = q->scaled_quantum;
 	}
 	if (++sch->q.qlen <= q->limit)
@@ -439,7 +519,7 @@ next_slot:
 	qdisc_bstats_update(sch, skb);
 	sch->q.qlen--;
 	sch->qstats.backlog -= qdisc_pkt_len(skb);
-
+	slot->backlog -= qdisc_pkt_len(skb);
 	/* Is the slot empty? */
 	if (slot->qlen == 0) {
 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
@@ -490,6 +570,8 @@ static void sfq_rehash(struct Qdisc *sch)
 			sfq_dec(q, i);
 			__skb_queue_tail(&list, skb);
 		}
+		slot->backlog = 0;
+		red_set_vars(&slot->vars);
 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 	}
 	q->tail = NULL;
@@ -514,6 +596,11 @@ drop:				sch->qstats.backlog -= qdisc_pkt_len(skb);
 		if (slot->qlen >= q->maxdepth)
 			goto drop;
 		slot_queue_add(slot, skb);
+		if (q->red_parms)
+			slot->vars.qavg = red_calc_qavg(q->red_parms,
+							&slot->vars,
+							slot->backlog);
+		slot->backlog += qdisc_pkt_len(skb);
 		sfq_inc(q, x);
 		if (slot->qlen == 1) {		/* The flow is new */
 			if (q->tail == NULL) {	/* It is the first flow */
@@ -552,6 +639,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	struct tc_sfq_qopt *ctl = nla_data(opt);
 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 	unsigned int qlen;
+	struct red_parms *p = NULL;
 
 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 		return -EINVAL;
@@ -560,7 +648,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (ctl->divisor &&
 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 		return -EINVAL;
-
+	if (ctl_v1 && ctl_v1->qth_min) {
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+	}
 	sch_tree_lock(sch);
 	if (ctl->quantum) {
 		q->quantum = ctl->quantum;
@@ -576,6 +668,16 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (ctl_v1) {
 		if (ctl_v1->depth)
 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+		if (p) {
+			swap(q->red_parms, p);
+			red_set_parms(q->red_parms,
+				      ctl_v1->qth_min, ctl_v1->qth_max,
+				      ctl_v1->Wlog,
+				      ctl_v1->Plog, ctl_v1->Scell_log,
+				      NULL,
+				      ctl_v1->max_P);
+		}
+		q->flags = ctl_v1->flags;
 		q->headdrop = ctl_v1->headdrop;
 	}
 	if (ctl->limit) {
@@ -594,6 +696,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 		q->perturbation = net_random();
 	}
 	sch_tree_unlock(sch);
+	kfree(p);
 	return 0;
 }
 
@@ -625,6 +728,7 @@ static void sfq_destroy(struct Qdisc *sch)
 	del_timer_sync(&q->perturb_timer);
 	sfq_free(q->ht);
 	sfq_free(q->slots);
+	kfree(q->red_parms);
 }
 
 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
@@ -683,6 +787,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_sfq_qopt_v1 opt;
+	struct red_parms *p = q->red_parms;
 
 	memset(&opt, 0, sizeof(opt));
 	opt.v0.quantum	= q->quantum;
@@ -693,6 +798,17 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opt.depth	= q->maxdepth;
 	opt.headdrop	= q->headdrop;
 
+	if (p) {
+		opt.qth_min	= p->qth_min >> p->Wlog;
+		opt.qth_max	= p->qth_max >> p->Wlog;
+		opt.Wlog	= p->Wlog;
+		opt.Plog	= p->Plog;
+		opt.Scell_log	= p->Scell_log;
+		opt.max_P	= p->max_P;
+	}
+	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
+	opt.flags	= q->flags;
+
 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 
 	return skb->len;
@@ -747,15 +863,13 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	sfq_index idx = q->ht[cl - 1];
 	struct gnet_stats_queue qs = { 0 };
 	struct tc_sfq_xstats xstats = { 0 };
-	struct sk_buff *skb;
 
 	if (idx != SFQ_EMPTY_SLOT) {
 		const struct sfq_slot *slot = &q->slots[idx];
 
 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 		qs.qlen = slot->qlen;
-		slot_queue_walk(slot, skb)
-			qs.backlog += qdisc_pkt_len(skb);
+		qs.backlog = slot->backlog;
 	}
 	if (gnet_stats_copy_queue(d, &qs) < 0)
 		return -1;
author	Eric Dumazet <eric.dumazet@gmail.com>	2012-01-06 07:31:44 +0100
committer	David S. Miller <davem@davemloft.net>	2012-01-13 05:05:28 +0100
commit	ddecf0f4db44ef94847a62d6ecf74456b4dcc66f (patch)
tree	48c49b70f83d713c7e3f594e067ee726bb70777e
parent	dp83640: Fix NOHZ local_softirq_pending 08 warning (diff)
download	linux-ddecf0f4db44ef94847a62d6ecf74456b4dcc66f.tar.xz linux-ddecf0f4db44ef94847a62d6ecf74456b4dcc66f.zip