net: gro: selective flush of packets

Current GRO can hold packets in gro_list for almost unlimited time, in case napi->poll() handler consumes its budget over and over. In this case, napi_complete()/napi_gro_flush() are not called. Another problem is that gro_list is flushed in non friendly way : We scan the list and complete packets in the reverse order. (youngest packets first, oldest packets last) This defeats priorities that sender could have cooked. Since GRO currently only store TCP packets, we dont really notice the bug because of retransmits, but this behavior can add unexpected latencies, particularly on mice flows clamped by elephant flows. This patch makes sure no packet can stay more than 1 ms in queue, and only in stress situations. It also complete packets in the right order to minimize latencies. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Jesse Gross <jesse@nicira.com> Cc: Tom Herbert <therbert@google.com> Cc: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2012-10-06 10:08:49 +0200
committer: David S. Miller <davem@davemloft.net> 2012-10-08 20:51:51 +0200
commit: 2e71a6f8084e7ac87166dd77d99c44190fb844fc (patch)
tree: eb2e2d47361b35b2b5a3f26beac3d1fbd888c372 /net/core
parent: skge: Add DMA mask quirk for Marvell 88E8001 on ASUS P5NSLI motherboard (diff)
download: linux-2e71a6f8084e7ac87166dd77d99c44190fb844fc.tar.xz
linux-2e71a6f8084e7ac87166dd77d99c44190fb844fc.zip
1 files changed, 31 insertions, 7 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index de2bad717d56..d44668f63c88 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3471,17 +3471,31 @@ out:
 	return netif_receive_skb(skb);
 }
 
-inline void napi_gro_flush(struct napi_struct *napi)
+/* napi->gro_list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
-	struct sk_buff *skb, *next;
+	struct sk_buff *skb, *prev = NULL;
 
-	for (skb = napi->gro_list; skb; skb = next) {
-		next = skb->next;
+	/* scan list and build reverse chain */
+	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
+		skb->prev = prev;
+		prev = skb;
+	}
+
+	for (skb = prev; skb; skb = prev) {
 		skb->next = NULL;
+
+		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+			return;
+
+		prev = skb->prev;
 		napi_gro_complete(skb);
+		napi->gro_count--;
 	}
 
-	napi->gro_count = 0;
 	napi->gro_list = NULL;
 }
 EXPORT_SYMBOL(napi_gro_flush);
@@ -3542,6 +3556,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 	napi->gro_count++;
 	NAPI_GRO_CB(skb)->count = 1;
+	NAPI_GRO_CB(skb)->age = jiffies;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	skb->next = napi->gro_list;
 	napi->gro_list = skb;
@@ -3878,7 +3893,7 @@ void napi_complete(struct napi_struct *n)
 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
 		return;
 
-	napi_gro_flush(n);
+	napi_gro_flush(n, false);
 	local_irq_save(flags);
 	__napi_complete(n);
 	local_irq_restore(flags);
@@ -3983,8 +3998,17 @@ static void net_rx_action(struct softirq_action *h)
 				local_irq_enable();
 				napi_complete(n);
 				local_irq_disable();
-			} else
+			} else {
+				if (n->gro_list) {
+					/* flush too old packets
+					 * If HZ < 1000, flush all packets.
+					 */
+					local_irq_enable();
+					napi_gro_flush(n, HZ >= 1000);
+					local_irq_disable();
+				}
 				list_move_tail(&n->poll_list, &sd->poll_list);
+			}
 		}
 
 		netpoll_poll_unlock(have);
author	Eric Dumazet <edumazet@google.com>	2012-10-06 10:08:49 +0200
committer	David S. Miller <davem@davemloft.net>	2012-10-08 20:51:51 +0200
commit	2e71a6f8084e7ac87166dd77d99c44190fb844fc (patch)
tree	eb2e2d47361b35b2b5a3f26beac3d1fbd888c372 /net/core
parent	skge: Add DMA mask quirk for Marvell 88E8001 on ASUS P5NSLI motherboard (diff)
download	linux-2e71a6f8084e7ac87166dd77d99c44190fb844fc.tar.xz linux-2e71a6f8084e7ac87166dd77d99c44190fb844fc.zip