tcp: support DUPACK threshold in RACK

This patch adds support for the classic DUPACK threshold rule (#DupThresh) in RACK. When the number of packets SACKed is greater or equal to the threshold, RACK sets the reordering window to zero which would immediately mark all the unsacked packets below the highest SACKed sequence lost. Since this approach is known to not work well with reordering, RACK only uses it if no reordering has been observed. The DUPACK threshold rule is a particularly useful extension to the fast recoveries triggered by RACK reordering timer. For example data-center transfers where the RTT is much smaller than a timer tick, or high RTT path where the default RTT/4 may take too long. Note that this patch differs slightly from RFC6675. RFC6675 considers a packet lost when at least #DupThresh higher-sequence packets are SACKed. With RACK, for connections that have seen reordering, RACK continues to use a dynamically-adaptive time-based reordering window to detect losses. But for connections on which we have not yet seen reordering, this patch considers a packet lost when at least one higher sequence packet is SACKed and the total number of SACKed packets is at least DupThresh. For example, suppose a connection has not seen reordering, and sends 10 packets, and packets 3, 5, 7 are SACKed. RFC6675 considers packets 1 and 2 lost. RACK considers packets 1, 2, 4, 6 lost. There is some small risk of spurious retransmits here due to reordering. However, this is mostly limited to the first flight of a connection on which the sender receives SACKs from reordering. And RFC 6675 and FACK loss detection have a similar risk on the first flight with reordering (it's just that the risk of spurious retransmits from reordering was slightly narrower for those older algorithms due to the margin of 3*MSS). Also the minimum reordering window is reduced from 1 msec to 0 to recover quicker on short RTT transfers. Therefore RACK is more aggressive in marking packets lost during recovery to reduce the reordering window timeouts. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com> Reviewed-by: Priyaranjan Jha <priyarjha@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Yuchung Cheng <ycheng@google.com> 2018-05-17 01:40:10 +0200
committer: David S. Miller <davem@davemloft.net> 2018-05-17 21:41:28 +0200
commit: 20b654dfe1beaca60ab51894ff405a049248433d (patch)
tree: a6f3037a9fd384c261db36c71a2b4e825104c8a4
parent: net: ethernet: ti: cpsw: disable mq feature for "AM33xx ES1.0" devices (diff)
download: linux-20b654dfe1beaca60ab51894ff405a049248433d.tar.xz
linux-20b654dfe1beaca60ab51894ff405a049248433d.zip
3 files changed, 29 insertions, 13 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 59afc9a10b4f..13bbac50dc8b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -451,6 +451,7 @@ tcp_recovery - INTEGER
 	RACK: 0x1 enables the RACK loss detection for fast detection of lost
 	      retransmissions and tail drops.
 	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
+	RACK: 0x4 disables RACK's DUPACK threshold heuristic
 
 	Default: 0x1
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a08eab58ef70..994f869d793c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
+#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 3a81720ac0c4..1c1bdf12a96f 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -21,6 +21,32 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
 }
 
+u32 tcp_rack_reo_wnd(const struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->rack.reord) {
+		/* If reordering has not been observed, be aggressive during
+		 * the recovery or starting the recovery by DUPACK threshold.
+		 */
+		if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
+			return 0;
+
+		if (tp->sacked_out >= tp->reordering &&
+		    !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
+			return 0;
+	}
+
+	/* To be more reordering resilient, allow min_rtt/4 settling delay.
+	 * Use min_rtt instead of the smoothed RTT because reordering is
+	 * often a path property and less related to queuing or delayed ACKs.
+	 * Upon receiving DSACKs, linearly increase the window up to the
+	 * smoothed RTT.
+	 */
+	return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
+		   tp->srtt_us >> 3);
+}
+
 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
  *
  * Marks a packet lost, if some packet sent later has been (s)acked.
@@ -44,23 +70,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 min_rtt = tcp_min_rtt(tp);
 	struct sk_buff *skb, *n;
 	u32 reo_wnd;
 
 	*reo_timeout = 0;
-	/* To be more reordering resilient, allow min_rtt/4 settling delay
-	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
-	 * RTT because reordering is often a path property and less related
-	 * to queuing or delayed ACKs.
-	 */
-	reo_wnd = 1000;
-	if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
-	    min_rtt != ~0U) {
-		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
-		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
-	}
-
+	reo_wnd = tcp_rack_reo_wnd(sk);
 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
 				 tcp_tsorted_anchor) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
author	Yuchung Cheng <ycheng@google.com>	2018-05-17 01:40:10 +0200
committer	David S. Miller <davem@davemloft.net>	2018-05-17 21:41:28 +0200
commit	20b654dfe1beaca60ab51894ff405a049248433d (patch)
tree	a6f3037a9fd384c261db36c71a2b4e825104c8a4
parent	net: ethernet: ti: cpsw: disable mq feature for "AM33xx ES1.0" devices (diff)
download	linux-20b654dfe1beaca60ab51894ff405a049248433d.tar.xz linux-20b654dfe1beaca60ab51894ff405a049248433d.zip