summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c160
1 files changed, 80 insertions, 80 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dad3e7eeed94..a27b9c0e27c0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
int sndmem, per_mss;
u32 nr_segs;
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to POLLOUT)
*/
- sndmem = 2 * nr_segs * per_mss;
+ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+ sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ * and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ * and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (!(sacked & TCPCB_LOST) ||
+ ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+ tp->lost += tcp_skb_pcount(skb);
+}
+
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);
tp->lost_out += tcp_skb_pcount(skb);
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
+ tcp_sum_lost(tp, skb);
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct rate_sample *rate;
int flag;
};
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_advance_highest_sack(sk, skb);
tcp_skb_collapse_tstamp(prev, skb);
+ if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+ TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack,
tcp_skb_pcount(skb),
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
- if (found_dup_sack)
+ if (found_dup_sack) {
state->flag |= FLAG_DSACKING_ACK;
+ tp->delivered++; /* A spurious retransmission is delivered */
+ }
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
+ bool mark_lost;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
+ mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+ is_reneg);
+ if (mark_lost)
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+ if (mark_lost) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -2329,10 +2362,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
- &np->daddr, ntohs(inet->inet_dport),
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2503,6 +2535,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (inet_csk(sk)->icsk_ca_ops->cong_control)
+ return;
+
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
(tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2879,67 +2914,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
*rexmit = REXMIT_LOST;
}
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
- const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
- struct rtt_meas *m = tcp_sk(sk)->rtt_min;
- struct rtt_meas rttm = {
- .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
- .ts = now,
- };
- u32 elapsed;
-
- /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
- if (unlikely(rttm.rtt <= m[0].rtt))
- m[0] = m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[1].rtt)
- m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[2].rtt)
- m[2] = rttm;
-
- elapsed = now - m[0].ts;
- if (unlikely(elapsed > wlen)) {
- /* Passed entire window without a new min so make 2nd choice
- * the new min & 3rd choice the new 2nd. So forth and so on.
- */
- m[0] = m[1];
- m[1] = m[2];
- m[2] = rttm;
- if (now - m[0].ts > wlen) {
- m[0] = m[1];
- m[1] = rttm;
- if (now - m[0].ts > wlen)
- m[0] = rttm;
- }
- } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
- /* Passed a quarter of the window without a new min so
- * take 2nd choice from the 2nd quarter of the window.
- */
- m[2] = m[1] = rttm;
- } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
- /* Passed half the window without a new min so take the 3rd
- * choice from the last half of the window.
- */
- m[2] = rttm;
- }
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+ minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+ rtt_us ? : jiffies_to_usecs(1));
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3102,10 +3083,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack)
+ struct tcp_sacktag_state *sack,
+ struct skb_mstamp *now)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct skb_mstamp first_ackt, last_ackt, now;
+ struct skb_mstamp first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3137,7 +3119,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
-
fully_acked = false;
} else {
/* Speedup tcp_unlink_write_queue() and next loop */
@@ -3173,6 +3154,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
+ tcp_rate_skb_delivered(sk, skb, sack->rate);
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
@@ -3205,16 +3187,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- skb_mstamp_get(&now);
if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
- seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
}
if (sack->first_sackt.v64) {
- sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+ sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
}
-
+ sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us);
@@ -3242,7 +3223,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3333,8 +3314,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
- int flag)
+ int flag, const struct rate_sample *rs)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->cong_control) {
+ icsk->icsk_ca_ops->cong_control(sk, rs);
+ return;
+ }
+
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3579,17 +3567,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
+ struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- u32 prior_delivered = tp->delivered;
+ u32 delivered = tp->delivered;
+ u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
+ sack_state.rate = &rs;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3612,6 +3604,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ skb_mstamp_get(&now);
+
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -3622,6 +3616,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
}
prior_fackets = tp->fackets_out;
+ rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
@@ -3677,7 +3672,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state);
+ &sack_state, &now);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3694,7 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ lost = tp->lost - lost; /* freshly marked lost */
+ tcp_rate_gen(sk, delivered, lost, &now, &rs);
+ tcp_cong_control(sk, ack, delivered, flag, &rs);
tcp_xmit_recovery(sk, rexmit);
return 1;
@@ -5951,7 +5949,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* so release it.
*/
if (req) {
- tp->total_retrans = req->num_retrans;
+ inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
@@ -5993,7 +5991,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else
tcp_init_metrics(sk);
- tcp_update_pacing_rate(sk);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
@@ -6326,6 +6325,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
+ inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);