summaryrefslogtreecommitdiffstats
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c201
1 files changed, 157 insertions, 44 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a88924947815..ade648c3512b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -411,16 +411,29 @@ static void mptcp_set_datafin_timeout(const struct sock *sk)
TCP_RTO_MIN << icsk->icsk_retransmits);
}
-static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
+static void __mptcp_set_timeout(struct sock *sk, long tout)
{
- long tout = ssk && inet_csk(ssk)->icsk_pending ?
- inet_csk(ssk)->icsk_timeout - jiffies : 0;
-
- if (tout <= 0)
- tout = mptcp_sk(sk)->timer_ival;
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
+static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow)
+{
+ const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
+ inet_csk(ssk)->icsk_timeout - jiffies : 0;
+}
+
+static void mptcp_set_timeout(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ long tout = 0;
+
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow)
+ tout = max(tout, mptcp_timeout_from_subflow(subflow));
+ __mptcp_set_timeout(sk, tout);
+}
+
static bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
@@ -531,7 +544,6 @@ static bool mptcp_check_data_fin(struct sock *sk)
}
ret = true;
- mptcp_set_timeout(sk, NULL);
mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
@@ -791,10 +803,7 @@ static void mptcp_reset_timer(struct sock *sk)
if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
return;
- /* should never be called with mptcp level timer cleared */
- tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
- if (WARN_ON_ONCE(!tout))
- tout = TCP_RTO_MIN;
+ tout = mptcp_sk(sk)->timer_ival;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}
@@ -1046,8 +1055,14 @@ static void __mptcp_clean_una(struct sock *sk)
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
- if (WARN_ON_ONCE(dfrag == msk->first_pending))
- break;
+ if (unlikely(dfrag == msk->first_pending)) {
+ /* in recovery mode can see ack after the current snd head */
+ if (WARN_ON_ONCE(!msk->recovery))
+ break;
+
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ }
+
dfrag_clear(sk, dfrag);
cleaned = true;
}
@@ -1056,8 +1071,14 @@ static void __mptcp_clean_una(struct sock *sk)
if (dfrag && after64(snd_una, dfrag->data_seq)) {
u64 delta = snd_una - dfrag->data_seq;
- if (WARN_ON_ONCE(delta > dfrag->already_sent))
- goto out;
+ /* prevent wrap around in recovery mode */
+ if (unlikely(delta > dfrag->already_sent)) {
+ if (WARN_ON_ONCE(!msk->recovery))
+ goto out;
+ if (WARN_ON_ONCE(delta > dfrag->data_len))
+ goto out;
+ dfrag->already_sent += delta - dfrag->already_sent;
+ }
dfrag->data_seq += delta;
dfrag->offset += delta;
@@ -1068,6 +1089,10 @@ static void __mptcp_clean_una(struct sock *sk)
cleaned = true;
}
+ /* all retransmitted data acked, recovery completed */
+ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt))
+ msk->recovery = false;
+
out:
if (cleaned) {
if (tcp_under_memory_pressure(sk)) {
@@ -1076,8 +1101,8 @@ out:
}
}
- if (snd_una == READ_ONCE(msk->snd_nxt)) {
- if (msk->timer_ival && !mptcp_data_fin_enabled(msk))
+ if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) {
+ if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
mptcp_stop_timer(sk);
} else {
mptcp_reset_timer(sk);
@@ -1366,16 +1391,44 @@ struct subflow_send_info {
u64 ratio;
};
+void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
+{
+ if (!subflow->stale)
+ return;
+
+ subflow->stale = 0;
+ MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER);
+}
+
+bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ if (unlikely(subflow->stale)) {
+ u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp);
+
+ if (subflow->stale_rcv_tstamp == rcv_tstamp)
+ return false;
+
+ mptcp_subflow_set_active(subflow);
+ }
+ return __mptcp_subflow_active(subflow);
+}
+
+/* implement the mptcp packet scheduler;
+ * returns the subflow that will transmit the next DSS
+ * additionally updates the rtx timeout
+ */
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
int i, nr_active = 0;
struct sock *ssk;
+ long tout = 0;
u64 ratio;
u32 pace;
- sock_owned_by_me((struct sock *)msk);
+ sock_owned_by_me(sk);
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
@@ -1386,8 +1439,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
/* re-use last subflow, if the burst allow that */
if (msk->last_snd && msk->snd_burst > 0 &&
sk_stream_memory_free(msk->last_snd) &&
- mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_set_timeout(sk);
return msk->last_snd;
+ }
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < 2; ++i) {
@@ -1400,6 +1455,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
if (!mptcp_subflow_active(subflow))
continue;
+ tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !subflow->backup;
if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
continue;
@@ -1415,6 +1471,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
send_info[subflow->backup].ratio = ratio;
}
}
+ __mptcp_set_timeout(sk, tout);
/* pick the best backup if no other subflow is active */
if (!nr_active)
@@ -1433,12 +1490,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info)
{
- mptcp_set_timeout(sk, ssk);
tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
release_sock(ssk);
}
-static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+void __mptcp_push_pending(struct sock *sk, unsigned int flags)
{
struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1459,15 +1515,19 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
- /* try to keep the subflow socket lock across
- * consecutive xmit on the same socket
+ /* First check. If the ssk has changed since
+ * the last round, release prev_ssk
*/
if (ssk != prev_ssk && prev_ssk)
mptcp_push_release(sk, prev_ssk, &info);
if (!ssk)
goto out;
- if (ssk != prev_ssk || !prev_ssk)
+ /* Need to lock the new subflow only if different
+ * from the previous one, otherwise we are still
+ * helding the relevant lock
+ */
+ if (ssk != prev_ssk)
lock_sock(ssk);
/* keep it simple and always provide a new skb for the
@@ -1501,12 +1561,11 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
mptcp_push_release(sk, ssk, &info);
out:
- if (copied) {
- /* start the timer, if it's not pending */
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ /* ensure the rtx timer is running */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+ if (copied)
__mptcp_check_send_data_fin(sk);
- }
}
static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
@@ -1567,7 +1626,6 @@ out:
*/
__mptcp_update_wmem(sk);
if (copied) {
- mptcp_set_timeout(sk, ssk);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
if (!mptcp_timer_pending(sk))
@@ -2083,10 +2141,11 @@ static void mptcp_timeout_timer(struct timer_list *t)
*
* A backup subflow is returned only if that is the only kind available.
*/
-static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
+static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{
+ struct sock *backup = NULL, *pick = NULL;
struct mptcp_subflow_context *subflow;
- struct sock *backup = NULL;
+ int min_stale_count = INT_MAX;
sock_owned_by_me((const struct sock *)msk);
@@ -2096,14 +2155,14 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- if (!mptcp_subflow_active(subflow))
+ if (!__mptcp_subflow_active(subflow))
continue;
- /* still data outstanding at TCP level? Don't retransmit. */
- if (!tcp_write_queue_empty(ssk)) {
- if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss)
- continue;
- return NULL;
+ /* still data outstanding at TCP level? skip this */
+ if (!tcp_rtx_and_write_queues_empty(ssk)) {
+ mptcp_pm_subflow_chk_stale(msk, ssk);
+ min_stale_count = min_t(int, min_stale_count, subflow->stale_count);
+ continue;
}
if (subflow->backup) {
@@ -2112,10 +2171,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
continue;
}
- return ssk;
+ if (!pick)
+ pick = ssk;
}
- return backup;
+ if (pick)
+ return pick;
+
+ /* use backup only if there are no progresses anywhere */
+ return min_stale_count > 1 ? backup : NULL;
}
static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
@@ -2126,6 +2190,50 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
}
}
+bool __mptcp_retransmit_pending_data(struct sock *sk)
+{
+ struct mptcp_data_frag *cur, *rtx_head;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (__mptcp_check_fallback(mptcp_sk(sk)))
+ return false;
+
+ if (tcp_rtx_and_write_queues_empty(sk))
+ return false;
+
+ /* the closing socket has some data untransmitted and/or unacked:
+ * some data in the mptcp rtx queue has not really xmitted yet.
+ * keep it simple and re-inject the whole mptcp level rtx queue
+ */
+ mptcp_data_lock(sk);
+ __mptcp_clean_una_wakeup(sk);
+ rtx_head = mptcp_rtx_head(sk);
+ if (!rtx_head) {
+ mptcp_data_unlock(sk);
+ return false;
+ }
+
+ /* will accept ack for reijected data before re-sending them */
+ if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt))
+ msk->recovery_snd_nxt = msk->snd_nxt;
+ msk->recovery = true;
+ mptcp_data_unlock(sk);
+
+ msk->first_pending = rtx_head;
+ msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
+ msk->snd_nxt = rtx_head->data_seq;
+ msk->snd_burst = 0;
+
+ /* be sure to clear the "sent status" on all re-injected fragments */
+ list_for_each_entry(cur, &msk->rtx_queue, list) {
+ if (!cur->already_sent)
+ break;
+ cur->already_sent = 0;
+ }
+
+ return true;
+}
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -2138,6 +2246,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool need_push;
list_del(&subflow->node);
@@ -2149,6 +2258,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (ssk->sk_socket)
sock_orphan(ssk);
+ need_push = __mptcp_retransmit_pending_data(sk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
@@ -2176,6 +2286,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (msk->subflow && ssk == msk->subflow->sk)
mptcp_dispose_initial_subflow(msk);
+
+ if (need_push)
+ __mptcp_push_pending(sk, 0);
}
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
@@ -2313,7 +2426,6 @@ static void __mptcp_retrans(struct sock *sk)
info.size_goal);
}
- mptcp_set_timeout(sk, ssk);
release_sock(ssk);
reset_timer:
@@ -2384,10 +2496,12 @@ static int __mptcp_init_sock(struct sock *sk)
msk->wmem_reserved = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->tx_pending_data = 0;
+ msk->timer_ival = TCP_RTO_MIN;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ msk->recovery = false;
mptcp_pm_data_init(msk);
@@ -2472,7 +2586,6 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
tcp_shutdown(ssk, how);
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
- mptcp_set_timeout(sk, ssk);
tcp_send_ack(ssk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
@@ -2723,7 +2836,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->token = subflow_req->token;
msk->subflow = NULL;
WRITE_ONCE(msk->fully_established, false);
- if (mp_opt->csum_reqd)
+ if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
WRITE_ONCE(msk->csum_enabled, true);
msk->write_seq = subflow_req->idsn + 1;
@@ -2732,7 +2845,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
- if (mp_opt->mp_capable) {
+ if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) {
msk->can_ack = true;
msk->remote_key = mp_opt->sndr_key;
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);