summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Makefile2
-rw-r--r--net/mptcp/bpf.c15
-rw-r--r--net/mptcp/ctrl.c14
-rw-r--r--net/mptcp/pm.c9
-rw-r--r--net/mptcp/pm_netlink.c33
-rw-r--r--net/mptcp/protocol.c502
-rw-r--r--net/mptcp/protocol.h42
-rw-r--r--net/mptcp/sched.c173
-rw-r--r--net/mptcp/sockopt.c79
-rw-r--r--net/mptcp/subflow.c60
10 files changed, 587 insertions, 342 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index a3829ce548f9..84e531f86b82 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
- mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o
+ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index 5a0a84ad94af..8a16672b94e2 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -19,3 +19,18 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
return NULL;
}
+
+BTF_SET8_START(bpf_mptcp_fmodret_ids)
+BTF_ID_FLAGS(func, update_socket_protocol)
+BTF_SET8_END(bpf_mptcp_fmodret_ids)
+
+static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_mptcp_fmodret_ids,
+};
+
+static int __init bpf_mptcp_kfunc_init(void)
+{
+ return register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
+}
+late_initcall(bpf_mptcp_kfunc_init);
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 43e540328a52..e72b518c5d02 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -32,6 +32,7 @@ struct mptcp_pernet {
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
u8 pm_type;
+ char scheduler[MPTCP_SCHED_NAME_MAX];
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
@@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net)
return mptcp_get_pernet(net)->pm_type;
}
+const char *mptcp_get_scheduler(const struct net *net)
+{
+ return mptcp_get_pernet(net)->scheduler;
+}
+
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
@@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
+ strcpy(pernet->scheduler, "default");
}
#ifdef CONFIG_SYSCTL
@@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
+ {
+ .procname = "scheduler",
+ .maxlen = MPTCP_SCHED_NAME_MAX,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
{}
};
@@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;
+ table[6].data = &pernet->scheduler;
hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
ARRAY_SIZE(mptcp_sysctl_table));
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 7dbbad1e4f55..d8da5374d9e1 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
msk = mptcp_sk(sk);
- if (subflow->backup != bkup) {
+ if (subflow->backup != bkup)
subflow->backup = bkup;
- mptcp_data_lock(sk);
- if (!sock_owned_by_user(sk))
- msk->last_snd = NULL;
- else
- __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags);
- mptcp_data_unlock(sk);
- }
mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 5692daf57a4d..9661f3812682 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -9,6 +9,7 @@
#include <linux/inet.h>
#include <linux/kernel.h>
#include <net/tcp.h>
+#include <net/inet_common.h>
#include <net/netns/generic.h>
#include <net/mptcp.h>
#include <net/genetlink.h>
@@ -471,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con
slow = lock_sock_fast(ssk);
if (prio) {
- if (subflow->backup != backup)
- msk->last_snd = NULL;
-
subflow->send_mp_prio = 1;
subflow->backup = backup;
subflow->request_bkup = backup;
@@ -1005,8 +1003,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
bool is_ipv6 = sk->sk_family == AF_INET6;
int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr;
- struct socket *ssock;
- struct sock *newsk;
+ struct sock *newsk, *ssk;
int backlog = 1024;
int err;
@@ -1032,28 +1029,32 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
&mptcp_keys[is_ipv6]);
lock_sock(newsk);
- ssock = __mptcp_nmpc_socket(mptcp_sk(newsk));
+ ssk = __mptcp_nmpc_sk(mptcp_sk(newsk));
release_sock(newsk);
- if (IS_ERR(ssock))
- return PTR_ERR(ssock);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (entry->addr.family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen);
+ if (ssk->sk_family == AF_INET)
+ err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (ssk->sk_family == AF_INET6)
+ err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
+#endif
if (err)
return err;
inet_sk_state_store(newsk, TCP_LISTEN);
- err = kernel_listen(ssock, backlog);
- if (err)
- return err;
-
- mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED);
-
- return 0;
+ lock_sock(ssk);
+ err = __inet_listen_sk(ssk, backlog);
+ if (!err)
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
+ release_sock(ssk);
+ return err;
}
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 3613489eb6e3..933b257eee02 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -67,11 +67,11 @@ static bool mptcp_is_tcpsk(struct sock *sk)
* Hand the socket over to tcp so all further socket ops
* bypass mptcp.
*/
- sock->ops = &inet_stream_ops;
+ WRITE_ONCE(sock->ops, &inet_stream_ops);
return true;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
} else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
- sock->ops = &inet6_stream_ops;
+ WRITE_ONCE(sock->ops, &inet6_stream_ops);
return true;
#endif
}
@@ -90,8 +90,8 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
if (err)
return err;
+ msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
WRITE_ONCE(msk->first, ssock->sk);
- WRITE_ONCE(msk->subflow, ssock);
subflow = mptcp_subflow_ctx(ssock->sk);
list_add(&subflow->node, &msk->conn_list);
sock_hold(ssock->sk);
@@ -101,6 +101,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
/* This is the first subflow, always with id 0 */
subflow->local_id_valid = 1;
mptcp_sock_graft(msk->first, sk->sk_socket);
+ iput(SOCK_INODE(ssock));
return 0;
}
@@ -108,7 +109,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
/* If the MPC handshake is not started, returns the first subflow,
* eventually allocating it.
*/
-struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk)
+struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
int ret;
@@ -116,10 +117,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk)
if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
return ERR_PTR(-EINVAL);
- if (!msk->subflow) {
- if (msk->first)
- return ERR_PTR(-EINVAL);
-
+ if (!msk->first) {
ret = __mptcp_socket_create(msk);
if (ret)
return ERR_PTR(ret);
@@ -127,7 +125,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk)
mptcp_sockopt_sync(msk, msk->first);
}
- return msk->subflow;
+ return msk->first;
}
static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
@@ -1368,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
* returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout
*/
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow;
@@ -1379,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
u64 linger_time;
long tout = 0;
- msk_owned_by_me(msk);
-
- if (__mptcp_check_fallback(msk)) {
- if (!msk->first)
- return NULL;
- return __tcp_can_send(msk->first) &&
- sk_stream_memory_free(msk->first) ? msk->first : NULL;
- }
-
- /* re-use last subflow, if the burst allow that */
- if (msk->last_snd && msk->snd_burst > 0 &&
- sk_stream_memory_free(msk->last_snd) &&
- mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
- mptcp_set_timeout(sk);
- return msk->last_snd;
- }
-
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL;
@@ -1448,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
- if (!burst) {
- msk->last_snd = NULL;
+ if (!burst)
return ssk;
- }
subflow = mptcp_subflow_ctx(ssk);
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst,
burst + wmem);
- msk->last_snd = ssk;
msk->snd_burst = burst;
return ssk;
}
@@ -1501,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk)
mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
}
-void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
+ struct mptcp_sendmsg_info *info)
{
- struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_sendmsg_info info = {
- .flags = flags,
- };
- bool do_check_data_fin = false;
struct mptcp_data_frag *dfrag;
- int len;
+ int len, copied = 0, err = 0;
while ((dfrag = mptcp_send_head(sk))) {
- info.sent = dfrag->already_sent;
- info.limit = dfrag->data_len;
+ info->sent = dfrag->already_sent;
+ info->limit = dfrag->data_len;
len = dfrag->data_len - dfrag->already_sent;
while (len > 0) {
int ret = 0;
- prev_ssk = ssk;
- ssk = mptcp_subflow_get_send(msk);
-
- /* First check. If the ssk has changed since
- * the last round, release prev_ssk
- */
- if (ssk != prev_ssk && prev_ssk)
- mptcp_push_release(prev_ssk, &info);
- if (!ssk)
- goto out;
-
- /* Need to lock the new subflow only if different
- * from the previous one, otherwise we are still
- * helding the relevant lock
- */
- if (ssk != prev_ssk)
- lock_sock(ssk);
-
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
if (ret <= 0) {
- if (ret == -EAGAIN)
- continue;
- mptcp_push_release(ssk, &info);
+ err = copied ? : ret;
goto out;
}
- do_check_data_fin = true;
- info.sent += ret;
+ info->sent += ret;
+ copied += ret;
len -= ret;
mptcp_update_post_push(msk, dfrag, ret);
}
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+
+ if (msk->snd_burst <= 0 ||
+ !sk_stream_memory_free(ssk) ||
+ !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+ err = copied;
+ goto out;
+ }
+ mptcp_set_timeout(sk);
+ }
+ err = copied;
+
+out:
+ return err;
+}
+
+void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+{
+ struct sock *prev_ssk = NULL, *ssk = NULL;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info = {
+ .flags = flags,
+ };
+ bool do_check_data_fin = false;
+ int push_count = 1;
+
+ while (mptcp_send_head(sk) && (push_count > 0)) {
+ struct mptcp_subflow_context *subflow;
+ int ret = 0;
+
+ if (mptcp_sched_get_send(msk))
+ break;
+
+ push_count = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+
+ prev_ssk = ssk;
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (ssk != prev_ssk) {
+ /* First check. If the ssk has changed since
+ * the last round, release prev_ssk
+ */
+ if (prev_ssk)
+ mptcp_push_release(prev_ssk, &info);
+
+ /* Need to lock the new subflow only if different
+ * from the previous one, otherwise we are still
+ * helding the relevant lock
+ */
+ lock_sock(ssk);
+ }
+
+ push_count++;
+
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0) {
+ if (ret != -EAGAIN ||
+ (1 << ssk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE))
+ push_count--;
+ continue;
+ }
+ do_check_data_fin = true;
+ }
+ }
}
/* at this point we held the socket lock for the last subflow we used */
if (ssk)
mptcp_push_release(ssk, &info);
-out:
/* ensure the rtx timer is running */
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
@@ -1572,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
struct mptcp_sendmsg_info info = {
.data_lock_held = true,
};
- struct mptcp_data_frag *dfrag;
+ bool keep_pushing = true;
struct sock *xmit_ssk;
- int len, copied = 0;
+ int copied = 0;
info.flags = 0;
- while ((dfrag = mptcp_send_head(sk))) {
- info.sent = dfrag->already_sent;
- info.limit = dfrag->data_len;
- len = dfrag->data_len - dfrag->already_sent;
- while (len > 0) {
- int ret = 0;
+ while (mptcp_send_head(sk) && keep_pushing) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ int ret = 0;
- /* check for a different subflow usage only after
- * spooling the first chunk of data
- */
- xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
- if (!xmit_ssk)
- goto out;
- if (xmit_ssk != ssk) {
- mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
- MPTCP_DELEGATE_SEND);
- goto out;
- }
-
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ /* check for a different subflow usage only after
+ * spooling the first chunk of data
+ */
+ if (first) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ first = false;
if (ret <= 0)
- goto out;
+ break;
+ copied += ret;
+ continue;
+ }
+
+ if (mptcp_sched_get_send(msk))
+ goto out;
- info.sent += ret;
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0)
+ keep_pushing = false;
copied += ret;
- len -= ret;
- first = false;
+ }
- mptcp_update_post_push(msk, dfrag, ret);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ xmit_ssk = mptcp_subflow_tcp_sock(subflow);
+ if (xmit_ssk != ssk) {
+ mptcp_subflow_delegate(subflow,
+ MPTCP_DELEGATE_SEND);
+ keep_pushing = false;
+ }
+ }
}
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
out:
@@ -1642,7 +1669,6 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
{
unsigned int saved_flags = msg->msg_flags;
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
struct sock *ssk;
int ret;
@@ -1653,9 +1679,9 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
* fastopen attempt, no need to check for additional subflow status.
*/
if (msg->msg_flags & MSG_FASTOPEN) {
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock))
- return PTR_ERR(ssock);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
}
if (!msk->first)
return -EINVAL;
@@ -1689,7 +1715,7 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
if (!mptcp_disconnect(sk, 0))
sk->sk_socket->state = SS_UNCONNECTED;
}
- inet_sk(sk)->defer_connect = 0;
+ inet_clear_bit(DEFER_CONNECT, sk);
return ret;
}
@@ -1707,7 +1733,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
- if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) {
+ if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
+ msg->msg_flags & MSG_FASTOPEN)) {
int copied_syn = 0;
ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn);
@@ -1881,6 +1908,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
+ u8 scaling_ratio = U8_MAX;
u32 time, advmss = 1;
u64 rtt_us, mstamp;
@@ -1911,9 +1939,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
rtt_us = max(sf_rtt_us, rtt_us);
advmss = max(sf_advmss, advmss);
+ scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
}
msk->rcvq_space.rtt_us = rtt_us;
+ msk->scaling_ratio = scaling_ratio;
if (time < (rtt_us >> 3) || rtt_us == 0)
return;
@@ -1922,8 +1952,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvmem, rcvbuf;
u64 rcvwin, grow;
+ int rcvbuf;
rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
@@ -1932,18 +1962,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
do_div(grow, msk->rcvq_space.space);
rcvwin += (grow << 1);
- rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
- while (tcp_win_from_space(sk, rcvmem) < advmss)
- rcvmem += 128;
-
- do_div(rcvwin, advmss);
- rcvbuf = min_t(u64, rcvwin * rcvmem,
+ rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;
- window_clamp = tcp_win_from_space(sk, rcvbuf);
+ window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf);
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make subflows follow along. If we do not do this, we
@@ -2202,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t)
*
* A backup subflow is returned only if that is the only kind available.
*/
-static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{
struct sock *backup = NULL, *pick = NULL;
struct mptcp_subflow_context *subflow;
int min_stale_count = INT_MAX;
- msk_owned_by_me(msk);
-
- if (__mptcp_check_fallback(msk))
- return NULL;
-
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -2243,14 +2263,6 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
return min_stale_count > 1 ? backup : NULL;
}
-static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
-{
- if (msk->subflow) {
- iput(SOCK_INODE(msk->subflow));
- WRITE_ONCE(msk->subflow, NULL);
- }
-}
-
bool __mptcp_retransmit_pending_data(struct sock *sk)
{
struct mptcp_data_frag *cur, *rtx_head;
@@ -2329,13 +2341,13 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
goto out_release;
}
- dispose_it = !msk->subflow || ssk != msk->subflow->sk;
+ dispose_it = msk->free_first || ssk != msk->first;
if (dispose_it)
list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
- if (flags & MPTCP_CF_FASTCLOSE) {
+ if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) {
/* be sure to force the tcp_disconnect() path,
* to generate the egress reset
*/
@@ -2350,7 +2362,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
* disconnect should never fail
*/
WARN_ON_ONCE(tcp_disconnect(ssk, 0));
- msk->subflow->state = SS_UNCONNECTED;
mptcp_subflow_ctx_reset(subflow);
release_sock(ssk);
@@ -2383,9 +2394,6 @@ out_release:
WRITE_ONCE(msk->first, NULL);
out:
- if (ssk == msk->last_snd)
- msk->last_snd = NULL;
-
if (need_push)
__mptcp_push_pending(sk, 0);
}
@@ -2502,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
static void __mptcp_retrans(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
struct mptcp_sendmsg_info info = {};
struct mptcp_data_frag *dfrag;
- size_t copied = 0;
struct sock *ssk;
- int ret;
+ int ret, err;
+ u16 len = 0;
mptcp_clean_una_wakeup(sk);
/* first check ssk: need to kick "stale" logic */
- ssk = mptcp_subflow_get_retrans(msk);
+ err = mptcp_sched_get_retrans(msk);
dfrag = mptcp_rtx_head(sk);
if (!dfrag) {
if (mptcp_data_fin_enabled(msk)) {
@@ -2530,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk)
goto reset_timer;
}
- if (!ssk)
+ if (err)
goto reset_timer;
- lock_sock(ssk);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ u16 copied = 0;
- /* limit retransmission to the bytes already sent on some subflows */
- info.sent = 0;
- info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
- while (info.sent < info.limit) {
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
- if (ret <= 0)
- break;
+ mptcp_subflow_set_scheduled(subflow, false);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
- copied += ret;
- info.sent += ret;
- }
- if (copied) {
- dfrag->already_sent = max(dfrag->already_sent, info.sent);
- msk->bytes_retrans += copied;
- tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
- info.size_goal);
- WRITE_ONCE(msk->allow_infinite_fallback, false);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+
+ /* limit retransmission to the bytes already sent on some subflows */
+ info.sent = 0;
+ info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
+ dfrag->already_sent;
+ while (info.sent < info.limit) {
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
+ break;
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+ copied += ret;
+ info.sent += ret;
+ }
+ if (copied) {
+ len = max(copied, len);
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+ }
+
+ release_sock(ssk);
+ }
}
- release_sock(ssk);
+ msk->bytes_retrans += len;
+ dfrag->already_sent = max(dfrag->already_sent, len);
reset_timer:
mptcp_check_and_set_pending(sk);
@@ -2663,7 +2685,7 @@ unlock:
sock_put(sk);
}
-static int __mptcp_init_sock(struct sock *sk)
+static void __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2690,8 +2712,6 @@ static int __mptcp_init_sock(struct sock *sk)
/* re-use the csk retrans timer for MPTCP-level retrans */
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
-
- return 0;
}
static void mptcp_ca_reset(struct sock *sk)
@@ -2711,9 +2731,7 @@ static int mptcp_init_sock(struct sock *sk)
struct net *net = sock_net(sk);
int ret;
- ret = __mptcp_init_sock(sk);
- if (ret)
- return ret;
+ __mptcp_init_sock(sk);
if (!mptcp_is_enabled(net))
return -ENOPROTOOPT;
@@ -2721,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk)
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
+ ret = mptcp_init_sched(mptcp_sk(sk),
+ mptcp_sched_find(mptcp_get_scheduler(net)));
+ if (ret)
+ return ret;
+
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
@@ -2866,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0;
+ mptcp_release_sched(msk);
sk->sk_prot->destroy(sk);
@@ -3055,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags)
* subflow
*/
mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
- msk->last_snd = NULL;
WRITE_ONCE(msk->flags, 0);
msk->cb_flags = 0;
msk->push_pending = 0;
@@ -3111,7 +3134,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
msk = mptcp_sk(nsk);
msk->local_key = subflow_req->local_key;
msk->token = subflow_req->token;
- WRITE_ONCE(msk->subflow, NULL);
msk->in_accept_queue = 1;
WRITE_ONCE(msk->fully_established, false);
if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
@@ -3122,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
msk->snd_una = msk->write_seq;
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
+ mptcp_init_sched(msk, mptcp_sk(sk)->sched);
/* passive msk is created after the first/MPC subflow */
msk->subflow_id = 2;
@@ -3175,25 +3198,17 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
}
-static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
+static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err,
bool kern)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *listener;
struct sock *newsk;
- listener = READ_ONCE(msk->subflow);
- if (WARN_ON_ONCE(!listener)) {
- *err = -EINVAL;
- return NULL;
- }
-
- pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
- newsk = inet_csk_accept(listener->sk, flags, err, kern);
+ pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
+ newsk = inet_csk_accept(ssk, flags, err, kern);
if (!newsk)
return NULL;
- pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
+ pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
if (sk_is_mptcp(newsk)) {
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
@@ -3210,9 +3225,9 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
}
newsk = new_mptcp_sock;
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
} else {
- MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_INC_STATS(sock_net(ssk),
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
}
@@ -3253,10 +3268,8 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- /* clears msk->subflow, allowing the following to close
- * even the initial subflow
- */
- mptcp_dispose_initial_subflow(msk);
+ /* allow the following to close even the initial subflow */
+ msk->free_first = 1;
mptcp_destroy_common(msk, 0);
sk_sockets_allocated_dec(sk);
}
@@ -3328,7 +3341,7 @@ static void mptcp_release_cb(struct sock *sk)
if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
__mptcp_clean_una_wakeup(sk);
- if (unlikely(&msk->cb_flags)) {
+ if (unlikely(msk->cb_flags)) {
/* be sure to set the current sk state before tacking actions
* depending on sk_state, that is processing MPTCP_ERROR_REPORT
*/
@@ -3336,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk)
__mptcp_set_connected(sk);
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk);
- if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
- msk->last_snd = NULL;
}
__mptcp_update_rmem(sk);
@@ -3406,14 +3417,12 @@ static void mptcp_unhash(struct sock *sk)
static int mptcp_get_port(struct sock *sk, unsigned short snum)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
- ssock = msk->subflow;
- pr_debug("msk=%p, subflow=%p", msk, ssock);
- if (WARN_ON_ONCE(!ssock))
+ pr_debug("msk=%p, ssk=%p", msk, msk->first);
+ if (WARN_ON_ONCE(!msk->first))
return -EINVAL;
- return inet_csk_get_port(ssock->sk, snum);
+ return inet_csk_get_port(msk->first, snum);
}
void mptcp_finish_connect(struct sock *ssk)
@@ -3588,25 +3597,24 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
int err = -EINVAL;
+ struct sock *ssk;
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock))
- return PTR_ERR(ssock);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
- mptcp_token_destroy(msk);
inet_sk_state_store(sk, TCP_SYN_SENT);
- subflow = mptcp_subflow_ctx(ssock->sk);
+ subflow = mptcp_subflow_ctx(ssk);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
* TCP option space.
*/
- if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
+ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info))
mptcp_subflow_early_fallback(msk, subflow);
#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
- MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT);
mptcp_subflow_early_fallback(msk, subflow);
}
if (likely(!__mptcp_check_fallback(msk)))
@@ -3615,25 +3623,42 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* if reaching here via the fastopen/sendmsg path, the caller already
* acquired the subflow socket lock, too.
*/
- if (msk->fastopening)
- err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1);
- else
- err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK);
- inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;
+ if (!msk->fastopening)
+ lock_sock(ssk);
+
+ /* the following mirrors closely a very small chunk of code from
+ * __inet_stream_connect()
+ */
+ if (ssk->sk_state != TCP_CLOSE)
+ goto out;
+
+ if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) {
+ err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len);
+ if (err)
+ goto out;
+ }
+
+ err = ssk->sk_prot->connect(ssk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk));
+
+out:
+ if (!msk->fastopening)
+ release_sock(ssk);
/* on successful connect, the msk state will be moved to established by
* subflow_finish_connect()
*/
- if (unlikely(err && err != -EINPROGRESS)) {
- inet_sk_state_store(sk, inet_sk_state_load(ssock->sk));
+ if (unlikely(err)) {
+ /* avoid leaving a dangling token in an unconnected socket */
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sk, TCP_CLOSE);
return err;
}
- mptcp_copy_inaddrs(sk, ssock->sk);
-
- /* silence EINPROGRESS and let the caller inet_stream_connect
- * handle the connection in progress
- */
+ mptcp_copy_inaddrs(sk, ssk);
return 0;
}
@@ -3674,22 +3699,27 @@ static struct proto mptcp_prot = {
static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct socket *ssock;
- int err;
+ struct sock *ssk, *sk = sock->sk;
+ int err = -EINVAL;
- lock_sock(sock->sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ lock_sock(sk);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ err = PTR_ERR(ssk);
goto unlock;
}
- err = ssock->ops->bind(ssock, uaddr, addr_len);
+ if (sk->sk_family == AF_INET)
+ err = inet_bind_sk(ssk, uaddr, addr_len);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (sk->sk_family == AF_INET6)
+ err = inet6_bind_sk(ssk, uaddr, addr_len);
+#endif
if (!err)
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ mptcp_copy_inaddrs(sk, ssk);
unlock:
- release_sock(sock->sk);
+ release_sock(sk);
return err;
}
@@ -3697,7 +3727,7 @@ static int mptcp_listen(struct socket *sock, int backlog)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct sock *sk = sock->sk;
- struct socket *ssock;
+ struct sock *ssk;
int err;
pr_debug("msk=%p", msk);
@@ -3708,25 +3738,26 @@ static int mptcp_listen(struct socket *sock, int backlog)
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto unlock;
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ err = PTR_ERR(ssk);
goto unlock;
}
- mptcp_token_destroy(msk);
inet_sk_state_store(sk, TCP_LISTEN);
sock_set_flag(sk, SOCK_RCU_FREE);
- err = ssock->ops->listen(ssock, backlog);
- inet_sk_state_store(sk, inet_sk_state_load(ssock->sk));
+ lock_sock(ssk);
+ err = __inet_listen_sk(ssk, backlog);
+ release_sock(ssk);
+ inet_sk_state_store(sk, inet_sk_state_load(ssk));
+
if (!err) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- mptcp_copy_inaddrs(sk, ssock->sk);
+ mptcp_copy_inaddrs(sk, ssk);
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
}
- mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED);
-
unlock:
release_sock(sk);
return err;
@@ -3736,8 +3767,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
int flags, bool kern)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct socket *ssock;
- struct sock *newsk;
+ struct sock *ssk, *newsk;
int err;
pr_debug("msk=%p", msk);
@@ -3745,11 +3775,11 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* Buggy applications can call accept on socket states other then LISTEN
* but no need to allocate the first subflow just to error out.
*/
- ssock = READ_ONCE(msk->subflow);
- if (!ssock)
+ ssk = READ_ONCE(msk->first);
+ if (!ssk)
return -EINVAL;
- newsk = mptcp_accept(sock->sk, flags, &err, kern);
+ newsk = mptcp_accept(ssk, flags, &err, kern);
if (!newsk)
return err;
@@ -3776,11 +3806,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* Do late cleanup for the first subflow as necessary. Also
* deal with bad peers not doing a complete shutdown.
*/
- if (msk->first &&
- unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) {
+ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) {
__mptcp_close_ssk(newsk, msk->first,
mptcp_subflow_ctx(msk->first), 0);
- if (unlikely(list_empty(&msk->conn_list)))
+ if (unlikely(list_is_singular(&msk->conn_list)))
inet_sk_state_store(newsk, TCP_CLOSE);
}
}
@@ -3819,12 +3848,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
state = inet_sk_state_load(sk);
pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
if (state == TCP_LISTEN) {
- struct socket *ssock = READ_ONCE(msk->subflow);
+ struct sock *ssk = READ_ONCE(msk->first);
- if (WARN_ON_ONCE(!ssock || !ssock->sk))
+ if (WARN_ON_ONCE(!ssk))
return 0;
- return inet_csk_listen_poll(ssock->sk);
+ return inet_csk_listen_poll(ssk);
}
shutdown = READ_ONCE(sk->sk_shutdown);
@@ -3839,7 +3868,8 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
mask |= EPOLLOUT | EPOLLWRNORM;
else
mask |= mptcp_check_writeable(msk);
- } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ } else if (state == TCP_SYN_SENT &&
+ inet_test_bit(DEFER_CONNECT, sk)) {
/* cf tcp_poll() note about TFO */
mask |= EPOLLOUT | EPOLLWRNORM;
}
@@ -3935,6 +3965,7 @@ void __init mptcp_proto_init(void)
mptcp_subflow_init();
mptcp_pm_init();
+ mptcp_sched_init();
mptcp_token_init();
if (proto_register(&mptcp_prot, 1) != 0)
@@ -3988,6 +4019,7 @@ int __init mptcp_proto_v6_init(void)
strcpy(mptcp_v6_prot.name, "MPTCPv6");
mptcp_v6_prot.slab = NULL;
mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
+ mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np);
err = proto_register(&mptcp_v6_prot, 1);
if (err)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 37fbe22e2433..7254b3562575 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -123,7 +123,6 @@
#define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6
-#define MPTCP_RESET_SCHEDULER 7
struct mptcp_skb_cb {
u64 map_seq;
@@ -269,7 +268,6 @@ struct mptcp_sock {
u64 rcv_data_fin_seq;
u64 bytes_retrans;
int rmem_fwd_alloc;
- struct sock *last_snd;
int snd_burst;
int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
@@ -299,7 +297,8 @@ struct mptcp_sock {
cork:1,
nodelay:1,
fastopening:1,
- in_accept_queue:1;
+ in_accept_queue:1,
+ free_first:1;
struct work_struct work;
struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue;
@@ -308,24 +307,23 @@ struct mptcp_sock {
struct list_head rtx_queue;
struct mptcp_data_frag *first_pending;
struct list_head join_list;
- struct socket *subflow; /* outgoing connect/listener/!mp_capable
- * The mptcp ops can safely dereference, using suitable
- * ONCE annotation, the subflow outside the socket
- * lock as such sock is freed after close().
- */
- struct sock *first;
+ struct sock *first; /* The mptcp ops can safely dereference, using suitable
+ * ONCE annotation, the subflow outside the socket
+ * lock as such sock is freed after close().
+ */
struct mptcp_pm_data pm;
+ struct mptcp_sched_ops *sched;
struct {
u32 space; /* bytes copied in last measurement window */
u32 copied; /* bytes copied in this measurement window */
u64 time; /* start time of measurement window */
u64 rtt_us; /* last maximum rtt of subflows */
} rcvq_space;
+ u8 scaling_ratio;
u32 subflow_id;
u32 setsockopt_seq;
char ca_name[TCP_CA_NAME_MAX];
- struct mptcp_sock *dl_next;
};
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -351,9 +349,14 @@ static inline int __mptcp_rmem(const struct sock *sk)
return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
}
+static inline int mptcp_win_from_space(const struct sock *sk, int space)
+{
+ return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
+}
+
static inline int __mptcp_space(const struct sock *sk)
{
- return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
+ return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
}
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
@@ -488,6 +491,7 @@ struct mptcp_subflow_context {
is_mptfo : 1, /* subflow is doing TFO */
__unused : 9;
enum mptcp_data_avail data_avail;
+ bool scheduled;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -621,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
+const char *mptcp_get_scheduler(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
@@ -635,7 +640,7 @@ void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
-struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk);
+struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
void mptcp_cancel_work(struct sock *sk);
void __mptcp_unaccepted_force_close(struct sock *sk);
@@ -653,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
+struct mptcp_sched_ops *mptcp_sched_find(const char *name);
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_sched_init(void);
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched);
+void mptcp_release_sched(struct mptcp_sock *msk);
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled);
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
+int mptcp_sched_get_send(struct mptcp_sock *msk);
+int mptcp_sched_get_retrans(struct mptcp_sock *msk);
static inline bool __tcp_can_send(const struct sock *ssk)
{
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
new file mode 100644
index 000000000000..4ab0693c069c
--- /dev/null
+++ b/net/mptcp/sched.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, SUSE.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include "protocol.h"
+
+static DEFINE_SPINLOCK(mptcp_sched_list_lock);
+static LIST_HEAD(mptcp_sched_list);
+
+static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk,
+ struct mptcp_sched_data *data)
+{
+ struct sock *ssk;
+
+ ssk = data->reinject ? mptcp_subflow_get_retrans(msk) :
+ mptcp_subflow_get_send(msk);
+ if (!ssk)
+ return -EINVAL;
+
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+ return 0;
+}
+
+static struct mptcp_sched_ops mptcp_sched_default = {
+ .get_subflow = mptcp_sched_default_get_subflow,
+ .name = "default",
+ .owner = THIS_MODULE,
+};
+
+/* Must be called with rcu read lock held */
+struct mptcp_sched_ops *mptcp_sched_find(const char *name)
+{
+ struct mptcp_sched_ops *sched, *ret = NULL;
+
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+ if (!strcmp(sched->name, name)) {
+ ret = sched;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (!sched->get_subflow)
+ return -EINVAL;
+
+ spin_lock(&mptcp_sched_list_lock);
+ if (mptcp_sched_find(sched->name)) {
+ spin_unlock(&mptcp_sched_list_lock);
+ return -EEXIST;
+ }
+ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
+ spin_unlock(&mptcp_sched_list_lock);
+
+ pr_debug("%s registered", sched->name);
+ return 0;
+}
+
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (sched == &mptcp_sched_default)
+ return;
+
+ spin_lock(&mptcp_sched_list_lock);
+ list_del_rcu(&sched->list);
+ spin_unlock(&mptcp_sched_list_lock);
+}
+
+void mptcp_sched_init(void)
+{
+ mptcp_register_scheduler(&mptcp_sched_default);
+}
+
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched)
+{
+ if (!sched)
+ sched = &mptcp_sched_default;
+
+ if (!bpf_try_module_get(sched, sched->owner))
+ return -EBUSY;
+
+ msk->sched = sched;
+ if (msk->sched->init)
+ msk->sched->init(msk);
+
+ pr_debug("sched=%s", msk->sched->name);
+
+ return 0;
+}
+
+void mptcp_release_sched(struct mptcp_sock *msk)
+{
+ struct mptcp_sched_ops *sched = msk->sched;
+
+ if (!sched)
+ return;
+
+ msk->sched = NULL;
+ if (sched->release)
+ sched->release(msk);
+
+ bpf_module_put(sched, sched->owner);
+}
+
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled)
+{
+ WRITE_ONCE(subflow->scheduled, scheduled);
+}
+
+int mptcp_sched_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sched_data data;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_send */
+ if (__mptcp_check_fallback(msk)) {
+ if (msk->first &&
+ __tcp_can_send(msk->first) &&
+ sk_stream_memory_free(msk->first)) {
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ data.reinject = false;
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_subflow(msk, &data);
+ return msk->sched->get_subflow(msk, &data);
+}
+
+int mptcp_sched_get_retrans(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sched_data data;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_retrans */
+ if (__mptcp_check_fallback(msk))
+ return -EINVAL;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ data.reinject = true;
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_subflow(msk, &data);
+ return msk->sched->get_subflow(msk, &data);
+}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 63f7a09335c5..8260202c0066 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -103,7 +103,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
break;
case SO_MARK:
if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) {
- ssk->sk_mark = sk->sk_mark;
+ WRITE_ONCE(ssk->sk_mark, sk->sk_mark);
sk_dst_reset(ssk);
}
break;
@@ -292,7 +292,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = (struct sock *)msk;
- struct socket *ssock;
+ struct sock *ssk;
int ret;
switch (optname) {
@@ -301,22 +301,22 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
case SO_BINDTODEVICE:
case SO_BINDTOIFINDEX:
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock)) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
release_sock(sk);
- return PTR_ERR(ssock);
+ return PTR_ERR(ssk);
}
- ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
+ ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen);
if (ret == 0) {
if (optname == SO_REUSEPORT)
- sk->sk_reuseport = ssock->sk->sk_reuseport;
+ sk->sk_reuseport = ssk->sk_reuseport;
else if (optname == SO_REUSEADDR)
- sk->sk_reuse = ssock->sk->sk_reuse;
+ sk->sk_reuse = ssk->sk_reuse;
else if (optname == SO_BINDTODEVICE)
- sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if;
+ sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
else if (optname == SO_BINDTOIFINDEX)
- sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if;
+ sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
}
release_sock(sk);
return ret;
@@ -390,20 +390,20 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
{
struct sock *sk = (struct sock *)msk;
int ret = -EOPNOTSUPP;
- struct socket *ssock;
+ struct sock *ssk;
switch (optname) {
case IPV6_V6ONLY:
case IPV6_TRANSPARENT:
case IPV6_FREEBIND:
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock)) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
release_sock(sk);
- return PTR_ERR(ssock);
+ return PTR_ERR(ssk);
}
- ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
+ ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen);
if (ret != 0) {
release_sock(sk);
return ret;
@@ -413,13 +413,15 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
switch (optname) {
case IPV6_V6ONLY:
- sk->sk_ipv6only = ssock->sk->sk_ipv6only;
+ sk->sk_ipv6only = ssk->sk_ipv6only;
break;
case IPV6_TRANSPARENT:
- inet_sk(sk)->transparent = inet_sk(ssock->sk)->transparent;
+ inet_assign_bit(TRANSPARENT, sk,
+ inet_test_bit(TRANSPARENT, ssk));
break;
case IPV6_FREEBIND:
- inet_sk(sk)->freebind = inet_sk(ssock->sk)->freebind;
+ inet_assign_bit(FREEBIND, sk,
+ inet_test_bit(FREEBIND, ssk));
break;
}
@@ -684,8 +686,7 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = (struct sock *)msk;
- struct inet_sock *issk;
- struct socket *ssock;
+ struct sock *ssk;
int err;
err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
@@ -694,20 +695,19 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock)) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
release_sock(sk);
- return PTR_ERR(ssock);
+ return PTR_ERR(ssk);
}
- issk = inet_sk(ssock->sk);
-
switch (optname) {
case IP_FREEBIND:
- issk->freebind = inet_sk(sk)->freebind;
+ inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
break;
case IP_TRANSPARENT:
- issk->transparent = inet_sk(sk)->transparent;
+ inet_assign_bit(TRANSPARENT, ssk,
+ inet_test_bit(TRANSPARENT, sk));
break;
default:
release_sock(sk);
@@ -763,18 +763,18 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = (struct sock *)msk;
- struct socket *sock;
+ struct sock *ssk;
int ret;
/* Limit to first subflow, before the connection establishment */
lock_sock(sk);
- sock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(sock)) {
- ret = PTR_ERR(sock);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ ret = PTR_ERR(ssk);
goto unlock;
}
- ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen);
+ ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
unlock:
release_sock(sk);
@@ -864,9 +864,8 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
char __user *optval, int __user *optlen)
{
struct sock *sk = (struct sock *)msk;
- struct socket *ssock;
- int ret;
struct sock *ssk;
+ int ret;
lock_sock(sk);
ssk = msk->first;
@@ -875,13 +874,13 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
goto out;
}
- ssock = __mptcp_nmpc_socket(msk);
- if (IS_ERR(ssock)) {
- ret = PTR_ERR(ssock);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ ret = PTR_ERR(ssk);
goto out;
}
- ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen);
+ ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
out:
release_sock(sk);
@@ -1441,8 +1440,8 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
__tcp_sock_set_cork(ssk, !!msk->cork);
__tcp_sock_set_nodelay(ssk, !!msk->nodelay);
- inet_sk(ssk)->transparent = inet_sk(sk)->transparent;
- inet_sk(ssk)->freebind = inet_sk(sk)->freebind;
+ inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
+ inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
}
static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 9ee3b7abbaf6..9bf3c7bc1762 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
const struct sock *sk = subflow->conn;
*space = __mptcp_space(sk);
- *full_space = tcp_full_space(sk);
+ *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}
void __mptcp_error_report(struct sock *sk)
@@ -1793,16 +1793,31 @@ static void subflow_state_change(struct sock *sk)
void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
{
struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
- struct mptcp_sock *msk, *next, *head = NULL;
- struct request_sock *req;
- struct sock *sk;
+ struct request_sock *req, *head, *tail;
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk, *ssk;
- /* build a list of all unaccepted mptcp sockets */
+ /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
+ * Splice the req list, so that accept() can not reach the pending ssk after
+ * the listener socket is released below.
+ */
spin_lock_bh(&queue->rskq_lock);
- for (req = queue->rskq_accept_head; req; req = req->dl_next) {
- struct mptcp_subflow_context *subflow;
- struct sock *ssk = req->sk;
+ head = queue->rskq_accept_head;
+ tail = queue->rskq_accept_tail;
+ queue->rskq_accept_head = NULL;
+ queue->rskq_accept_tail = NULL;
+ spin_unlock_bh(&queue->rskq_lock);
+
+ if (!head)
+ return;
+ /* can't acquire the msk socket lock under the subflow one,
+ * or will cause ABBA deadlock
+ */
+ release_sock(listener_ssk);
+
+ for (req = head; req; req = req->dl_next) {
+ ssk = req->sk;
if (!sk_is_mptcp(ssk))
continue;
@@ -1810,32 +1825,10 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s
if (!subflow || !subflow->conn)
continue;
- /* skip if already in list */
sk = subflow->conn;
- msk = mptcp_sk(sk);
- if (msk->dl_next || msk == head)
- continue;
-
sock_hold(sk);
- msk->dl_next = head;
- head = msk;
- }
- spin_unlock_bh(&queue->rskq_lock);
- if (!head)
- return;
-
- /* can't acquire the msk socket lock under the subflow one,
- * or will cause ABBA deadlock
- */
- release_sock(listener_ssk);
-
- for (msk = head; msk; msk = next) {
- sk = (struct sock *)msk;
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
- next = msk->dl_next;
- msk->dl_next = NULL;
-
__mptcp_unaccepted_force_close(sk);
release_sock(sk);
@@ -1859,6 +1852,13 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s
/* we are still under the listener msk socket lock */
lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+
+ /* restore the listener queue, to let the TCP code clean it up */
+ spin_lock_bh(&queue->rskq_lock);
+ WARN_ON_ONCE(queue->rskq_accept_head);
+ queue->rskq_accept_head = head;
+ queue->rskq_accept_tail = tail;
+ spin_unlock_bh(&queue->rskq_lock);
}
static int subflow_ulp_init(struct sock *sk)