summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Kconfig24
-rw-r--r--net/mptcp/Makefile7
-rw-r--r--net/mptcp/crypto.c63
-rw-r--r--net/mptcp/crypto_test.c72
-rw-r--r--net/mptcp/ctrl.c1
-rw-r--r--net/mptcp/mptcp_diag.c169
-rw-r--r--net/mptcp/options.c71
-rw-r--r--net/mptcp/pm.c46
-rw-r--r--net/mptcp/pm_netlink.c2
-rw-r--r--net/mptcp/protocol.c853
-rw-r--r--net/mptcp/protocol.h130
-rw-r--r--net/mptcp/subflow.c341
-rw-r--r--net/mptcp/syncookies.c130
-rw-r--r--net/mptcp/token.c373
-rw-r--r--net/mptcp/token_test.c140
15 files changed, 1803 insertions, 619 deletions
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index a9ed3bf1d93f..698bc3525160 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -13,17 +13,29 @@ config MPTCP
if MPTCP
+config INET_MPTCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
config MPTCP_IPV6
bool "MPTCP: IPv6 support for Multipath TCP"
select IPV6
default y
-config MPTCP_HMAC_TEST
- bool "Tests for MPTCP HMAC implementation"
+endif
+
+config MPTCP_KUNIT_TESTS
+ tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS
+ select MPTCP
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
help
- This option enable boot time self-test for the HMAC implementation
- used by the MPTCP code
+ Currently covers the MPTCP crypto and token helpers.
+ Only useful for kernel devs running KUnit test harness and are not
+ for inclusion into a production build.
- Say N if you are unsure.
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
-endif
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index baa0640527c7..a611968be4d7 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -3,3 +3,10 @@ obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
mib.o pm_netlink.o
+
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
+
+mptcp_crypto_test-objs := crypto_test.o
+mptcp_token_test-objs := token_test.o
+obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 82bd2b54d741..05d398d3fde4 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -78,65 +78,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac);
}
-#ifdef CONFIG_MPTCP_HMAC_TEST
-struct test_cast {
- char *key;
- char *msg;
- char *result;
-};
-
-/* we can't reuse RFC 4231 test vectors, as we have constraint on the
- * input and key size.
- */
-static struct test_cast tests[] = {
- {
- .key = "0b0b0b0b0b0b0b0b",
- .msg = "48692054",
- .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa",
- },
- {
- .key = "aaaaaaaaaaaaaaaa",
- .msg = "dddddddd",
- .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9",
- },
- {
- .key = "0102030405060708",
- .msg = "cdcdcdcd",
- .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d",
- },
-};
-
-static int __init test_mptcp_crypto(void)
-{
- char hmac[32], hmac_hex[65];
- u32 nonce1, nonce2;
- u64 key1, key2;
- u8 msg[8];
- int i, j;
-
- for (i = 0; i < ARRAY_SIZE(tests); ++i) {
- /* mptcp hmap will convert to be before computing the hmac */
- key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0]));
- key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8]));
- nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
- nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
-
- put_unaligned_be32(nonce1, &msg[0]);
- put_unaligned_be32(nonce2, &msg[4]);
-
- mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
- for (j = 0; j < 32; ++j)
- sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
- hmac_hex[64] = 0;
-
- if (memcmp(hmac_hex, tests[i].result, 64))
- pr_err("test %d failed, got %s expected %s", i,
- hmac_hex, tests[i].result);
- else
- pr_info("test %d [ ok ]", i);
- }
- return 0;
-}
-
-late_initcall(test_mptcp_crypto);
+#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS)
+EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha);
#endif
diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c
new file mode 100644
index 000000000000..017248dea038
--- /dev/null
+++ b/net/mptcp/crypto_test.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "protocol.h"
+
+struct test_case {
+ char *key;
+ char *msg;
+ char *result;
+};
+
+/* we can't reuse RFC 4231 test vectors, as we have constraint on the
+ * input and key size.
+ */
+static struct test_case tests[] = {
+ {
+ .key = "0b0b0b0b0b0b0b0b",
+ .msg = "48692054",
+ .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa",
+ },
+ {
+ .key = "aaaaaaaaaaaaaaaa",
+ .msg = "dddddddd",
+ .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9",
+ },
+ {
+ .key = "0102030405060708",
+ .msg = "cdcdcdcd",
+ .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d",
+ },
+};
+
+static void mptcp_crypto_test_basic(struct kunit *test)
+{
+ char hmac[32], hmac_hex[65];
+ u32 nonce1, nonce2;
+ u64 key1, key2;
+ u8 msg[8];
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ /* mptcp hmap will convert to be before computing the hmac */
+ key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0]));
+ key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8]));
+ nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
+ nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
+
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+ for (j = 0; j < 32; ++j)
+ sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
+ hmac_hex[64] = 0;
+
+ KUNIT_EXPECT_STREQ(test, &hmac_hex[0], tests[i].result);
+ }
+}
+
+static struct kunit_case mptcp_crypto_test_cases[] = {
+ KUNIT_CASE(mptcp_crypto_test_basic),
+ {}
+};
+
+static struct kunit_suite mptcp_crypto_suite = {
+ .name = "mptcp-crypto",
+ .test_cases = mptcp_crypto_test_cases,
+};
+
+kunit_test_suite(mptcp_crypto_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 8e39585d37f3..54b888f94009 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -112,6 +112,7 @@ static struct pernet_operations mptcp_pernet_ops = {
void __init mptcp_init(void)
{
+ mptcp_join_cookie_init();
mptcp_proto_init();
if (register_pernet_subsys(&mptcp_pernet_ops) < 0)
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
new file mode 100644
index 000000000000..5f390a97f556
--- /dev/null
+++ b/net/mptcp/mptcp_diag.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2020 Red Hat
+ *
+ * Author: Paolo Abeni <pabeni@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ struct nlattr *bc, bool net_admin)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
+ net_admin);
+}
+
+static int mptcp_diag_dump_one(struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sk_buff *in_skb = cb->skb;
+ struct mptcp_sock *msk = NULL;
+ struct sk_buff *rep;
+ int err = -ENOENT;
+ struct net *net;
+ struct sock *sk;
+
+ net = sock_net(in_skb->sk);
+ msk = mptcp_token_get_sock(req->id.idiag_cookie[0]);
+ if (!msk)
+ goto out_nosk;
+
+ err = -ENOMEM;
+ sk = (struct sock *)msk;
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct mptcp_info)) +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out:
+ sock_put(sk);
+
+out_nosk:
+ return err;
+}
+
+static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct net *net = sock_net(skb->sk);
+ struct inet_diag_dump_data *cb_data;
+ struct mptcp_sock *msk;
+ struct nlattr *bc;
+
+ cb_data = cb->data;
+ bc = cb_data->inet_diag_nla_bc;
+
+ while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) !=
+ NULL) {
+ struct inet_sock *inet = (struct inet_sock *)msk;
+ struct sock *sk = (struct sock *)msk;
+ int ret = 0;
+
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+next:
+ sock_put(sk);
+ if (ret < 0) {
+ /* will retry on the same position */
+ cb->args[1]--;
+ break;
+ }
+ cond_resched();
+ }
+}
+
+static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_info *info = _info;
+ u32 flags = 0;
+ bool slow;
+ u8 val;
+
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+ if (!info)
+ return;
+
+ slow = lock_sock_fast(sk);
+ info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
+ info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
+ info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
+ info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
+ val = READ_ONCE(msk->pm.add_addr_signal_max);
+ info->mptcpi_add_addr_signal_max = val;
+ val = READ_ONCE(msk->pm.add_addr_accept_max);
+ info->mptcpi_add_addr_accepted_max = val;
+ if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
+ flags |= MPTCP_INFO_FLAG_FALLBACK;
+ if (READ_ONCE(msk->can_ack))
+ flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
+ info->mptcpi_flags = flags;
+ info->mptcpi_token = READ_ONCE(msk->token);
+ info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
+ info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
+ info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
+ unlock_sock_fast(sk, slow);
+}
+
+static const struct inet_diag_handler mptcp_diag_handler = {
+ .dump = mptcp_diag_dump,
+ .dump_one = mptcp_diag_dump_one,
+ .idiag_get_info = mptcp_diag_get_info,
+ .idiag_type = IPPROTO_MPTCP,
+ .idiag_info_size = sizeof(struct mptcp_info),
+};
+
+static int __init mptcp_diag_init(void)
+{
+ return inet_diag_register(&mptcp_diag_handler);
+}
+
+static void __exit mptcp_diag_exit(void)
+{
+ inet_diag_unregister(&mptcp_diag_handler);
+}
+
+module_init(mptcp_diag_init);
+module_exit(mptcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 8f940be42f98..7fa822b55c34 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -451,6 +451,8 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
struct sk_buff *skb, struct mptcp_ext *ext)
{
+ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq);
+
if (!ext->use_map || !skb->len) {
/* RFC6824 requires a DSS mapping with specific values
* if DATA_FIN is set but no data payload is mapped
@@ -458,10 +460,13 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
ext->data_fin = 1;
ext->use_map = 1;
ext->dsn64 = 1;
- ext->data_seq = subflow->data_fin_tx_seq;
+ /* The write_seq value has already been incremented, so
+ * the actual sequence number for the DATA_FIN is one less.
+ */
+ ext->data_seq = data_fin_tx_seq - 1;
ext->subflow_seq = 0;
ext->data_len = 1;
- } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) {
+ } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) {
/* If there's an existing DSS mapping and it is the
* final mapping, DATA_FIN consumes 1 additional byte of
* mapping space.
@@ -477,22 +482,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
unsigned int dss_size = 0;
+ u64 snd_data_fin_enable;
struct mptcp_ext *mpext;
- struct mptcp_sock *msk;
unsigned int ack_size;
bool ret = false;
- u8 tcp_fin;
- if (skb) {
- mpext = mptcp_get_ext(skb);
- tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
- } else {
- mpext = NULL;
- tcp_fin = 0;
- }
+ mpext = skb ? mptcp_get_ext(skb) : NULL;
+ snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable);
- if (!skb || (mpext && mpext->use_map) || tcp_fin) {
+ if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
unsigned int map_size;
map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
@@ -502,7 +502,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
if (mpext)
opts->ext_copy = *mpext;
- if (skb && tcp_fin && subflow->data_fin_tx_enable)
+ if (skb && snd_data_fin_enable)
mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
ret = true;
}
@@ -511,7 +511,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
* if the first subflow may have the already the remote key handy
*/
opts->ext_copy.use_ack = 0;
- msk = mptcp_sk(subflow->conn);
if (!READ_ONCE(msk->can_ack)) {
*size = ALIGN(dss_size, 4);
return ret;
@@ -624,6 +623,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
opts->suboptions = 0;
+ if (unlikely(mptcp_check_fallback(sk)))
+ return false;
+
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -706,6 +708,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
* additional ack.
*/
subflow->fully_established = 1;
+ WRITE_ONCE(msk->fully_established, true);
goto fully_established;
}
@@ -714,15 +717,14 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
*/
if (!mp_opt->mp_capable) {
subflow->mp_capable = 0;
- tcp_sk(sk)->is_mptcp = 0;
+ pr_fallback(msk);
+ __mptcp_do_fallback(msk);
return false;
}
if (unlikely(!READ_ONCE(msk->pm.server_side)))
pr_warn_once("bogus mpc option on established client sk");
- subflow->fully_established = 1;
- subflow->remote_key = mp_opt->sndr_key;
- subflow->can_ack = 1;
+ mptcp_subflow_fully_established(subflow, mp_opt);
fully_established:
if (likely(subflow->pm_notified))
@@ -780,6 +782,22 @@ static void update_una(struct mptcp_sock *msk,
}
}
+bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq)
+{
+ /* Skip if DATA_FIN was already received.
+ * If updating simultaneously with the recvmsg loop, values
+ * should match. If they mismatch, the peer is misbehaving and
+ * we will prefer the most recent information.
+ */
+ if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first))
+ return false;
+
+ WRITE_ONCE(msk->rcv_data_fin_seq, data_fin_seq);
+ WRITE_ONCE(msk->rcv_data_fin, 1);
+
+ return true;
+}
+
static bool add_addr_hmac_valid(struct mptcp_sock *msk,
struct mptcp_options_received *mp_opt)
{
@@ -814,6 +832,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct mptcp_options_received mp_opt;
struct mptcp_ext *mpext;
+ if (__mptcp_check_fallback(msk))
+ return;
+
mptcp_get_options(skb, &mp_opt);
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return;
@@ -847,6 +868,20 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
if (mp_opt.use_ack)
update_una(msk, &mp_opt);
+ /* Zero-data-length packets are dropped by the caller and not
+ * propagated to the MPTCP layer, so the skb extension does not
+ * need to be allocated or populated. DATA_FIN information, if
+ * present, needs to be updated here before the skb is freed.
+ */
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ if (mp_opt.data_fin && mp_opt.data_len == 1 &&
+ mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) &&
+ schedule_work(&msk->work))
+ sock_hold(subflow->conn);
+
+ return;
+ }
+
mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext)
return;
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 977d9c8b1453..a8ad20559aaa 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -10,8 +10,6 @@
#include <net/mptcp.h>
#include "protocol.h"
-static struct workqueue_struct *pm_wq;
-
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
@@ -78,7 +76,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return false;
msk->pm.status |= BIT(new_status);
- if (queue_work(pm_wq, &msk->pm.work))
+ if (schedule_work(&msk->work))
sock_hold((struct sock *)msk);
return true;
}
@@ -181,35 +179,6 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return mptcp_pm_nl_get_local_id(msk, skc);
}
-static void pm_worker(struct work_struct *work)
-{
- struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data,
- work);
- struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm);
- struct sock *sk = (struct sock *)msk;
-
- lock_sock(sk);
- spin_lock_bh(&msk->pm.lock);
-
- pr_debug("msk=%p status=%x", msk, pm->status);
- if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
- pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
- mptcp_pm_nl_add_addr_received(msk);
- }
- if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
- mptcp_pm_nl_fully_established(msk);
- }
- if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
- mptcp_pm_nl_subflow_established(msk);
- }
-
- spin_unlock_bh(&msk->pm.lock);
- release_sock(sk);
- sock_put(sk);
-}
-
void mptcp_pm_data_init(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
@@ -223,22 +192,11 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
msk->pm.status = 0;
spin_lock_init(&msk->pm.lock);
- INIT_WORK(&msk->pm.work, pm_worker);
mptcp_pm_nl_data_init(msk);
}
-void mptcp_pm_close(struct mptcp_sock *msk)
-{
- if (cancel_work_sync(&msk->pm.work))
- sock_put((struct sock *)msk);
-}
-
-void mptcp_pm_init(void)
+void __init mptcp_pm_init(void)
{
- pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
- if (!pm_wq)
- panic("Failed to allocate workqueue");
-
mptcp_pm_nl_init();
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index b78edf237ba0..c8820c4156e6 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -851,7 +851,7 @@ static struct pernet_operations mptcp_pm_pernet_ops = {
.size = sizeof(struct pm_nl_pernet),
};
-void mptcp_pm_nl_init(void)
+void __init mptcp_pm_nl_init(void)
{
if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
panic("Failed to register MPTCP PM pernet subsystem.\n");
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c0abe738e7d3..8c1d1a595701 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -16,6 +16,7 @@
#include <net/inet_hashtables.h>
#include <net/protocol.h>
#include <net/tcp.h>
+#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/transp_v6.h>
#endif
@@ -52,18 +53,10 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
return msk->subflow;
}
-static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
-{
- return msk->first && !sk_is_mptcp(msk->first);
-}
-
-static struct socket *mptcp_is_tcpsk(struct sock *sk)
+static bool mptcp_is_tcpsk(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
- if (sock->sk != sk)
- return NULL;
-
if (unlikely(sk->sk_prot == &tcp_prot)) {
/* we are being invoked after mptcp_accept() has
* accepted a non-mp-capable flow: sk is a tcp_sk,
@@ -73,59 +66,37 @@ static struct socket *mptcp_is_tcpsk(struct sock *sk)
* bypass mptcp.
*/
sock->ops = &inet_stream_ops;
- return sock;
+ return true;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
} else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
sock->ops = &inet6_stream_ops;
- return sock;
+ return true;
#endif
}
- return NULL;
+ return false;
}
-static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
+static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
- struct socket *sock;
-
sock_owned_by_me((const struct sock *)msk);
- sock = mptcp_is_tcpsk((struct sock *)msk);
- if (unlikely(sock))
- return sock;
-
- if (likely(!__mptcp_needs_tcp_fallback(msk)))
+ if (likely(!__mptcp_check_fallback(msk)))
return NULL;
- return msk->subflow;
-}
-
-static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
-{
- return !msk->first;
+ return msk->first;
}
-static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
+static int __mptcp_socket_create(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
struct socket *ssock;
int err;
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock))
- return ssock;
-
- ssock = __mptcp_nmpc_socket(msk);
- if (ssock)
- goto set_state;
-
- if (!__mptcp_can_create_subflow(msk))
- return ERR_PTR(-EINVAL);
-
err = mptcp_subflow_create_socket(sk, &ssock);
if (err)
- return ERR_PTR(err);
+ return err;
msk->first = ssock->sk;
msk->subflow = ssock;
@@ -133,10 +104,12 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
list_add(&subflow->node, &msk->conn_list);
subflow->request_mptcp = 1;
-set_state:
- if (state != MPTCP_SAME_STATE)
- inet_sk_state_store(sk, state);
- return ssock;
+ /* accept() will wait on first subflow sk_wq, and we always wakes up
+ * via msk->sk_socket
+ */
+ RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
+
+ return 0;
}
static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@@ -170,6 +143,14 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
MPTCP_SKB_CB(skb)->offset = offset;
}
+static void mptcp_stop_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
+}
+
/* both sockets must be locked */
static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
struct sock *ssk)
@@ -191,6 +172,139 @@ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
return mptcp_subflow_data_available(ssk);
}
+static void mptcp_check_data_fin_ack(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (__mptcp_check_fallback(msk))
+ return;
+
+ /* Look for an acknowledged DATA_FIN */
+ if (((1 << sk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
+ msk->write_seq == atomic64_read(&msk->snd_una)) {
+ mptcp_stop_timer(sk);
+
+ WRITE_ONCE(msk->snd_data_fin_enable, 0);
+
+ switch (sk->sk_state) {
+ case TCP_FIN_WAIT1:
+ inet_sk_state_store(sk, TCP_FIN_WAIT2);
+ sk->sk_state_change(sk);
+ break;
+ case TCP_CLOSING:
+ fallthrough;
+ case TCP_LAST_ACK:
+ inet_sk_state_store(sk, TCP_CLOSE);
+ sk->sk_state_change(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ }
+}
+
+static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (READ_ONCE(msk->rcv_data_fin) &&
+ ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
+ u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
+
+ if (msk->ack_seq == rcv_data_fin_seq) {
+ if (seq)
+ *seq = rcv_data_fin_seq;
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
+{
+ long tout = ssk && inet_csk(ssk)->icsk_pending ?
+ inet_csk(ssk)->icsk_timeout - jiffies : 0;
+
+ if (tout <= 0)
+ tout = mptcp_sk(sk)->timer_ival;
+ mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
+}
+
+static void mptcp_check_data_fin(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ u64 rcv_data_fin_seq;
+
+ if (__mptcp_check_fallback(msk) || !msk->first)
+ return;
+
+ /* Need to ack a DATA_FIN received from a peer while this side
+ * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
+ * msk->rcv_data_fin was set when parsing the incoming options
+ * at the subflow level and the msk lock was not held, so this
+ * is the first opportunity to act on the DATA_FIN and change
+ * the msk state.
+ *
+ * If we are caught up to the sequence number of the incoming
+ * DATA_FIN, send the DATA_ACK now and do state transition. If
+ * not caught up, do nothing and let the recv code send DATA_ACK
+ * when catching up.
+ */
+
+ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
+ struct mptcp_subflow_context *subflow;
+
+ msk->ack_seq++;
+ WRITE_ONCE(msk->rcv_data_fin, 0);
+
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ inet_sk_state_store(sk, TCP_CLOSE_WAIT);
+ break;
+ case TCP_FIN_WAIT1:
+ inet_sk_state_store(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ inet_sk_state_store(sk, TCP_CLOSE);
+ // @@ Close subflows now?
+ break;
+ default:
+ /* Other states not expected */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ mptcp_set_timeout(sk, NULL);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ tcp_send_ack(ssk);
+ release_sock(ssk);
+ }
+
+ sk->sk_state_change(sk);
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ }
+}
+
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct sock *ssk,
unsigned int *bytes)
@@ -207,13 +321,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
return false;
}
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
-
- if (rcvbuf > sk->sk_rcvbuf)
- sk->sk_rcvbuf = rcvbuf;
- }
-
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -229,6 +336,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
if (!skb)
break;
+ if (__mptcp_check_fallback(msk)) {
+ /* if we are running under the workqueue, TCP could have
+ * collapsed skbs between dummy map creation and now
+ * be sure to adjust the size
+ */
+ map_remaining = skb->len;
+ subflow->map_data_len = skb->len;
+ }
+
offset = seq - TCP_SKB_CB(skb)->seq;
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (fin) {
@@ -265,6 +381,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
*bytes = moved;
+ /* If the moves have caught up with the DATA_FIN sequence number
+ * it's time to ack the DATA_FIN and change socket state, but
+ * this is not a good place to change state. Let the workqueue
+ * do it.
+ */
+ if (mptcp_pending_data_fin(sk, NULL) &&
+ schedule_work(&msk->work))
+ sock_hold(sk);
+
return done;
}
@@ -329,16 +454,6 @@ static void __mptcp_flush_join_list(struct mptcp_sock *msk)
spin_unlock_bh(&msk->join_list_lock);
}
-static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
-{
- long tout = ssk && inet_csk(ssk)->icsk_pending ?
- inet_csk(ssk)->icsk_timeout - jiffies : 0;
-
- if (tout <= 0)
- tout = mptcp_sk(sk)->timer_ival;
- mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
-}
-
static bool mptcp_timer_pending(struct sock *sk)
{
return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
@@ -360,7 +475,8 @@ void mptcp_data_acked(struct sock *sk)
{
mptcp_reset_timer(sk);
- if (!sk_stream_is_writeable(sk) &&
+ if ((!sk_stream_is_writeable(sk) ||
+ (inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk);
}
@@ -395,14 +511,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
}
}
-static void mptcp_stop_timer(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
- mptcp_sk(sk)->timer_ival = 0;
-}
-
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{
const struct sock *sk = (const struct sock *)msk;
@@ -466,8 +574,15 @@ static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- u64 snd_una = atomic64_read(&msk->snd_una);
bool cleaned = false;
+ u64 snd_una;
+
+ /* on fallback we just need to ignore snd_una, as this is really
+ * plain TCP
+ */
+ if (__mptcp_check_fallback(msk))
+ atomic64_set(&msk->snd_una, msk->write_seq);
+ snd_una = atomic64_read(&msk->snd_una);
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
@@ -479,15 +594,20 @@ static void mptcp_clean_una(struct sock *sk)
dfrag = mptcp_rtx_head(sk);
if (dfrag && after64(snd_una, dfrag->data_seq)) {
- u64 delta = dfrag->data_seq + dfrag->data_len - snd_una;
+ u64 delta = snd_una - dfrag->data_seq;
+
+ if (WARN_ON_ONCE(delta > dfrag->data_len))
+ goto out;
dfrag->data_seq += delta;
+ dfrag->offset += delta;
dfrag->data_len -= delta;
dfrag_uncharge(sk, delta);
cleaned = true;
}
+out:
if (cleaned) {
sk_mem_reclaim_partial(sk);
@@ -673,7 +793,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
out:
if (!retransmission)
pfrag->offset += frag_truesize;
- *write_seq += ret;
+ WRITE_ONCE(*write_seq, *write_seq + ret);
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
return ret;
@@ -740,7 +860,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
- struct socket *ssock;
size_t copied = 0;
struct sock *ssk;
bool tx_ok;
@@ -759,19 +878,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto out;
}
-fallback:
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock)) {
- release_sock(sk);
- pr_debug("fallback passthrough");
- ret = sock_sendmsg(ssock, msg);
- return ret >= 0 ? ret + copied : (copied ? copied : ret);
- }
-
pfrag = sk_page_frag(sk);
restart:
mptcp_clean_una(sk);
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ ret = -EPIPE;
+ goto out;
+ }
+
wait_for_sndbuf:
__mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
@@ -819,17 +934,6 @@ wait_for_sndbuf:
}
break;
}
- if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
- /* Can happen for passive sockets:
- * 3WHS negotiated MPTCP, but first packet after is
- * plain TCP (e.g. due to middlebox filtering unknown
- * options).
- *
- * Fall back to TCP.
- */
- release_sock(ssk);
- goto fallback;
- }
copied += ret;
@@ -880,7 +984,6 @@ wait_for_sndbuf:
mptcp_set_timeout(sk, ssk);
if (copied) {
- ret = copied;
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
size_goal);
@@ -893,7 +996,7 @@ wait_for_sndbuf:
release_sock(ssk);
out:
release_sock(sk);
- return ret;
+ return copied ? : ret;
}
static void mptcp_wait_data(struct sock *sk, long *timeo)
@@ -949,6 +1052,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
return copied;
}
+/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
+ *
+ * Only difference: Use highest rtt estimate of the subflows in use.
+ */
+static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ u32 time, advmss = 1;
+ u64 rtt_us, mstamp;
+
+ sock_owned_by_me(sk);
+
+ if (copied <= 0)
+ return;
+
+ msk->rcvq_space.copied += copied;
+
+ mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
+ time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+
+ rtt_us = msk->rcvq_space.rtt_us;
+ if (rtt_us && time < (rtt_us >> 3))
+ return;
+
+ rtt_us = 0;
+ mptcp_for_each_subflow(msk, subflow) {
+ const struct tcp_sock *tp;
+ u64 sf_rtt_us;
+ u32 sf_advmss;
+
+ tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
+
+ sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
+ sf_advmss = READ_ONCE(tp->advmss);
+
+ rtt_us = max(sf_rtt_us, rtt_us);
+ advmss = max(sf_advmss, advmss);
+ }
+
+ msk->rcvq_space.rtt_us = rtt_us;
+ if (time < (rtt_us >> 3) || rtt_us == 0)
+ return;
+
+ if (msk->rcvq_space.copied <= msk->rcvq_space.space)
+ goto new_measure;
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvmem, rcvbuf;
+ u64 rcvwin, grow;
+
+ rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
+
+ grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
+
+ do_div(grow, msk->rcvq_space.space);
+ rcvwin += (grow << 1);
+
+ rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(sk, rcvmem) < advmss)
+ rcvmem += 128;
+
+ do_div(rcvwin, advmss);
+ rcvbuf = min_t(u64, rcvwin * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+
+ if (rcvbuf > sk->sk_rcvbuf) {
+ u32 window_clamp;
+
+ window_clamp = tcp_win_from_space(sk, rcvbuf);
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
+ tcp_sk(ssk)->window_clamp = window_clamp;
+ }
+ }
+ }
+
+ msk->rcvq_space.space = msk->rcvq_space.copied;
+new_measure:
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.time = mstamp;
+}
+
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
unsigned int moved = 0;
@@ -972,7 +1169,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
int copied = 0;
int target;
long timeo;
@@ -981,16 +1177,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return -EOPNOTSUPP;
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock)) {
-fallback:
- release_sock(sk);
- pr_debug("fallback-read subflow=%p",
- mptcp_subflow_ctx(ssock->sk));
- copied = sock_recvmsg(ssock, msg, flags);
- return copied;
- }
-
timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX);
@@ -1056,9 +1242,6 @@ fallback:
pr_debug("block timeout %ld", timeo);
mptcp_wait_data(sk, &timeo);
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock))
- goto fallback;
}
if (skb_queue_empty(&sk->sk_receive_queue)) {
@@ -1075,6 +1258,8 @@ fallback:
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
+ mptcp_rcv_space_adjust(msk, copied);
+
release_sock(sk);
return copied;
}
@@ -1083,7 +1268,7 @@ static void mptcp_retransmit_handler(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (atomic64_read(&msk->snd_una) == msk->write_seq) {
+ if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) {
mptcp_stop_timer(sk);
} else {
set_bit(MPTCP_WORK_RTX, &msk->flags);
@@ -1172,6 +1357,29 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
+static void pm_work(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ pr_debug("msk=%p status=%x", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -1185,12 +1393,18 @@ static void mptcp_worker(struct work_struct *work)
lock_sock(sk);
mptcp_clean_una(sk);
+ mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk);
+ if (msk->pm.status)
+ pm_work(msk);
+
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
+ mptcp_check_data_fin(sk);
+
if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
goto unlock;
@@ -1283,7 +1497,12 @@ static int mptcp_init_sock(struct sock *sk)
if (ret)
return ret;
+ ret = __mptcp_socket_create(mptcp_sk(sk));
+ if (ret)
+ return ret;
+
sk_sockets_allocated_inc(sk);
+ sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
return 0;
@@ -1308,8 +1527,7 @@ static void mptcp_cancel_work(struct sock *sk)
sock_put(sk);
}
-static void mptcp_subflow_shutdown(struct sock *ssk, int how,
- bool data_fin_tx_enable, u64 data_fin_tx_seq)
+static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
{
lock_sock(ssk);
@@ -1322,36 +1540,84 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how,
tcp_disconnect(ssk, O_NONBLOCK);
break;
default:
- if (data_fin_tx_enable) {
- struct mptcp_subflow_context *subflow;
-
- subflow = mptcp_subflow_ctx(ssk);
- subflow->data_fin_tx_seq = data_fin_tx_seq;
- subflow->data_fin_tx_enable = 1;
+ if (__mptcp_check_fallback(mptcp_sk(sk))) {
+ pr_debug("Fallback");
+ ssk->sk_shutdown |= how;
+ tcp_shutdown(ssk, how);
+ } else {
+ pr_debug("Sending DATA_FIN on subflow %p", ssk);
+ mptcp_set_timeout(sk, ssk);
+ tcp_send_ack(ssk);
}
-
- ssk->sk_shutdown |= how;
- tcp_shutdown(ssk, how);
break;
}
- /* Wake up anyone sleeping in poll. */
- ssk->sk_state_change(ssk);
release_sock(ssk);
}
-/* Called with msk lock held, releases such lock before returning */
+static const unsigned char new_state[16] = {
+ /* current state: new state: action: */
+ [0 /* (Invalid) */] = TCP_CLOSE,
+ [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_SYN_SENT] = TCP_CLOSE,
+ [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
+ [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
+ [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */
+ [TCP_CLOSE] = TCP_CLOSE,
+ [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
+ [TCP_LAST_ACK] = TCP_LAST_ACK,
+ [TCP_LISTEN] = TCP_CLOSE,
+ [TCP_CLOSING] = TCP_CLOSING,
+ [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
+};
+
+static int mptcp_close_state(struct sock *sk)
+{
+ int next = (int)new_state[sk->sk_state];
+ int ns = next & TCP_STATE_MASK;
+
+ inet_sk_state_store(sk, ns);
+
+ return next & TCP_ACTION_FIN;
+}
+
static void mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow, *tmp;
struct mptcp_sock *msk = mptcp_sk(sk);
LIST_HEAD(conn_list);
- u64 data_fin_tx_seq;
lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ goto cleanup;
+ } else if (sk->sk_state == TCP_CLOSE) {
+ goto cleanup;
+ }
+
+ if (__mptcp_check_fallback(msk)) {
+ goto update_state;
+ } else if (mptcp_close_state(sk)) {
+ pr_debug("Sending DATA_FIN sk=%p", sk);
+ WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
+ WRITE_ONCE(msk->snd_data_fin_enable, 1);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK);
+ }
+ }
+
+ sk_stream_wait_close(sk, timeout);
+update_state:
inet_sk_state_store(sk, TCP_CLOSE);
+cleanup:
/* be sure to always acquire the join list lock, to sync vs
* mptcp_finish_join().
*/
@@ -1360,22 +1626,16 @@ static void mptcp_close(struct sock *sk, long timeout)
spin_unlock_bh(&msk->join_list_lock);
list_splice_init(&msk->conn_list, &conn_list);
- data_fin_tx_seq = msk->write_seq;
-
__mptcp_clear_xmit(sk);
release_sock(sk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- subflow->data_fin_tx_seq = data_fin_tx_seq;
- subflow->data_fin_tx_enable = 1;
__mptcp_close_ssk(sk, ssk, subflow, timeout);
}
mptcp_cancel_work(sk);
- mptcp_pm_close(msk);
__skb_queue_purge(&sk->sk_receive_queue);
@@ -1447,20 +1707,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->local_key = subflow_req->local_key;
msk->token = subflow_req->token;
msk->subflow = NULL;
-
- if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) {
- nsk->sk_state = TCP_CLOSE;
- bh_unlock_sock(nsk);
-
- /* we can't call into mptcp_close() here - possible BH context
- * free the sock directly.
- * sk_clone_lock() sets nsk refcnt to two, hence call sk_free()
- * too.
- */
- sk_common_release(nsk);
- sk_free(nsk);
- return NULL;
- }
+ WRITE_ONCE(msk->fully_established, false);
msk->write_seq = subflow_req->idsn + 1;
atomic64_set(&msk->snd_una, msk->write_seq);
@@ -1482,6 +1729,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
return nsk;
}
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.rtt_us = 0;
+
+ msk->rcvq_space.time = tp->tcp_mstamp;
+
+ /* initial rcv_space offering made to peer */
+ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+ TCP_INIT_CWND * tp->advmss);
+ if (msk->rcvq_space.space == 0)
+ msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+}
+
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
bool kern)
{
@@ -1501,7 +1764,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
return NULL;
pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
-
if (sk_is_mptcp(newsk)) {
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
@@ -1529,8 +1791,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
newsk = new_mptcp_sock;
mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list);
- inet_sk_state_store(newsk, TCP_ESTABLISHED);
+ mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(new_mptcp_sock);
__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
@@ -1547,21 +1809,82 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_token_destroy(msk->token);
+ mptcp_token_destroy(msk);
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
sk_sockets_allocated_dec(sk);
}
+static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct socket *ssock;
+ int ret;
+
+ switch (optname) {
+ case SO_REUSEPORT:
+ case SO_REUSEADDR:
+ lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
+ if (ret == 0) {
+ if (optname == SO_REUSEPORT)
+ sk->sk_reuseport = ssock->sk->sk_reuseport;
+ else if (optname == SO_REUSEADDR)
+ sk->sk_reuse = ssock->sk->sk_reuse;
+ }
+ release_sock(sk);
+ return ret;
+ }
+
+ return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
+}
+
+static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ int ret = -EOPNOTSUPP;
+ struct socket *ssock;
+
+ switch (optname) {
+ case IPV6_V6ONLY:
+ lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
+ if (ret == 0)
+ sk->sk_ipv6only = ssock->sk->sk_ipv6only;
+
+ release_sock(sk);
+ break;
+ }
+
+ return ret;
+}
+
static int mptcp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
+ struct sock *ssk;
pr_debug("msk=%p", msk);
+ if (level == SOL_SOCKET)
+ return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
+
/* @@ the meaning of setsockopt() when the socket is connected and
* there are multiple subflows is not yet defined. It is up to the
* MPTCP-level socket to configure the subflows until the subflow
@@ -1569,11 +1892,13 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
* to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
+ ssk = __mptcp_tcp_fallback(msk);
release_sock(sk);
- if (ssock)
- return tcp_setsockopt(ssock->sk, level, optname, optval,
- optlen);
+ if (ssk)
+ return tcp_setsockopt(ssk, level, optname, optval, optlen);
+
+ if (level == SOL_IPV6)
+ return mptcp_setsockopt_v6(msk, optname, optval, optlen);
return -EOPNOTSUPP;
}
@@ -1582,7 +1907,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *option)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct socket *ssock;
+ struct sock *ssk;
pr_debug("msk=%p", msk);
@@ -1593,11 +1918,10 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
* to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
+ ssk = __mptcp_tcp_fallback(msk);
release_sock(sk);
- if (ssock)
- return tcp_getsockopt(ssock->sk, level, optname, optval,
- option);
+ if (ssk)
+ return tcp_getsockopt(ssk, level, optname, optval, option);
return -EOPNOTSUPP;
}
@@ -1636,6 +1960,20 @@ static void mptcp_release_cb(struct sock *sk)
}
}
+static int mptcp_hash(struct sock *sk)
+{
+ /* should never be called,
+ * we hash the TCP subflows not the master socket
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void mptcp_unhash(struct sock *sk)
+{
+ /* called from sk_common_release(), but nothing to do here */
+}
+
static int mptcp_get_port(struct sock *sk, unsigned short snum)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1660,32 +1998,26 @@ void mptcp_finish_connect(struct sock *ssk)
sk = subflow->conn;
msk = mptcp_sk(sk);
- if (!subflow->mp_capable) {
- MPTCP_INC_STATS(sock_net(sk),
- MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
- return;
- }
-
pr_debug("msk=%p, token=%u", sk, subflow->token);
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
ack_seq++;
subflow->map_seq = ack_seq;
subflow->map_subflow_seq = 1;
- subflow->rel_write_seq = 1;
/* the socket is not connected yet, no msk/subflow ops can access/race
* accessing the field below
*/
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->local_key, subflow->local_key);
- WRITE_ONCE(msk->token, subflow->token);
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
atomic64_set(&msk->snd_una, msk->write_seq);
mptcp_pm_new_connection(msk, 0);
+
+ mptcp_rcv_space_init(msk, ssk);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
@@ -1708,7 +2040,7 @@ bool mptcp_finish_join(struct sock *sk)
pr_debug("msk=%p, subflow=%p", msk, subflow);
/* mptcp socket already closing? */
- if (inet_sk_state_load(parent) != TCP_ESTABLISHED)
+ if (!mptcp_is_fully_established(parent))
return false;
if (!msk->pm.server_side)
@@ -1761,8 +2093,8 @@ static struct proto mptcp_prot = {
.sendmsg = mptcp_sendmsg,
.recvmsg = mptcp_recvmsg,
.release_cb = mptcp_release_cb,
- .hash = inet_hash,
- .unhash = inet_unhash,
+ .hash = mptcp_hash,
+ .unhash = mptcp_unhash,
.get_port = mptcp_get_port,
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
@@ -1771,6 +2103,7 @@ static struct proto mptcp_prot = {
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock),
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.no_autobind = true,
};
@@ -1781,9 +2114,9 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
int err;
lock_sock(sock->sk);
- ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
goto unlock;
}
@@ -1796,10 +2129,18 @@ unlock:
return err;
}
+static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ subflow->request_mptcp = 0;
+ __mptcp_do_fallback(msk);
+}
+
static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct mptcp_subflow_context *subflow;
struct socket *ssock;
int err;
@@ -1812,19 +2153,24 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
goto do_connect;
}
- ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
goto unlock;
}
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sock->sk, TCP_SYN_SENT);
+ subflow = mptcp_subflow_ctx(ssock->sk);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
* TCP option space.
*/
if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
- mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;
+ mptcp_subflow_early_fallback(msk, subflow);
#endif
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
+ mptcp_subflow_early_fallback(msk, subflow);
do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
@@ -1843,42 +2189,6 @@ unlock:
return err;
}
-static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
-{
- if (sock->sk->sk_prot == &tcp_prot) {
- /* we are being invoked from __sys_accept4, after
- * mptcp_accept() has just accepted a non-mp-capable
- * flow: sk is a tcp_sk, not an mptcp one.
- *
- * Hand the socket over to tcp so all further socket ops
- * bypass mptcp.
- */
- sock->ops = &inet_stream_ops;
- }
-
- return inet_getname(sock, uaddr, peer);
-}
-
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
-{
- if (sock->sk->sk_prot == &tcpv6_prot) {
- /* we are being invoked from __sys_accept4 after
- * mptcp_accept() has accepted a non-mp-capable
- * subflow: sk is a tcp_sk, not mptcp.
- *
- * Hand the socket over to tcp so all further
- * socket ops bypass mptcp.
- */
- sock->ops = &inet6_stream_ops;
- }
-
- return inet6_getname(sock, uaddr, peer);
-}
-#endif
-
static int mptcp_listen(struct socket *sock, int backlog)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
@@ -1888,12 +2198,14 @@ static int mptcp_listen(struct socket *sock, int backlog)
pr_debug("msk=%p", msk);
lock_sock(sock->sk);
- ssock = __mptcp_socket_create(msk, TCP_LISTEN);
- if (IS_ERR(ssock)) {
- err = PTR_ERR(ssock);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
goto unlock;
}
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sock->sk, TCP_LISTEN);
sock_set_flag(sock->sk, SOCK_RCU_FREE);
err = ssock->ops->listen(ssock, backlog);
@@ -1906,15 +2218,6 @@ unlock:
return err;
}
-static bool is_tcp_proto(const struct proto *p)
-{
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- return p == &tcp_prot || p == &tcpv6_prot;
-#else
- return p == &tcp_prot;
-#endif
-}
-
static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
int flags, bool kern)
{
@@ -1932,11 +2235,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssock)
goto unlock_fail;
+ clear_bit(MPTCP_DATA_READY, &msk->flags);
sock_hold(ssock->sk);
release_sock(sock->sk);
err = ssock->ops->accept(sock, newsock, flags, kern);
- if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) {
+ if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
struct mptcp_sock *msk = mptcp_sk(newsock->sk);
struct mptcp_subflow_context *subflow;
@@ -1944,7 +2248,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
* This is needed so NOSPACE flag can be set from tcp stack.
*/
__mptcp_flush_join_list(msk);
- list_for_each_entry(subflow, &msk->conn_list, node) {
+ mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!ssk->sk_socket)
@@ -1952,6 +2256,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
}
}
+ if (inet_csk_listen_poll(ssock->sk))
+ set_bit(MPTCP_DATA_READY, &msk->flags);
sock_put(ssock->sk);
return err;
@@ -1960,39 +2266,36 @@ unlock_fail:
return -EINVAL;
}
+static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+{
+ return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM :
+ 0;
+}
+
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
struct sock *sk = sock->sk;
struct mptcp_sock *msk;
- struct socket *ssock;
__poll_t mask = 0;
+ int state;
msk = mptcp_sk(sk);
- lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (!ssock)
- ssock = __mptcp_nmpc_socket(msk);
- if (ssock) {
- mask = ssock->ops->poll(file, ssock, wait);
- release_sock(sk);
- return mask;
- }
-
- release_sock(sk);
sock_poll_wait(file, sock, wait);
- lock_sock(sk);
- if (test_bit(MPTCP_DATA_READY, &msk->flags))
- mask = EPOLLIN | EPOLLRDNORM;
- if (sk_stream_is_writeable(sk) &&
- test_bit(MPTCP_SEND_SPACE, &msk->flags))
- mask |= EPOLLOUT | EPOLLWRNORM;
+ state = inet_sk_state_load(sk);
+ if (state == TCP_LISTEN)
+ return mptcp_check_readable(msk);
+
+ if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
+ mask |= mptcp_check_readable(msk);
+ if (sk_stream_is_writeable(sk) &&
+ test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ }
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
- release_sock(sk);
-
return mask;
}
@@ -2000,23 +2303,13 @@ static int mptcp_shutdown(struct socket *sock, int how)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct mptcp_subflow_context *subflow;
- struct socket *ssock;
int ret = 0;
pr_debug("sk=%p, how=%d", msk, how);
lock_sock(sock->sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (ssock) {
- release_sock(sock->sk);
- return inet_shutdown(ssock, how);
- }
-
- if (how == SHUT_WR || how == SHUT_RDWR)
- inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
how++;
-
if ((how & ~SHUTDOWN_MASK) || !how) {
ret = -EINVAL;
goto out_unlock;
@@ -2030,13 +2323,36 @@ static int mptcp_shutdown(struct socket *sock, int how)
sock->state = SS_CONNECTED;
}
- __mptcp_flush_join_list(msk);
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+ /* If we've already sent a FIN, or it's a closed state, skip this. */
+ if (__mptcp_check_fallback(msk)) {
+ if (how == SHUT_WR || how == SHUT_RDWR)
+ inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
- mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
+ }
+ } else if ((how & SEND_SHUTDOWN) &&
+ ((1 << sock->sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+ TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) &&
+ mptcp_close_state(sock->sk)) {
+ __mptcp_flush_join_list(msk);
+
+ WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
+ WRITE_ONCE(msk->snd_data_fin_enable, 1);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_subflow_shutdown(sock->sk, tcp_sk, how);
+ }
}
+ /* Wake up anyone sleeping in poll. */
+ sock->sk->sk_state_change(sock->sk);
+
out_unlock:
release_sock(sock->sk);
@@ -2051,7 +2367,7 @@ static const struct proto_ops mptcp_stream_ops = {
.connect = mptcp_stream_connect,
.socketpair = sock_no_socketpair,
.accept = mptcp_stream_accept,
- .getname = mptcp_v4_getname,
+ .getname = inet_getname,
.poll = mptcp_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
@@ -2063,10 +2379,6 @@ static const struct proto_ops mptcp_stream_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
};
static struct inet_protosw mptcp_protosw = {
@@ -2077,7 +2389,7 @@ static struct inet_protosw mptcp_protosw = {
.flags = INET_PROTOSW_ICSK,
};
-void mptcp_proto_init(void)
+void __init mptcp_proto_init(void)
{
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
@@ -2086,6 +2398,7 @@ void mptcp_proto_init(void)
mptcp_subflow_init();
mptcp_pm_init();
+ mptcp_token_init();
if (proto_register(&mptcp_prot, 1) != 0)
panic("Failed to register MPTCP proto.\n");
@@ -2104,7 +2417,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.connect = mptcp_stream_connect,
.socketpair = sock_no_socketpair,
.accept = mptcp_stream_accept,
- .getname = mptcp_v6_getname,
+ .getname = inet6_getname,
.poll = mptcp_poll,
.ioctl = inet6_ioctl,
.gettstamp = sock_gettstamp,
@@ -2118,8 +2431,6 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet6_compat_ioctl,
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
@@ -2139,7 +2450,7 @@ static struct inet_protosw mptcp_v6_protosw = {
.flags = INET_PROTOSW_ICSK,
};
-int mptcp_proto_v6_init(void)
+int __init mptcp_proto_v6_init(void)
{
int err;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index c6eeaf3e8dcb..60b27d44c184 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -89,6 +89,7 @@
#define MPTCP_SEND_SPACE 1
#define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3
+#define MPTCP_FALLBACK_DONE 4
struct mptcp_options_received {
u64 sndr_key;
@@ -173,8 +174,6 @@ struct mptcp_pm_data {
u8 local_addr_max;
u8 subflows_max;
u8 status;
-
- struct work_struct work;
};
struct mptcp_data_frag {
@@ -194,11 +193,15 @@ struct mptcp_sock {
u64 remote_key;
u64 write_seq;
u64 ack_seq;
+ u64 rcv_data_fin_seq;
atomic64_t snd_una;
unsigned long timer_ival;
u32 token;
unsigned long flags;
bool can_ack;
+ bool fully_established;
+ bool rcv_data_fin;
+ bool snd_data_fin_enable;
spinlock_t join_list_lock;
struct work_struct work;
struct list_head conn_list;
@@ -208,6 +211,12 @@ struct mptcp_sock {
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
struct mptcp_pm_data pm;
+ struct {
+ u32 space; /* bytes copied in last measurement window */
+ u32 copied; /* bytes copied in this measurement window */
+ u64 time; /* start time of measurement window */
+ u64 rtt_us; /* last maximum rtt of subflows */
+ } rcvq_space;
};
#define mptcp_for_each_subflow(__msk, __subflow) \
@@ -250,6 +259,7 @@ struct mptcp_subflow_request_sock {
u32 local_nonce;
u32 remote_nonce;
struct mptcp_sock *msk;
+ struct hlist_nulls_node token_node;
};
static inline struct mptcp_subflow_request_sock *
@@ -284,10 +294,8 @@ struct mptcp_subflow_context {
backup : 1,
data_avail : 1,
rx_eof : 1,
- data_fin_tx_enable : 1,
use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
can_ack : 1; /* only after processing the remote a key */
- u64 data_fin_tx_seq;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -336,8 +344,10 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
}
int mptcp_is_enabled(struct net *net);
+void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
+ struct mptcp_options_received *mp_opt);
bool mptcp_subflow_data_available(struct sock *sk);
-void mptcp_subflow_init(void);
+void __init mptcp_subflow_init(void);
/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, int ifindex,
@@ -355,14 +365,9 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
}
-extern const struct inet_connection_sock_af_ops ipv4_specific;
+void __init mptcp_proto_init(void);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-extern const struct inet_connection_sock_af_ops ipv6_specific;
-#endif
-
-void mptcp_proto_init(void);
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-int mptcp_proto_v6_init(void);
+int __init mptcp_proto_v6_init(void);
#endif
struct sock *mptcp_sk_clone(const struct sock *sk,
@@ -372,36 +377,41 @@ void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
+static inline bool mptcp_is_fully_established(struct sock *sk)
+{
+ return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
+ READ_ONCE(mptcp_sk(sk)->fully_established);
+}
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
+bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq);
+
+void __init mptcp_token_init(void);
+static inline void mptcp_token_init_request(struct request_sock *req)
+{
+ mptcp_subflow_rsk(req)->token_node.pprev = NULL;
+}
int mptcp_token_new_request(struct request_sock *req);
-void mptcp_token_destroy_request(u32 token);
+void mptcp_token_destroy_request(struct request_sock *req);
int mptcp_token_new_connect(struct sock *sk);
-int mptcp_token_new_accept(u32 token, struct sock *conn);
+void mptcp_token_accept(struct mptcp_subflow_request_sock *r,
+ struct mptcp_sock *msk);
+bool mptcp_token_exists(u32 token);
struct mptcp_sock *mptcp_token_get_sock(u32 token);
-void mptcp_token_destroy(u32 token);
+struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
+ long *s_num);
+void mptcp_token_destroy(struct mptcp_sock *msk);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
-static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
-{
- /* we might consider a faster version that computes the key as a
- * hash of some information available in the MPTCP socket. Use
- * random data at the moment, as it's probably the safest option
- * in case multiple sockets are opened in different namespaces at
- * the same time.
- */
- get_random_bytes(key, sizeof(u64));
- mptcp_crypto_key_sha(*key, token, idsn);
-}
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
-void mptcp_pm_init(void);
+void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
-void mptcp_pm_close(struct mptcp_sock *msk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
void mptcp_pm_fully_established(struct mptcp_sock *msk);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
@@ -433,7 +443,7 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_addr_info *saddr);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
-void mptcp_pm_nl_init(void);
+void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
@@ -454,4 +464,66 @@ static inline bool before64(__u64 seq1, __u64 seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
+static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+{
+ return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline bool mptcp_check_fallback(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ return __mptcp_check_fallback(msk);
+}
+
+static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
+{
+ if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
+ pr_debug("TCP fallback already done (msk=%p)", msk);
+ return;
+ }
+ set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline void mptcp_do_fallback(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ __mptcp_do_fallback(msk);
+}
+
+#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
+
+static inline bool subflow_simultaneous_connect(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ return sk->sk_state == TCP_ESTABLISHED &&
+ !mptcp_sk(parent)->pm.server_side &&
+ !subflow->conn_finished;
+}
+
+#ifdef CONFIG_SYN_COOKIES
+void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb);
+bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb);
+void __init mptcp_join_cookie_init(void);
+#else
+static inline void
+subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb) {}
+static inline bool
+mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline void mptcp_join_cookie_init(void) {}
+#endif
+
#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 3838a0b3a21f..96f4f2fe50ad 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -29,40 +29,6 @@ static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
}
-static int subflow_rebuild_header(struct sock *sk)
-{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- int local_id, err = 0;
-
- if (subflow->request_mptcp && !subflow->token) {
- pr_debug("subflow=%p", sk);
- err = mptcp_token_new_connect(sk);
- } else if (subflow->request_join && !subflow->local_nonce) {
- struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
-
- pr_debug("subflow=%p", sk);
-
- do {
- get_random_bytes(&subflow->local_nonce, sizeof(u32));
- } while (!subflow->local_nonce);
-
- if (subflow->local_id)
- goto out;
-
- local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
- if (local_id < 0)
- return -EINVAL;
-
- subflow->local_id = local_id;
- }
-
-out:
- if (err)
- return err;
-
- return subflow->icsk_af_ops->rebuild_header(sk);
-}
-
static void subflow_req_destructor(struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@ -72,8 +38,7 @@ static void subflow_req_destructor(struct request_sock *req)
if (subflow_req->msk)
sock_put((struct sock *)subflow_req->msk);
- if (subflow_req->mp_capable)
- mptcp_token_destroy_request(subflow_req->token);
+ mptcp_token_destroy_request(req);
tcp_request_sock_ops.destructor(req);
}
@@ -88,6 +53,12 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
}
+static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
+{
+ return mptcp_is_fully_established((void *)msk) &&
+ READ_ONCE(msk->pm.accept_subflow);
+}
+
/* validate received token and create truncated hmac and nonce for SYN-ACK */
static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
const struct sk_buff *skb)
@@ -120,30 +91,43 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
return msk;
}
-static void subflow_init_req(struct request_sock *req,
- const struct sock *sk_listener,
- struct sk_buff *skb)
+static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
{
- struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
- struct mptcp_options_received mp_opt;
-
- pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
-
- mptcp_get_options(skb, &mp_opt);
subflow_req->mp_capable = 0;
subflow_req->mp_join = 0;
subflow_req->msk = NULL;
+ mptcp_token_init_request(req);
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
* TCP option space.
*/
if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
- return;
+ return -EINVAL;
#endif
+ return 0;
+}
+
+static void subflow_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_options_received mp_opt;
+ int ret;
+
+ pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+
+ ret = __subflow_init_req(req, sk_listener);
+ if (ret)
+ return;
+
+ mptcp_get_options(skb, &mp_opt);
+
if (mp_opt.mp_capable) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
@@ -154,13 +138,33 @@ static void subflow_init_req(struct request_sock *req,
}
if (mp_opt.mp_capable && listener->request_mptcp) {
- int err;
+ int err, retries = 4;
+
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+again:
+ do {
+ get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
+ } while (subflow_req->local_key == 0);
+
+ if (unlikely(req->syncookie)) {
+ mptcp_crypto_key_sha(subflow_req->local_key,
+ &subflow_req->token,
+ &subflow_req->idsn);
+ if (mptcp_token_exists(subflow_req->token)) {
+ if (retries-- > 0)
+ goto again;
+ } else {
+ subflow_req->mp_capable = 1;
+ }
+ return;
+ }
err = mptcp_token_new_request(req);
if (err == 0)
subflow_req->mp_capable = 1;
+ else if (retries-- > 0)
+ goto again;
- subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
} else if (mp_opt.mp_join && listener->request_mptcp) {
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
subflow_req->mp_join = 1;
@@ -169,11 +173,60 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->token = mp_opt.token;
subflow_req->remote_nonce = mp_opt.nonce;
subflow_req->msk = subflow_token_join_request(req, skb);
+
+ if (unlikely(req->syncookie) && subflow_req->msk) {
+ if (mptcp_can_accept_new_subflow(subflow_req->msk))
+ subflow_init_req_cookie_join_save(subflow_req, skb);
+ }
+
pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
subflow_req->remote_nonce, subflow_req->msk);
}
}
+int mptcp_subflow_init_cookie_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_options_received mp_opt;
+ int err;
+
+ err = __subflow_init_req(req, sk_listener);
+ if (err)
+ return err;
+
+ mptcp_get_options(skb, &mp_opt);
+
+ if (mp_opt.mp_capable && mp_opt.mp_join)
+ return -EINVAL;
+
+ if (mp_opt.mp_capable && listener->request_mptcp) {
+ if (mp_opt.sndr_key == 0)
+ return -EINVAL;
+
+ subflow_req->local_key = mp_opt.rcvr_key;
+ err = mptcp_token_new_request(req);
+ if (err)
+ return err;
+
+ subflow_req->mp_capable = 1;
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
+ } else if (mp_opt.mp_join && listener->request_mptcp) {
+ if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
+ return -EINVAL;
+
+ if (mptcp_can_accept_new_subflow(subflow_req->msk))
+ subflow_req->mp_join = 1;
+
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
+
static void subflow_v4_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
@@ -222,7 +275,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_options_received mp_opt;
struct sock *parent = subflow->conn;
- struct tcp_sock *tp = tcp_sk(sk);
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -235,46 +287,40 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (subflow->conn_finished)
return;
+ subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+ pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
mptcp_get_options(skb, &mp_opt);
- if (subflow->request_mptcp && mp_opt.mp_capable) {
+ if (subflow->request_mptcp) {
+ if (!mp_opt.mp_capable) {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+ mptcp_do_fallback(sk);
+ pr_fallback(mptcp_sk(subflow->conn));
+ goto fallback;
+ }
+
subflow->mp_capable = 1;
subflow->can_ack = 1;
subflow->remote_key = mp_opt.sndr_key;
pr_debug("subflow=%p, remote_key=%llu", subflow,
subflow->remote_key);
- } else if (subflow->request_join && mp_opt.mp_join) {
- subflow->mp_join = 1;
+ mptcp_finish_connect(sk);
+ } else if (subflow->request_join) {
+ u8 hmac[SHA256_DIGEST_SIZE];
+
+ if (!mp_opt.mp_join)
+ goto do_reset;
+
subflow->thmac = mp_opt.thmac;
subflow->remote_nonce = mp_opt.nonce;
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
subflow->thmac, subflow->remote_nonce);
- } else if (subflow->request_mptcp) {
- tp->is_mptcp = 0;
- }
- if (!tp->is_mptcp)
- return;
-
- if (subflow->mp_capable) {
- pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
- subflow->remote_key);
- mptcp_finish_connect(sk);
-
- if (skb) {
- pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
- subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
- }
- } else if (subflow->mp_join) {
- u8 hmac[SHA256_DIGEST_SIZE];
-
- pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
- subflow, subflow->thmac,
- subflow->remote_nonce);
if (!subflow_thmac_valid(subflow)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
- subflow->mp_join = 0;
goto do_reset;
}
@@ -282,24 +328,26 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->local_nonce,
subflow->remote_nonce,
hmac);
-
memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
- if (skb)
- subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
-
if (!mptcp_finish_join(sk))
goto do_reset;
+ subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
- } else {
-do_reset:
- tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_done(sk);
+ } else if (mptcp_check_fallback(sk)) {
+fallback:
+ mptcp_rcv_space_init(mptcp_sk(parent), sk);
}
+ return;
+
+do_reset:
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
}
-static struct request_sock_ops subflow_request_sock_ops;
+struct request_sock_ops mptcp_subflow_request_sock_ops;
+EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -312,7 +360,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
- return tcp_conn_request(&subflow_request_sock_ops,
+ return tcp_conn_request(&mptcp_subflow_request_sock_ops,
&subflow_request_sock_ipv4_ops,
sk, skb);
drop:
@@ -337,7 +385,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (!ipv6_unicast_destination(skb))
goto drop;
- return tcp_conn_request(&subflow_request_sock_ops,
+ return tcp_conn_request(&mptcp_subflow_request_sock_ops,
&subflow_request_sock_ipv6_ops, sk, skb);
drop:
@@ -386,7 +434,7 @@ static void mptcp_sock_destruct(struct sock *sk)
sock_orphan(sk);
}
- mptcp_token_destroy(mptcp_sk(sk)->token);
+ mptcp_token_destroy(mptcp_sk(sk));
inet_sock_destruct(sk);
}
@@ -421,6 +469,17 @@ static void subflow_drop_ctx(struct sock *ssk)
kfree_rcu(ctx, rcu);
}
+void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
+ struct mptcp_options_received *mp_opt)
+{
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ subflow->remote_key = mp_opt->sndr_key;
+ subflow->fully_established = 1;
+ subflow->can_ack = 1;
+ WRITE_ONCE(msk->fully_established, true);
+}
+
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
@@ -444,7 +503,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
/* hopefully temporary handling for MP_JOIN+syncookie */
subflow_req = mptcp_subflow_rsk(req);
- fallback_is_fatal = subflow_req->mp_join;
+ fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
fallback = !tcp_rsk(req)->is_mptcp;
if (fallback)
goto create_child;
@@ -472,6 +531,7 @@ create_msk:
} else if (subflow_req->mp_join) {
mptcp_get_options(skb, &mp_opt);
if (!mp_opt.mp_join ||
+ !mptcp_can_accept_new_subflow(subflow_req->msk) ||
!subflow_hmac_valid(req, &mp_opt)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
fallback = true;
@@ -500,20 +560,25 @@ create_child:
}
if (ctx->mp_capable) {
+ /* this can't race with mptcp_close(), as the msk is
+ * not yet exposted to user-space
+ */
+ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
+
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
new_msk->sk_destruct = mptcp_sock_destruct;
mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
+ mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
ctx->conn = new_msk;
new_msk = NULL;
/* with OoO packets we can reach here without ingress
* mpc option
*/
- ctx->remote_key = mp_opt.sndr_key;
- ctx->fully_established = mp_opt.mp_capable;
- ctx->can_ack = mp_opt.mp_capable;
+ if (mp_opt.mp_capable)
+ mptcp_subflow_fully_established(ctx, &mp_opt);
} else if (ctx->mp_join) {
struct mptcp_sock *owner;
@@ -548,9 +613,9 @@ out:
dispose_child:
subflow_drop_ctx(child);
tcp_rsk(req)->drop_req = true;
- tcp_send_active_reset(child, GFP_ATOMIC);
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
+ req->rsk_ops->send_reset(sk, skb);
/* The last child reference will be released by the caller */
return child;
@@ -562,7 +627,8 @@ enum mapping_status {
MAPPING_OK,
MAPPING_INVALID,
MAPPING_EMPTY,
- MAPPING_DATA_FIN
+ MAPPING_DATA_FIN,
+ MAPPING_DUMMY
};
static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
@@ -614,7 +680,8 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
return true;
}
-static enum mapping_status get_mapping_status(struct sock *ssk)
+static enum mapping_status get_mapping_status(struct sock *ssk,
+ struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_ext *mpext;
@@ -626,6 +693,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
if (!skb)
return MAPPING_EMPTY;
+ if (mptcp_check_fallback(ssk))
+ return MAPPING_DUMMY;
+
mpext = mptcp_get_ext(skb);
if (!mpext || !mpext->use_map) {
if (!subflow->map_valid && !skb->len) {
@@ -661,7 +731,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
if (mpext->data_fin == 1) {
if (data_len == 1) {
- pr_debug("DATA_FIN with no payload");
+ mptcp_update_rcv_data_fin(msk, mpext->data_seq);
+ pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
if (subflow->map_valid) {
/* A DATA_FIN might arrive in a DSS
* option before the previous mapping
@@ -673,6 +744,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
} else {
return MAPPING_DATA_FIN;
}
+ } else {
+ mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len);
+ pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len);
}
/* Adjust for DATA_FIN using 1 byte of sequence space */
@@ -761,12 +835,22 @@ static bool subflow_check_data_avail(struct sock *ssk)
u64 ack_seq;
u64 old_ack;
- status = get_mapping_status(ssk);
+ status = get_mapping_status(ssk, msk);
pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
if (status == MAPPING_INVALID) {
ssk->sk_err = EBADMSG;
goto fatal;
}
+ if (status == MAPPING_DUMMY) {
+ __mptcp_do_fallback(msk);
+ skb = skb_peek(&ssk->sk_receive_queue);
+ subflow->map_valid = 1;
+ subflow->map_seq = READ_ONCE(msk->ack_seq);
+ subflow->map_data_len = skb->len;
+ subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
+ subflow->ssn_offset;
+ return true;
+ }
if (status != MAPPING_OK)
return false;
@@ -889,15 +973,20 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
static void subflow_data_ready(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ u16 state = 1 << inet_sk_state_load(sk);
struct sock *parent = subflow->conn;
+ struct mptcp_sock *msk;
- if (!subflow->mp_capable && !subflow->mp_join) {
- subflow->tcp_data_ready(sk);
-
+ msk = mptcp_sk(parent);
+ if (state & TCPF_LISTEN) {
+ set_bit(MPTCP_DATA_READY, &msk->flags);
parent->sk_data_ready(parent);
return;
}
+ WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
+ !subflow->mp_join && !(state & TCPF_CLOSE));
+
if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk);
}
@@ -974,19 +1063,34 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
struct sockaddr_storage addr;
+ int local_id = loc->id;
struct socket *sf;
+ struct sock *ssk;
u32 remote_token;
int addrlen;
int err;
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (!mptcp_is_fully_established(sk))
return -ENOTCONN;
err = mptcp_subflow_create_socket(sk, &sf);
if (err)
return err;
- subflow = mptcp_subflow_ctx(sf->sk);
+ ssk = sf->sk;
+ subflow = mptcp_subflow_ctx(ssk);
+ do {
+ get_random_bytes(&subflow->local_nonce, sizeof(u32));
+ } while (!subflow->local_nonce);
+
+ if (!local_id) {
+ err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
+ if (err < 0)
+ goto failed;
+
+ local_id = err;
+ }
+
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
subflow->token = msk->token;
@@ -997,15 +1101,16 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- sf->sk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
- pr_debug("msk=%p remote_token=%u", msk, remote_token);
+ pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token,
+ local_id);
subflow->remote_token = remote_token;
- subflow->local_id = loc->id;
+ subflow->local_id = local_id;
subflow->request_join = 1;
subflow->request_bkup = 1;
mptcp_info2sockaddr(remote, &addr);
@@ -1032,6 +1137,12 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
struct socket *sf;
int err;
+ /* un-accepted server sockets can reach here - on bad configuration
+ * bail early to avoid greater trouble later
+ */
+ if (unlikely(!sk->sk_socket))
+ return -EINVAL;
+
err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
&sf);
if (err)
@@ -1118,14 +1229,26 @@ static void subflow_state_change(struct sock *sk)
__subflow_state_change(sk);
+ if (subflow_simultaneous_connect(sk)) {
+ mptcp_do_fallback(sk);
+ mptcp_rcv_space_init(mptcp_sk(parent), sk);
+ pr_fallback(mptcp_sk(parent));
+ subflow->conn_finished = 1;
+ if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
+ inet_sk_state_store(parent, TCP_ESTABLISHED);
+ parent->sk_state_change(parent);
+ }
+ }
+
/* as recvmsg() does not acquire the subflow socket for ssk selection
* a fin packet carrying a DSS can be unnoticed if we don't trigger
* the data available machinery here.
*/
- if (subflow->mp_capable && mptcp_subflow_data_available(sk))
+ if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk);
- if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
+ if (__mptcp_check_fallback(mptcp_sk(parent)) &&
+ !(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
mptcp_subflow_eof(parent);
@@ -1255,10 +1378,10 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops)
return 0;
}
-void mptcp_subflow_init(void)
+void __init mptcp_subflow_init(void)
{
- subflow_request_sock_ops = tcp_request_sock_ops;
- if (subflow_ops_init(&subflow_request_sock_ops) != 0)
+ mptcp_subflow_request_sock_ops = tcp_request_sock_ops;
+ if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0)
panic("MPTCP: failed to init subflow request sock ops\n");
subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
@@ -1268,7 +1391,6 @@ void mptcp_subflow_init(void)
subflow_specific.conn_request = subflow_v4_conn_request;
subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_specific.sk_rx_dst_set = subflow_finish_connect;
- subflow_specific.rebuild_header = subflow_rebuild_header;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
@@ -1278,7 +1400,6 @@ void mptcp_subflow_init(void)
subflow_v6_specific.conn_request = subflow_v6_conn_request;
subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
- subflow_v6_specific.rebuild_header = subflow_rebuild_header;
subflow_v6m_specific = subflow_v6_specific;
subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c
new file mode 100644
index 000000000000..abe0fd099746
--- /dev/null
+++ b/net/mptcp/syncookies.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/skbuff.h>
+
+#include "protocol.h"
+
+/* Syncookies do not work for JOIN requests.
+ *
+ * Unlike MP_CAPABLE, where the ACK cookie contains the needed MPTCP
+ * options to reconstruct the initial syn state, MP_JOIN does not contain
+ * the token to obtain the mptcp socket nor the server-generated nonce
+ * that was used in the cookie SYN/ACK response.
+ *
+ * Keep a small best effort state table to store the syn/synack data,
+ * indexed by skb hash.
+ *
+ * A MP_JOIN SYN packet handled by syn cookies is only stored if the 32bit
+ * token matches a known mptcp connection that can still accept more subflows.
+ *
+ * There is no timeout handling -- state is only re-constructed
+ * when the TCP ACK passed the cookie validation check.
+ */
+
+struct join_entry {
+ u32 token;
+ u32 remote_nonce;
+ u32 local_nonce;
+ u8 join_id;
+ u8 local_id;
+ u8 backup;
+ u8 valid;
+};
+
+#define COOKIE_JOIN_SLOTS 1024
+
+static struct join_entry join_entries[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp;
+static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp;
+
+static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net)
+{
+ u32 i = skb_get_hash(skb) ^ net_hash_mix(net);
+
+ return i % ARRAY_SIZE(join_entries);
+}
+
+static void mptcp_join_store_state(struct join_entry *entry,
+ const struct mptcp_subflow_request_sock *subflow_req)
+{
+ entry->token = subflow_req->token;
+ entry->remote_nonce = subflow_req->remote_nonce;
+ entry->local_nonce = subflow_req->local_nonce;
+ entry->backup = subflow_req->backup;
+ entry->join_id = subflow_req->remote_id;
+ entry->local_id = subflow_req->local_id;
+ entry->valid = 1;
+}
+
+void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb)
+{
+ struct net *net = read_pnet(&subflow_req->sk.req.ireq_net);
+ u32 i = mptcp_join_entry_hash(skb, net);
+
+ /* No use in waiting if other cpu is already using this slot --
+ * would overwrite the data that got stored.
+ */
+ spin_lock_bh(&join_entry_locks[i]);
+ mptcp_join_store_state(&join_entries[i], subflow_req);
+ spin_unlock_bh(&join_entry_locks[i]);
+}
+
+/* Called for a cookie-ack with MP_JOIN option present.
+ * Look up the saved state based on skb hash & check token matches msk
+ * in same netns.
+ *
+ * Caller will check msk can still accept another subflow. The hmac
+ * present in the cookie ACK mptcp option space will be checked later.
+ */
+bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb)
+{
+ struct net *net = read_pnet(&subflow_req->sk.req.ireq_net);
+ u32 i = mptcp_join_entry_hash(skb, net);
+ struct mptcp_sock *msk;
+ struct join_entry *e;
+
+ e = &join_entries[i];
+
+ spin_lock_bh(&join_entry_locks[i]);
+
+ if (e->valid == 0) {
+ spin_unlock_bh(&join_entry_locks[i]);
+ return false;
+ }
+
+ e->valid = 0;
+
+ msk = mptcp_token_get_sock(e->token);
+ if (!msk) {
+ spin_unlock_bh(&join_entry_locks[i]);
+ return false;
+ }
+
+ /* If this fails, the token got re-used in the mean time by another
+ * mptcp socket in a different netns, i.e. entry is outdated.
+ */
+ if (!net_eq(sock_net((struct sock *)msk), net))
+ goto err_put;
+
+ subflow_req->remote_nonce = e->remote_nonce;
+ subflow_req->local_nonce = e->local_nonce;
+ subflow_req->backup = e->backup;
+ subflow_req->remote_id = e->join_id;
+ subflow_req->token = e->token;
+ subflow_req->msk = msk;
+ spin_unlock_bh(&join_entry_locks[i]);
+ return true;
+
+err_put:
+ spin_unlock_bh(&join_entry_locks[i]);
+ sock_put((struct sock *)msk);
+ return false;
+}
+
+void __init mptcp_join_cookie_init(void)
+{
+ int i;
+
+ for (i = 0; i < COOKIE_JOIN_SLOTS; i++)
+ spin_lock_init(&join_entry_locks[i]);
+}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 33352dd99d4d..8b47c4bb1c6b 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -24,7 +24,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/radix-tree.h>
+#include <linux/memblock.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <net/sock.h>
@@ -33,10 +33,67 @@
#include <net/mptcp.h>
#include "protocol.h"
-static RADIX_TREE(token_tree, GFP_ATOMIC);
-static RADIX_TREE(token_req_tree, GFP_ATOMIC);
-static DEFINE_SPINLOCK(token_tree_lock);
-static int token_used __read_mostly;
+#define TOKEN_MAX_RETRIES 4
+#define TOKEN_MAX_CHAIN_LEN 4
+
+struct token_bucket {
+ spinlock_t lock;
+ int chain_len;
+ struct hlist_nulls_head req_chain;
+ struct hlist_nulls_head msk_chain;
+};
+
+static struct token_bucket *token_hash __read_mostly;
+static unsigned int token_mask __read_mostly;
+
+static struct token_bucket *token_bucket(u32 token)
+{
+ return &token_hash[token & token_mask];
+}
+
+/* called with bucket lock held */
+static struct mptcp_subflow_request_sock *
+__token_lookup_req(struct token_bucket *t, u32 token)
+{
+ struct mptcp_subflow_request_sock *req;
+ struct hlist_nulls_node *pos;
+
+ hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node)
+ if (req->token == token)
+ return req;
+ return NULL;
+}
+
+/* called with bucket lock held */
+static struct mptcp_sock *
+__token_lookup_msk(struct token_bucket *t, u32 token)
+{
+ struct hlist_nulls_node *pos;
+ struct sock *sk;
+
+ sk_nulls_for_each_rcu(sk, pos, &t->msk_chain)
+ if (mptcp_sk(sk)->token == token)
+ return mptcp_sk(sk);
+ return NULL;
+}
+
+static bool __token_bucket_busy(struct token_bucket *t, u32 token)
+{
+ return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN ||
+ __token_lookup_req(t, token) || __token_lookup_msk(t, token);
+}
+
+static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
+{
+ /* we might consider a faster version that computes the key as a
+ * hash of some information available in the MPTCP socket. Use
+ * random data at the moment, as it's probably the safest option
+ * in case multiple sockets are opened in different namespaces at
+ * the same time.
+ */
+ get_random_bytes(key, sizeof(u64));
+ mptcp_crypto_key_sha(*key, token, idsn);
+}
/**
* mptcp_token_new_request - create new key/idsn/token for subflow_request
@@ -52,30 +109,28 @@ static int token_used __read_mostly;
int mptcp_token_new_request(struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
- int err;
-
- while (1) {
- u32 token;
-
- mptcp_crypto_key_gen_sha(&subflow_req->local_key,
- &subflow_req->token,
- &subflow_req->idsn);
- pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
- req, subflow_req->local_key, subflow_req->token,
- subflow_req->idsn);
-
- token = subflow_req->token;
- spin_lock_bh(&token_tree_lock);
- if (!radix_tree_lookup(&token_req_tree, token) &&
- !radix_tree_lookup(&token_tree, token))
- break;
- spin_unlock_bh(&token_tree_lock);
+ struct token_bucket *bucket;
+ u32 token;
+
+ mptcp_crypto_key_sha(subflow_req->local_key,
+ &subflow_req->token,
+ &subflow_req->idsn);
+ pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
+ req, subflow_req->local_key, subflow_req->token,
+ subflow_req->idsn);
+
+ token = subflow_req->token;
+ bucket = token_bucket(token);
+ spin_lock_bh(&bucket->lock);
+ if (__token_bucket_busy(bucket, token)) {
+ spin_unlock_bh(&bucket->lock);
+ return -EBUSY;
}
- err = radix_tree_insert(&token_req_tree,
- subflow_req->token, &token_used);
- spin_unlock_bh(&token_tree_lock);
- return err;
+ hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain);
+ bucket->chain_len++;
+ spin_unlock_bh(&bucket->lock);
+ return 0;
}
/**
@@ -97,48 +152,82 @@ int mptcp_token_new_request(struct request_sock *req)
int mptcp_token_new_connect(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sock *mptcp_sock = subflow->conn;
- int err;
-
- while (1) {
- u32 token;
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ int retries = TOKEN_MAX_RETRIES;
+ struct token_bucket *bucket;
- mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
- &subflow->idsn);
+ pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
+ sk, subflow->local_key, subflow->token, subflow->idsn);
- pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
- sk, subflow->local_key, subflow->token, subflow->idsn);
+again:
+ mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
+ &subflow->idsn);
- token = subflow->token;
- spin_lock_bh(&token_tree_lock);
- if (!radix_tree_lookup(&token_req_tree, token) &&
- !radix_tree_lookup(&token_tree, token))
- break;
- spin_unlock_bh(&token_tree_lock);
+ bucket = token_bucket(subflow->token);
+ spin_lock_bh(&bucket->lock);
+ if (__token_bucket_busy(bucket, subflow->token)) {
+ spin_unlock_bh(&bucket->lock);
+ if (!--retries)
+ return -EBUSY;
+ goto again;
}
- err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock);
- spin_unlock_bh(&token_tree_lock);
- return err;
+ WRITE_ONCE(msk->token, subflow->token);
+ __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
+ bucket->chain_len++;
+ spin_unlock_bh(&bucket->lock);
+ return 0;
}
/**
- * mptcp_token_new_accept - insert token for later processing
- * @token: the token to insert to the tree
- * @conn: the just cloned socket linked to the new connection
+ * mptcp_token_accept - replace a req sk with full sock in token hash
+ * @req: the request socket to be removed
+ * @msk: the just cloned socket linked to the new connection
*
* Called when a SYN packet creates a new logical connection, i.e.
* is not a join request.
*/
-int mptcp_token_new_accept(u32 token, struct sock *conn)
+void mptcp_token_accept(struct mptcp_subflow_request_sock *req,
+ struct mptcp_sock *msk)
{
- int err;
+ struct mptcp_subflow_request_sock *pos;
+ struct token_bucket *bucket;
- spin_lock_bh(&token_tree_lock);
- err = radix_tree_insert(&token_tree, token, conn);
- spin_unlock_bh(&token_tree_lock);
+ bucket = token_bucket(req->token);
+ spin_lock_bh(&bucket->lock);
- return err;
+ /* pedantic lookup check for the moved token */
+ pos = __token_lookup_req(bucket, req->token);
+ if (!WARN_ON_ONCE(pos != req))
+ hlist_nulls_del_init_rcu(&req->token_node);
+ __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
+ spin_unlock_bh(&bucket->lock);
+}
+
+bool mptcp_token_exists(u32 token)
+{
+ struct hlist_nulls_node *pos;
+ struct token_bucket *bucket;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+
+ rcu_read_lock();
+ bucket = token_bucket(token);
+
+again:
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ msk = mptcp_sk(sk);
+ if (READ_ONCE(msk->token) == token)
+ goto found;
+ }
+ if (get_nulls_value(pos) != (token & token_mask))
+ goto again;
+
+ rcu_read_unlock();
+ return false;
+found:
+ rcu_read_unlock();
+ return true;
}
/**
@@ -152,45 +241,171 @@ int mptcp_token_new_accept(u32 token, struct sock *conn)
*/
struct mptcp_sock *mptcp_token_get_sock(u32 token)
{
- struct sock *conn;
-
- spin_lock_bh(&token_tree_lock);
- conn = radix_tree_lookup(&token_tree, token);
- if (conn) {
- /* token still reserved? */
- if (conn == (struct sock *)&token_used)
- conn = NULL;
- else
- sock_hold(conn);
+ struct hlist_nulls_node *pos;
+ struct token_bucket *bucket;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+
+ rcu_read_lock();
+ bucket = token_bucket(token);
+
+again:
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ msk = mptcp_sk(sk);
+ if (READ_ONCE(msk->token) != token)
+ continue;
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto not_found;
+ if (READ_ONCE(msk->token) != token) {
+ sock_put(sk);
+ goto again;
+ }
+ goto found;
+ }
+ if (get_nulls_value(pos) != (token & token_mask))
+ goto again;
+
+not_found:
+ msk = NULL;
+
+found:
+ rcu_read_unlock();
+ return msk;
+}
+EXPORT_SYMBOL_GPL(mptcp_token_get_sock);
+
+/**
+ * mptcp_token_iter_next - iterate over the token container from given pos
+ * @net: namespace to be iterated
+ * @s_slot: start slot number
+ * @s_num: start number inside the given lock
+ *
+ * This function returns the first mptcp connection structure found inside the
+ * token container starting from the specified position, or NULL.
+ *
+ * On successful iteration, the iterator is move to the next position and the
+ * the acquires a reference to the returned socket.
+ */
+struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
+ long *s_num)
+{
+ struct mptcp_sock *ret = NULL;
+ struct hlist_nulls_node *pos;
+ int slot, num;
+
+ for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) {
+ struct token_bucket *bucket = &token_hash[slot];
+ struct sock *sk;
+
+ num = 0;
+
+ if (hlist_nulls_empty(&bucket->msk_chain))
+ continue;
+
+ rcu_read_lock();
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ ++num;
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num <= *s_num)
+ continue;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ continue;
+
+ if (!net_eq(sock_net(sk), net)) {
+ sock_put(sk);
+ continue;
+ }
+
+ ret = mptcp_sk(sk);
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
}
- spin_unlock_bh(&token_tree_lock);
- return mptcp_sk(conn);
+out:
+ *s_slot = slot;
+ *s_num = num;
+ return ret;
}
+EXPORT_SYMBOL_GPL(mptcp_token_iter_next);
/**
* mptcp_token_destroy_request - remove mptcp connection/token
- * @token: token of mptcp connection to remove
+ * @req: mptcp request socket dropping the token
*
- * Remove not-yet-fully-established incoming connection identified
- * by @token.
+ * Remove the token associated to @req.
*/
-void mptcp_token_destroy_request(u32 token)
+void mptcp_token_destroy_request(struct request_sock *req)
{
- spin_lock_bh(&token_tree_lock);
- radix_tree_delete(&token_req_tree, token);
- spin_unlock_bh(&token_tree_lock);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_subflow_request_sock *pos;
+ struct token_bucket *bucket;
+
+ if (hlist_nulls_unhashed(&subflow_req->token_node))
+ return;
+
+ bucket = token_bucket(subflow_req->token);
+ spin_lock_bh(&bucket->lock);
+ pos = __token_lookup_req(bucket, subflow_req->token);
+ if (!WARN_ON_ONCE(pos != subflow_req)) {
+ hlist_nulls_del_init_rcu(&pos->token_node);
+ bucket->chain_len--;
+ }
+ spin_unlock_bh(&bucket->lock);
}
/**
* mptcp_token_destroy - remove mptcp connection/token
- * @token: token of mptcp connection to remove
+ * @msk: mptcp connection dropping the token
*
- * Remove the connection identified by @token.
+ * Remove the token associated to @msk
*/
-void mptcp_token_destroy(u32 token)
+void mptcp_token_destroy(struct mptcp_sock *msk)
{
- spin_lock_bh(&token_tree_lock);
- radix_tree_delete(&token_tree, token);
- spin_unlock_bh(&token_tree_lock);
+ struct token_bucket *bucket;
+ struct mptcp_sock *pos;
+
+ if (sk_unhashed((struct sock *)msk))
+ return;
+
+ bucket = token_bucket(msk->token);
+ spin_lock_bh(&bucket->lock);
+ pos = __token_lookup_msk(bucket, msk->token);
+ if (!WARN_ON_ONCE(pos != msk)) {
+ __sk_nulls_del_node_init_rcu((struct sock *)pos);
+ bucket->chain_len--;
+ }
+ spin_unlock_bh(&bucket->lock);
}
+
+void __init mptcp_token_init(void)
+{
+ int i;
+
+ token_hash = alloc_large_system_hash("MPTCP token",
+ sizeof(struct token_bucket),
+ 0,
+ 20,/* one slot per 1MB of memory */
+ HASH_ZERO,
+ NULL,
+ &token_mask,
+ 0,
+ 64 * 1024);
+ for (i = 0; i < token_mask + 1; ++i) {
+ INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i);
+ INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i);
+ spin_lock_init(&token_hash[i].lock);
+ }
+}
+
+#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS)
+EXPORT_SYMBOL_GPL(mptcp_token_new_request);
+EXPORT_SYMBOL_GPL(mptcp_token_new_connect);
+EXPORT_SYMBOL_GPL(mptcp_token_accept);
+EXPORT_SYMBOL_GPL(mptcp_token_destroy_request);
+EXPORT_SYMBOL_GPL(mptcp_token_destroy);
+#endif
diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c
new file mode 100644
index 000000000000..e1bd6f0a0676
--- /dev/null
+++ b/net/mptcp/token_test.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "protocol.h"
+
+static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req;
+
+ req = kunit_kzalloc(test, sizeof(struct mptcp_subflow_request_sock),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req);
+ mptcp_token_init_request((struct request_sock *)req);
+ return req;
+}
+
+static void mptcp_token_test_req_basic(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *null_msk = NULL;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ KUNIT_EXPECT_NE(test, 0, (int)req->token);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token));
+
+ /* cleanup */
+ mptcp_token_destroy_request((struct request_sock *)req);
+}
+
+static struct inet_connection_sock *build_icsk(struct kunit *test)
+{
+ struct inet_connection_sock *icsk;
+
+ icsk = kunit_kzalloc(test, sizeof(struct inet_connection_sock),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, icsk);
+ return icsk;
+}
+
+static struct mptcp_subflow_context *build_ctx(struct kunit *test)
+{
+ struct mptcp_subflow_context *ctx;
+
+ ctx = kunit_kzalloc(test, sizeof(struct mptcp_subflow_context),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ctx);
+ return ctx;
+}
+
+static struct mptcp_sock *build_msk(struct kunit *test)
+{
+ struct mptcp_sock *msk;
+
+ msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk);
+ refcount_set(&((struct sock *)msk)->sk_refcnt, 1);
+ return msk;
+}
+
+static void mptcp_token_test_msk_basic(struct kunit *test)
+{
+ struct inet_connection_sock *icsk = build_icsk(test);
+ struct mptcp_subflow_context *ctx = build_ctx(test);
+ struct mptcp_sock *msk = build_msk(test);
+ struct mptcp_sock *null_msk = NULL;
+ struct sock *sk;
+
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ ctx->conn = (struct sock *)msk;
+ sk = (struct sock *)msk;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_connect((struct sock *)icsk));
+ KUNIT_EXPECT_NE(test, 0, (int)ctx->token);
+ KUNIT_EXPECT_EQ(test, ctx->token, msk->token);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token));
+ KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt));
+
+ mptcp_token_destroy(msk);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token));
+}
+
+static void mptcp_token_test_accept(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *msk = build_msk(test);
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ msk->token = req->token;
+ mptcp_token_accept(req, msk);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token));
+
+ /* this is now a no-op */
+ mptcp_token_destroy_request((struct request_sock *)req);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token));
+
+ /* cleanup */
+ mptcp_token_destroy(msk);
+}
+
+static void mptcp_token_test_destroyed(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *msk = build_msk(test);
+ struct mptcp_sock *null_msk = NULL;
+ struct sock *sk;
+
+ sk = (struct sock *)msk;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ msk->token = req->token;
+ mptcp_token_accept(req, msk);
+
+ /* simulate race on removal */
+ refcount_set(&sk->sk_refcnt, 0);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token));
+
+ /* cleanup */
+ mptcp_token_destroy(msk);
+}
+
+static struct kunit_case mptcp_token_test_cases[] = {
+ KUNIT_CASE(mptcp_token_test_req_basic),
+ KUNIT_CASE(mptcp_token_test_msk_basic),
+ KUNIT_CASE(mptcp_token_test_accept),
+ KUNIT_CASE(mptcp_token_test_destroyed),
+ {}
+};
+
+static struct kunit_suite mptcp_token_suite = {
+ .name = "mptcp-token",
+ .test_cases = mptcp_token_test_cases,
+};
+
+kunit_test_suite(mptcp_token_suite);
+
+MODULE_LICENSE("GPL");