summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/mib.c8
-rw-r--r--net/mptcp/mib.h8
-rw-r--r--net/mptcp/mptcp_diag.c8
-rw-r--r--net/mptcp/options.c73
-rw-r--r--net/mptcp/pm.c46
-rw-r--r--net/mptcp/pm_netlink.c826
-rw-r--r--net/mptcp/protocol.c312
-rw-r--r--net/mptcp/protocol.h155
-rw-r--r--net/mptcp/subflow.c262
9 files changed, 1395 insertions, 303 deletions
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index b921cbdd9aaa..3780c29c321d 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -29,8 +29,16 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR),
SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD),
+ SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD),
+ SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX),
+ SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX),
+ SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX),
+ SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX),
SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR),
SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
+ SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
+ SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 47bcecce1106..72afbc135f8e 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -22,8 +22,16 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */
MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */
+ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */
+ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */
+ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */
+ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */
+ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */
MPTCP_MIB_RMADDR, /* Received RM_ADDR */
MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
+ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
+ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index b70ae4ba3000..f16d9b5ee978 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -128,11 +128,13 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
- info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
- val = READ_ONCE(msk->pm.add_addr_signal_max);
+ info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
+ info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk);
+ val = mptcp_pm_get_add_addr_signal_max(msk);
info->mptcpi_add_addr_signal_max = val;
- val = READ_ONCE(msk->pm.add_addr_accept_max);
+ val = mptcp_pm_get_add_addr_accept_max(msk);
info->mptcpi_add_addr_accepted_max = val;
+ info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk);
if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
flags |= MPTCP_INFO_FLAG_FALLBACK;
if (READ_ONCE(msk->can_ack))
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index e0d21c0607e5..b63574d6b812 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -282,6 +282,15 @@ static void mptcp_parse_option(const struct sk_buff *skb,
pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
break;
+ case MPTCPOPT_MP_PRIO:
+ if (opsize != TCPOLEN_MPTCP_PRIO)
+ break;
+
+ mp_opt->mp_prio = 1;
+ mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
+ pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
+ break;
+
case MPTCPOPT_MP_FASTCLOSE:
if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
break;
@@ -313,6 +322,7 @@ void mptcp_get_options(const struct sk_buff *skb,
mp_opt->port = 0;
mp_opt->rm_addr = 0;
mp_opt->dss = 0;
+ mp_opt->mp_prio = 0;
length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
@@ -498,8 +508,8 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ u64 snd_data_fin_enable, ack_seq;
unsigned int dss_size = 0;
- u64 snd_data_fin_enable;
struct mptcp_ext *mpext;
unsigned int ack_size;
bool ret = false;
@@ -531,13 +541,14 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return ret;
}
+ ack_seq = READ_ONCE(msk->ack_seq);
if (READ_ONCE(msk->use_64bit_ack)) {
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
- opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
+ opts->ext_copy.data_ack = ack_seq;
opts->ext_copy.ack64 = 1;
} else {
ack_size = TCPOLEN_MPTCP_DSS_ACK32;
- opts->ext_copy.data_ack32 = (uint32_t)READ_ONCE(msk->ack_seq);
+ opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
opts->ext_copy.ack64 = 0;
}
opts->ext_copy.use_ack = 1;
@@ -679,6 +690,29 @@ static bool mptcp_established_options_rm_addr(struct sock *sk,
return true;
}
+static bool mptcp_established_options_mp_prio(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ if (!subflow->send_mp_prio)
+ return false;
+
+ /* account for the trailing 'nop' option */
+ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN)
+ return false;
+
+ *size = TCPOLEN_MPTCP_PRIO_ALIGN;
+ opts->suboptions |= OPTION_MPTCP_PRIO;
+ opts->backup = subflow->request_bkup;
+
+ pr_debug("prio=%d", opts->backup);
+
+ return true;
+}
+
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
@@ -721,6 +755,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
ret = true;
}
+ if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ }
+
return ret;
}
@@ -828,7 +868,7 @@ fully_established:
clear_3rdack_retransmission(ssk);
mptcp_pm_subflow_established(msk, subflow);
} else {
- mptcp_pm_fully_established(msk);
+ mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC);
}
return true;
@@ -879,8 +919,7 @@ static void ack_update_msk(struct mptcp_sock *msk,
msk->wnd_end = new_wnd_end;
/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
- if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)) &&
- sk_stream_memory_free(ssk))
+ if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
__mptcp_check_push(sk, ssk);
if (after64(new_snd_una, old_snd_una)) {
@@ -986,6 +1025,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
mptcp_pm_del_add_timer(msk, &addr);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
}
+
+ if (mp_opt.port)
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD);
+
mp_opt.add_addr = 0;
}
@@ -994,6 +1037,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
mp_opt.rm_addr = 0;
}
+ if (mp_opt.mp_prio) {
+ mptcp_pm_mp_prio_received(sk, mp_opt.backup);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX);
+ mp_opt.mp_prio = 0;
+ }
+
if (!mp_opt.dss)
return;
@@ -1168,6 +1217,18 @@ mp_capable_done:
0, opts->rm_id);
}
+ if (OPTION_MPTCP_PRIO & opts->suboptions) {
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->send_mp_prio = 0;
+
+ *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
+ TCPOLEN_MPTCP_PRIO,
+ opts->backup, TCPOPT_NOP);
+ }
+
if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_SYN,
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index da2ed576f289..6fd4b2c1b076 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -20,6 +20,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
pr_debug("msk=%p, local_id=%d", msk, addr->id);
+ lockdep_assert_held(&msk->pm.lock);
+
if (add_addr) {
pr_warn("addr_signal error, add_addr=%d", add_addr);
return -EINVAL;
@@ -66,22 +68,26 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id)
/* path manager event handlers */
-void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
+void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
WRITE_ONCE(pm->server_side, server_side);
+ mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
}
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
+ unsigned int subflows_max;
int ret = 0;
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
- pm->subflows_max, READ_ONCE(pm->accept_subflow));
+ subflows_max, READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->accept_subflow))
@@ -89,8 +95,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->accept_subflow)) {
- ret = pm->subflows < pm->subflows_max;
- if (ret && ++pm->subflows == pm->subflows_max)
+ ret = pm->subflows < subflows_max;
+ if (ret && ++pm->subflows == subflows_max)
WRITE_ONCE(pm->accept_subflow, false);
}
spin_unlock_bh(&pm->lock);
@@ -114,16 +120,13 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return true;
}
-void mptcp_pm_fully_established(struct mptcp_sock *msk)
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp)
{
struct mptcp_pm_data *pm = &msk->pm;
+ bool announce = false;
pr_debug("msk=%p", msk);
- /* try to avoid acquiring the lock below */
- if (!READ_ONCE(pm->work_pending))
- return;
-
spin_lock_bh(&pm->lock);
/* mptcp_pm_fully_established() can be invoked by multiple
@@ -133,9 +136,15 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk)
if (READ_ONCE(pm->work_pending) &&
!(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
- msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
+ if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
+ announce = true;
+
+ msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
spin_unlock_bh(&pm->lock);
+
+ if (announce)
+ mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp);
}
void mptcp_pm_connection_closed(struct mptcp_sock *msk)
@@ -174,6 +183,8 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
READ_ONCE(pm->accept_addr));
+ mptcp_event_addr_announced(msk, addr);
+
spin_lock_bh(&pm->lock);
if (!READ_ONCE(pm->accept_addr)) {
@@ -188,8 +199,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
{
- if (!mptcp_pm_should_add_signal_ipv6(msk) &&
- !mptcp_pm_should_add_signal_port(msk))
+ if (!mptcp_pm_should_add_signal(msk))
return;
mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
@@ -201,12 +211,24 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id)
pr_debug("msk=%p remote_id=%d", msk, rm_id);
+ mptcp_event_addr_removed(msk, rm_id);
+
spin_lock_bh(&pm->lock);
mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED);
pm->rm_id = rm_id;
spin_unlock_bh(&pm->lock);
}
+void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
+ subflow->backup = bkup;
+
+ mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC);
+}
+
/* path manager helpers */
bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index a6d983d80576..8e8e35fa4002 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -26,6 +26,7 @@ struct mptcp_pm_addr_entry {
struct list_head list;
struct mptcp_addr_info addr;
struct rcu_head rcu;
+ struct socket *lsk;
};
struct mptcp_pm_add_entry {
@@ -36,6 +37,9 @@ struct mptcp_pm_add_entry {
u8 retrans_times;
};
+#define MAX_ADDR_ID 255
+#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG)
+
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
@@ -46,25 +50,33 @@ struct pm_nl_pernet {
unsigned int local_addr_max;
unsigned int subflows_max;
unsigned int next_id;
+ unsigned long id_bitmap[BITMAP_SZ];
};
#define MPTCP_PM_ADDR_MAX 8
#define ADD_ADDR_RETRANS_MAX 3
+static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk);
+
static bool addresses_equal(const struct mptcp_addr_info *a,
struct mptcp_addr_info *b, bool use_port)
{
bool addr_equals = false;
- if (a->family != b->family)
- return false;
-
- if (a->family == AF_INET)
- addr_equals = a->addr.s_addr == b->addr.s_addr;
+ if (a->family == b->family) {
+ if (a->family == AF_INET)
+ addr_equals = a->addr.s_addr == b->addr.s_addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- else
- addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+ else
+ addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+ } else if (a->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&b->addr6))
+ addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3];
+ } else if (b->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&a->addr6))
+ addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr;
#endif
+ }
if (!addr_equals)
return false;
@@ -81,14 +93,14 @@ static bool address_zero(const struct mptcp_addr_info *addr)
memset(&zero, 0, sizeof(zero));
zero.family = addr->family;
- return addresses_equal(addr, &zero, false);
+ return addresses_equal(addr, &zero, true);
}
static void local_address(const struct sock_common *skc,
struct mptcp_addr_info *addr)
{
- addr->port = 0;
addr->family = skc->skc_family;
+ addr->port = htons(skc->skc_num);
if (addr->family == AF_INET)
addr->addr.s_addr = skc->skc_rcv_saddr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -121,7 +133,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list,
skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
local_address(skc, &cur);
- if (addresses_equal(&cur, saddr, false))
+ if (addresses_equal(&cur, saddr, saddr->port))
return true;
}
@@ -133,6 +145,9 @@ select_local_address(const struct pm_nl_pernet *pernet,
struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *ret = NULL;
+ struct sock *sk = (struct sock *)msk;
+
+ msk_owned_by_me(msk);
rcu_read_lock();
__mptcp_flush_join_list(msk);
@@ -140,11 +155,20 @@ select_local_address(const struct pm_nl_pernet *pernet,
if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
+ if (entry->addr.family != sk->sk_family) {
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if ((entry->addr.family == AF_INET &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ||
+ (sk->sk_family == AF_INET &&
+ !ipv6_addr_v4mapped(&entry->addr.addr6)))
+#endif
+ continue;
+ }
+
/* avoid any address already in use by subflows and
* pending join
*/
- if (entry->addr.family == ((struct sock *)msk)->sk_family &&
- !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
+ if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
ret = entry;
break;
}
@@ -177,11 +201,47 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
return ret;
}
+unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->add_addr_signal_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
+
+unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->add_addr_accept_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
+
+unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->subflows_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
+
+unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->local_addr_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
+
static void check_work_pending(struct mptcp_sock *msk)
{
- if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max &&
- (msk->pm.local_addr_used == msk->pm.local_addr_max ||
- msk->pm.subflows == msk->pm.subflows_max))
+ if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) &&
+ (msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) ||
+ msk->pm.subflows == mptcp_pm_get_subflows_max(msk)))
WRITE_ONCE(msk->pm.work_pending, false);
}
@@ -191,14 +251,37 @@ lookup_anno_list_by_saddr(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *entry;
+ lockdep_assert_held(&msk->pm.lock);
+
list_for_each_entry(entry, &msk->pm.anno_list, list) {
- if (addresses_equal(&entry->addr, addr, false))
+ if (addresses_equal(&entry->addr, addr, true))
return entry;
}
return NULL;
}
+bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct mptcp_addr_info saddr;
+ bool ret = false;
+
+ local_address((struct sock_common *)sk, &saddr);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (addresses_equal(&entry->addr, &saddr, true)) {
+ ret = true;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
static void mptcp_pm_add_timer(struct timer_list *timer)
{
struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
@@ -266,6 +349,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
struct sock *sk = (struct sock *)msk;
struct net *net = sock_net(sk);
+ lockdep_assert_held(&msk->pm.lock);
+
if (lookup_anno_list_by_saddr(msk, &entry->addr))
return false;
@@ -306,20 +391,26 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
- struct mptcp_addr_info remote = { 0 };
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *local;
+ unsigned int add_addr_signal_max;
+ unsigned int local_addr_max;
struct pm_nl_pernet *pernet;
+ unsigned int subflows_max;
pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
+ add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
+ local_addr_max = mptcp_pm_get_local_addr_max(msk);
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
- msk->pm.local_addr_used, msk->pm.local_addr_max,
- msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max,
- msk->pm.subflows, msk->pm.subflows_max);
+ msk->pm.local_addr_used, local_addr_max,
+ msk->pm.add_addr_signaled, add_addr_signal_max,
+ msk->pm.subflows, subflows_max);
/* check first for announce */
- if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) {
+ if (msk->pm.add_addr_signaled < add_addr_signal_max) {
local = select_signal_address(pernet,
msk->pm.add_addr_signaled);
@@ -331,22 +422,23 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
}
} else {
/* pick failed, avoid fourther attempts later */
- msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
+ msk->pm.local_addr_used = add_addr_signal_max;
}
check_work_pending(msk);
}
/* check if should create a new subflow */
- if (msk->pm.local_addr_used < msk->pm.local_addr_max &&
- msk->pm.subflows < msk->pm.subflows_max) {
- remote_address((struct sock_common *)sk, &remote);
-
+ if (msk->pm.local_addr_used < local_addr_max &&
+ msk->pm.subflows < subflows_max) {
local = select_local_address(pernet, msk);
if (local) {
+ struct mptcp_addr_info remote = { 0 };
+
msk->pm.local_addr_used++;
msk->pm.subflows++;
check_work_pending(msk);
+ remote_address((struct sock_common *)sk, &remote);
spin_unlock_bh(&msk->pm.lock);
__mptcp_subflow_connect(sk, &local->addr, &remote);
spin_lock_bh(&msk->pm.lock);
@@ -354,35 +446,40 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
}
/* lookup failed, avoid fourther attempts later */
- msk->pm.local_addr_used = msk->pm.local_addr_max;
+ msk->pm.local_addr_used = local_addr_max;
check_work_pending(msk);
}
}
-void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
+static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
-void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
+static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
-void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
+static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
+ unsigned int add_addr_accept_max;
struct mptcp_addr_info remote;
struct mptcp_addr_info local;
+ unsigned int subflows_max;
bool use_port = false;
+ add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
pr_debug("accepted %d:%d remote family %d",
- msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max,
+ msk->pm.add_addr_accepted, add_addr_accept_max,
msk->pm.remote.family);
msk->pm.add_addr_accepted++;
msk->pm.subflows++;
- if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max ||
- msk->pm.subflows >= msk->pm.subflows_max)
+ if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
+ msk->pm.subflows >= subflows_max)
WRITE_ONCE(msk->pm.accept_addr, false);
/* connect to the specified remote address, using whatever
@@ -404,12 +501,14 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
mptcp_pm_nl_add_addr_send_ack(msk);
}
-void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
+static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- if (!mptcp_pm_should_add_signal_ipv6(msk) &&
- !mptcp_pm_should_add_signal_port(msk))
+ msk_owned_by_me(msk);
+ lockdep_assert_held(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal(msk))
return;
__mptcp_flush_join_list(msk);
@@ -419,10 +518,9 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
u8 add_addr;
spin_unlock_bh(&msk->pm.lock);
- if (mptcp_pm_should_add_signal_ipv6(msk))
- pr_debug("send ack for add_addr6");
- if (mptcp_pm_should_add_signal_port(msk))
- pr_debug("send ack for add_addr_port");
+ pr_debug("send ack for add_addr%s%s",
+ mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "",
+ mptcp_pm_should_add_signal_port(msk) ? " [port]" : "");
lock_sock(ssk);
tcp_send_ack(ssk);
@@ -438,13 +536,50 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
}
}
-void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ u8 bkup)
+{
+ struct mptcp_subflow_context *subflow;
+
+ pr_debug("bkup=%d", bkup);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info local;
+
+ local_address((struct sock_common *)ssk, &local);
+ if (!addresses_equal(&local, addr, addr->port))
+ continue;
+
+ subflow->backup = bkup;
+ subflow->send_mp_prio = 1;
+ subflow->request_bkup = bkup;
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIOTX);
+
+ spin_unlock_bh(&msk->pm.lock);
+ pr_debug("send ack for mp_prio");
+ lock_sock(ssk);
+ tcp_send_ack(ssk);
+ release_sock(ssk);
+ spin_lock_bh(&msk->pm.lock);
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = (struct sock *)msk;
pr_debug("address rm_id %d", msk->pm.rm_id);
+ msk_owned_by_me(msk);
+
if (!msk->pm.rm_id)
return;
@@ -460,7 +595,7 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
- __mptcp_close_ssk(sk, ssk, subflow);
+ mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock);
msk->pm.add_addr_accepted--;
@@ -473,6 +608,39 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
}
}
+void mptcp_pm_nl_work(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ msk_owned_by_me(msk);
+
+ spin_lock_bh(&msk->pm.lock);
+
+ pr_debug("msk=%p status=%x", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
+ mptcp_pm_nl_add_addr_send_ack(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
+ mptcp_pm_nl_rm_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+}
+
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
{
struct mptcp_subflow_context *subflow, *tmp;
@@ -480,6 +648,8 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
pr_debug("subflow rm_id %d", rm_id);
+ msk_owned_by_me(msk);
+
if (!rm_id)
return;
@@ -495,7 +665,7 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
- __mptcp_close_ssk(sk, ssk, subflow);
+ mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock);
msk->pm.local_addr_used--;
@@ -518,16 +688,19 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry)
{
struct mptcp_pm_addr_entry *cur;
+ unsigned int addr_max;
int ret = -EINVAL;
spin_lock_bh(&pernet->lock);
/* to keep the code simple, don't do IDR-like allocation for address ID,
* just bail when we exceed limits
*/
- if (pernet->next_id > 255)
- goto out;
+ if (pernet->next_id == MAX_ADDR_ID)
+ pernet->next_id = 1;
if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
goto out;
+ if (test_bit(entry->addr.id, pernet->id_bitmap))
+ goto out;
/* do not insert duplicate address, differentiate on port only
* singled addresses
@@ -539,12 +712,34 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
- pernet->add_addr_signal_max++;
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
- pernet->local_addr_max++;
+ if (!entry->addr.id) {
+find_next:
+ entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
+ MAX_ADDR_ID + 1,
+ pernet->next_id);
+ if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) &&
+ pernet->next_id != 1) {
+ pernet->next_id = 1;
+ goto find_next;
+ }
+ }
+
+ if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID)
+ goto out;
+
+ __set_bit(entry->addr.id, pernet->id_bitmap);
+ if (entry->addr.id > pernet->next_id)
+ pernet->next_id = entry->addr.id;
+
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->add_addr_signal_max;
+ WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1);
+ }
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->local_addr_max;
+ WRITE_ONCE(pernet->local_addr_max, addr_max + 1);
+ }
- entry->addr.id = pernet->next_id++;
pernet->addrs++;
list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
ret = entry->addr.id;
@@ -554,6 +749,53 @@ out:
return ret;
}
+static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct sockaddr_storage addr;
+ struct mptcp_sock *msk;
+ struct socket *ssock;
+ int backlog = 1024;
+ int err;
+
+ err = sock_create_kern(sock_net(sk), entry->addr.family,
+ SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
+ if (err)
+ return err;
+
+ msk = mptcp_sk(entry->lsk->sk);
+ if (!msk) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
+ err = kernel_bind(ssock, (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_in));
+ if (err) {
+ pr_warn("kernel_bind error, err=%d", err);
+ goto out;
+ }
+
+ err = kernel_listen(ssock, backlog);
+ if (err) {
+ pr_warn("kernel_listen error, err=%d", err);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ sock_release(entry->lsk);
+ return err;
+}
+
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
{
struct mptcp_pm_addr_entry *entry;
@@ -580,7 +822,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (addresses_equal(&entry->addr, &skc_local, false)) {
+ if (addresses_equal(&entry->addr, &skc_local, entry->addr.port)) {
ret = entry->addr.id;
break;
}
@@ -597,6 +839,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
entry->addr = skc_local;
entry->addr.ifindex = 0;
entry->addr.flags = 0;
+ entry->addr.id = 0;
+ entry->addr.port = 0;
+ entry->lsk = NULL;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
@@ -607,26 +852,23 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- struct pm_nl_pernet *pernet;
bool subflows;
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
-
- pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max);
- pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max);
- pm->local_addr_max = READ_ONCE(pernet->local_addr_max);
- pm->subflows_max = READ_ONCE(pernet->subflows_max);
- subflows = !!pm->subflows_max;
- WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) ||
- !!pm->add_addr_signal_max);
- WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows);
+ subflows = !!mptcp_pm_get_subflows_max(msk);
+ WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) ||
+ !!mptcp_pm_get_add_addr_signal_max(msk));
+ WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows);
WRITE_ONCE(pm->accept_subflow, subflows);
}
-#define MPTCP_PM_CMD_GRP_OFFSET 0
+#define MPTCP_PM_CMD_GRP_OFFSET 0
+#define MPTCP_PM_EV_GRP_OFFSET 1
static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
[MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
+ [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
};
static const struct nla_policy
@@ -722,6 +964,9 @@ skip_family:
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+ if (tb[MPTCP_PM_ADDR_ATTR_PORT])
+ entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
+
return 0;
}
@@ -730,6 +975,31 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
return net_generic(genl_info_net(info), pm_nl_pernet_id);
}
+static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
+{
+ struct mptcp_sock *msk;
+ long s_slot = 0, s_num = 0;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (!READ_ONCE(msk->fully_established))
+ goto next;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
@@ -748,13 +1018,25 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
}
*entry = addr;
+ if (entry->addr.port) {
+ ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
+ if (ret) {
+ GENL_SET_ERR_MSG(info, "create listen socket error");
+ kfree(entry);
+ return ret;
+ }
+ }
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0) {
GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
+ if (entry->lsk)
+ sock_release(entry->lsk);
kfree(entry);
return ret;
}
+ mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk));
+
return 0;
}
@@ -832,11 +1114,44 @@ next:
return 0;
}
+struct addr_entry_release_work {
+ struct rcu_work rwork;
+ struct mptcp_pm_addr_entry *entry;
+};
+
+static void mptcp_pm_release_addr_entry(struct work_struct *work)
+{
+ struct addr_entry_release_work *w;
+ struct mptcp_pm_addr_entry *entry;
+
+ w = container_of(to_rcu_work(work), struct addr_entry_release_work, rwork);
+ entry = w->entry;
+ if (entry) {
+ if (entry->lsk)
+ sock_release(entry->lsk);
+ kfree(entry);
+ }
+ kfree(w);
+}
+
+static void mptcp_pm_free_addr_entry(struct mptcp_pm_addr_entry *entry)
+{
+ struct addr_entry_release_work *w;
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (w) {
+ INIT_RCU_WORK(&w->rwork, mptcp_pm_release_addr_entry);
+ w->entry = entry;
+ queue_rcu_work(system_wq, &w->rwork);
+ }
+}
+
static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
+ unsigned int addr_max;
int ret;
ret = mptcp_pm_parse_addr(attr, info, false, &addr);
@@ -850,17 +1165,22 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
spin_unlock_bh(&pernet->lock);
return -EINVAL;
}
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
- pernet->add_addr_signal_max--;
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
- pernet->local_addr_max--;
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->add_addr_signal_max;
+ WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1);
+ }
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->local_addr_max;
+ WRITE_ONCE(pernet->local_addr_max, addr_max - 1);
+ }
pernet->addrs--;
list_del_rcu(&entry->list);
+ __clear_bit(entry->addr.id, pernet->id_bitmap);
spin_unlock_bh(&pernet->lock);
mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr);
- kfree_rcu(entry, rcu);
+ mptcp_pm_free_addr_entry(entry);
return ret;
}
@@ -874,15 +1194,15 @@ static void __flush_addrs(struct net *net, struct list_head *list)
struct mptcp_pm_addr_entry, list);
mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr);
list_del_rcu(&cur->list);
- kfree_rcu(cur, rcu);
+ mptcp_pm_free_addr_entry(cur);
}
}
static void __reset_counters(struct pm_nl_pernet *pernet)
{
- pernet->add_addr_signal_max = 0;
- pernet->add_addr_accept_max = 0;
- pernet->local_addr_max = 0;
+ WRITE_ONCE(pernet->add_addr_signal_max, 0);
+ WRITE_ONCE(pernet->add_addr_accept_max, 0);
+ WRITE_ONCE(pernet->local_addr_max, 0);
pernet->addrs = 0;
}
@@ -894,6 +1214,8 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
spin_lock_bh(&pernet->lock);
list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet);
+ pernet->next_id = 1;
+ bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock);
__flush_addrs(sock_net(skb->sk), &free_list);
return 0;
@@ -911,6 +1233,8 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,
if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
goto nla_put_failure;
+ if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port)))
+ goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags))
@@ -994,27 +1318,34 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
struct pm_nl_pernet *pernet;
int id = cb->args[0];
void *hdr;
+ int i;
pernet = net_generic(net, pm_nl_pernet_id);
spin_lock_bh(&pernet->lock);
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if (entry->addr.id <= id)
- continue;
-
- hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, &mptcp_genl_family,
- NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
- if (!hdr)
- break;
+ for (i = id; i < MAX_ADDR_ID + 1; i++) {
+ if (test_bit(i, pernet->id_bitmap)) {
+ entry = __lookup_addr_by_id(pernet, i);
+ if (!entry)
+ break;
+
+ if (entry->addr.id <= id)
+ continue;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ break;
+
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
+ break;
+ }
- if (mptcp_nl_fill_addr(msg, entry) < 0) {
- genlmsg_cancel(msg, hdr);
- break;
+ id = entry->addr.id;
+ genlmsg_end(msg, hdr);
}
-
- id = entry->addr.id;
- genlmsg_end(msg, hdr);
}
spin_unlock_bh(&pernet->lock);
@@ -1096,6 +1427,321 @@ fail:
return -EMSGSIZE;
}
+static int mptcp_nl_addr_backup(struct net *net,
+ struct mptcp_addr_info *addr,
+ u8 bkup)
+{
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (list_empty(&msk->conn_list))
+ goto next;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct net *net = sock_net(skb->sk);
+ u8 bkup = 0;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, true, &addr);
+ if (ret < 0)
+ return ret;
+
+ if (addr.addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
+ bkup = 1;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (addresses_equal(&entry->addr, &addr.addr, true)) {
+ ret = mptcp_nl_addr_backup(net, &entry->addr, bkup);
+ if (ret)
+ return ret;
+
+ if (bkup)
+ entry->addr.flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ entry->addr.flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ }
+ }
+
+ return 0;
+}
+
+static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp)
+{
+ genlmsg_multicast_netns(&mptcp_genl_family, net,
+ nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp);
+}
+
+static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk)
+{
+ const struct inet_sock *issk = inet_sk(ssk);
+ const struct mptcp_subflow_context *sf;
+
+ if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family))
+ return -EMSGSIZE;
+
+ switch (ssk->sk_family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr))
+ return -EMSGSIZE;
+ if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr))
+ return -EMSGSIZE;
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6: {
+ const struct ipv6_pinfo *np = inet6_sk(ssk);
+
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr))
+ return -EMSGSIZE;
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr))
+ return -EMSGSIZE;
+ break;
+ }
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ return -EMSGSIZE;
+ }
+
+ if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport))
+ return -EMSGSIZE;
+ if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport))
+ return -EMSGSIZE;
+
+ sf = mptcp_subflow_ctx(ssk);
+ if (WARN_ON_ONCE(!sf))
+ return -EINVAL;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int mptcp_event_put_token_and_ssk(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ const struct sock *sk = (const struct sock *)msk;
+ const struct mptcp_subflow_context *sf;
+ u8 sk_err;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ return -EMSGSIZE;
+
+ if (mptcp_event_add_subflow(skb, ssk))
+ return -EMSGSIZE;
+
+ sf = mptcp_subflow_ctx(ssk);
+ if (WARN_ON_ONCE(!sf))
+ return -EINVAL;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup))
+ return -EMSGSIZE;
+
+ if (ssk->sk_bound_dev_if &&
+ nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if))
+ return -EMSGSIZE;
+
+ sk_err = ssk->sk_err;
+ if (sk_err && sk->sk_state == TCP_ESTABLISHED &&
+ nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int mptcp_event_sub_established(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ return mptcp_event_put_token_and_ssk(skb, msk, ssk);
+}
+
+static int mptcp_event_sub_closed(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ if (mptcp_event_put_token_and_ssk(skb, msk, ssk))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int mptcp_event_created(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token);
+
+ if (err)
+ return err;
+
+ return mptcp_event_add_subflow(skb, ssk);
+}
+
+void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id)
+{
+ struct net *net = sock_net((const struct sock *)msk);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED);
+ if (!nlh)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id))
+ goto nla_put_failure;
+
+ genlmsg_end(skb, nlh);
+ mptcp_nl_mcast_send(net, skb, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+}
+
+void mptcp_event_addr_announced(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *info)
+{
+ struct net *net = sock_net((const struct sock *)msk);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0,
+ MPTCP_EVENT_ANNOUNCED);
+ if (!nlh)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port))
+ goto nla_put_failure;
+
+ switch (info->family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr))
+ goto nla_put_failure;
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6:
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6))
+ goto nla_put_failure;
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(skb, nlh);
+ mptcp_nl_mcast_send(net, skb, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+}
+
+void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
+ const struct sock *ssk, gfp_t gfp)
+{
+ struct net *net = sock_net((const struct sock *)msk);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type);
+ if (!nlh)
+ goto nla_put_failure;
+
+ switch (type) {
+ case MPTCP_EVENT_UNSPEC:
+ WARN_ON_ONCE(1);
+ break;
+ case MPTCP_EVENT_CREATED:
+ case MPTCP_EVENT_ESTABLISHED:
+ if (mptcp_event_created(skb, msk, ssk) < 0)
+ goto nla_put_failure;
+ break;
+ case MPTCP_EVENT_CLOSED:
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0)
+ goto nla_put_failure;
+ break;
+ case MPTCP_EVENT_ANNOUNCED:
+ case MPTCP_EVENT_REMOVED:
+ /* call mptcp_event_addr_announced()/removed instead */
+ WARN_ON_ONCE(1);
+ break;
+ case MPTCP_EVENT_SUB_ESTABLISHED:
+ case MPTCP_EVENT_SUB_PRIORITY:
+ if (mptcp_event_sub_established(skb, msk, ssk) < 0)
+ goto nla_put_failure;
+ break;
+ case MPTCP_EVENT_SUB_CLOSED:
+ if (mptcp_event_sub_closed(skb, msk, ssk) < 0)
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(skb, nlh);
+ mptcp_nl_mcast_send(net, skb, gfp);
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+}
+
static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_ADD_ADDR,
@@ -1126,6 +1772,11 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
.cmd = MPTCP_PM_CMD_GET_LIMITS,
.doit = mptcp_nl_cmd_get_limits,
},
+ {
+ .cmd = MPTCP_PM_CMD_SET_FLAGS,
+ .doit = mptcp_nl_cmd_set_flags,
+ .flags = GENL_ADMIN_PERM,
+ },
};
static struct genl_family mptcp_genl_family __ro_after_init = {
@@ -1148,6 +1799,7 @@ static int __net_init pm_nl_init_net(struct net *net)
INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
__reset_counters(pernet);
pernet->next_id = 1;
+ bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1);
spin_lock_init(&pernet->lock);
return 0;
}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f998a077c7dd..a57f3eab7b6a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -45,11 +45,14 @@ static struct percpu_counter mptcp_sockets_allocated;
static void __mptcp_destroy_sock(struct sock *sk);
static void __mptcp_check_send_data_fin(struct sock *sk);
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+static struct net_device mptcp_napi_dev;
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
*/
-static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
+struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
{
if (!msk->subflow || READ_ONCE(msk->can_ack))
return NULL;
@@ -114,11 +117,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
list_add(&subflow->node, &msk->conn_list);
sock_hold(ssock->sk);
subflow->request_mptcp = 1;
-
- /* accept() will wait on first subflow sk_wq, and we always wakes up
- * via msk->sk_socket
- */
- RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
+ mptcp_sock_graft(msk->first, sk->sk_socket);
return 0;
}
@@ -364,8 +363,6 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
/* Look for an acknowledged DATA_FIN */
if (mptcp_pending_data_fin_ack(sk)) {
- mptcp_stop_timer(sk);
-
WRITE_ONCE(msk->snd_data_fin_enable, 0);
switch (sk->sk_state) {
@@ -459,7 +456,18 @@ static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk)
static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
{
struct sock *ack_hint = READ_ONCE(msk->ack_hint);
+ int old_space = READ_ONCE(msk->old_wspace);
struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ bool cleanup;
+
+ /* this is a simple superset of what tcp_cleanup_rbuf() implements
+ * so that we don't have to acquire the ssk socket lock most of the time
+ * to do actually nothing
+ */
+ cleanup = __mptcp_space(sk) - old_space >= max(0, old_space);
+ if (!cleanup)
+ return;
/* if the hinted ssk is still active, try to use it */
if (likely(ack_hint)) {
@@ -734,10 +742,14 @@ wake:
void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
+ struct mptcp_subflow_context *subflow;
+
if (likely(list_empty(&msk->join_list)))
return;
spin_lock_bh(&msk->join_list_lock);
+ list_for_each_entry(subflow, &msk->join_list, node)
+ mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
}
@@ -1037,13 +1049,6 @@ out:
__mptcp_update_wmem(sk);
sk_mem_reclaim_partial(sk);
}
-
- if (sk_stream_is_writeable(sk)) {
- /* pairs with memory barrier in mptcp_poll */
- smp_mb();
- if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags))
- sk_stream_write_space(sk);
- }
}
if (snd_una == READ_ONCE(msk->snd_nxt)) {
@@ -1362,8 +1367,7 @@ struct subflow_send_info {
u64 ratio;
};
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
- u32 *sndbuf)
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
@@ -1374,24 +1378,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
sock_owned_by_me((struct sock *)msk);
- *sndbuf = 0;
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
- *sndbuf = msk->first->sk_sndbuf;
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
/* re-use last subflow, if the burst allow that */
if (msk->last_snd && msk->snd_burst > 0 &&
sk_stream_memory_free(msk->last_snd) &&
- mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
- mptcp_for_each_subflow(msk, subflow) {
- ssk = mptcp_subflow_tcp_sock(subflow);
- *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
- }
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
return msk->last_snd;
- }
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < 2; ++i) {
@@ -1404,8 +1401,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
continue;
nr_active += !subflow->backup;
- *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
- if (!sk_stream_memory_free(subflow->tcp_sock))
+ if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
continue;
pace = READ_ONCE(ssk->sk_pacing_rate);
@@ -1431,9 +1427,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
if (send_info[0].ssk) {
msk->last_snd = send_info[0].ssk;
msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
- sk_stream_wspace(msk->last_snd));
+ tcp_sk(msk->last_snd)->snd_wnd);
return msk->last_snd;
}
+
return NULL;
}
@@ -1454,7 +1451,6 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
};
struct mptcp_data_frag *dfrag;
int len, copied = 0;
- u32 sndbuf;
while ((dfrag = mptcp_send_head(sk))) {
info.sent = dfrag->already_sent;
@@ -1465,12 +1461,7 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
prev_ssk = ssk;
__mptcp_flush_join_list(msk);
- ssk = mptcp_subflow_get_send(msk, &sndbuf);
-
- /* do auto tuning */
- if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
- sndbuf > READ_ONCE(sk->sk_sndbuf))
- WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+ ssk = mptcp_subflow_get_send(msk);
/* try to keep the subflow socket lock across
* consecutive xmit on the same socket
@@ -1527,7 +1518,9 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_sendmsg_info info;
struct mptcp_data_frag *dfrag;
+ struct sock *xmit_ssk;
int len, copied = 0;
+ bool first = true;
info.flags = 0;
while ((dfrag = mptcp_send_head(sk))) {
@@ -1537,10 +1530,17 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
while (len > 0) {
int ret = 0;
- /* do auto tuning */
- if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
- ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
- WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
+ /* the caller already invoked the packet scheduler,
+ * check for a different subflow usage only after
+ * spooling the first chunk of data
+ */
+ xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
+ if (!xmit_ssk)
+ goto out;
+ if (xmit_ssk != ssk) {
+ mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
+ goto out;
+ }
if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
__mptcp_update_wmem(sk);
@@ -1560,6 +1560,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
msk->tx_pending_data -= ret;
copied += ret;
len -= ret;
+ first = false;
}
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
@@ -1573,12 +1574,24 @@ out:
mptcp_set_timeout(sk, ssk);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+
if (msk->snd_data_fin_enable &&
msk->snd_nxt + 1 == msk->write_seq)
mptcp_schedule_work(sk);
}
}
+static void mptcp_set_nospace(struct sock *sk)
+{
+ /* enable autotune */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* will be cleared on avail space */
+ set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1680,7 +1693,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
continue;
wait_for_memory:
- set_bit(MPTCP_NOSPACE, &msk->flags);
+ mptcp_set_nospace(sk);
mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
@@ -1867,7 +1880,7 @@ static void __mptcp_splice_receive_queue(struct sock *sk)
skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
}
-static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
+static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
unsigned int moved = 0;
@@ -1887,13 +1900,10 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
slowpath = lock_sock_fast(ssk);
mptcp_data_lock(sk);
+ __mptcp_update_rmem(sk);
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
mptcp_data_unlock(sk);
- if (moved && rcv) {
- WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
- tcp_cleanup_rbuf(ssk, 1);
- WRITE_ONCE(msk->rmem_pending, 0);
- }
+ tcp_cleanup_rbuf(ssk, moved);
unlock_sock_fast(ssk, slowpath);
} while (!done);
@@ -1906,6 +1916,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
ret |= __mptcp_ofo_queue(msk);
__mptcp_splice_receive_queue(sk);
mptcp_data_unlock(sk);
+ mptcp_cleanup_rbuf(msk);
}
if (ret)
mptcp_check_data_fin((struct sock *)msk);
@@ -1935,7 +1946,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
while (copied < len) {
- int bytes_read, old_space;
+ int bytes_read;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
if (unlikely(bytes_read < 0)) {
@@ -1946,14 +1957,11 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
- if (skb_queue_empty(&msk->receive_queue) &&
- __mptcp_move_skbs(msk, len - copied))
- continue;
-
/* be sure to advertise window change */
- old_space = READ_ONCE(msk->old_wspace);
- if ((tcp_space(sk) - old_space) >= old_space)
- mptcp_cleanup_rbuf(msk);
+ mptcp_cleanup_rbuf(msk);
+
+ if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
+ continue;
/* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
@@ -1981,7 +1989,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
/* race breaker: the shutdown could be after the
* previous receive queue check
*/
- if (__mptcp_move_skbs(msk, len - copied))
+ if (__mptcp_move_skbs(msk))
continue;
break;
}
@@ -2014,7 +2022,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
/* .. race-breaker: ssk might have gotten new data
* after last __mptcp_move_skbs() returned false.
*/
- if (unlikely(__mptcp_move_skbs(msk, 0)))
+ if (unlikely(__mptcp_move_skbs(msk)))
set_bit(MPTCP_DATA_READY, &msk->flags);
} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
/* data to read but mptcp_wait_data() cleared DATA_READY */
@@ -2113,12 +2121,9 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
* so we need to use tcp_close() after detaching them from the mptcp
* parent socket.
*/
-void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
- struct mptcp_subflow_context *subflow)
+static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow)
{
- bool dispose_socket = false;
- struct socket *sock;
-
list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
@@ -2126,11 +2131,8 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
/* if we are invoked by the msk cleanup code, the subflow is
* already orphaned
*/
- sock = ssk->sk_socket;
- if (sock) {
- dispose_socket = sock != sk->sk_socket;
+ if (ssk->sk_socket)
sock_orphan(ssk);
- }
subflow->disposable = 1;
@@ -2148,59 +2150,40 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
__sock_put(ssk);
}
release_sock(ssk);
- if (dispose_socket)
- iput(SOCK_INODE(sock));
sock_put(ssk);
}
-static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
+void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow)
{
- return 0;
+ if (sk->sk_state == TCP_ESTABLISHED)
+ mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
+ __mptcp_close_ssk(sk, ssk, subflow);
}
-static void pm_work(struct mptcp_sock *msk)
+static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
{
- struct mptcp_pm_data *pm = &msk->pm;
-
- spin_lock_bh(&msk->pm.lock);
-
- pr_debug("msk=%p status=%x", msk, pm->status);
- if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
- pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
- mptcp_pm_nl_add_addr_received(msk);
- }
- if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
- pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
- mptcp_pm_nl_add_addr_send_ack(msk);
- }
- if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
- pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
- mptcp_pm_nl_rm_addr_received(msk);
- }
- if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
- mptcp_pm_nl_fully_established(msk);
- }
- if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
- pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
- mptcp_pm_nl_subflow_established(msk);
- }
-
- spin_unlock_bh(&msk->pm.lock);
+ return 0;
}
static void __mptcp_close_subflow(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
+ might_sleep();
+
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (inet_sk_state_load(ssk) != TCP_CLOSE)
continue;
- __mptcp_close_ssk((struct sock *)msk, ssk, subflow);
+ /* 'subflow_data_ready' will re-sched once rx queue is empty */
+ if (!skb_queue_empty_lockless(&ssk->sk_receive_queue))
+ continue;
+
+ mptcp_close_ssk((struct sock *)msk, ssk, subflow);
}
}
@@ -2272,11 +2255,8 @@ static void mptcp_worker(struct work_struct *work)
mptcp_check_fastclose(msk);
- if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
- __mptcp_close_subflow(msk);
-
if (msk->pm.status)
- pm_work(msk);
+ mptcp_pm_nl_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
@@ -2296,9 +2276,13 @@ static void mptcp_worker(struct work_struct *work)
goto unlock;
}
+ if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ __mptcp_close_subflow(msk);
+
if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
goto unlock;
+ __mptcp_clean_una(sk);
dfrag = mptcp_rtx_head(sk);
if (!dfrag)
goto unlock;
@@ -2536,6 +2520,14 @@ static void __mptcp_destroy_sock(struct sock *sk)
pr_debug("msk=%p", msk);
+ might_sleep();
+
+ /* dispose the ancillatory tcp socket, if any */
+ if (msk->subflow) {
+ iput(SOCK_INODE(msk->subflow));
+ msk->subflow = NULL;
+ }
+
/* be sure to always acquire the join list lock, to sync vs
* mptcp_finish_join().
*/
@@ -2586,20 +2578,10 @@ cleanup:
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- bool slow, dispose_socket;
- struct socket *sock;
+ bool slow = lock_sock_fast(ssk);
- slow = lock_sock_fast(ssk);
- sock = ssk->sk_socket;
- dispose_socket = sock && sock != sk->sk_socket;
sock_orphan(ssk);
unlock_sock_fast(ssk, slow);
-
- /* for the outgoing subflows we additionally need to free
- * the associated socket
- */
- if (dispose_socket)
- iput(SOCK_INODE(sock));
}
sock_orphan(sk);
@@ -2614,6 +2596,10 @@ cleanup:
release_sock(sk);
if (do_cancel_work)
mptcp_cancel_work(sk);
+
+ if (mptcp_sk(sk)->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+
sock_put(sk);
}
@@ -2928,10 +2914,16 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
if (!mptcp_send_head(sk))
return;
- if (!sock_owned_by_user(sk))
- __mptcp_subflow_push_pending(sk, ssk);
- else
+ if (!sock_owned_by_user(sk)) {
+ struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
+
+ if (xmit_ssk == ssk)
+ __mptcp_subflow_push_pending(sk, ssk);
+ else if (xmit_ssk)
+ mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
+ } else {
set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ }
}
#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
@@ -2959,6 +2951,8 @@ static void mptcp_release_cb(struct sock *sk)
mptcp_push_pending(sk, 0);
spin_lock_bh(&sk->sk_lock.slock);
}
+ if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
+ __mptcp_error_report(sk);
/* clear any wmem reservation and errors */
__mptcp_update_wmem(sk);
@@ -2979,6 +2973,20 @@ static void mptcp_release_cb(struct sock *sk)
}
}
+void mptcp_subflow_process_delegated(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_subflow_push_pending(sk, ssk);
+ else
+ set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ mptcp_data_unlock(sk);
+ mptcp_subflow_delegated_done(subflow);
+}
+
static int mptcp_hash(struct sock *sk)
{
/* should never be called,
@@ -3036,12 +3044,12 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->can_ack, 1);
WRITE_ONCE(msk->snd_una, msk->write_seq);
- mptcp_pm_new_connection(msk, 0);
+ mptcp_pm_new_connection(msk, ssk, 0);
mptcp_rcv_space_init(msk, ssk);
}
-static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
+void mptcp_sock_graft(struct sock *sk, struct socket *parent)
{
write_lock_bh(&sk->sk_callback_lock);
rcu_assign_pointer(sk->sk_wq, &parent->wq);
@@ -3065,7 +3073,7 @@ bool mptcp_finish_join(struct sock *ssk)
return false;
if (!msk->pm.server_side)
- return true;
+ goto out;
if (!mptcp_pm_allow_new_subflow(msk))
return false;
@@ -3092,6 +3100,8 @@ bool mptcp_finish_join(struct sock *ssk)
if (parent_sock && !ssk->sk_socket)
mptcp_sock_graft(ssk, parent_sock);
subflow->map_seq = READ_ONCE(msk->ack_seq);
+out:
+ mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
return true;
}
@@ -3268,9 +3278,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
struct mptcp_sock *msk = mptcp_sk(newsock->sk);
struct mptcp_subflow_context *subflow;
struct sock *newsk = newsock->sk;
- bool slowpath;
- slowpath = lock_sock_fast(newsk);
+ lock_sock(newsk);
/* PM/worker can now acquire the first subflow socket
* lock without racing with listener queue cleanup,
@@ -3280,10 +3289,11 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
list_add(&subflow->node, &msk->conn_list);
sock_hold(msk->first);
if (mptcp_is_fully_established(newsk))
- mptcp_pm_fully_established(msk);
+ mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
mptcp_copy_inaddrs(newsk, msk->first);
mptcp_rcv_space_init(msk, msk->first);
+ mptcp_propagate_sndbuf(newsk, msk->first);
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
@@ -3295,7 +3305,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssk->sk_socket)
mptcp_sock_graft(ssk, newsock);
}
- unlock_sock_fast(newsk, slowpath);
+ release_sock(newsk);
}
if (inet_csk_listen_poll(ssock->sk))
@@ -3319,12 +3329,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
struct sock *sk = (struct sock *)msk;
if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
- return 0;
+ return EPOLLOUT | EPOLLWRNORM;
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;
- set_bit(MPTCP_NOSPACE, &msk->flags);
+ mptcp_set_nospace(sk);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;
@@ -3352,9 +3362,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
mask |= mptcp_check_readable(msk);
mask |= mptcp_check_writeable(msk);
}
+ if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ mask |= EPOLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ smp_rmb();
+ if (sk->sk_err)
+ mask |= EPOLLERR;
+
return mask;
}
@@ -3388,13 +3405,58 @@ static struct inet_protosw mptcp_protosw = {
.flags = INET_PROTOSW_ICSK,
};
+static int mptcp_napi_poll(struct napi_struct *napi, int budget)
+{
+ struct mptcp_delegated_action *delegated;
+ struct mptcp_subflow_context *subflow;
+ int work_done = 0;
+
+ delegated = container_of(napi, struct mptcp_delegated_action, napi);
+ while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ bh_lock_sock_nested(ssk);
+ if (!sock_owned_by_user(ssk) &&
+ mptcp_subflow_has_delegated_action(subflow))
+ mptcp_subflow_process_delegated(ssk);
+ /* ... elsewhere tcp_release_cb_override already processed
+ * the action or will do at next release_sock().
+ * In both case must dequeue the subflow here - on the same
+ * CPU that scheduled it.
+ */
+ bh_unlock_sock(ssk);
+ sock_put(ssk);
+
+ if (++work_done == budget)
+ return budget;
+ }
+
+ /* always provide a 0 'work_done' argument, so that napi_complete_done
+ * will not try accessing the NULL napi->dev ptr
+ */
+ napi_complete_done(napi, 0);
+ return work_done;
+}
+
void __init mptcp_proto_init(void)
{
+ struct mptcp_delegated_action *delegated;
+ int cpu;
+
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
panic("Failed to allocate MPTCP pcpu counter\n");
+ init_dummy_netdev(&mptcp_napi_dev);
+ for_each_possible_cpu(cpu) {
+ delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
+ INIT_LIST_HEAD(&delegated->head);
+ netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll,
+ NAPI_POLL_WEIGHT);
+ napi_enable(&delegated->napi);
+ }
+
mptcp_subflow_init();
mptcp_pm_init();
mptcp_token_init();
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d67de793d363..91827d949766 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -10,6 +10,7 @@
#include <linux/random.h>
#include <net/tcp.h>
#include <net/inet_connection_sock.h>
+#include <uapi/linux/mptcp.h>
#define MPTCP_SUPPORTED_VERSION 1
@@ -24,6 +25,7 @@
#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
#define OPTION_MPTCP_RM_ADDR BIT(8)
#define OPTION_MPTCP_FASTCLOSE BIT(9)
+#define OPTION_MPTCP_PRIO BIT(10)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -59,6 +61,8 @@
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24
#define TCPOLEN_MPTCP_PORT_LEN 4
#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+#define TCPOLEN_MPTCP_PRIO 3
+#define TCPOLEN_MPTCP_PRIO_ALIGN 4
#define TCPOLEN_MPTCP_FASTCLOSE 12
/* MPTCP MP_JOIN flags */
@@ -86,6 +90,9 @@
#define MPTCP_ADDR_IPVERSION_4 4
#define MPTCP_ADDR_IPVERSION_6 6
+/* MPTCP MP_PRIO flags */
+#define MPTCP_PRIO_BKUP BIT(0)
+
/* MPTCP socket flags */
#define MPTCP_DATA_READY 0
#define MPTCP_NOSPACE 1
@@ -95,6 +102,7 @@
#define MPTCP_WORK_CLOSE_SUBFLOW 5
#define MPTCP_PUSH_PENDING 6
#define MPTCP_CLEAN_UNA 7
+#define MPTCP_ERROR_REPORT 8
static inline bool before64(__u64 seq1, __u64 seq2)
{
@@ -116,6 +124,7 @@ struct mptcp_options_received {
dss : 1,
add_addr : 1,
rm_addr : 1,
+ mp_prio : 1,
family : 4,
echo : 1,
backup : 1;
@@ -196,10 +205,6 @@ struct mptcp_pm_data {
u8 add_addr_accepted;
u8 local_addr_used;
u8 subflows;
- u8 add_addr_signal_max;
- u8 add_addr_accept_max;
- u8 local_addr_max;
- u8 subflows_max;
u8 status;
u8 rm_id;
};
@@ -233,7 +238,6 @@ struct mptcp_sock {
u64 wnd_end;
unsigned long timer_ival;
u32 token;
- int rmem_pending;
int rmem_released;
unsigned long flags;
bool can_ack;
@@ -285,6 +289,11 @@ struct mptcp_sock {
#define mptcp_for_each_subflow(__msk, __subflow) \
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+static inline void msk_owned_by_me(const struct mptcp_sock *msk)
+{
+ sock_owned_by_me((const struct sock *)msk);
+}
+
static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
{
return (struct mptcp_sock *)sk;
@@ -292,7 +301,7 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
static inline int __mptcp_space(const struct sock *sk)
{
- return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending);
+ return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released);
}
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
@@ -325,20 +334,13 @@ static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk)
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
}
-static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
+static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una)))
+ if (msk->snd_una == READ_ONCE(msk->snd_nxt))
return NULL;
- return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
-}
-
-static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
-
return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
}
@@ -372,6 +374,15 @@ enum mptcp_data_avail {
MPTCP_SUBFLOW_OOO_DATA
};
+struct mptcp_delegated_action {
+ struct napi_struct napi;
+ struct list_head head;
+};
+
+DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+
+#define MPTCP_DELEGATE_SEND 0
+
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
@@ -396,6 +407,7 @@ struct mptcp_subflow_context {
map_valid : 1,
mpc_map : 1,
backup : 1,
+ send_mp_prio : 1,
rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */
disposable : 1; /* ctx can be free at ulp release time */
@@ -408,12 +420,16 @@ struct mptcp_subflow_context {
u8 local_id;
u8 remote_id;
+ long delegated_status;
+ struct list_head delegated_node; /* link into delegated_action, protected by local BH */
+
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
const struct inet_connection_sock_af_ops *icsk_af_ops;
void (*tcp_data_ready)(struct sock *sk);
void (*tcp_state_change)(struct sock *sk);
void (*tcp_write_space)(struct sock *sk);
+ void (*tcp_error_report)(struct sock *sk);
struct rcu_head rcu;
};
@@ -456,6 +472,61 @@ static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk,
spin_unlock_bh(&msk->join_list_lock);
}
+void mptcp_subflow_process_delegated(struct sock *ssk);
+
+static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_delegated_action *delegated;
+ bool schedule;
+
+ /* The implied barrier pairs with mptcp_subflow_delegated_done(), and
+ * ensures the below list check sees list updates done prior to status
+ * bit changes
+ */
+ if (!test_and_set_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
+ /* still on delegated list from previous scheduling */
+ if (!list_empty(&subflow->delegated_node))
+ return;
+
+ /* the caller held the subflow bh socket lock */
+ lockdep_assert_in_softirq();
+
+ delegated = this_cpu_ptr(&mptcp_delegated_actions);
+ schedule = list_empty(&delegated->head);
+ list_add_tail(&subflow->delegated_node, &delegated->head);
+ sock_hold(mptcp_subflow_tcp_sock(subflow));
+ if (schedule)
+ napi_schedule(&delegated->napi);
+ }
+}
+
+static inline struct mptcp_subflow_context *
+mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
+{
+ struct mptcp_subflow_context *ret;
+
+ if (list_empty(&delegated->head))
+ return NULL;
+
+ ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
+ list_del_init(&ret->delegated_node);
+ return ret;
+}
+
+static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow)
+{
+ return test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status);
+}
+
+static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow)
+{
+ /* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before
+ * touching the status bit
+ */
+ smp_wmb();
+ clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status);
+}
+
int mptcp_is_enabled(struct net *net);
unsigned int mptcp_get_add_addr_timeout(struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
@@ -463,14 +534,19 @@ void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
-void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
- struct mptcp_subflow_context *subflow);
+void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow);
void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_sock_graft(struct sock *sk, struct socket *parent);
+struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
+void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr,
+ unsigned short family);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
struct mptcp_subflow_context *ctx)
@@ -478,6 +554,7 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
sk->sk_data_ready = ctx->tcp_data_ready;
sk->sk_state_change = ctx->tcp_state_change;
sk->sk_write_space = ctx->tcp_write_space;
+ sk->sk_error_report = ctx->tcp_error_report;
inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
}
@@ -505,6 +582,7 @@ bool mptcp_finish_join(struct sock *sk);
bool mptcp_schedule_work(struct sock *sk);
void __mptcp_check_push(struct sock *sk, struct sock *ssk);
void __mptcp_data_acked(struct sock *sk);
+void __mptcp_error_report(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
void __mptcp_flush_join_list(struct mptcp_sock *msk);
@@ -514,6 +592,25 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}
+static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
+ return false;
+
+ WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
+ return true;
+}
+
+static inline void mptcp_write_space(struct sock *sk)
+{
+ if (sk_stream_is_writeable(sk)) {
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
+ sk_stream_write_space(sk);
+ }
+}
+
void mptcp_destroy_common(struct mptcp_sock *msk);
void __init mptcp_token_init(void);
@@ -539,8 +636,8 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
-void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
-void mptcp_pm_fully_established(struct mptcp_sock *msk);
+void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
void mptcp_pm_connection_closed(struct mptcp_sock *msk);
void mptcp_pm_subflow_established(struct mptcp_sock *msk,
@@ -550,7 +647,12 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id);
+void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ u8 bkup);
void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
+bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
struct mptcp_addr_info *addr);
@@ -561,6 +663,11 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id);
+void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
+ const struct sock *ssk, gfp_t gfp);
+void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info);
+void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id);
+
static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
{
return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL);
@@ -608,13 +715,13 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
-void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
-void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
-void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
-void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk);
-void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk);
+void mptcp_pm_nl_work(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
+unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
{
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 278cbe3e539e..06e233410e0e 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -18,12 +18,15 @@
#include <net/tcp.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/ip6_route.h>
+#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
+static void mptcp_subflow_ops_undo_override(struct sock *ssk);
+
static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
enum linux_mptcp_mib_field field)
{
@@ -61,11 +64,23 @@ static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
}
/* validate received token and create truncated hmac and nonce for SYN-ACK */
-static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
- const struct sk_buff *skb)
+static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
{
- struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_sock *msk = subflow_req->msk;
u8 hmac[SHA256_DIGEST_SIZE];
+
+ get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
+
+ subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_req->local_nonce,
+ subflow_req->remote_nonce, hmac);
+
+ subflow_req->thmac = get_unaligned_be64(hmac);
+}
+
+static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct mptcp_sock *msk;
int local_id;
@@ -82,17 +97,10 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
}
subflow_req->local_id = local_id;
- get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
-
- subflow_generate_hmac(msk->local_key, msk->remote_key,
- subflow_req->local_nonce,
- subflow_req->remote_nonce, hmac);
-
- subflow_req->thmac = get_unaligned_be64(hmac);
return msk;
}
-static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
+static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@ -100,16 +108,11 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li
subflow_req->mp_join = 0;
subflow_req->msk = NULL;
mptcp_token_init_request(req);
+}
-#ifdef CONFIG_TCP_MD5SIG
- /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
- * TCP option space.
- */
- if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
- return -EINVAL;
-#endif
-
- return 0;
+static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
+{
+ return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
}
/* Init mptcp request socket.
@@ -117,20 +120,23 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li
* Returns an error code if a JOIN has failed and a TCP reset
* should be sent.
*/
-static int subflow_init_req(struct request_sock *req,
- const struct sock *sk_listener,
- struct sk_buff *skb)
+static int subflow_check_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
{
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct mptcp_options_received mp_opt;
- int ret;
pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
- ret = __subflow_init_req(req, sk_listener);
- if (ret)
- return 0;
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+ return -EINVAL;
+#endif
mptcp_get_options(skb, &mp_opt);
@@ -178,12 +184,30 @@ again:
subflow_req->remote_id = mp_opt.join_id;
subflow_req->token = mp_opt.token;
subflow_req->remote_nonce = mp_opt.nonce;
- subflow_req->msk = subflow_token_join_request(req, skb);
+ subflow_req->msk = subflow_token_join_request(req);
/* Can't fall back to TCP in this case. */
if (!subflow_req->msk)
return -EPERM;
+ if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
+ pr_debug("syn inet_sport=%d %d",
+ ntohs(inet_sk(sk_listener)->inet_sport),
+ ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
+ if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
+ sock_put((struct sock *)subflow_req->msk);
+ mptcp_token_destroy_request(req);
+ tcp_request_sock_ops.destructor(req);
+ subflow_req->msk = NULL;
+ subflow_req->mp_join = 0;
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
+ return -EPERM;
+ }
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
+ }
+
+ subflow_req_create_thmac(subflow_req);
+
if (unlikely(req->syncookie)) {
if (mptcp_can_accept_new_subflow(subflow_req->msk))
subflow_init_req_cookie_join_save(subflow_req, skb);
@@ -205,10 +229,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
struct mptcp_options_received mp_opt;
int err;
- err = __subflow_init_req(req, sk_listener);
- if (err)
- return err;
-
+ subflow_init_req(req, sk_listener);
mptcp_get_options(skb, &mp_opt);
if (mp_opt.mp_capable && mp_opt.mp_join)
@@ -248,12 +269,13 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
int err;
tcp_rsk(req)->is_mptcp = 1;
+ subflow_init_req(req, sk);
dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
if (!dst)
return NULL;
- err = subflow_init_req(req, sk, skb);
+ err = subflow_check_req(req, sk, skb);
if (err == 0)
return dst;
@@ -273,12 +295,13 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
int err;
tcp_rsk(req)->is_mptcp = 1;
+ subflow_init_req(req, sk);
dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
if (!dst)
return NULL;
- err = subflow_init_req(req, sk, skb);
+ err = subflow_check_req(req, sk, skb);
if (err == 0)
return dst;
@@ -326,6 +349,11 @@ void mptcp_subflow_reset(struct sock *ssk)
sock_put(sk);
}
+static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
+{
+ return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -343,6 +371,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (subflow->conn_finished)
return;
+ mptcp_propagate_sndbuf(parent, sk);
subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@@ -391,6 +420,13 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+
+ if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
+ pr_debug("synack inet_dport=%d %d",
+ ntohs(inet_sk(sk)->inet_dport),
+ ntohs(inet_sk(parent)->inet_dport));
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
+ }
} else if (mptcp_check_fallback(sk)) {
fallback:
mptcp_rcv_space_init(mptcp_sk(parent), sk);
@@ -427,6 +463,7 @@ drop:
static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
static struct inet_connection_sock_af_ops subflow_v6_specific;
static struct inet_connection_sock_af_ops subflow_v6m_specific;
+static struct proto tcpv6_prot_override;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
@@ -508,6 +545,8 @@ static void subflow_ulp_fallback(struct sock *sk,
icsk->icsk_ulp_ops = NULL;
rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
tcp_sk(sk)->is_mptcp = 0;
+
+ mptcp_subflow_ops_undo_override(sk);
}
static void subflow_drop_ctx(struct sock *ssk)
@@ -628,7 +667,7 @@ create_child:
* created mptcp socket
*/
new_msk->sk_destruct = mptcp_sock_destruct;
- mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
+ mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
ctx->conn = new_msk;
new_msk = NULL;
@@ -653,6 +692,17 @@ create_child:
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
tcp_rsk(req)->drop_req = true;
+
+ if (subflow_use_different_sport(owner, sk)) {
+ pr_debug("ack inet_sport=%d %d",
+ ntohs(inet_sk(sk)->inet_sport),
+ ntohs(inet_sk((struct sock *)owner)->inet_sport));
+ if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
+ goto out;
+ }
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
+ }
}
}
@@ -681,6 +731,7 @@ dispose_child:
}
static struct inet_connection_sock_af_ops subflow_specific;
+static struct proto tcp_prot_override;
enum mapping_status {
MAPPING_OK,
@@ -894,6 +945,22 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
subflow->map_valid = 0;
}
+/* sched mptcp worker to remove the subflow if no more data is pending */
+static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ if (likely(ssk->sk_state != TCP_CLOSE))
+ return;
+
+ if (skb_queue_empty(&ssk->sk_receive_queue) &&
+ !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) {
+ sock_hold(sk);
+ if (!schedule_work(&msk->work))
+ sock_put(sk);
+ }
+}
+
static bool subflow_check_data_avail(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -932,11 +999,11 @@ static bool subflow_check_data_avail(struct sock *ssk)
}
if (status != MAPPING_OK)
- return false;
+ goto no_data;
skb = skb_peek(&ssk->sk_receive_queue);
if (WARN_ON_ONCE(!skb))
- return false;
+ goto no_data;
/* if msk lacks the remote key, this subflow must provide an
* MP_CAPABLE-based mapping
@@ -970,6 +1037,9 @@ static bool subflow_check_data_avail(struct sock *ssk)
}
return true;
+no_data:
+ subflow_sched_work_if_closed(msk, ssk);
+ return false;
fatal:
/* fatal protocol error, close the socket */
/* This barrier is coupled with smp_rmb() in tcp_poll() */
@@ -1040,7 +1110,50 @@ static void subflow_data_ready(struct sock *sk)
static void subflow_write_space(struct sock *ssk)
{
- /* we take action in __mptcp_clean_una() */
+ struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+
+ mptcp_propagate_sndbuf(sk, ssk);
+ mptcp_write_space(sk);
+}
+
+void __mptcp_error_report(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int err = sock_error(ssk);
+
+ if (!err)
+ continue;
+
+ /* only propagate errors on fallen-back sockets or
+ * on MPC connect
+ */
+ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
+ continue;
+
+ inet_sk_state_store(sk, inet_sk_state_load(ssk));
+ sk->sk_err = -err;
+
+ /* This barrier is coupled with smp_rmb() in mptcp_poll() */
+ smp_wmb();
+ sk->sk_error_report(sk);
+ break;
+ }
+}
+
+static void subflow_error_report(struct sock *ssk)
+{
+ struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_error_report(sk);
+ else
+ set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags);
+ mptcp_data_unlock(sk);
}
static struct inet_connection_sock_af_ops *
@@ -1073,22 +1186,32 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
}
#endif
-static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
- struct sockaddr_storage *addr)
+void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr,
+ unsigned short family)
{
memset(addr, 0, sizeof(*addr));
- addr->ss_family = info->family;
+ addr->ss_family = family;
if (addr->ss_family == AF_INET) {
struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
- in_addr->sin_addr = info->addr;
+ if (info->family == AF_INET)
+ in_addr->sin_addr = info->addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (ipv6_addr_v4mapped(&info->addr6))
+ in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
+#endif
in_addr->sin_port = info->port;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->ss_family == AF_INET6) {
struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
- in6_addr->sin6_addr = info->addr6;
+ if (info->family == AF_INET)
+ ipv6_addr_set_v4mapped(info->addr.s_addr,
+ &in6_addr->sin6_addr);
+ else
+ in6_addr->sin6_addr = info->addr6;
in6_addr->sin6_port = info->port;
}
#endif
@@ -1132,11 +1255,11 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
subflow->token = msk->token;
- mptcp_info2sockaddr(loc, &addr);
+ mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
addrlen = sizeof(struct sockaddr_in);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (loc->family == AF_INET6)
+ if (addr.ss_family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
ssk->sk_bound_dev_if = loc->ifindex;
@@ -1152,13 +1275,16 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
subflow->remote_id = remote_id;
subflow->request_join = 1;
subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
- mptcp_info2sockaddr(remote, &addr);
+ mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
mptcp_add_pending_subflow(msk, subflow);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
if (err && err != -EINPROGRESS)
goto failed_unlink;
+ /* discard the subflow socket */
+ mptcp_sock_graft(ssk, sk->sk_socket);
+ iput(SOCK_INODE(sf));
return err;
failed_unlink:
@@ -1196,6 +1322,25 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
#endif /* CONFIG_SOCK_CGROUP_DATA */
}
+static void mptcp_subflow_ops_override(struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (ssk->sk_prot == &tcpv6_prot)
+ ssk->sk_prot = &tcpv6_prot_override;
+ else
+#endif
+ ssk->sk_prot = &tcp_prot_override;
+}
+
+static void mptcp_subflow_ops_undo_override(struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (ssk->sk_prot == &tcpv6_prot_override)
+ ssk->sk_prot = &tcpv6_prot;
+ else
+#endif
+ ssk->sk_prot = &tcp_prot;
+}
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
@@ -1251,6 +1396,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
*new_sock = sf;
sock_hold(sk);
subflow->conn = sk;
+ mptcp_subflow_ops_override(sf->sk);
return 0;
}
@@ -1267,6 +1413,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
INIT_LIST_HEAD(&ctx->node);
+ INIT_LIST_HEAD(&ctx->delegated_node);
pr_debug("subflow=%p", ctx);
@@ -1299,6 +1446,7 @@ static void subflow_state_change(struct sock *sk)
__subflow_state_change(sk);
if (subflow_simultaneous_connect(sk)) {
+ mptcp_propagate_sndbuf(parent, sk);
mptcp_do_fallback(sk);
mptcp_rcv_space_init(mptcp_sk(parent), sk);
pr_fallback(mptcp_sk(parent));
@@ -1316,6 +1464,8 @@ static void subflow_state_change(struct sock *sk)
if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk);
+ subflow_sched_work_if_closed(mptcp_sk(parent), sk);
+
if (__mptcp_check_fallback(mptcp_sk(parent)) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
@@ -1352,9 +1502,11 @@ static int subflow_ulp_init(struct sock *sk)
ctx->tcp_data_ready = sk->sk_data_ready;
ctx->tcp_state_change = sk->sk_state_change;
ctx->tcp_write_space = sk->sk_write_space;
+ ctx->tcp_error_report = sk->sk_error_report;
sk->sk_data_ready = subflow_data_ready;
sk->sk_write_space = subflow_write_space;
sk->sk_state_change = subflow_state_change;
+ sk->sk_error_report = subflow_error_report;
out:
return err;
}
@@ -1378,6 +1530,7 @@ static void subflow_ulp_release(struct sock *ssk)
sock_put(sk);
}
+ mptcp_subflow_ops_undo_override(ssk);
if (release)
kfree_rcu(ctx, rcu);
}
@@ -1407,6 +1560,7 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
new_ctx->tcp_state_change = old_ctx->tcp_state_change;
new_ctx->tcp_write_space = old_ctx->tcp_write_space;
+ new_ctx->tcp_error_report = old_ctx->tcp_error_report;
new_ctx->rel_write_seq = 1;
new_ctx->tcp_sock = newsk;
@@ -1431,6 +1585,16 @@ static void subflow_ulp_clone(const struct request_sock *req,
}
}
+static void tcp_release_cb_override(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (mptcp_subflow_has_delegated_action(subflow))
+ mptcp_subflow_process_delegated(ssk);
+
+ tcp_release_cb(ssk);
+}
+
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
.name = "mptcp",
.owner = THIS_MODULE,
@@ -1471,6 +1635,9 @@ void __init mptcp_subflow_init(void)
subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+ tcp_prot_override = tcp_prot;
+ tcp_prot_override.release_cb = tcp_release_cb_override;
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
@@ -1486,6 +1653,9 @@ void __init mptcp_subflow_init(void)
subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
subflow_v6m_specific.net_frag_header_len = 0;
+
+ tcpv6_prot_override = tcpv6_prot;
+ tcpv6_prot_override.release_cb = tcp_release_cb_override;
#endif
mptcp_diag_subflow_init(&subflow_ulp_ops);