diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/Kconfig | 2 | ||||
-rw-r--r-- | net/mptcp/Makefile | 4 | ||||
-rw-r--r-- | net/mptcp/crypto.c | 2 | ||||
-rw-r--r-- | net/mptcp/mib.c | 3 | ||||
-rw-r--r-- | net/mptcp/mib.h | 3 | ||||
-rw-r--r-- | net/mptcp/options.c | 298 | ||||
-rw-r--r-- | net/mptcp/pm.c | 64 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 328 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 356 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 117 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 756 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 58 | ||||
-rw-r--r-- | net/mptcp/token.c | 2 |
13 files changed, 1426 insertions, 567 deletions
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index a014149aa323..20328920f6ed 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -22,7 +22,7 @@ config MPTCP_IPV6 depends on IPV6=y default y -config MPTCP_KUNIT_TESTS +config MPTCP_KUNIT_TEST tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index a611968be4d7..e54daceac58b 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,11 +2,11 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o + mib.o pm_netlink.o sockopt.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o -obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o +obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index b472dc149856..a8931349933c 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -78,6 +78,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); } -#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha); #endif diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 3780c29c321d..eb2dc6dbe212 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -10,9 +10,12 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), + SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), + SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 72afbc135f8e..f0da4f060fe1 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -3,9 +3,12 @@ enum linux_mptcp_mib_field { MPTCP_MIB_NUM = 0, MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 89a4225ed321..99fc21406168 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -13,6 +13,8 @@ #include "protocol.h" #include "mib.h" +#include <trace/events/mptcp.h> + static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; @@ -26,6 +28,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, int expected_opsize; u8 version; u8 flags; + u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: @@ -219,45 +222,45 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_4; + mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_6; + mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_4; + mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_6; + mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->add_addr = 1; - mp_opt->addr_id = *ptr++; - if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { - memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); + mp_opt->addr.id = *ptr++; + if (mp_opt->addr.family == AF_INET) { + memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { - mp_opt->port = get_unaligned_be16(ptr); + mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { - memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); + memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { - mp_opt->port = get_unaligned_be16(ptr); + mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } @@ -267,19 +270,22 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", - (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", - mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port); + (mp_opt->addr.family == AF_INET6) ? "6" : "", + mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: - if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) + if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || + opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->rm_addr = 1; - mp_opt->rm_id = *ptr++; - pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); + mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; + for (i = 0; i < mp_opt->rm_list.nr; i++) + mp_opt->rm_list.ids[i] = *ptr++; + pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: @@ -301,6 +307,18 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->fastclose = 1; break; + case MPTCPOPT_RST: + if (opsize != TCPOLEN_MPTCP_RST) + break; + + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + break; + mp_opt->reset = 1; + flags = *ptr++; + mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; + mp_opt->reset_reason = *ptr; + break; + default: break; } @@ -319,10 +337,11 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->add_addr = 0; mp_opt->ahmac = 0; mp_opt->fastclose = 0; - mp_opt->port = 0; + mp_opt->addr.port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; mp_opt->mp_prio = 0; + mp_opt->reset = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -566,39 +585,32 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return true; } -static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, - struct in_addr *addr, u16 port) -{ - u8 hmac[SHA256_DIGEST_SIZE]; - u8 msg[7]; - - msg[0] = addr_id; - memcpy(&msg[1], &addr->s_addr, 4); - msg[5] = port >> 8; - msg[6] = port & 0xFF; - - mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); - - return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); -} - -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, - struct in6_addr *addr, u16 port) +static u64 add_addr_generate_hmac(u64 key1, u64 key2, + struct mptcp_addr_info *addr) { + u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; + int i = 0; - msg[0] = addr_id; - memcpy(&msg[1], &addr->s6_addr, 16); - msg[17] = port >> 8; - msg[18] = port & 0xFF; + msg[i++] = addr->id; + if (addr->family == AF_INET) { + memcpy(&msg[i], &addr->addr.s_addr, 4); + i += 4; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) { + memcpy(&msg[i], &addr->addr6.s6_addr, 16); + i += 16; + } +#endif + msg[i++] = port >> 8; + msg[i++] = port & 0xFF; - mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); + mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } -#endif static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, @@ -609,13 +621,13 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; - struct mptcp_addr_info saddr; bool echo; bool port; int len; if ((mptcp_pm_should_add_signal_ipv6(msk) || - mptcp_pm_should_add_signal_port(msk)) && + mptcp_pm_should_add_signal_port(msk) || + mptcp_pm_should_add_signal_echo(msk)) && skb && skb_is_tcp_pure_ack(skb)) { pr_debug("drop other suboptions"); opts->suboptions = 0; @@ -626,45 +638,24 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port))) + !(mptcp_pm_add_addr_signal(msk, remaining, &opts->addr, &echo, &port))) return false; - len = mptcp_add_addr_len(saddr.family, echo, port); + len = mptcp_add_addr_len(opts->addr.family, echo, port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) *size -= opt_size; - opts->addr_id = saddr.id; - if (port) - opts->port = ntohs(saddr.port); - if (saddr.family == AF_INET) { - opts->suboptions |= OPTION_MPTCP_ADD_ADDR; - opts->addr = saddr.addr; - if (!echo) { - opts->ahmac = add_addr_generate_hmac(msk->local_key, - msk->remote_key, - opts->addr_id, - &opts->addr, - opts->port); - } - } -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (saddr.family == AF_INET6) { - opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; - opts->addr6 = saddr.addr6; - if (!echo) { - opts->ahmac = add_addr6_generate_hmac(msk->local_key, - msk->remote_key, - opts->addr_id, - &opts->addr6, - opts->port); - } + opts->suboptions |= OPTION_MPTCP_ADD_ADDR; + if (!echo) { + opts->ahmac = add_addr_generate_hmac(msk->local_key, + msk->remote_key, + &opts->addr); } -#endif pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", - opts->addr_id, opts->ahmac, echo, opts->port); + opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } @@ -676,20 +667,25 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - u8 rm_id; + struct mptcp_rm_list rm_list; + int i, len; if (!mptcp_pm_should_rm_signal(msk) || - !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id))) + !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; - if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE) + len = mptcp_rm_addr_len(&rm_list); + if (len < 0) + return false; + if (remaining < len) return false; - *size = TCPOLEN_MPTCP_RM_ADDR_BASE; + *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; - opts->rm_id = rm_id; + opts->rm_list = rm_list; - pr_debug("rm_id=%d", opts->rm_id); + for (i = 0; i < opts->rm_list.nr; i++) + pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); return true; } @@ -717,6 +713,22 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, return true; } +static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (remaining < TCPOLEN_MPTCP_RST) + return; + + *size = TCPOLEN_MPTCP_RST; + opts->suboptions |= OPTION_MPTCP_RST; + opts->reset_transient = subflow->reset_transient; + opts->reset_reason = subflow->reset_reason; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -732,11 +744,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(__mptcp_check_fallback(msk))) return false; - /* prevent adding of any MPTCP related options on reset packet - * until we support MP_TCPRST/MP_FASTCLOSE - */ - if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) - return false; + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { + mptcp_established_options_rst(sk, skb, size, remaining, opts); + return true; + } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) @@ -873,7 +884,7 @@ fully_established: subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); - mptcp_pm_subflow_established(msk, subflow); + mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC); } @@ -934,6 +945,10 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_data_acked(sk); } mptcp_data_unlock(sk); + + trace_ack_update_msk(mp_opt->data_ack, + old_snd_una, new_snd_una, + new_wnd_end, msk->wnd_end); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -943,7 +958,7 @@ bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool us * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ - if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first)) + if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, @@ -961,18 +976,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, if (mp_opt->echo) return true; - if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) - hmac = add_addr_generate_hmac(msk->remote_key, - msk->local_key, - mp_opt->addr_id, &mp_opt->addr, - mp_opt->port); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else - hmac = add_addr6_generate_hmac(msk->remote_key, - msk->local_key, - mp_opt->addr_id, &mp_opt->addr6, - mp_opt->port); -#endif + hmac = add_addr_generate_hmac(msk->remote_key, + msk->local_key, + &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, (unsigned long long)hmac, @@ -1013,36 +1019,23 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { - struct mptcp_addr_info addr; - - addr.port = htons(mp_opt.port); - addr.id = mp_opt.addr_id; - if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) { - addr.family = AF_INET; - addr.addr = mp_opt.addr; - } -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) { - addr.family = AF_INET6; - addr.addr6 = mp_opt.addr6; - } -#endif if (!mp_opt.echo) { - mptcp_pm_add_addr_received(msk, &addr); + mptcp_pm_add_addr_received(msk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { - mptcp_pm_del_add_timer(msk, &addr); + mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); + mptcp_pm_del_add_timer(msk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } - if (mp_opt.port) + if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); mp_opt.add_addr = 0; } if (mp_opt.rm_addr) { - mptcp_pm_rm_addr_received(msk, mp_opt.rm_id); + mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); mp_opt.rm_addr = 0; } @@ -1052,6 +1045,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mp_opt.mp_prio = 0; } + if (mp_opt.reset) { + subflow->reset_seen = 1; + subflow->reset_reason = mp_opt.reset_reason; + subflow->reset_transient = mp_opt.reset_transient; + } + if (!mp_opt.dss) return; @@ -1160,20 +1159,16 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, } mp_capable_done: - if ((OPTION_MPTCP_ADD_ADDR -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - | OPTION_MPTCP_ADD_ADDR6 -#endif - ) & opts->suboptions) { + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) + if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif - if (opts->port) + if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { @@ -1182,28 +1177,30 @@ mp_capable_done: } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - len, echo, opts->addr_id); - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { - memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + len, echo, opts->addr.id); + if (opts->addr.family == AF_INET) { + memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + else if (opts->addr.family == AF_INET6) { + memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif - if (!opts->port) { + if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { + u16 port = ntohs(opts->addr.port); + if (opts->ahmac) { u8 *bptr = (u8 *)ptr; - put_unaligned_be16(opts->port, bptr); + put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; @@ -1212,7 +1209,7 @@ mp_capable_done: ptr += 3; } else { - put_unaligned_be32(opts->port << 16 | + put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; @@ -1221,9 +1218,23 @@ mp_capable_done: } if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { + u8 i = 1; + *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, - TCPOLEN_MPTCP_RM_ADDR_BASE, - 0, opts->rm_id); + TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, + 0, opts->rm_list.ids[0]); + + while (i < opts->rm_list.nr) { + u8 id1, id2, id3, id4; + + id1 = opts->rm_list.ids[i]; + id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; + id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; + id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; + put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); + ptr += 1; + i += 4; + } } if (OPTION_MPTCP_PRIO & opts->suboptions) { @@ -1265,6 +1276,12 @@ mp_capable_done: ptr += 5; } + if (OPTION_MPTCP_RST & opts->suboptions) + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; @@ -1316,3 +1333,20 @@ mp_capable_done: if (tp) mptcp_set_rwin(tp); } + +__be32 mptcp_get_reset_option(const struct sk_buff *skb) +{ + const struct mptcp_ext *ext = mptcp_get_ext(skb); + u8 flags, reason; + + if (ext) { + flags = ext->reset_transient; + reason = ext->reset_reason; + + return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, + flags, reason); + } + + return htonl(0u); +} +EXPORT_SYMBOL_GPL(mptcp_get_reset_option); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 6fd4b2c1b076..9d00fa6d22e9 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -14,7 +14,7 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo, bool port) + bool echo) { u8 add_addr = READ_ONCE(msk->pm.addr_signal); @@ -33,35 +33,36 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); if (addr->family == AF_INET6) add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); - if (port) + if (addr->port) add_addr |= BIT(MPTCP_ADD_ADDR_PORT); WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } -int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) +int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { u8 rm_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, local_id=%d", msk, local_id); + pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); if (rm_addr) { pr_warn("addr_signal error, rm_addr=%d", rm_addr); return -EINVAL; } - msk->pm.rm_id = local_id; + msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); + mptcp_pm_nl_addr_send_ack(msk); return 0; } -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id) +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { - pr_debug("msk=%p, local_id=%d", msk, local_id); + pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, local_id); + mptcp_pm_nl_rm_subflow_received(msk, rm_list); spin_unlock_bh(&msk->pm.lock); return 0; } @@ -152,8 +153,7 @@ void mptcp_pm_connection_closed(struct mptcp_sock *msk) pr_debug("msk=%p", msk); } -void mptcp_pm_subflow_established(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) +void mptcp_pm_subflow_established(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; @@ -188,7 +188,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_lock_bh(&pm->lock); if (!READ_ONCE(pm->accept_addr)) { - mptcp_pm_announce_addr(msk, addr, true, addr->port); + mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; @@ -197,6 +197,21 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_unlock_bh(&pm->lock); } +void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p", msk); + + spin_lock_bh(&pm->lock); + + if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); + + spin_unlock_bh(&pm->lock); +} + void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) { if (!mptcp_pm_should_add_signal(msk)) @@ -205,17 +220,20 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } -void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) +void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { struct mptcp_pm_data *pm = &msk->pm; + u8 i; - pr_debug("msk=%p remote_id=%d", msk, rm_id); + pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr); - mptcp_event_addr_removed(msk, rm_id); + for (i = 0; i < rm_list->nr; i++) + mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); - pm->rm_id = rm_id; + pm->rm_list_rx = *rm_list; spin_unlock_bh(&pm->lock); } @@ -258,9 +276,9 @@ out_unlock: } bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - u8 *rm_id) + struct mptcp_rm_list *rm_list) { - int ret = false; + int ret = false, len; spin_lock_bh(&msk->pm.lock); @@ -268,10 +286,15 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_rm_signal(msk)) goto out_unlock; - if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE) + len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); + if (len < 0) { + WRITE_ONCE(msk->pm.addr_signal, 0); + goto out_unlock; + } + if (remaining < len) goto out_unlock; - *rm_id = msk->pm.rm_id; + *rm_list = msk->pm.rm_list_tx; WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; @@ -291,7 +314,8 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.add_addr_accepted = 0; msk->pm.local_addr_used = 0; msk->pm.subflows = 0; - msk->pm.rm_id = 0; + msk->pm.rm_list_tx.nr = 0; + msk->pm.rm_list_rx.nr = 0; WRITE_ONCE(msk->pm.work_pending, false); WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 8e8e35fa4002..6ba040897738 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -25,6 +25,8 @@ static int pm_nl_pernet_id; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; + u8 flags; + int ifindex; struct rcu_head rcu; struct socket *lsk; }; @@ -56,8 +58,6 @@ struct pm_nl_pernet { #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 -static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk); - static bool addresses_equal(const struct mptcp_addr_info *a, struct mptcp_addr_info *b, bool use_port) { @@ -140,6 +140,24 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, return false; } +static bool lookup_subflow_by_daddr(const struct list_head *list, + struct mptcp_addr_info *daddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + remote_address(skc, &cur); + if (addresses_equal(&cur, daddr, daddr->port)) + return true; + } + + return false; +} + static struct mptcp_pm_addr_entry * select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_sock *msk) @@ -152,7 +170,7 @@ select_local_address(const struct pm_nl_pernet *pernet, rcu_read_lock(); __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; if (entry->addr.family != sk->sk_family) { @@ -190,7 +208,7 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) * can lead to additional addresses not being announced. */ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; if (i++ == pos) { ret = entry; @@ -245,9 +263,9 @@ static void check_work_pending(struct mptcp_sock *msk) WRITE_ONCE(msk->pm.work_pending, false); } -static struct mptcp_pm_add_entry * -lookup_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) +struct mptcp_pm_add_entry * +mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; @@ -308,7 +326,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!mptcp_pm_should_add_signal(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false, entry->addr.port); + mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } @@ -319,6 +337,9 @@ static void mptcp_pm_add_timer(struct timer_list *timer) spin_unlock_bh(&msk->pm.lock); + if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) + mptcp_pm_subflow_established(msk); + out: __sock_put(sk); } @@ -331,7 +352,7 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct sock *sk = (struct sock *)msk; spin_lock_bh(&msk->pm.lock); - entry = lookup_anno_list_by_saddr(msk, addr); + entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry) entry->retrans_times = ADD_ADDR_RETRANS_MAX; spin_unlock_bh(&msk->pm.lock); @@ -351,7 +372,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); - if (lookup_anno_list_by_saddr(msk, &entry->addr)) + if (mptcp_lookup_anno_list_by_saddr(msk, &entry->addr)) return false; add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); @@ -417,8 +438,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false, local->addr.port); - mptcp_pm_nl_add_addr_send_ack(msk); + mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_nl_addr_send_ack(msk); } } else { /* pick failed, avoid fourther attempts later */ @@ -440,7 +461,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) check_work_pending(msk); remote_address((struct sock_common *)sk, &remote); spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local->addr, &remote); + __mptcp_subflow_connect(sk, &local->addr, &remote, + local->flags, local->ifindex); spin_lock_bh(&msk->pm.lock); return; } @@ -468,7 +490,6 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct mptcp_addr_info remote; struct mptcp_addr_info local; unsigned int subflows_max; - bool use_port = false; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); @@ -476,6 +497,10 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) pr_debug("accepted %d:%d remote family %d", msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); + + if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) + goto add_addr_echo; + msk->pm.add_addr_accepted++; msk->pm.subflows++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || @@ -488,37 +513,37 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; if (!remote.port) remote.port = sk->sk_dport; - else - use_port = true; memset(&local, 0, sizeof(local)); local.family = remote.family; spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local, &remote); + __mptcp_subflow_connect(sk, &local, &remote, 0, 0); spin_lock_bh(&msk->pm.lock); - mptcp_pm_announce_addr(msk, &remote, true, use_port); - mptcp_pm_nl_add_addr_send_ack(msk); +add_addr_echo: + mptcp_pm_announce_addr(msk, &msk->pm.remote, true); + mptcp_pm_nl_addr_send_ack(msk); } -static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) +void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); - if (!mptcp_pm_should_add_signal(msk)) + if (!mptcp_pm_should_add_signal(msk) && + !mptcp_pm_should_rm_signal(msk)) return; __mptcp_flush_join_list(msk); subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - u8 add_addr; spin_unlock_bh(&msk->pm.lock); - pr_debug("send ack for add_addr%s%s", + pr_debug("send ack for %s%s%s", + mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr", mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); @@ -526,13 +551,6 @@ static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) tcp_send_ack(ssk); release_sock(ssk); spin_lock_bh(&msk->pm.lock); - - add_addr = READ_ONCE(msk->pm.addr_signal); - if (mptcp_pm_should_add_signal_ipv6(msk)) - add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); - if (mptcp_pm_should_add_signal_port(msk)) - add_addr &= ~BIT(MPTCP_ADD_ADDR_PORT); - WRITE_ONCE(msk->pm.addr_signal, add_addr); } } @@ -571,43 +589,68 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } -static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) +static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list, + enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; + u8 i; - pr_debug("address rm_id %d", msk->pm.rm_id); + pr_debug("%s rm_list_nr %d", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); - if (!msk->pm.rm_id) + if (!rm_list->nr) return; if (list_empty(&msk->conn_list)) return; - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - int how = RCV_SHUTDOWN | SEND_SHUTDOWN; + for (i = 0; i < rm_list->nr; i++) { + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int how = RCV_SHUTDOWN | SEND_SHUTDOWN; + u8 id = subflow->local_id; - if (msk->pm.rm_id != subflow->remote_id) - continue; + if (rm_type == MPTCP_MIB_RMADDR) + id = subflow->remote_id; - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, how); - mptcp_close_ssk(sk, ssk, subflow); - spin_lock_bh(&msk->pm.lock); - - msk->pm.add_addr_accepted--; - msk->pm.subflows--; - WRITE_ONCE(msk->pm.accept_addr, true); + if (rm_list->ids[i] != id) + continue; - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMADDR); + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", + i, rm_list->ids[i], subflow->local_id, subflow->remote_id); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, how); + mptcp_close_ssk(sk, ssk, subflow); + spin_lock_bh(&msk->pm.lock); - break; + if (rm_type == MPTCP_MIB_RMADDR) { + msk->pm.add_addr_accepted--; + WRITE_ONCE(msk->pm.accept_addr, true); + } else if (rm_type == MPTCP_MIB_RMSUBFLOW) { + msk->pm.local_addr_used--; + } + msk->pm.subflows--; + __MPTCP_INC_STATS(sock_net(sk), rm_type); + } } } +static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) +{ + mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); +} + +void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) +{ + mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); +} + void mptcp_pm_nl_work(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; @@ -623,7 +666,7 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) } if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_add_addr_send_ack(msk); + mptcp_pm_nl_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); @@ -641,45 +684,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) -{ - struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = (struct sock *)msk; - - pr_debug("subflow rm_id %d", rm_id); - - msk_owned_by_me(msk); - - if (!rm_id) - return; - - if (list_empty(&msk->conn_list)) - return; - - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - - if (rm_id != subflow->local_id) - continue; - - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, how); - mptcp_close_ssk(sk, ssk, subflow); - spin_lock_bh(&msk->pm.lock); - - msk->pm.local_addr_used--; - msk->pm.subflows--; - - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); - - break; - } -} - static bool address_use_port(struct mptcp_pm_addr_entry *entry) { - return (entry->addr.flags & + return (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == MPTCP_PM_ADDR_FLAG_SIGNAL; } @@ -731,11 +738,11 @@ find_next: if (entry->addr.id > pernet->next_id) pernet->next_id = entry->addr.id; - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max + 1); } @@ -837,10 +844,10 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return -ENOMEM; entry->addr = skc_local; - entry->addr.ifindex = 0; - entry->addr.flags = 0; entry->addr.id = 0; entry->addr.port = 0; + entry->ifindex = 0; + entry->flags = 0; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) @@ -955,14 +962,14 @@ skip_family: if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); - entry->addr.ifindex = val; + entry->ifindex = val; } if (tb[MPTCP_PM_ADDR_ATTR_ID]) entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) - entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); @@ -1071,12 +1078,15 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, struct mptcp_addr_info *addr, bool force) { + struct mptcp_rm_list list = { .nr = 0 }; bool ret; + list.ids[list.nr++] = addr->id; + ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); - mptcp_pm_remove_addr(msk, addr->id); + mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } return ret; @@ -1087,9 +1097,12 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, { struct mptcp_sock *msk; long s_slot = 0, s_num = 0; + struct mptcp_rm_list list = { .nr = 0 }; pr_debug("remove_id=%d", addr->id); + list.ids[list.nr++] = addr->id; + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; bool remove_subflow; @@ -1103,7 +1116,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow); if (remove_subflow) - mptcp_pm_remove_subflow(msk, addr->id); + mptcp_pm_remove_subflow(msk, &list); release_sock(sk); next: @@ -1146,6 +1159,41 @@ static void mptcp_pm_free_addr_entry(struct mptcp_pm_addr_entry *entry) } } +static int mptcp_nl_remove_id_zero_address(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + list.ids[list.nr++] = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info msk_local; + + if (list_empty(&msk->conn_list)) + goto next; + + local_address((struct sock_common *)msk, &msk_local); + if (!addresses_equal(&msk_local, addr, addr->port)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &list); + mptcp_pm_nl_rm_subflow_received(msk, &list); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; @@ -1158,6 +1206,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; + /* the zero id address is special: the first address used by the msk + * always gets such an id, so different subflows can have different zero + * id addresses. Additionally zero id is not accounted for in id_bitmap. + * Let's use an 'mptcp_rm_list' instead of the common remove code. + */ + if (addr.addr.id == 0) + return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); + spin_lock_bh(&pernet->lock); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { @@ -1165,11 +1221,11 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&pernet->lock); return -EINVAL; } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max - 1); } @@ -1185,14 +1241,61 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void __flush_addrs(struct net *net, struct list_head *list) +static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) +{ + struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, rm_list, list) { + if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && + alist.nr < MPTCP_RM_IDS_MAX && + slist.nr < MPTCP_RM_IDS_MAX) { + alist.ids[alist.nr++] = entry->addr.id; + slist.ids[slist.nr++] = entry->addr.id; + } else if (remove_anno_list_by_saddr(msk, &entry->addr) && + alist.nr < MPTCP_RM_IDS_MAX) { + alist.ids[alist.nr++] = entry->addr.id; + } + } + + if (alist.nr) { + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &alist); + spin_unlock_bh(&msk->pm.lock); + } + if (slist.nr) + mptcp_pm_remove_subflow(msk, &slist); +} + +static void mptcp_nl_remove_addrs_list(struct net *net, + struct list_head *rm_list) +{ + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (list_empty(rm_list)) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + mptcp_pm_remove_addrs_and_subflows(msk, rm_list); + release_sock(sk); + + sock_put(sk); + cond_resched(); + } +} + +static void __flush_addrs(struct list_head *list) { while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); - mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); mptcp_pm_free_addr_entry(cur); } @@ -1217,7 +1320,8 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); - __flush_addrs(sock_net(skb->sk), &free_list); + mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); + __flush_addrs(&free_list); return 0; } @@ -1237,10 +1341,10 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb, goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; - if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags)) + if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) goto nla_put_failure; - if (entry->addr.ifindex && - nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->addr.ifindex)) + if (entry->ifindex && + nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) goto nla_put_failure; if (addr->family == AF_INET && @@ -1468,7 +1572,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; - if (addr.addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; list_for_each_entry(entry, &pernet->local_addr_list, list) { @@ -1478,9 +1582,9 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) return ret; if (bkup) - entry->addr.flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; else - entry->addr.flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; } } @@ -1586,9 +1690,21 @@ static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { + const struct mptcp_subflow_context *sf; + if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; + sf = mptcp_subflow_ctx(ssk); + if (!sf->reset_seen) + return 0; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) + return -EMSGSIZE; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) + return -EMSGSIZE; + return 0; } @@ -1814,7 +1930,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net, &pernet->local_addr_list); + __flush_addrs(&pernet->local_addr_list); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 61329b8181ea..29a2d690d8d5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -25,6 +25,9 @@ #include "protocol.h" #include "mib.h" +#define CREATE_TRACE_POINTS +#include <trace/events/mptcp.h> + #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; @@ -90,16 +93,6 @@ static bool mptcp_is_tcpsk(struct sock *sk) return false; } -static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) -{ - sock_owned_by_me((const struct sock *)msk); - - if (likely(!__mptcp_check_fallback(msk))) - return NULL; - - return msk->first; -} - static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -417,18 +410,6 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) - return false; - - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); -} - static bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & @@ -499,7 +480,7 @@ static bool mptcp_check_data_fin(struct sock *sk) u64 rcv_data_fin_seq; bool ret = false; - if (__mptcp_check_fallback(msk) || !msk->first) + if (__mptcp_check_fallback(msk)) return ret; /* Need to ack a DATA_FIN received from a peer while this side @@ -748,18 +729,47 @@ wake: sk->sk_data_ready(sk); } -void __mptcp_flush_join_list(struct mptcp_sock *msk) +static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; + bool ret = false; if (likely(list_empty(&msk->join_list))) - return; + return false; spin_lock_bh(&msk->join_list_lock); - list_for_each_entry(subflow, &msk->join_list, node) + list_for_each_entry(subflow, &msk->join_list, node) { + u32 sseq = READ_ONCE(subflow->setsockopt_seq); + mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); + if (READ_ONCE(msk->setsockopt_seq) != sseq) + ret = true; + } list_splice_tail_init(&msk->join_list, &msk->conn_list); spin_unlock_bh(&msk->join_list_lock); + + return ret; +} + +void __mptcp_flush_join_list(struct mptcp_sock *msk) +{ + if (likely(!mptcp_do_flush_join_list(msk))) + return; + + if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) + mptcp_schedule_work((struct sock *)msk); +} + +static void mptcp_flush_join_list(struct mptcp_sock *msk) +{ + bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); + + might_sleep(); + + if (!mptcp_do_flush_join_list(msk) && !sync_needed) + return; + + mptcp_sockopt_sync_all(msk); } static bool mptcp_timer_pending(struct sock *sk) @@ -1283,7 +1293,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, int avail_size; size_t ret = 0; - pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", + pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); /* compute send limit */ @@ -1411,6 +1421,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) send_info[i].ratio = -1; } mptcp_for_each_subflow(msk, subflow) { + trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; @@ -1431,10 +1442,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } } - pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld", - msk, nr_active, send_info[0].ssk, send_info[0].ratio, - send_info[1].ssk, send_info[1].ratio); - /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[0].ssk = send_info[1].ssk; @@ -1475,7 +1482,7 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags) int ret = 0; prev_ssk = ssk; - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); /* try to keep the subflow socket lock across @@ -1615,9 +1622,13 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int ret = 0; long timeo; - if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + /* we don't support FASTOPEN yet */ + if (msg->msg_flags & MSG_FASTOPEN) return -EOPNOTSUPP; + /* silently ignore everything else */ + msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1701,7 +1712,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } - pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk, + pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); @@ -1740,36 +1751,41 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, - size_t len) + size_t len, int flags) { - struct sk_buff *skb; + struct sk_buff *skb, *tmp; int copied = 0; - while ((skb = skb_peek(&msk->receive_queue)) != NULL) { + skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); int err; - err = skb_copy_datagram_msg(skb, offset, msg, count); - if (unlikely(err < 0)) { - if (!copied) - return err; - break; + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_msg(skb, offset, msg, count); + if (unlikely(err < 0)) { + if (!copied) + return err; + break; + } } copied += count; if (count < data_len) { - MPTCP_SKB_CB(skb)->offset += count; + if (!(flags & MSG_PEEK)) + MPTCP_SKB_CB(skb)->offset += count; break; } - /* we will bulk release the skb memory later */ - skb->destructor = NULL; - msk->rmem_released += skb->truesize; - __skb_unlink(skb, &msk->receive_queue); - __kfree_skb(skb); + if (!(flags & MSG_PEEK)) { + /* we will bulk release the skb memory later */ + skb->destructor = NULL; + msk->rmem_released += skb->truesize; + __skb_unlink(skb, &msk->receive_queue); + __kfree_skb(skb); + } if (copied >= len) break; @@ -1901,7 +1917,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool ret, done; - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; @@ -1946,8 +1962,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int target; long timeo; - if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) - return -EOPNOTSUPP; + /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); if (unlikely(sk->sk_state == TCP_LISTEN)) { @@ -1963,7 +1980,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); + bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2047,34 +2064,28 @@ out_err: pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), skb_queue_empty_lockless(&sk->sk_receive_queue), copied); - mptcp_rcv_space_adjust(msk, copied); + if (!(flags & MSG_PEEK)) + mptcp_rcv_space_adjust(msk, copied); release_sock(sk); return copied; } -static void mptcp_retransmit_handler(struct sock *sk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - set_bit(MPTCP_WORK_RTX, &msk->flags); - mptcp_schedule_work(sk); -} - static void mptcp_retransmit_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; + struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - mptcp_retransmit_handler(sk); + /* we need a process context to retransmit */ + if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) + mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, - &sk->sk_tsq_flags)) - sock_hold(sk); + set_bit(MPTCP_RETRANSMIT, &msk->flags); } bh_unlock_sock(sk); sock_put(sk); @@ -2343,7 +2354,7 @@ static void mptcp_worker(struct work_struct *work) goto unlock; mptcp_check_data_fin_ack(sk); - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_check_fastclose(msk); @@ -2406,6 +2417,9 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); + + tcp_assign_congestion_control(sk); + return 0; } @@ -2545,7 +2559,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk) } } - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2601,6 +2615,8 @@ static void __mptcp_destroy_sock(struct sock *sk) WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); + + tcp_cleanup_congestion_control(sk); sk_refcnt_debug_release(sk); mptcp_dispose_initial_subflow(msk); sock_put(sk); @@ -2627,7 +2643,7 @@ static void mptcp_close(struct sock *sk, long timeout) cleanup: /* orphan all the subflows */ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; - list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); @@ -2682,7 +2698,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - __mptcp_flush_join_list(msk); + mptcp_do_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2731,6 +2748,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->snd_nxt = msk->write_seq; msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; if (mp_opt->mp_capable) { msk->can_ack = true; @@ -2839,161 +2857,6 @@ static void mptcp_destroy(struct sock *sk) sk_sockets_allocated_dec(sk); } -static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct sock *sk = (struct sock *)msk; - struct socket *ssock; - int ret; - - switch (optname) { - case SO_REUSEPORT: - case SO_REUSEADDR: - lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - release_sock(sk); - return -EINVAL; - } - - ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); - if (ret == 0) { - if (optname == SO_REUSEPORT) - sk->sk_reuseport = ssock->sk->sk_reuseport; - else if (optname == SO_REUSEADDR) - sk->sk_reuse = ssock->sk->sk_reuse; - } - release_sock(sk); - return ret; - } - - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); -} - -static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct sock *sk = (struct sock *)msk; - int ret = -EOPNOTSUPP; - struct socket *ssock; - - switch (optname) { - case IPV6_V6ONLY: - lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - release_sock(sk); - return -EINVAL; - } - - ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); - if (ret == 0) - sk->sk_ipv6only = ssock->sk->sk_ipv6only; - - release_sock(sk); - break; - } - - return ret; -} - -static bool mptcp_unsupported(int level, int optname) -{ - if (level == SOL_IP) { - switch (optname) { - case IP_ADD_MEMBERSHIP: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; - } - if (level == SOL_IPV6) { - switch (optname) { - case IPV6_ADDRFORM: - case IPV6_ADD_MEMBERSHIP: - case IPV6_DROP_MEMBERSHIP: - case IPV6_JOIN_ANYCAST: - case IPV6_LEAVE_ANYCAST: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; - } - return false; -} - -static int mptcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - pr_debug("msk=%p", msk); - - if (mptcp_unsupported(level, optname)) - return -ENOPROTOOPT; - - if (level == SOL_SOCKET) - return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); - - /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not yet defined. It is up to the - * MPTCP-level socket to configure the subflows until the subflow - * is in TCP fallback, when TCP socket options are passed through - * to the one remaining subflow. - */ - lock_sock(sk); - ssk = __mptcp_tcp_fallback(msk); - release_sock(sk); - if (ssk) - return tcp_setsockopt(ssk, level, optname, optval, optlen); - - if (level == SOL_IPV6) - return mptcp_setsockopt_v6(msk, optname, optval, optlen); - - return -EOPNOTSUPP; -} - -static int mptcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - pr_debug("msk=%p", msk); - - /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not yet defined. It is up to the - * MPTCP-level socket to configure the subflows until the subflow - * is in TCP fallback, when socket options are passed through - * to the one remaining subflow. - */ - lock_sock(sk); - ssk = __mptcp_tcp_fallback(msk); - release_sock(sk); - if (ssk) - return tcp_getsockopt(ssk, level, optname, optval, option); - - return -EOPNOTSUPP; -} - void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) @@ -3022,17 +2885,16 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) } } -#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) - /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) { - unsigned long flags, nflags; - for (;;) { - flags = 0; + unsigned long flags = 0; + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) flags |= BIT(MPTCP_PUSH_PENDING); + if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) + flags |= BIT(MPTCP_RETRANSMIT); if (!flags) break; @@ -3047,6 +2909,8 @@ static void mptcp_release_cb(struct sock *sk) spin_unlock_bh(&sk->sk_lock.slock); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); + if (flags & BIT(MPTCP_RETRANSMIT)) + __mptcp_retrans(sk); cond_resched(); spin_lock_bh(&sk->sk_lock.slock); @@ -3062,20 +2926,6 @@ static void mptcp_release_cb(struct sock *sk) */ __mptcp_update_wmem(sk); __mptcp_update_rmem(sk); - - do { - flags = sk->sk_tsq_flags; - if (!(flags & MPTCP_DEFERRED_ALL)) - return; - nflags = flags & ~MPTCP_DEFERRED_ALL; - } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); - - sock_release_ownership(sk); - - if (flags & TCPF_WRITE_TIMER_DEFERRED) { - mptcp_retransmit_handler(sk); - __sock_put(sk); - } } void mptcp_subflow_process_delegated(struct sock *ssk) @@ -3174,14 +3024,18 @@ bool mptcp_finish_join(struct sock *ssk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (!mptcp_is_fully_established(parent)) + if (!mptcp_is_fully_established(parent)) { + subflow->reset_reason = MPTCP_RST_EMPTCP; return false; + } if (!msk->pm.server_side) goto out; - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* active connections are already on conn_list, and we can't acquire * msk lock here. @@ -3195,8 +3049,10 @@ bool mptcp_finish_join(struct sock *ssk) sock_hold(ssk); } spin_unlock_bh(&msk->join_list_lock); - if (!ret) + if (!ret) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* attach to msk socket only after we are sure he will deal with us * at close time @@ -3308,8 +3164,12 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { + MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); + } + if (likely(!__mptcp_check_fallback(msk))) + MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); @@ -3406,7 +3266,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e21a5bc36cf0..edc0128730df 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -22,10 +22,10 @@ #define OPTION_MPTCP_MPJ_SYNACK BIT(4) #define OPTION_MPTCP_MPJ_ACK BIT(5) #define OPTION_MPTCP_ADD_ADDR BIT(6) -#define OPTION_MPTCP_ADD_ADDR6 BIT(7) -#define OPTION_MPTCP_RM_ADDR BIT(8) -#define OPTION_MPTCP_FASTCLOSE BIT(9) -#define OPTION_MPTCP_PRIO BIT(10) +#define OPTION_MPTCP_RM_ADDR BIT(7) +#define OPTION_MPTCP_FASTCLOSE BIT(8) +#define OPTION_MPTCP_PRIO BIT(9) +#define OPTION_MPTCP_RST BIT(10) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -36,6 +36,7 @@ #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 +#define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 @@ -61,10 +62,11 @@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 -#define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 +#define TCPOLEN_MPTCP_RST 4 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -88,12 +90,13 @@ /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) -#define MPTCP_ADDR_IPVERSION_4 4 -#define MPTCP_ADDR_IPVERSION_6 6 /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) +/* MPTCP TCPRST flags */ +#define MPTCP_RST_TRANSIENT BIT(0) + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_NOSPACE 1 @@ -104,6 +107,8 @@ #define MPTCP_PUSH_PENDING 6 #define MPTCP_CLEAN_UNA 7 #define MPTCP_ERROR_REPORT 8 +#define MPTCP_RETRANSMIT 9 +#define MPTCP_WORK_SYNC_SETSOCKOPT 10 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -122,11 +127,11 @@ struct mptcp_options_received { u16 mp_capable : 1, mp_join : 1, fastclose : 1, + reset : 1, dss : 1, add_addr : 1, rm_addr : 1, mp_prio : 1, - family : 4, echo : 1, backup : 1; u32 token; @@ -141,16 +146,11 @@ struct mptcp_options_received { ack64:1, mpc_map:1, __unused:2; - u8 addr_id; - u8 rm_id; - union { - struct in_addr addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; -#endif - }; + struct mptcp_addr_info addr; + struct mptcp_rm_list rm_list; u64 ahmac; - u16 port; + u8 reset_reason:4; + u8 reset_transient:1; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) @@ -159,20 +159,6 @@ static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) ((nib & 0xF) << 8) | field); } -struct mptcp_addr_info { - sa_family_t family; - __be16 port; - u8 id; - u8 flags; - int ifindex; - union { - struct in_addr addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; -#endif - }; -}; - enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, @@ -207,7 +193,8 @@ struct mptcp_pm_data { u8 local_addr_used; u8 subflows; u8 status; - u8 rm_id; + struct mptcp_rm_list rm_list_tx; + struct mptcp_rm_list rm_list_rx; }; struct mptcp_data_frag { @@ -269,6 +256,8 @@ struct mptcp_sock { u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; + + u32 setsockopt_seq; }; #define mptcp_lock_sock(___sk, cb) do { \ @@ -420,10 +409,15 @@ struct mptcp_subflow_context { u8 hmac[MPTCPOPT_HMAC_LEN]; u8 local_id; u8 remote_id; + u8 reset_seen:1; + u8 reset_transient:1; + u8 reset_reason:4; long delegated_status; struct list_head delegated_node; /* link into delegated_action, protected by local BH */ + u32 setsockopt_seq; + struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; @@ -543,12 +537,25 @@ struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote); + const struct mptcp_addr_info *remote, + u8 flags, int ifindex); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); +static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} + static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { @@ -581,6 +588,11 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); +int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option); + void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); @@ -641,13 +653,16 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); -void mptcp_pm_subflow_established(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow); +void mptcp_pm_subflow_established(struct mptcp_sock *msk); void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, + struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); -void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, @@ -657,12 +672,15 @@ bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_addr_info *addr); +struct mptcp_pm_add_entry * +mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, + struct mptcp_addr_info *addr); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo, bool port); -int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); + bool echo); +int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); @@ -709,23 +727,38 @@ static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) return len; } +static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) +{ + if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) + return -EINVAL; + + return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; +} + bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr, bool *echo, bool *port); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - u8 *rm_id); + struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_work(struct mptcp_sock *msk); -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); + +void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); +void mptcp_sockopt_sync_all(struct mptcp_sock *msk); + +static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c new file mode 100644 index 000000000000..00d941b66c1e --- /dev/null +++ b/net/mptcp/sockopt.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2021, Red Hat. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <net/sock.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/mptcp.h> +#include "protocol.h" + +static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); + + if (likely(!__mptcp_check_fallback(msk))) + return NULL; + + return msk->first; +} + +static u32 sockopt_seq_reset(const struct sock *sk) +{ + sock_owned_by_me(sk); + + /* Highbits contain state. Allows to distinguish sockopt_seq + * of listener and established: + * s0 = new_listener() + * sockopt(s0) - seq is 1 + * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0) + * sockopt(s0) - seq increments to 2 on s0 + * sockopt(s1) // seq increments to 2 on s1 (different option) + * new ssk completes join, inherits options from s0 // seq 2 + * Needs sync from mptcp join logic, but ssk->seq == msk->seq + * + * Set High order bits to sk_state so ssk->seq == msk->seq test + * will fail. + */ + + return (u32)sk->sk_state << 24u; +} + +static void sockopt_seq_inc(struct mptcp_sock *msk) +{ + u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff; + + msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq; +} + +static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen, int *val) +{ + if (optlen < sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(val, optval, sizeof(*val))) + return -EFAULT; + + return 0; +} + +static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + sockopt_seq_inc(msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + switch (optname) { + case SO_DEBUG: + sock_valbool_flag(ssk, SOCK_DBG, !!val); + break; + case SO_KEEPALIVE: + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, !!val); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val); + break; + case SO_PRIORITY: + ssk->sk_priority = val; + break; + case SO_SNDBUF: + case SO_SNDBUFFORCE: + ssk->sk_userlocks |= SOCK_SNDBUF_LOCK; + WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + break; + case SO_RCVBUF: + case SO_RCVBUFFORCE: + ssk->sk_userlocks |= SOCK_RCVBUF_LOCK; + WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + break; + case SO_MARK: + if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { + ssk->sk_mark = sk->sk_mark; + sk_dst_reset(ssk); + } + break; + case SO_INCOMING_CPU: + WRITE_ONCE(ssk->sk_incoming_cpu, val); + break; + } + + subflow->setsockopt_seq = msk->setsockopt_seq; + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); +} + +static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val) +{ + sockptr_t optval = KERNEL_SOCKPTR(&val); + struct sock *sk = (struct sock *)msk; + int ret; + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + optval, sizeof(val)); + if (ret) + return ret; + + mptcp_sol_socket_sync_intval(msk, optname, val); + return 0; +} + +static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) +{ + struct sock *sk = (struct sock *)msk; + + WRITE_ONCE(sk->sk_incoming_cpu, val); + + mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); +} + +static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + int val, ret; + + ret = mptcp_get_int_option(msk, optval, optlen, &val); + if (ret) + return ret; + + switch (optname) { + case SO_KEEPALIVE: + mptcp_sol_socket_sync_intval(msk, optname, val); + return 0; + case SO_DEBUG: + case SO_MARK: + case SO_PRIORITY: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + return mptcp_sol_socket_intval(msk, optname, val); + case SO_INCOMING_CPU: + mptcp_so_incoming_cpu(msk, val); + return 0; + } + + return -ENOPROTOOPT; +} + +static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct linger ling; + sockptr_t kopt; + int ret; + + if (optlen < sizeof(ling)) + return -EINVAL; + + if (copy_from_sockptr(&ling, optval, sizeof(ling))) + return -EFAULT; + + kopt = KERNEL_SOCKPTR(&ling); + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling)); + if (ret) + return ret; + + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + if (!ling.l_onoff) { + sock_reset_flag(ssk, SOCK_LINGER); + } else { + ssk->sk_lingertime = sk->sk_lingertime; + sock_set_flag(ssk, SOCK_LINGER); + } + + subflow->setsockopt_seq = msk->setsockopt_seq; + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + return 0; +} + +static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret; + + switch (optname) { + case SO_REUSEPORT: + case SO_REUSEADDR: + case SO_BINDTODEVICE: + case SO_BINDTOIFINDEX: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + if (ret == 0) { + if (optname == SO_REUSEPORT) + sk->sk_reuseport = ssock->sk->sk_reuseport; + else if (optname == SO_REUSEADDR) + sk->sk_reuse = ssock->sk->sk_reuse; + else if (optname == SO_BINDTODEVICE) + sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + else if (optname == SO_BINDTOIFINDEX) + sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + } + release_sock(sk); + return ret; + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + case SO_MARK: + case SO_INCOMING_CPU: + case SO_DEBUG: + return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); + case SO_LINGER: + return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); + case SO_NO_CHECK: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_BSDCOMPAT: + case SO_PASSCRED: + case SO_PASSSEC: + case SO_RXQ_OVFL: + case SO_WIFI_STATUS: + case SO_NOFCS: + case SO_SELECT_ERR_QUEUE: + return 0; + } + + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); +} + +static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + switch (optname) { + case IPV6_V6ONLY: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + if (ret == 0) + sk->sk_ipv6only = ssock->sk->sk_ipv6only; + + release_sock(sk); + break; + } + + return ret; +} + +static bool mptcp_supported_sockopt(int level, int optname) +{ + if (level == SOL_SOCKET) { + switch (optname) { + case SO_DEBUG: + case SO_REUSEPORT: + case SO_REUSEADDR: + + /* the following ones need a better implementation, + * but are quite common we want to preserve them + */ + case SO_BINDTODEVICE: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_LINGER: + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + case SO_RCVLOWAT: + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + case SO_MARK: + case SO_INCOMING_CPU: + case SO_BINDTOIFINDEX: + case SO_BUSY_POLL: + case SO_PREFER_BUSY_POLL: + case SO_BUSY_POLL_BUDGET: + + /* next ones are no-op for plain TCP */ + case SO_NO_CHECK: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_BSDCOMPAT: + case SO_PASSCRED: + case SO_PASSSEC: + case SO_RXQ_OVFL: + case SO_WIFI_STATUS: + case SO_NOFCS: + case SO_SELECT_ERR_QUEUE: + return true; + } + + /* SO_OOBINLINE is not supported, let's avoid the related mess */ + /* SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, + * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, + * we must be careful with subflows + */ + /* SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks + * explicitly the sk_protocol field + */ + /* SO_PEEK_OFF is unsupported, as it is for plain TCP */ + /* SO_MAX_PACING_RATE is unsupported, we must be careful with subflows */ + /* SO_CNX_ADVICE is currently unsupported, could possibly be relevant, + * but likely needs careful design + */ + /* SO_ZEROCOPY is currently unsupported, TODO in sndmsg */ + /* SO_TXTIME is currently unsupported */ + return false; + } + if (level == SOL_IP) { + switch (optname) { + /* should work fine */ + case IP_FREEBIND: + case IP_TRANSPARENT: + + /* the following are control cmsg related */ + case IP_PKTINFO: + case IP_RECVTTL: + case IP_RECVTOS: + case IP_RECVOPTS: + case IP_RETOPTS: + case IP_PASSSEC: + case IP_RECVORIGDSTADDR: + case IP_CHECKSUM: + case IP_RECVFRAGSIZE: + + /* common stuff that need some love */ + case IP_TOS: + case IP_TTL: + case IP_BIND_ADDRESS_NO_PORT: + case IP_MTU_DISCOVER: + case IP_RECVERR: + + /* possibly less common may deserve some love */ + case IP_MINTTL: + + /* the following is apparently a no-op for plain TCP */ + case IP_RECVERR_RFC4884: + return true; + } + + /* IP_OPTIONS is not supported, needs subflow care */ + /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */ + /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF, + * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP, + * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE, + * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP, + * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, + * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal + * with mcast stuff + */ + /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */ + return false; + } + if (level == SOL_IPV6) { + switch (optname) { + case IPV6_V6ONLY: + + /* the following are control cmsg related */ + case IPV6_RECVPKTINFO: + case IPV6_2292PKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_2292HOPLIMIT: + case IPV6_RECVRTHDR: + case IPV6_2292RTHDR: + case IPV6_RECVHOPOPTS: + case IPV6_2292HOPOPTS: + case IPV6_RECVDSTOPTS: + case IPV6_2292DSTOPTS: + case IPV6_RECVTCLASS: + case IPV6_FLOWINFO: + case IPV6_RECVPATHMTU: + case IPV6_RECVORIGDSTADDR: + case IPV6_RECVFRAGSIZE: + + /* the following ones need some love but are quite common */ + case IPV6_TCLASS: + case IPV6_TRANSPARENT: + case IPV6_FREEBIND: + case IPV6_PKTINFO: + case IPV6_2292PKTOPTIONS: + case IPV6_UNICAST_HOPS: + case IPV6_MTU_DISCOVER: + case IPV6_MTU: + case IPV6_RECVERR: + case IPV6_FLOWINFO_SEND: + case IPV6_FLOWLABEL_MGR: + case IPV6_MINHOPCOUNT: + case IPV6_DONTFRAG: + case IPV6_AUTOFLOWLABEL: + + /* the following one is a no-op for plain TCP */ + case IPV6_RECVERR_RFC4884: + return true; + } + + /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are + * not supported + */ + /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF, + * IPV6_MULTICAST_IF, IPV6_ADDRFORM, + * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST, + * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP, + * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP, + * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER + * are not supported better not deal with mcast + */ + /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */ + + /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */ + /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */ + return false; + } + if (level == SOL_TCP) { + switch (optname) { + /* the following are no-op or should work just fine */ + case TCP_THIN_DUPACK: + case TCP_DEFER_ACCEPT: + + /* the following need some love */ + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_CONGESTION: + case TCP_ULP: + case TCP_CORK: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_SAVE_SYN: + case TCP_LINGER2: + case TCP_WINDOW_CLAMP: + case TCP_QUICKACK: + case TCP_USER_TIMEOUT: + case TCP_TIMESTAMP: + case TCP_NOTSENT_LOWAT: + case TCP_TX_DELAY: + return true; + } + + /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */ + + /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, + * TCP_REPAIR_WINDOW are not supported, better avoid this mess + */ + /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE, + * are not supported fastopen is currently unsupported + */ + /* TCP_INQ is currently unsupported, needs some recvmsg work */ + } + return false; +} + +static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + char name[TCP_CA_NAME_MAX]; + bool cap_net_admin; + int ret; + + if (optlen < 1) + return -EINVAL; + + ret = strncpy_from_sockptr(name, optval, + min_t(long, TCP_CA_NAME_MAX - 1, optlen)); + if (ret < 0) + return -EFAULT; + + name[ret] = 0; + + cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN); + + ret = 0; + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err; + + lock_sock(ssk); + err = tcp_set_congestion_control(ssk, name, true, cap_net_admin); + if (err < 0 && ret == 0) + ret = err; + subflow->setsockopt_seq = msk->setsockopt_seq; + release_sock(ssk); + } + + if (ret == 0) + tcp_set_congestion_control(sk, name, false, cap_net_admin); + + release_sock(sk); + return ret; +} + +static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + switch (optname) { + case TCP_ULP: + return -EOPNOTSUPP; + case TCP_CONGESTION: + return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); + } + + return -EOPNOTSUPP; +} + +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *ssk; + + pr_debug("msk=%p", msk); + + if (!mptcp_supported_sockopt(level, optname)) + return -ENOPROTOOPT; + + if (level == SOL_SOCKET) + return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when TCP socket options are passed through + * to the one remaining subflow. + */ + lock_sock(sk); + ssk = __mptcp_tcp_fallback(msk); + release_sock(sk); + if (ssk) + return tcp_setsockopt(ssk, level, optname, optval, optlen); + + if (level == SOL_IPV6) + return mptcp_setsockopt_v6(msk, optname, optval, optlen); + + if (level == SOL_TCP) + return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen); + + return -EOPNOTSUPP; +} + +static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret = -EINVAL; + struct sock *ssk; + + lock_sock(sk); + ssk = msk->first; + if (ssk) { + ret = tcp_getsockopt(ssk, level, optname, optval, optlen); + goto out; + } + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + goto out; + + ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); + +out: + release_sock(sk); + return ret; +} + +static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + switch (optname) { + case TCP_ULP: + case TCP_CONGESTION: + case TCP_INFO: + case TCP_CC_INFO: + return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, + optval, optlen); + } + return -EOPNOTSUPP; +} + +int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *ssk; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when socket options are passed through + * to the one remaining subflow. + */ + lock_sock(sk); + ssk = __mptcp_tcp_fallback(msk); + release_sock(sk); + if (ssk) + return tcp_getsockopt(ssk, level, optname, optval, option); + + if (level == SOL_TCP) + return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); + return -EOPNOTSUPP; +} + +static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) +{ + static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; + struct sock *sk = (struct sock *)msk; + + if (ssk->sk_prot->keepalive) { + if (sock_flag(sk, SOCK_KEEPOPEN)) + ssk->sk_prot->keepalive(ssk, 1); + else + ssk->sk_prot->keepalive(ssk, 0); + } + + ssk->sk_priority = sk->sk_priority; + ssk->sk_bound_dev_if = sk->sk_bound_dev_if; + ssk->sk_incoming_cpu = sk->sk_incoming_cpu; + + if (sk->sk_userlocks & tx_rx_locks) { + ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + } + + if (sock_flag(sk, SOCK_LINGER)) { + ssk->sk_lingertime = sk->sk_lingertime; + sock_set_flag(ssk, SOCK_LINGER); + } else { + sock_reset_flag(ssk, SOCK_LINGER); + } + + if (sk->sk_mark != ssk->sk_mark) { + ssk->sk_mark = sk->sk_mark; + sk_dst_reset(ssk); + } + + sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG)); + + if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops) + tcp_set_congestion_control(ssk, inet_csk(sk)->icsk_ca_ops->name, false, true); +} + +static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) +{ + bool slow = lock_sock_fast(ssk); + + sync_socket_options(msk, ssk); + + unlock_sock_fast(ssk, slow); +} + +void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + msk_owned_by_me(msk); + + if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { + __mptcp_sockopt_sync(msk, ssk); + + subflow->setsockopt_seq = msk->setsockopt_seq; + } +} + +void mptcp_sockopt_sync_all(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + u32 seq; + + seq = sockopt_seq_reset(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u32 sseq = READ_ONCE(subflow->setsockopt_seq); + + if (sseq != msk->setsockopt_seq) { + __mptcp_sockopt_sync(msk, ssk); + WRITE_ONCE(subflow->setsockopt_seq, seq); + } else if (sseq != seq) { + WRITE_ONCE(subflow->setsockopt_seq, seq); + } + + cond_resched(); + } + + msk->setsockopt_seq = seq; +} diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d17d39ccdf34..82e91b00ad39 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -25,6 +25,8 @@ #include "protocol.h" #include "mib.h" +#include <trace/events/mptcp.h> + static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, @@ -115,6 +117,16 @@ static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } +static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) +{ + struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + + if (mpext) { + memset(mpext, 0, sizeof(*mpext)); + mpext->reset_reason = reason; + } +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -165,6 +177,7 @@ again: if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } @@ -176,6 +189,8 @@ again: subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; + else + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (mp_opt.mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; @@ -187,8 +202,10 @@ again: subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ - if (!subflow_req->msk) + if (!subflow_req->msk) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; + } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d", @@ -392,12 +409,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key = mp_opt.sndr_key; pr_debug("subflow=%p, remote_key=%llu", subflow, subflow->remote_key); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!mp_opt.mp_join) + if (!mp_opt.mp_join) { + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; + } subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; @@ -406,6 +426,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } @@ -434,6 +455,7 @@ fallback: return; do_reset: + subflow->reset_transient = 0; mptcp_subflow_reset(sk); } @@ -650,13 +672,18 @@ create_child: * to reset the context to non MPTCP status. */ if (!ctx || fallback) { - if (fallback_is_fatal) + if (fallback_is_fatal) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; + } subflow_drop_ctx(child); goto out; } + /* ssk inherits options of listener sk */ + ctx->setsockopt_seq = listener->setsockopt_seq; + if (ctx->mp_capable) { /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space @@ -672,6 +699,7 @@ create_child: * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; + mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; @@ -686,8 +714,10 @@ create_child: struct mptcp_sock *owner; owner = subflow_req->msk; - if (!owner) + if (!owner) { + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; + } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; @@ -834,9 +864,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, goto validate_seq; } - pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", - mpext->data_seq, mpext->dsn64, mpext->subflow_seq, - mpext->data_len, mpext->data_fin); + trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { @@ -974,8 +1002,6 @@ static bool subflow_check_data_avail(struct sock *ssk) struct mptcp_sock *msk; struct sk_buff *skb; - pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, - subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); if (!skb_peek(&ssk->sk_receive_queue)) subflow->data_avail = 0; if (subflow->data_avail) @@ -987,7 +1013,7 @@ static bool subflow_check_data_avail(struct sock *ssk) u64 old_ack; status = get_mapping_status(ssk, msk); - pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (status == MAPPING_INVALID) { ssk->sk_err = EBADMSG; goto fatal; @@ -1052,6 +1078,8 @@ fatal: smp_wmb(); ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); subflow->data_avail = 0; return false; @@ -1081,7 +1109,7 @@ bool mptcp_subflow_data_available(struct sock *sk) * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, - * as far as mptcp peer is concerened that data is still inflight. + * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) @@ -1230,7 +1258,8 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote) + const struct mptcp_addr_info *remote, + u8 flags, int ifindex) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; @@ -1274,7 +1303,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - ssk->sk_bound_dev_if = loc->ifindex; + ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; @@ -1286,10 +1315,11 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->local_id = local_id; subflow->remote_id = remote_id; subflow->request_join = 1; - subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr, ssk->sk_family); mptcp_add_pending_subflow(msk, subflow); + mptcp_sockopt_sync(msk, ssk); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) goto failed_unlink; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index feb4b9ffd462..8f0270a780ce 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -402,7 +402,7 @@ void __init mptcp_token_init(void) } } -#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) EXPORT_SYMBOL_GPL(mptcp_token_new_request); EXPORT_SYMBOL_GPL(mptcp_token_new_connect); EXPORT_SYMBOL_GPL(mptcp_token_accept); |