diff options
Diffstat (limited to 'net/ipv4')
34 files changed, 1438 insertions, 855 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e31108e5ef79..ce4aa827be05 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); @@ -826,6 +827,7 @@ int inet_shutdown(struct socket *sock, int how) err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for POLLHUP, even on eg. unconnected UDP sockets -- RR */ + /* fall through */ default: sk->sk_shutdown |= how; if (sk->sk_prot->shutdown) @@ -839,7 +841,7 @@ int inet_shutdown(struct socket *sock, int how) case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; - /* Fall through */ + /* fall through */ case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7c45b8896709..a8d7c5a9fb05 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1180,6 +1180,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; + /* fall through */ case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7adc0616599..a4573bccd6da 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa) */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { - if (ifa->ifa_local == addr) { - struct net_device *dev = ifa->ifa_dev->dev; - - if (!net_eq(dev_net(dev), net)) - continue; - result = dev; - break; - } - } - if (!result) { + ifa = inet_lookup_ifaddr_rcu(net, addr); + if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; @@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); + } else { + result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); @@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) } EXPORT_SYMBOL(__ip_dev_find); +/* called under RCU lock */ +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) +{ + u32 hash = inet_addr_hash(net, addr); + struct in_ifaddr *ifa; + + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) + if (ifa->ifa_local == addr && + net_eq(dev_net(ifa->ifa_dev->dev), net)) + return ifa; + + return NULL; +} + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -438,7 +444,7 @@ static void check_lifetime(struct work_struct *work); static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, - u32 portid) + u32 portid, struct netlink_ext_ack *extack) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -483,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; + ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); @@ -515,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, static int inet_insert_ifa(struct in_ifaddr *ifa) { - return __inet_insert_ifa(ifa, NULL, 0); + return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) @@ -896,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, return ret; } } - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, + extack); } else { inet_free_ifa(ifa); @@ -1516,6 +1524,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ + /* fall through */ case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; @@ -1751,7 +1760,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rtnl(dev)) + if (dev && !__in_dev_get_rcu(dev)) return -EAFNOSUPPORT; err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); @@ -1775,7 +1784,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 37819ab4cc74..f52d27a422c3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -73,6 +73,11 @@ fail: fib_free_table(main_table); return -ENOMEM; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return false; +} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -128,6 +133,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return net->ipv4.fib_has_custom_rules; +} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, @@ -345,9 +355,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(net) && - (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) - goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -402,13 +409,28 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + struct net *net = dev_net(dev); - if (!r && !fib_num_tclassid_users(dev_net(dev)) && - IN_DEV_ACCEPT_LOCAL(idev) && + if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { + if (IN_DEV_ACCEPT_LOCAL(idev)) + goto ok; + /* with custom local routes in place, checking local addresses + * only will be too optimistic, with custom rules, checking + * local addresses only can be too strict, e.g. due to vrf + */ + if (net->ipv4.fib_has_custom_local_routes || + fib4_has_custom_rules(net)) + goto full_check; + if (inet_lookup_ifaddr_rcu(net, src)) + return -EINVAL; + +ok: *itag = 0; return 0; } + +full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } @@ -759,6 +781,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, } err = fib_table_insert(net, tb, &cfg, extack); + if (!err && cfg.fc_type == RTN_LOCAL) + net->ipv4.fib_has_custom_local_routes = true; errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 01ed22139ac2..589caaa90613 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); } - -static inline void fib_add_weight(struct fib_info *fi, - const struct fib_nh *nh) -{ - fi->fib_weight += nh->nh_weight; -} - #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define fib_rebalance(fi) do { } while (0) -#define fib_add_weight(fi, nh) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -774,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh, struct netlink_ext_ack *extack) +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -1258,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh, extack); + err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) @@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); - fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) fib_rebalance(fi); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c636650a6a70..5ddc4aefff12 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -87,32 +87,32 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa) { struct fib_entry_notifier_info info = { .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, net, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } @@ -1216,9 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, fi, - new_fa->fa_tos, cfg->fc_type, - tb->tb_id); + key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1273,8 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type, - tb->tb_id); + call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1574,8 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete->fa_info, tos, - fa_to_delete->fa_type, tb->tb_id); + fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1892,9 +1888,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); + KEYLENGTH - fa->fa_slen, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1932,8 +1927,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, continue; call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, - fa->fa_type, fa->tb_id); + KEYLENGTH - fa->fa_slen, fa); } } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 681e33998e03..1617604c9284 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto) } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ @@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb) if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; - if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->type) { + case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: @@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb) } if (icmph->code > NR_ICMP_UNREACH) goto out; - } else if (icmph->type == ICMP_PARAMETERPROB) + break; + case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; + break; + case ICMP_TIME_EXCEEDED: + __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS); + if (icmph->code == ICMP_EXC_FRAGTIME) + goto out; + break; + } /* * Throw it at our lower layers @@ -959,8 +968,9 @@ static bool icmp_timestamp(struct sk_buff *skb) */ icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; - if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) - BUG(); + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; icmp_param.data.icmph.code = 0; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b47a59cb3573..4ca46dc08e63 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -39,11 +39,11 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ -static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, - const struct in6_addr *sk2_rcv_saddr6, - __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk1_ipv6only, bool sk2_ipv6only, - bool match_wildcard) +static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, + bool match_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -52,29 +52,29 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; + return true; if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; + return true; if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; + return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) - return 1; + return true; - return 0; + return false; } #endif @@ -82,20 +82,20 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk2_ipv6only, bool match_wildcard) +static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) @@ -495,17 +495,15 @@ EXPORT_SYMBOL(inet_csk_accept); * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) + void (*retransmit_handler)(struct timer_list *t), + void (*delack_handler)(struct timer_list *t), + void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); - setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, - (unsigned long)sk); - setup_timer(&icsk->icsk_delack_timer, delack_handler, - (unsigned long)sk); - setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); + timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); + timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); @@ -676,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); -static void reqsk_timer_handler(unsigned long data) +static void reqsk_timer_handler(struct timer_list *t) { - struct request_sock *req = (struct request_sock *)data; + struct request_sock *req = from_timer(req, t, rsk_timer); struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); @@ -749,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, req->num_timeout = 0; req->sk = NULL; - setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, - (unsigned long)req); + timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index af74d0433453..7f3ef5c287a1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) spin_unlock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire((unsigned long) fq); + f->frag_expire(&fq->timer); return evicted; } @@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, f->constructor(q, arg); add_frag_mem_limit(nf, f->qsize); - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 1); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b039159e67a..a4bab81f1462 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -142,9 +142,9 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -static void tw_timer_handler(unsigned long data) +static void tw_timer_handler(struct timer_list *t) { - struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); if (tw->tw_kill) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); @@ -188,8 +188,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - setup_pinned_timer(&tw->tw_timer, tw_timer_handler, - (unsigned long)tw); + timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index b20c8ac64081..914d56928578 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *p, *n; + struct rb_node *p = rb_first(&base->rb_root); - rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { - inet_putpeer(p); + while (p) { + struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); + + p = rb_next(p); + rb_erase(&peer->rb_node, &base->rb_root); + inet_putpeer(peer); cond_resched(); } - base->rb_root = RB_ROOT; base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index df8fe0503de0..bbf1b94942c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -191,12 +191,13 @@ static bool frag_expire_skip_icmp(u32 user) /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ -static void ip_expire(unsigned long arg) +static void ip_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct ipq *qp; struct net *net; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); rcu_read_lock(); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 467e44d7587d..c105a315b1a3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, false)) goto err_free_rt; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len - dev->hard_header_len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -1011,15 +1011,14 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit = ipgre_exit_net, + .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1542,15 +1541,14 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_net(struct net *net) +static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, + .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1561,16 +1559,14 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_net(struct net *net) +static void __net_exit erspan_exit_batch_net(struct list_head *net_list) { - struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); - - ip_tunnel_delete_net(itn, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit = erspan_exit_net, + .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e9805ad664ac..fe6fee728ce4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, } } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, + struct rtnl_link_ops *ops) { + struct ip_tunnel_net *itn; + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list, ops); + list_for_each_entry(net, net_list, exit_list) { + itn = net_generic(net, id); + ip_tunnel_destroy(itn, &list, ops); + } unregister_netdevice_many(&list); rtnl_unlock(); } -EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 89453cf62158..949f432a5f04 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -198,15 +198,6 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); @@ -453,15 +444,14 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_net(struct net *net) +static void __net_exit vti_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - ip_tunnel_delete_net(itn, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit = vti_exit_net, + .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index cdd627355ed1..c891235b4966 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -659,15 +659,14 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit = ipip_exit_net, + .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9b3e6e069ae..40a43ad294cb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,6 +67,7 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/nexthop.h> +#include <net/switchdev.h> struct ipmr_rule { struct fib_rule common; @@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) fib_rules_unregister(net->ipv4.mr_rules_ops); rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; +} +EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) net->ipv4.mrt = NULL; rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return 0; +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} +EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -587,6 +620,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -594,6 +703,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { + struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; @@ -603,6 +713,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -652,10 +766,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, c); } -static inline void ipmr_cache_free(struct mfc_cache *c) +void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->rcu, ipmr_cache_free_rcu); } +EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -754,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -828,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* Fill in the VIF structures */ + attr.orig_dev = dev; + if (!switchdev_port_attr_get(dev, &attr)) { + memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); + v->dev_parent_id.id_len = attr.u.ppid.id_len; + } else { + v->dev_parent_id.id_len = 0; + } v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -851,6 +976,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } @@ -949,6 +1075,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->mfc_un.res.refcount, 1); } return c; } @@ -1150,6 +1277,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ @@ -1161,8 +1289,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); return 0; } @@ -1189,6 +1318,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1238,6 +1369,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1245,6 +1377,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c, *tmp; LIST_HEAD(list); int i; @@ -1263,8 +1396,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { @@ -1393,6 +1528,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; + /* fall through */ case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { @@ -1724,10 +1860,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } +#ifdef CONFIG_NET_SWITCHDEV +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + struct vif_device *out_vif = &mrt->vif_table[out_vifi]; + struct vif_device *in_vif = &mrt->vif_table[in_vifi]; + + if (!skb->offload_mr_fwd_mark) + return false; + if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) + return false; + return netdev_phys_item_id_same(&out_vif->dev_parent_id, + &in_vif->dev_parent_id); +} +#else +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + return false; +} +#endif + /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1748,6 +1907,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -1925,8 +2087,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, - psend); + ipmr_queue_xmit(net, mrt, true_vifi, + skb2, cache, psend); } psend = ct; } @@ -1937,9 +2099,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb2, + cache, psend); } else { - ipmr_queue_xmit(net, mrt, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); return; } } @@ -2156,6 +2319,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) return -EMSGSIZE; @@ -3048,14 +3214,87 @@ static const struct net_protocol pim_protocol = { }; #endif +static unsigned int ipmr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); +} + +static int ipmr_dump(struct net *net, struct notifier_block *nb) +{ + struct mr_table *mrt; + int err; + + err = ipmr_rules_dump(net, nb); + if (err) + return err; + + ipmr_for_each_table(mrt, net) { + struct vif_device *v = &mrt->vif_table[0]; + struct mfc_cache *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(&mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(&mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + call_ipmr_mfc_entry_notifier(nb, net, + FIB_EVENT_ENTRY_ADD, mfc, + mrt->id); + } + + return 0; +} + +static const struct fib_notifier_ops ipmr_notifier_ops_template = { + .family = RTNL_FAMILY_IPMR, + .fib_seq_read = ipmr_seq_read, + .fib_dump = ipmr_dump, + .owner = THIS_MODULE, +}; + +static int __net_init ipmr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv4.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.ipmr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ipmr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); + net->ipv4.ipmr_notifier_ops = NULL; +} + /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; + err = ipmr_notifier_init(net); + if (err) + goto ipmr_notifier_fail; + err = ipmr_rules_init(net); if (err < 0) - goto fail; + goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -3072,7 +3311,9 @@ proc_cache_fail: proc_vif_fail: ipmr_rules_exit(net); #endif -fail: +ipmr_rules_fail: + ipmr_notifier_exit(net); +ipmr_notifier_fail: return err; } @@ -3082,6 +3323,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_notifier_exit(net); ipmr_rules_exit(net); } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index a0f37b208268..0443ca4120b0 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -276,7 +276,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3d9f1c2f81c5..bc40bd411196 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1250,7 +1250,7 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size, + unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, ip_rt_min_advmss); return min(advmss, IPV4_MAX_PMTU - header_size); @@ -3038,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { - int rc = 0; int cpu; ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); @@ -3095,7 +3094,7 @@ int __init ip_rt_init(void) #endif register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); - return rc; + return 0; } #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 77cf32a80952..fda37f2862c9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0989e739d098..a82b44038308 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,7 @@ #include <net/inet_frag.h> #include <net/ping.h> #include <net/protocol.h> +#include <net/netevent.h> static int zero; static int one = 1; @@ -252,10 +253,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; @@ -266,7 +269,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -283,12 +286,8 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, NULL, user_key, + TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -359,11 +358,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -386,15 +387,25 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH +static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_policy); + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + + return ret; +} +#endif + static struct ctl_table ipv4_table[] = { { - .procname = "tcp_retrans_collapse", - .data = &sysctl_tcp_retrans_collapse, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -402,48 +413,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, - { - .procname = "tcp_abort_on_overflow", - .data = &sysctl_tcp_abort_on_overflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_stdurg", - .data = &sysctl_tcp_stdurg, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_rfc1337", - .data = &sysctl_tcp_rfc1337, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -465,34 +434,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "tcp_fack", - .data = &sysctl_tcp_fack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_recovery", - .data = &sysctl_tcp_recovery, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_max_reordering", - .data = &sysctl_tcp_max_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_dsack", - .data = &sysctl_tcp_dsack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_mem", .maxlen = sizeof(sysctl_tcp_mem), .data = &sysctl_tcp_mem, @@ -516,36 +457,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &one, }, { - .procname = "tcp_app_win", - .data = &sysctl_tcp_app_win, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_adv_win_scale", - .data = &sysctl_tcp_adv_win_scale, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_adv_win_scale_min, - .extra2 = &tcp_adv_win_scale_max, - }, - { - .procname = "tcp_frto", - .data = &sysctl_tcp_frto, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_min_rtt_wlen", - .data = &sysctl_tcp_min_rtt_wlen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), @@ -553,60 +464,11 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_no_metrics_save", - .data = &sysctl_tcp_nometrics_save, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_moderate_rcvbuf", - .data = &sysctl_tcp_moderate_rcvbuf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_tso_win_divisor", - .data = &sysctl_tcp_tso_win_divisor, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "tcp_congestion_control", .mode = 0644, .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, - { - .procname = "tcp_workaround_signed_windows", - .data = &sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_limit_output_bytes", - .data = &sysctl_tcp_limit_output_bytes, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_challenge_ack_limit", - .data = &sysctl_tcp_challenge_ack_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_slow_start_after_idle", - .data = &sysctl_tcp_slow_start_after_idle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -650,65 +512,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_allowed_congestion_control, }, { - .procname = "tcp_thin_linear_timeouts", - .data = &sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_early_retrans", - .data = &sysctl_tcp_early_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &four, - }, - { - .procname = "tcp_min_tso_segs", - .data = &sysctl_tcp_min_tso_segs, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &gso_max_segs, - }, - { - .procname = "tcp_pacing_ss_ratio", - .data = &sysctl_tcp_pacing_ss_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, - { - .procname = "tcp_pacing_ca_ratio", - .data = &sysctl_tcp_pacing_ca_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, - { - .procname = "tcp_autocorking", - .data = &sysctl_tcp_autocorking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "tcp_invalid_ratelimit", - .data = &sysctl_tcp_invalid_ratelimit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, - { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, .mode = 0444, @@ -1086,6 +889,28 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", @@ -1101,7 +926,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, .extra2 = &one, }, @@ -1145,6 +970,200 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_early_retrans", + .data = &init_net.ipv4.sysctl_tcp_early_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &four, + }, + { + .procname = "tcp_recovery", + .data = &init_net.ipv4.sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_slow_start_after_idle", + .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retrans_collapse", + .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_stdurg", + .data = &init_net.ipv4.sysctl_tcp_stdurg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_rfc1337", + .data = &init_net.ipv4.sysctl_tcp_rfc1337, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_abort_on_overflow", + .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fack", + .data = &init_net.ipv4.sysctl_tcp_fack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_reordering", + .data = &init_net.ipv4.sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_dsack", + .data = &init_net.ipv4.sysctl_tcp_dsack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_app_win", + .data = &init_net.ipv4.sysctl_tcp_app_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_adv_win_scale", + .data = &init_net.ipv4.sysctl_tcp_adv_win_scale, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, + }, + { + .procname = "tcp_frto", + .data = &init_net.ipv4.sysctl_tcp_frto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_no_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_nometrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_moderate_rcvbuf", + .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_tso_win_divisor", + .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_workaround_signed_windows", + .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_limit_output_bytes", + .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_challenge_ack_limit", + .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_min_tso_segs", + .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &gso_max_segs, + }, + { + .procname = "tcp_min_rtt_wlen", + .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_autocorking", + .data = &init_net.ipv4.sysctl_tcp_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "tcp_invalid_ratelimit", + .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "tcp_pacing_ss_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..a7a0f316eb86 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> +#include <linux/static_key.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -282,9 +283,7 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> -int sysctl_tcp_min_tso_segs __read_mostly = 2; - -int sysctl_tcp_autocorking __read_mostly = 1; +#include <trace/events/tcp.h> struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -300,6 +299,11 @@ EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +#if IS_ENABLED(CONFIG_SMC) +DEFINE_STATIC_KEY_FALSE(tcp_have_smc); +EXPORT_SYMBOL(tcp_have_smc); +#endif + /* * Current number of TCP sockets. */ @@ -413,8 +417,10 @@ void tcp_init_sock(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -456,8 +462,22 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -675,7 +695,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sysctl_tcp_autocorking && + sock_net(sk)->ipv4.sysctl_tcp_autocorking && skb != tcp_write_queue_head(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -686,10 +706,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!tcp_send_head(sk)) - return; - skb = tcp_write_queue_tail(sk); + if (!skb) + return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); @@ -869,6 +888,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -948,14 +968,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + tcp_rtx_and_write_queues_empty(sk)); if (!skb) goto wait_for_memory; @@ -1027,7 +1047,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } @@ -1126,7 +1146,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -1183,7 +1203,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; @@ -1259,7 +1279,7 @@ restart: int max = size_goal; skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { + if (skb) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; @@ -1279,7 +1299,7 @@ new_segment: process_backlog = false; goto restart; } - first_skb = skb_queue_empty(&sk->sk_write_queue); + first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, @@ -1404,7 +1424,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: @@ -1505,6 +1525,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -2017,6 +2044,8 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + trace_tcp_set_state(sk, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@ -2304,6 +2333,37 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); + } +} + +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + tcp_rtx_queue_purge(sk); + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + sk_mem_reclaim(sk); + tcp_clear_all_retrans_hints(tcp_sk(sk)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2362,7 +2422,6 @@ int tcp_disconnect(struct sock *sk, int flags) * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; - tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); @@ -2454,7 +2513,7 @@ static int tcp_repair_options_est(struct sock *sk, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sysctl_tcp_fack) + if (sock_net(sk)->ipv4.sysctl_tcp_fack) tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: @@ -2518,6 +2577,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + if (optlen != sizeof(key)) + return -EINVAL; + + if (copy_from_user(key, optval, optlen)) + return -EFAULT; + + return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + } default: /* fallthru */ break; @@ -2749,7 +2819,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { @@ -2759,7 +2829,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else @@ -2768,6 +2838,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EOPNOTSUPP; } break; + case TCP_FASTOPEN_NO_COOKIE: + if (val > 1 || val < 0) + err = -EINVAL; + else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + err = -EINVAL; + else + tp->fastopen_no_cookie = val; + break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; @@ -3104,6 +3182,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctx; + + if (get_user(len, optlen)) + return -EFAULT; + + rcu_read_lock(); + ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); + if (ctx) + memcpy(key, ctx->key, sizeof(key)); + else + len = 0; + rcu_read_unlock(); + + len = min_t(unsigned int, len, sizeof(key)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, key, len)) + return -EFAULT; + return 0; + } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; @@ -3166,6 +3266,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_connect; break; + case TCP_FASTOPEN_NO_COOKIE: + val = tp->fastopen_no_cookie; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 66ac69f7bd19..06fbe102a425 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -389,7 +389,7 @@ static void tcp_cdg_release(struct sock *sk) kfree(ca->gradients); } -struct tcp_congestion_ops tcp_cdg __read_mostly = { +static struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index fbbeda647774..78c192ee03a4 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -10,15 +10,18 @@ #include <net/inetpeer.h> #include <net/tcp.h> -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -26,8 +29,8 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -38,10 +41,37 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_destroy_cipher(struct sock *sk) +{ + struct tcp_fastopen_context *ctx; + + ctx = rcu_dereference_protected( + inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); + if (ctx) + call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); +} + +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, + void *key, unsigned int len) { - int err; struct tcp_fastopen_context *ctx, *octx; + struct fastopen_queue *q; + int err; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -62,26 +92,37 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + if (sk) { + q = &inet_csk(sk)->icsk_accept_queue.fastopenq; + octx = rcu_dereference_protected(q->ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(q->ctx, ctx); + } else { + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + } + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + + ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); + if (!ctx) + ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -97,7 +138,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct sock *sk, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -105,7 +147,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(sk, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -113,13 +155,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(sk, buf, foc); } } #endif @@ -217,12 +259,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; @@ -272,33 +309,45 @@ static bool tcp_fastopen_queue_check(struct sock *sk) return true; } +static bool tcp_fastopen_no_cookie(const struct sock *sk, + const struct dst_entry *dst, + int flag) +{ + return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) || + tcp_sk(sk)->fastopen_no_cookie || + (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); +} + /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + const struct dst_entry *dst) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && + tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { @@ -331,6 +380,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { unsigned long last_syn_loss = 0; + const struct dst_entry *dst; int syn_loss = 0; tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); @@ -348,7 +398,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + dst = __sk_dst_get(sk); + + if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } @@ -402,25 +454,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -429,17 +472,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -458,27 +502,25 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node *p; - struct sk_buff *skb; struct dst_entry *dst; + struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { - p = rb_first(&tp->out_of_order_queue); - if (p && !rb_next(p)) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); + if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5a87a00641d3..8393b405ea98 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,25 +76,10 @@ #include <linux/ipsec.h> #include <asm/unaligned.h> #include <linux/errqueue.h> +#include <trace/events/tcp.h> +#include <linux/static_key.h> -int sysctl_tcp_fack __read_mostly; -int sysctl_tcp_max_reordering __read_mostly = 300; -int sysctl_tcp_dsack __read_mostly = 1; -int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); - -/* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 1000; - -int sysctl_tcp_stdurg __read_mostly; -int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly = 2; -int sysctl_tcp_min_rtt_wlen __read_mostly = 300; -int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_early_retrans __read_mostly = 3; -int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -368,8 +353,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize) >> 1; - int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; + int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; + int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -394,7 +379,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, skb->truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); @@ -420,7 +405,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency * Allow enough cushion so that sender is not limited by our window */ - if (sysctl_tcp_moderate_rcvbuf) + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) @@ -432,6 +417,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) */ void tcp_init_buffer_space(struct sock *sk) { + int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -450,14 +436,14 @@ void tcp_init_buffer_space(struct sock *sk) if (tp->window_clamp >= maxwin) { tp->window_clamp = maxwin; - if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) + if (tcp_app_win && maxwin > 4 * tp->advmss) tp->window_clamp = max(maxwin - - (maxwin >> sysctl_tcp_app_win), + (maxwin >> tcp_app_win), 4 * tp->advmss); } /* Force reservation of one segment. */ - if (sysctl_tcp_app_win && + if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); @@ -610,7 +596,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * <prev RTT . ><current RTT .. ><next RTT .... > */ - if (sysctl_tcp_moderate_rcvbuf && + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvwin, rcvmem, rcvbuf; @@ -634,7 +620,7 @@ void tcp_rcv_space_adjust(struct sock *sk) } rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) + while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); @@ -781,15 +767,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->srtt_us = max(1U, srtt); } -/* Set the sk_pacing_rate to allow proper sizing of TSO packets. - * Note: TCP stack does not yet implement pacing. - * FQ packet scheduler can be used to implement cheap but effective - * TCP pacing, to smooth the burst on large writes when packets - * in flight is significantly lower than cwnd (or rwin) - */ -int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; -int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; - static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -807,9 +784,9 @@ static void tcp_update_pacing_rate(struct sock *sk) * end of slow start and should slow down. */ if (tp->snd_cwnd < tp->snd_ssthresh / 2) - rate *= sysctl_tcp_pacing_ss_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; else - rate *= sysctl_tcp_pacing_ca_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); @@ -891,7 +868,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, return; if (metric > tp->reordering) { - tp->reordering = min(sysctl_tcp_max_reordering, metric); + tp->reordering = min(sock_net(sk)->ipv4.sysctl_tcp_max_reordering, metric); #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", @@ -1143,6 +1120,7 @@ struct tcp_sacktag_state { u64 last_sackt; struct rate_sample *rate; int flag; + unsigned int mss_now; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1192,7 +1170,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, if (pkt_len >= skb->len && !in_sack) return 0; - err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); + err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1289,13 +1268,13 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *prev = tcp_write_queue_prev(sk, skb); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ @@ -1364,8 +1343,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); @@ -1415,9 +1393,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto fallback; /* Can only happen with delayed DSACK + discard craziness */ - if (unlikely(skb == tcp_write_queue_head(sk))) + prev = skb_rb_prev(skb); + if (!prev) goto fallback; - prev = tcp_write_queue_prev(sk, skb); if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; @@ -1496,18 +1474,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) + if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ - if (prev == tcp_write_queue_tail(sk)) + skb = skb_rb_next(prev); + if (!skb) goto out; - skb = tcp_write_queue_next(sk, prev); if (!skb_can_shift(skb) || - (skb == tcp_send_head(sk)) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; @@ -1515,7 +1492,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), + len, mss, 0); } out: @@ -1539,13 +1517,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; - if (skb == tcp_send_head(sk)) - break; - /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; @@ -1594,6 +1569,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_skb_pcount(skb), skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1605,23 +1582,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, return skb; } -/* Avoid all extra work that is being done by sacktag while walking in - * a normal way - */ +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, + struct tcp_sacktag_state *state, + u32 seq) +{ + struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; + struct sk_buff *skb; + int unack_bytes; + + while (*p) { + parent = *p; + skb = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb)->seq)) { + p = &parent->rb_left; + continue; + } + if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { + p = &parent->rb_right; + continue; + } + + state->fack_count = 0; + unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una; + if (state->mss_now && unack_bytes > 0) + state->fack_count = unack_bytes / state->mss_now; + + return skb; + } + return NULL; +} + static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, struct tcp_sacktag_state *state, u32 skip_to_seq) { - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) - break; + if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) + return skb; - state->fack_count += tcp_skb_pcount(skb); - } - return skb; + return tcp_sacktag_bsearch(sk, state, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, @@ -1743,8 +1741,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } } - skb = tcp_write_queue_head(sk); + state->mss_now = tcp_current_mss(sk); state->fack_count = 0; + skb = NULL; i = 0; if (!tp->sacked_out) { @@ -1968,7 +1967,7 @@ void tcp_enter_loss(struct sock *sk) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); @@ -1977,10 +1976,7 @@ void tcp_enter_loss(struct sock *sk) } tcp_clear_all_retrans_hints(tp); - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - + skb_rbtree_walk_from(skb) { mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || is_reneg); if (mark_lost) @@ -2014,7 +2010,7 @@ void tcp_enter_loss(struct sock *sk) * falsely raise the receive window, which results in repeated * timeouts and stop-and-go behavior. */ - tp->frto = sysctl_tcp_frto && + tp->frto = net->ipv4.sysctl_tcp_frto && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } @@ -2206,20 +2202,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); - if (tp->lost_skb_hint) { - skb = tp->lost_skb_hint; - cnt = tp->lost_cnt_hint; + skb = tp->lost_skb_hint; + if (skb) { /* Head already handled? */ - if (mark_head && skb != tcp_write_queue_head(sk)) + if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) return; + cnt = tp->lost_cnt_hint; } else { - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); cnt = 0; } - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk_from(skb) { /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -2243,7 +2237,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) /* If needed, chop off the prefix to mark as lost. */ lost = (packets - oldcnt) * mss; if (lost < skb->len && - tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2327,16 +2322,16 @@ static bool tcp_any_retrans_done(const struct sock *sk) if (tp->retrans_out) return true; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; return false; } -#if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { +#if FASTRETRANS_DEBUG > 1 struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -2358,10 +2353,8 @@ static void DBGUNDO(struct sock *sk, const char *msg) tp->packets_out); } #endif -} -#else -#define DBGUNDO(x...) do { } while (0) #endif +} static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { @@ -2370,9 +2363,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (unmark_loss) { struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; @@ -2617,9 +2608,7 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk); u32 prior_lost = tp->lost_out; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss && !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { @@ -2713,7 +2702,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ - if (tcp_send_head(sk) && + if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; @@ -2775,7 +2764,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) struct tcp_sock *tp = tcp_sk(sk); /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk); @@ -2805,9 +2794,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. @@ -2874,6 +2863,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ + /* fall through */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -2914,8 +2904,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { + u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); - u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); @@ -3057,8 +3047,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && - before(shinfo->tskey, tcp_sk(sk)->snd_una)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + before(shinfo->tskey, tcp_sk(sk)->snd_una)) { + tcp_skb_tsorted_save(skb) { + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + } tcp_skb_tsorted_restore(skb); + } } /* Remove acknowledged frames from the retransmission queue. If our packet @@ -3074,11 +3067,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; + struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; - struct sk_buff *skb; u32 pkts_acked = 0; u32 last_in_flight = 0; bool rtt_update; @@ -3086,7 +3079,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = 0; - while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; @@ -3104,8 +3097,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = false; } else { - /* Speedup tcp_unlink_write_queue() and next loop */ - prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } @@ -3157,12 +3148,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!fully_acked) break; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) @@ -3254,12 +3245,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, static void tcp_ack_probe(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *head = tcp_send_head(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ - - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + if (!head) + return; + if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -3379,7 +3372,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); - if (tcp_send_head(sk)) + if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { @@ -3400,7 +3393,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, if (*last_oow_ack_time) { s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } @@ -3436,10 +3429,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); u32 count, now; /* First check our per-socket dupack rate limit. */ - if (__tcp_oow_rate_limited(sock_net(sk), + if (__tcp_oow_rate_limited(net, LINUX_MIB_TCPACKSKIPPEDCHALLENGE, &tp->last_oow_ack_time)) return; @@ -3447,16 +3441,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { - u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; + u32 half = (ack_limit + 1) >> 1; challenge_timestamp = now; - WRITE_ONCE(challenge_count, half + - prandom_u32_max(sysctl_tcp_challenge_ack_limit)); + WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); } count = READ_ONCE(challenge_count); if (count > 0) { WRITE_ONCE(challenge_count, count - 1); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } } @@ -3564,8 +3558,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; - /* We very likely will need to access write queue head. */ - prefetchw(sk->sk_write_queue.next); + /* We very likely will need to access rtx queue. */ + prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. @@ -3679,8 +3673,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tcp_send_head(sk)) - tcp_ack_probe(sk); + tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3722,6 +3715,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } +static void smc_parse_options(const struct tcphdr *th, + struct tcp_options_received *opt_rx, + const unsigned char *ptr, + int opsize) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (th->syn && !(opsize & 1) && + opsize >= TCPOLEN_EXP_SMC_BASE && + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + opt_rx->smc_ok = 1; + } +#endif +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3829,6 +3837,9 @@ void tcp_parse_options(const struct net *net, tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); + else + smc_parse_options(th, opt_rx, ptr, + opsize); break; } @@ -3996,6 +4007,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) /* When we get a reset we do this. */ void tcp_reset(struct sock *sk) { + trace_tcp_receive_reset(sk); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -4118,7 +4131,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4153,7 +4166,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -4269,11 +4282,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } -enum tcp_queue { - OOO_QUEUE, - RCV_QUEUE, -}; - /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket @@ -4289,7 +4297,6 @@ enum tcp_queue { * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, - enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4314,10 +4321,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; - if (dest == OOO_QUEUE) - TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; - else - to->tstamp = from->tstamp; + to->tstamp = from->tstamp; } return true; @@ -4342,7 +4346,7 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_first(&tp->out_of_order_queue); while (p) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4354,9 +4358,6 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); - /* Replace tstamp which was stomped by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4368,8 +4369,7 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, - tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4410,7 +4410,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node **p, *q, *parent; + struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; @@ -4423,10 +4423,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Stash tstamp to avoid being stomped on by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - TCP_SKB_CB(skb)->swtstamp = skb->tstamp; - /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -4454,7 +4450,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); @@ -4473,7 +4469,7 @@ coalesce_done: parent = NULL; while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; @@ -4505,7 +4501,7 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } @@ -4518,9 +4514,7 @@ insert: merge_right: /* Remove other segments covered by skb. */ - while ((q = rb_next(&skb->rbnode)) != NULL) { - skb1 = rb_entry(q, struct sk_buff, rbnode); - + while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4535,7 +4529,7 @@ merge_right: tcp_drop(sk, skb1); } /* If there is no skb after us, we are the last_skb ! */ - if (!q) + if (!skb1) tp->ooo_last_skb = skb; add_sack: @@ -4557,7 +4551,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, RCV_QUEUE, tail, + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { @@ -4721,7 +4715,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; - return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); + return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, @@ -4742,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -4750,7 +4744,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else @@ -4797,7 +4791,7 @@ restart: * overlaps to the next one. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && - (tcp_win_from_space(skb->truesize) > skb->len || + (tcp_win_from_space(sk, skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; break; @@ -4869,26 +4863,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; - struct rb_node *p; u32 start, end; - p = rb_first(&tp->out_of_order_queue); - skb = rb_entry_safe(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { - p = rb_last(&tp->out_of_order_queue); - /* Note: This is possible p is NULL here. We do not - * use rb_entry_safe(), as ooo_last_skb is valid only - * if rbtree is not empty. - */ - tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); + tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; for (head = skb;;) { - skb = tcp_skb_next(skb, NULL); + skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. @@ -4931,14 +4918,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); + tcp_drop(sk, rb_to_skb(node)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; node = prev; } while (node); - tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + tp->ooo_last_skb = rb_to_skb(prev); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -5113,7 +5100,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); - if (ptr && !sysctl_tcp_stdurg) + if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) ptr--; ptr += ntohl(th->seq); @@ -5533,20 +5520,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5560,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; @@ -5595,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ - tcp_for_write_queue_from(data, sk) { - if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data, 1)) + skb_rbtree_walk_from(data) { + if (__tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); @@ -5615,6 +5594,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return false; } +static void smc_check_reset_syn(struct tcp_sock *tp) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && !tp->rx_opt.smc_ok) + tp->syn_smc = 0; + } +#endif +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5710,10 +5699,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tcp_is_sack(tp) && sysctl_tcp_fack) + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack) tcp_enable_fack(tp); - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5722,6 +5710,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * is initialized. */ tp->copied_seq = tp->rcv_nxt; + smc_check_reset_syn(tp); + smp_mb(); tcp_finish_connect(sk, skb); @@ -5939,15 +5929,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5967,19 +5960,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (req) { - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); @@ -6076,6 +6056,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; + /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, @@ -6184,6 +6165,9 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); +#if IS_ENABLED(CONFIG_SMC) + ireq->smc_ok = rx_opt->smc_ok; +#endif } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, @@ -6359,7 +6343,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init_rwin(req, sk, dst); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); - fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5b027c69cbc5..0162c577bb9c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,8 @@ #include <crypto/hash.h> #include <linux/scatterlist.h> +#include <trace/events/tcp.h> + #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -480,7 +482,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); BUG_ON(!skb); tcp_mstamp_refresh(tp); @@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) + if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); @@ -1783,8 +1787,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; @@ -1864,6 +1869,8 @@ void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + trace_tcp_destroy_sock(sk); + tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); @@ -1896,6 +1903,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_fastopen_destroy_cipher(sk); tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); @@ -2476,6 +2484,36 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_early_retrans = 3; + net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; + net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ + net->ipv4.sysctl_tcp_retrans_collapse = 1; + net->ipv4.sysctl_tcp_max_reordering = 300; + net->ipv4.sysctl_tcp_dsack = 1; + net->ipv4.sysctl_tcp_app_win = 31; + net->ipv4.sysctl_tcp_adv_win_scale = 1; + net->ipv4.sysctl_tcp_frto = 2; + net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + /* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ + net->ipv4.sysctl_tcp_tso_win_divisor = 3; + /* Default TSQ limit of four TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* rfc5961 challenge ack rate limiting */ + net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + net->ipv4.sysctl_tcp_min_tso_segs = 2; + net->ipv4.sysctl_tcp_min_rtt_wlen = 300; + net->ipv4.sysctl_tcp_autocorking = 1; + net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; + net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; + net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; + + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return 0; fail: @@ -2486,7 +2524,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0f0d740f6c8b..9d5ddebfd831 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -21,8 +21,6 @@ #include <net/tcp.h> #include <net/genetlink.h> -int sysctl_tcp_nometrics_save __read_mostly; - static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash); @@ -331,7 +329,7 @@ void tcp_update_metrics(struct sock *sk) int m; sk_dst_confirm(sk); - if (sysctl_tcp_nometrics_save || !dst) + if (net->ipv4.sysctl_tcp_nometrics_save || !dst) return; rcu_read_lock(); @@ -893,10 +891,14 @@ static void tcp_metrics_flush_all(struct net *net) for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; + bool match; + spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { - if (net_eq(tm_net(tm), net)) { + match = net ? net_eq(tm_net(tm), net) : + !atomic_read(&tm_net(tm)->count); + if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); } else { @@ -1019,14 +1021,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) return 0; } -static void __net_exit tcp_net_metrics_exit(struct net *net) +static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { - tcp_metrics_flush_all(net); + tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, - .exit = tcp_net_metrics_exit, + .init = tcp_net_metrics_init, + .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 188a6f31356d..3c65c1a3f944 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -23,13 +23,12 @@ #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/workqueue.h> +#include <linux/static_key.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/busy_poll.h> -int sysctl_tcp_abort_on_overflow __read_mostly; - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -180,7 +179,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if (sysctl_tcp_rfc1337 == 0) { + if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; @@ -298,8 +297,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) - BUG(); + BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); } } while (0); #endif @@ -371,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(full_space, + tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, @@ -417,6 +415,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +static void smc_check_reset_syn_req(struct tcp_sock *oldtp, + struct request_sock *req, + struct tcp_sock *newtp) +{ +#if IS_ENABLED(CONFIG_SMC) + struct inet_request_sock *ireq; + + if (static_branch_unlikely(&tcp_have_smc)) { + ireq = inet_rsk(req); + if (oldtp->syn_smc && !ireq->smc_ok) + newtp->syn_smc = 0; + } +#endif +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -434,6 +447,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; @@ -446,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); @@ -493,7 +510,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { - if (sysctl_tcp_fack) + if (sock_net(sk)->ipv4.sysctl_tcp_fack) tcp_enable_fack(newtp); } newtp->window_clamp = req->rsk_window_clamp; @@ -764,7 +781,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: - if (!sysctl_tcp_abort_on_overflow) { + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; } diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 125fc1450b01..0529e29e2941 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -39,7 +39,7 @@ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 * nv_rtt_factor RTT averaging factor - * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping @@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2; static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ -static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ static int nv_cwnd_growth_rate_neg __read_mostly = 8; static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ static int nv_dec_eval_min_calls __read_mostly = 60; @@ -101,6 +101,11 @@ struct tcpnv { u32 nv_last_rtt; /* last rtt */ u32 nv_min_rtt; /* active min rtt. Used to determine slope */ u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_base_rtt; /* If non-zero it represents the threshold for + * congestion */ + u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is + * set to 80% of nv_base_rtt. It helps reduce + * unfairness between flows */ u32 nv_rtt_max_rate; /* max rate seen during current RTT */ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives * acking beyond nv_rtt_start_seq */ @@ -132,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) static void tcpnv_init(struct sock *sk) { struct tcpnv *ca = inet_csk_ca(sk); + int base_rtt; tcpnv_reset(ca, sk); + /* See if base_rtt is available from socket_ops bpf program. + * It is meant to be used in environments, such as communication + * within a datacenter, where we have reasonable estimates of + * RTTs + */ + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + if (base_rtt > 0) { + ca->nv_base_rtt = base_rtt; + ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ + } else { + ca->nv_base_rtt = 0; + ca->nv_lower_bound_rtt = 0; + } + ca->nv_allow_cwnd_growth = 1; ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; ca->nv_min_rtt = NV_INIT_RTT; @@ -144,6 +164,19 @@ static void tcpnv_init(struct sock *sk) ca->cwnd_growth_factor = 0; } +/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) + * bounds to RTT. + */ +inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) +{ + if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) + return ca->nv_lower_bound_rtt; + else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) + return ca->nv_base_rtt; + else + return val; +} + static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -265,6 +298,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) if (ca->nv_eval_call_cnt < 255) ca->nv_eval_call_cnt++; + /* Apply bounds to rtt. Only used to update min_rtt */ + avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); + /* update min rtt if necessary */ if (avg_rtt < ca->nv_min_rtt) ca->nv_min_rtt = avg_rtt; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 478909f4694d..b98b2f7f07e7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,40 +41,25 @@ #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> +#include <linux/static_key.h> -/* People can turn this off for buggy TCP's found in printers etc. */ -int sysctl_tcp_retrans_collapse __read_mostly = 1; - -/* People can turn this on to work with those rare, broken TCPs that - * interpret the window field as a signed quantity. - */ -int sysctl_tcp_workaround_signed_windows __read_mostly = 0; - -/* Default TSQ limit of four TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 262144; - -/* This limits the percentage of the congestion window which we - * will allow a single TSO frame to consume. Building TSO frames - * which are too large can cause TCP streams to be bursty. - */ -int sysctl_tcp_tso_win_divisor __read_mostly = 3; - -/* By default, RFC2861 behavior. */ -int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +#include <trace/events/tcp.h> static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_write_queue); + tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -203,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss) * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ -void tcp_select_initial_window(int __space, __u32 mss, +void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) @@ -227,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ - if (sysctl_tcp_workaround_signed_windows) + if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; @@ -287,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk) /* Make sure we do not exceed the maximum possible * scaled window. */ - if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) + if (!tp->rx_opt.rcv_wscale && + sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -418,6 +404,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) +#define OPTION_SMC (1 << 9) + +static void smc_options_write(__be32 *ptr, u16 *options) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (unlikely(OPTION_SMC & *options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } + } +#endif +} struct tcp_out_options { u16 options; /* bit field of OPTION_* */ @@ -536,6 +538,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } ptr += (len + 3) >> 2; } + + smc_options_write(ptr, &options); +} + +static void smc_set_option(const struct tcp_sock *tp, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif +} + +static void smc_set_option_cond(const struct tcp_sock *tp, + const struct inet_request_sock *ireq, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && ireq->smc_ok) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif } /* Compute TCP options for SYN packets. This is not the final @@ -603,11 +640,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + smc_set_option(tp, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct request_sock *req, +static unsigned int tcp_synack_options(const struct sock *sk, + struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, @@ -663,6 +703,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, } } + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -973,6 +1015,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) HRTIMER_MODE_ABS_PINNED); } +static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->skb_mstamp = tp->tcp_mstamp; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1005,10 +1053,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else - skb = skb_clone(skb, gfp_mask); + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + if (unlikely(!skb)) return -ENOBUFS; } @@ -1129,7 +1181,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - oskb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1241,12 +1293,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) TCP_SKB_CB(skb)->eor = 0; } +/* Insert buff after skb on the write or rtx queue of sk. */ +static void tcp_insert_write_queue_after(struct sk_buff *skb, + struct sk_buff *buff, + struct sock *sk, + enum tcp_queue tcp_queue) +{ + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + __skb_queue_after(&sk->sk_write_queue, skb, buff); + else + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); @@ -1329,7 +1394,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); + if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } @@ -1607,7 +1674,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if (sysctl_tcp_slow_start_after_idle && + if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1616,10 +1683,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) - * 3) no more data to send (null tcp_send_head ) + * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ - if (!tcp_send_head(sk) && sk->sk_socket && + if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); @@ -1694,7 +1761,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; return tso_segs ? : - tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); + tcp_tso_autosize(sk, mss_now, + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); } /* Returns the portion of skb which can be sent right away */ @@ -1815,7 +1883,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, +static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; @@ -1824,7 +1893,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now, gfp); + return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) @@ -1860,7 +1929,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); return 0; } @@ -1910,7 +1979,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + win_divisor = ACCESS_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1930,8 +1999,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - head = tcp_write_queue_head(sk); - + /* TODO : use tsorted_sent_queue ? */ + head = tcp_rtx_queue_head(sk); + if (!head) + goto send_now; age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) @@ -2146,17 +2217,17 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int limit; limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); - limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); + limit = min_t(u32, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send the 1st or 2nd skb in write queue. + /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (skb == sk->sk_write_queue.next || - skb->prev == sk->sk_write_queue.next) + if (tcp_rtx_queue_empty(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2207,7 +2278,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ - if (tcp_write_queue_empty(sk)) + if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); @@ -2263,7 +2334,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); goto repair; /* Skip network transmission */ } @@ -2302,7 +2373,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, limit, mss_now, gfp))) break; if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) @@ -2342,7 +2414,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return !tp->packets_out && tcp_send_head(sk); + return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2350,6 +2422,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, rto_delta_us; + int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. @@ -2357,16 +2430,17 @@ bool tcp_schedule_loss_probe(struct sock *sk) if (tp->fastopen_rsk) return false; + early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || icsk->icsk_ca_state != TCP_CA_Open) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - tcp_send_head(sk)) + !tcp_write_queue_empty(sk)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2419,18 +2493,14 @@ void tcp_send_loss_probe(struct sock *sk) int mss = tcp_current_mss(sk); skb = tcp_send_head(sk); - if (skb) { - if (tcp_snd_wnd_test(tp, skb, mss)) { - pcount = tp->packets_out; - tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - if (tp->packets_out > pcount) - goto probe_sent; - goto rearm_timer; - } - skb = tcp_write_queue_prev(sk, skb); - } else { - skb = tcp_write_queue_tail(sk); + if (skb && tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; } + skb = skb_rb_last(&sk->tcp_rtx_queue); /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) @@ -2448,10 +2518,11 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_next(sk, skb); + skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) @@ -2651,7 +2722,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + struct sk_buff *next_skb = skb_rb_next(skb); int skb_size, next_skb_size; skb_size = skb->len; @@ -2668,8 +2739,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_replace(sk, next_skb, skb); - tcp_unlink_write_queue(next_skb, sk); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2697,7 +2766,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); - sk_wmem_free_skb(sk, next_skb); + tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } @@ -2708,8 +2777,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; - if (skb == tcp_send_head(sk)) - return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2727,12 +2794,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, struct sk_buff *skb = to, *tmp; bool first = true; - if (!sysctl_tcp_retrans_collapse) + if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; - tcp_for_write_queue_from_safe(skb, tmp, sk) { + skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; @@ -2807,7 +2874,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) len = cur_mss * segs; if (skb->len > len) { - if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, + cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone(skb, GFP_ATOMIC)) @@ -2841,11 +2909,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + tcp_skb_tsorted_save(skb) { + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } tcp_skb_tsorted_restore(skb); + if (!err) { - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); tcp_rate_skb_sent(sk, skb); } } else { @@ -2854,6 +2925,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } @@ -2897,29 +2969,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - struct sk_buff *hole = NULL; u32 max_segs; int mib_idx; if (!tp->packets_out) return; - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - } else { - skb = tcp_write_queue_head(sk); - } - + rtx_head = tcp_rtx_queue_head(sk); + skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { __u8 sacked; int segs; - if (skb == tcp_send_head(sk)) - break; - if (tcp_pacing_check(sk)) break; @@ -2964,7 +3028,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk) && + if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -3006,12 +3070,15 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { + if (!tskb && tcp_under_memory_pressure(sk)) + tskb = skb_rb_last(&sk->tcp_rtx_queue); + + if (tskb) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (!tcp_send_head(sk)) { + if (tcp_write_queue_empty(sk)) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have @@ -3028,6 +3095,7 @@ coalesce: goto coalesce; return; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ @@ -3064,6 +3132,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + /* skb of trace_tcp_send_reset() keeps the skb that caused RST, + * skb here is different to the troublesome skb, so use NULL + */ + trace_tcp_send_reset(sk, NULL); } /* Send a crossed SYN-ACK during socket establishment. @@ -3076,20 +3149,24 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { - pr_debug("%s: wrong queue state\n", __func__); + pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + struct sk_buff *nskb; + + tcp_skb_tsorted_save(skb) { + nskb = skb_copy(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; - tcp_unlink_write_queue(skb, sk); + INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); - __tcp_add_write_queue_head(sk, nskb); - sk_wmem_free_skb(sk, skb); + tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk->sk_wmem_queued += nskb->truesize; sk_mem_charge(sk, nskb->truesize); skb = nskb; @@ -3166,8 +3243,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + - sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3268,7 +3345,7 @@ static void tcp_connect_init(struct sock *sk) if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, @@ -3307,7 +3384,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); tp->write_seq = tcb->end_seq; @@ -3355,6 +3431,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) int copied = copy_from_iter(skb_put(syn_data, space), space, &fo->data->msg_iter); if (unlikely(!copied)) { + tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } @@ -3385,12 +3462,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); + tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - /* data was not sent, this is our new send_head */ - sk->sk_send_head = syn_data; + /* data was not sent, put it in write_queue */ + __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: @@ -3433,6 +3511,7 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3627,7 +3706,8 @@ int tcp_write_wakeup(struct sock *sk, int mib) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); @@ -3657,7 +3737,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; @@ -3698,6 +3778,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) tcp_sk(sk)->total_retrans++; + trace_tcp_retransmit_synack(sk, req); } return res; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index be8ef1e5dfef..ac3e9c6d3a3d 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -2,8 +2,6 @@ #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; - static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -46,7 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -59,45 +57,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ - break; } } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..035a1ef1f2d8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,8 +22,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_thin_linear_timeouts __read_mostly; - /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -156,8 +154,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) - start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return false; + start_ts = tcp_skb_timestamp(head); + } if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -283,15 +286,17 @@ out: * * Returns: Nothing (void) */ -static void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { - inet_csk(sk)->icsk_ack.blocked = 1; + icsk->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) @@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data) static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; u32 start_ts; - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; return; } @@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + start_ts = tcp_skb_timestamp(skb); if (!start_ts) - tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > jiffies_to_msecs(icsk->icsk_user_timeout)) @@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - WARN_ON(tcp_write_queue_empty(sk)); + WARN_ON(tcp_rtx_queue_empty(sk)); tp->tlp_high_seq = 0; @@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -514,7 +520,7 @@ out_reset_timer: * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && - (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; @@ -570,9 +576,11 @@ out: sk_mem_reclaim(sk); } -static void tcp_write_timer(unsigned long data) +static void tcp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { @@ -607,9 +615,9 @@ void tcp_set_keepalive(struct sock *sk, int val) EXPORT_SYMBOL_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (struct timer_list *t) { - struct sock *sk = (struct sock *) data; + struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; @@ -647,7 +655,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || tcp_send_head(sk)) + if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 218cfcc77650..ee113ff15fd0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd-1); + return min(tp->snd_ssthresh, tp->snd_cwnd); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ebfbccae62fd..a6699af05539 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1209,8 +1209,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&up->reader_queue)) + if (size < (sk->sk_rcvbuf >> 2)) return; } else { size += up->forward_deficit; |