diff options
Diffstat (limited to 'net/ipv4')
57 files changed, 1287 insertions, 810 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5ce44fb7d498..e31108e5ef79 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -944,6 +944,8 @@ const struct proto_ops inet_stream_ops = { .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .sendmsg_locked = tcp_sendmsg_locked, + .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, @@ -1594,6 +1596,9 @@ static const struct net_protocol igmp_protocol = { }; #endif +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, .early_demux_handler = tcp_v4_early_demux, @@ -1604,6 +1609,9 @@ static struct net_protocol tcp_protocol = { .icmp_strict_tag_validation = 1, }; +/* thinking of making this const? Don't. + * early_demux can change based on sysctl. + */ static struct net_protocol udp_protocol = { .early_demux = udp_v4_early_demux, .early_demux_handler = udp_v4_early_demux, @@ -1723,6 +1731,13 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif + /* Some igmp sysctl, whose values are always used */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; + return 0; } @@ -1763,6 +1778,11 @@ static const struct net_offload ipip_offload = { }, }; +static int __init ipip_offload_init(void) +{ + return inet_add_offload(&ipip_offload, IPPROTO_IPIP); +} + static int __init ipv4_offload_init(void) { /* @@ -1772,9 +1792,10 @@ static int __init ipv4_offload_init(void) pr_crit("%s: Cannot add UDP protocol offload\n", __func__); if (tcpv4_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); + if (ipip_offload_init() < 0) + pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); - inet_add_offload(&ipip_offload, IPPROTO_IPIP); return 0; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 8b52179ddc6e..7c45b8896709 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -171,7 +171,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c4c6e1969ed0..2ae8f54cb321 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1523,9 +1523,17 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { - if (optptr[0] == IPOPT_CIPSO) + switch (optptr[0]) { + case IPOPT_CIPSO: return optptr; - taglen = optptr[1]; + case IPOPT_END: + return NULL; + case IPOPT_NOOP: + taglen = 1; + break; + default: + taglen = optptr[1]; + } optlen -= taglen; optptr += taglen; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 38d9af9b917c..d7adc0616599 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2491,9 +2491,9 @@ void __init devinet_init(void) rtnl_af_register(&inet_af_ops); - rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); - rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); - rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); + rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0); + rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0); rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, - inet_netconf_dump_devconf, NULL); + inet_netconf_dump_devconf, 0); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 0cbee0a666ff..b00e4a43b4dc 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -258,7 +258,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * esp_output_udp_encap(x, skb, esp); if (!skb_cloned(skb)) { - if (tailen <= skb_availroom(skb)) { + if (tailen <= skb_tailroom(skb)) { nfrags = 1; trailer = skb; tail = skb_tail_pointer(trailer); @@ -292,8 +292,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * kunmap_atomic(vaddr); - spin_unlock_bh(&x->lock); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -301,6 +299,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * skb_shinfo(skb)->nr_frags = ++nfrags; pfrag->offset = pfrag->offset + allocsize; + + spin_unlock_bh(&x->lock); + nfrags++; skb->len += tailen; @@ -381,7 +382,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; if (!esp->inplace) { int allocsize; @@ -392,7 +393,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * spin_lock_bh(&x->lock); if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { spin_unlock_bh(&x->lock); - goto error; + goto error_free; } skb_shinfo(skb)->nr_frags = 1; @@ -409,7 +410,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -442,8 +443,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * if (sg != dsg) esp_ssg_unref(x, tmp); - kfree(tmp); +error_free: + kfree(tmp); error: return err; } @@ -499,18 +501,59 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) return esp_output_tail(x, skb, &esp); } +static inline int esp_remove_trailer(struct sk_buff *skb) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct xfrm_offload *xo = xfrm_offload(skb); + struct crypto_aead *aead = x->data; + int alen, hlen, elen; + int padlen, trimlen; + __wsum csumdiff; + u8 nexthdr[2]; + int ret; + + alen = crypto_aead_authsize(aead); + hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); + elen = skb->len - hlen; + + if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) { + ret = xo->proto; + goto out; + } + + if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2)) + BUG(); + + ret = -EINVAL; + padlen = nexthdr[0]; + if (padlen + 2 + alen >= elen) { + net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n", + padlen + 2, elen - alen); + goto out; + } + + trimlen = alen + padlen + 2; + if (skb->ip_summed == CHECKSUM_COMPLETE) { + csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0); + skb->csum = csum_block_sub(skb->csum, csumdiff, + skb->len - trimlen); + } + pskb_trim(skb, skb->len - trimlen); + + ret = nexthdr[1]; + +out: + return ret; +} + int esp_input_done2(struct sk_buff *skb, int err) { const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; - int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); - int elen = skb->len - hlen; int ihl; - u8 nexthdr[2]; - int padlen; if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) kfree(ESP_SKB_CB(skb)->tmp); @@ -518,16 +561,10 @@ int esp_input_done2(struct sk_buff *skb, int err) if (unlikely(err)) goto out; - if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) - BUG(); - - err = -EINVAL; - padlen = nexthdr[0]; - if (padlen + 2 + alen >= elen) + err = esp_remove_trailer(skb); + if (unlikely(err < 0)) goto out; - /* ... check padding bits here. Silly. :-) */ - iph = ip_hdr(skb); ihl = iph->ihl * 4; @@ -568,15 +605,12 @@ int esp_input_done2(struct sk_buff *skb, int err) skb->ip_summed = CHECKSUM_UNNECESSARY; } - pskb_trim(skb, skb->len - alen - padlen - 2); - __skb_pull(skb, hlen); + skb_pull_rcsum(skb, hlen); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); - err = nexthdr[1]; - /* RFC4303: Drop dummy packets without any error */ if (err == IPPROTO_NONE) err = -EINVAL; @@ -695,8 +729,10 @@ skip_cow: sg_init_table(sg, nfrags); err = skb_to_sgvec(skb, sg, 0, skb->len); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + kfree(tmp); goto out; + } skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e0666016a764..f8b918c766b0 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -182,11 +182,13 @@ out: static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; + struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; - skb->ip_summed = CHECKSUM_NONE; + if (!(xo->flags & CRYPTO_DONE)) + skb->ip_summed = CHECKSUM_NONE; return esp_input_done2(skb, 0); } @@ -257,7 +259,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); err = esp_output_tail(x, skb, &esp); - if (err < 0) + if (err) return err; secpath_reset(skb); @@ -303,3 +305,4 @@ module_init(esp4_offload_init); module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); +MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 044d2a159a3c..37819ab4cc74 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1247,22 +1247,28 @@ static int __net_init ip_fib_net_init(struct net *net) int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; - net->ipv4.fib_seq = 0; + err = fib4_notifier_init(net); + if (err) + return err; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); - if (!net->ipv4.fib_table_hash) - return -ENOMEM; + if (!net->ipv4.fib_table_hash) { + err = -ENOMEM; + goto err_table_hash_alloc; + } err = fib4_rules_init(net); if (err < 0) - goto fail; + goto err_rules_init; return 0; -fail: +err_rules_init: kfree(net->ipv4.fib_table_hash); +err_table_hash_alloc: + fib4_notifier_exit(net); return err; } @@ -1292,6 +1298,7 @@ static void ip_fib_net_exit(struct net *net) #endif rtnl_unlock(); kfree(net->ipv4.fib_table_hash); + fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) @@ -1341,7 +1348,7 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0); } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 769ab87ebc4b..5b2af19cfb5b 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -32,6 +32,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); +bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index e0714d975947..cfd420b0572c 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -1,86 +1,73 @@ #include <linux/rtnetlink.h> #include <linux/notifier.h> -#include <linux/rcupdate.h> +#include <linux/socket.h> #include <linux/kernel.h> +#include <linux/export.h> #include <net/net_namespace.h> +#include <net/fib_notifier.h> #include <net/netns/ipv4.h> #include <net/ip_fib.h> -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -int call_fib_notifier(struct notifier_block *nb, struct net *net, - enum fib_event_type event_type, - struct fib_notifier_info *info) +int call_fib4_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) { - info->net = net; - return nb->notifier_call(nb, event_type, info); + info->family = AF_INET; + return call_fib_notifier(nb, net, event_type, info); } -int call_fib_notifiers(struct net *net, enum fib_event_type event_type, - struct fib_notifier_info *info) +int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) { + ASSERT_RTNL(); + + info->family = AF_INET; net->ipv4.fib_seq++; - info->net = net; - return atomic_notifier_call_chain(&fib_chain, event_type, info); + return call_fib_notifiers(net, event_type, info); } -static unsigned int fib_seq_sum(void) +static unsigned int fib4_seq_read(struct net *net) { - unsigned int fib_seq = 0; - struct net *net; - - rtnl_lock(); - for_each_net(net) - fib_seq += net->ipv4.fib_seq; - rtnl_unlock(); + ASSERT_RTNL(); - return fib_seq; + return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static bool fib_dump_is_consistent(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb), - unsigned int fib_seq) +static int fib4_dump(struct net *net, struct notifier_block *nb) { - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) - return true; - atomic_notifier_chain_unregister(&fib_chain, nb); - if (cb) - cb(nb); - return false; + int err; + + err = fib4_rules_dump(net, nb); + if (err) + return err; + + fib_notify(net, nb); + + return 0; } -#define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) -{ - int retries = 0; +static const struct fib_notifier_ops fib4_notifier_ops_template = { + .family = AF_INET, + .fib_seq_read = fib4_seq_read, + .fib_dump = fib4_dump, + .owner = THIS_MODULE, +}; - do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; +int __net_init fib4_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; - /* Mutex semantics guarantee that every change done to - * FIB tries before we read the change sequence counter - * is now visible to us. - */ - rcu_read_lock(); - for_each_net_rcu(net) { - fib_rules_notify(net, nb); - fib_notify(net, nb); - } - rcu_read_unlock(); + net->ipv4.fib_seq = 0; - if (fib_dump_is_consistent(nb, cb, fib_seq)) - return 0; - } while (++retries < FIB_DUMP_MAX_RETRIES); + ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.notifier_ops = ops; - return -EBUSY; + return 0; } -EXPORT_SYMBOL(register_fib_notifier); -int unregister_fib_notifier(struct notifier_block *nb) +void __net_exit fib4_notifier_exit(struct net *net) { - return atomic_notifier_chain_unregister(&fib_chain, nb); + fib_notifier_ops_unregister(net->ipv4.notifier_ops); } -EXPORT_SYMBOL(unregister_fib_notifier); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 778ecf977eb2..35d646a62ad4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -68,6 +68,16 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); +int fib4_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, AF_INET); +} + +unsigned int fib4_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, AF_INET); +} + int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { @@ -185,38 +195,6 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } -static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, - enum fib_event_type event_type, - struct fib_rule *rule) -{ - struct fib_rule_notifier_info info = { - .rule = rule, - }; - - return call_fib_notifier(nb, net, event_type, &info.info); -} - -static int call_fib_rule_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib_rule *rule) -{ - struct fib_rule_notifier_info info = { - .rule = rule, - }; - - return call_fib_notifiers(net, event_type, &info.info); -} - -/* Called with rcu_read_lock() */ -void fib_rules_notify(struct net *net, struct notifier_block *nb) -{ - struct fib_rules_ops *ops = net->ipv4.rules_ops; - struct fib_rule *rule; - - list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule); -} - static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { FRA_GENERIC_POLICY, [FRA_FLOW] = { .type = NLA_U32 }, @@ -273,7 +251,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule); err = 0; errout: @@ -295,7 +272,6 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule); errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 222100103808..57a5d48acee8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -44,6 +44,7 @@ #include <net/netlink.h> #include <net/nexthop.h> #include <net/lwtunnel.h> +#include <net/fib_notifier.h> #include "fib_lookup.h" @@ -219,7 +220,7 @@ static void free_fib_info_rcu(struct rcu_head *head) } endfor_nexthops(fi); m = fi->fib_metrics; - if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) kfree(m); kfree(fi); } @@ -695,6 +696,40 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, return 0; } +bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) +{ + struct nlattr *nla; + int remaining; + + if (!cfg->fc_mx) + return true; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return false; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + bool ecn_ca = false; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + } else { + val = nla_get_u32(nla); + } + + if (fi->fib_metrics->metrics[type - 1] != val) + return false; + } + + return true; +} + /* * Picture @@ -1083,15 +1118,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (!fi->fib_metrics) - goto failure; - atomic_set(&fi->fib_metrics->refcnt, 1); - } else + if (unlikely(!fi->fib_metrics)) { + kfree(fi); + return ERR_PTR(err); + } + refcount_set(&fi->fib_metrics->refcnt, 1); + } else { fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; - + } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; @@ -1342,6 +1379,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) rtm->rtm_flags |= RTNH_F_DEAD; } + if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) @@ -1449,14 +1488,14 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && fib_nh->nh_flags & RTNH_F_LINKDOWN) break; - return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type, - &info.info); + return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type, + &info.info); case FIB_EVENT_NH_DEL: - if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && fib_nh->nh_flags & RTNH_F_LINKDOWN) || (fib_nh->nh_flags & RTNH_F_DEAD)) - return call_fib_notifiers(dev_net(fib_nh->nh_dev), - event_type, &info.info); + return call_fib4_notifiers(dev_net(fib_nh->nh_dev), + event_type, &info.info); default: break; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 64668c69dda6..c636650a6a70 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -81,6 +81,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" @@ -97,7 +98,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = type, .tb_id = tb_id, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, net, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -113,7 +114,7 @@ static int call_fib_entry_notifiers(struct net *net, .type = type, .tb_id = tb_id, }; - return call_fib_notifiers(net, event_type, &info.info); + return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 @@ -1562,7 +1563,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi, extack) == 0) { + fib_nh_match(cfg, fi, extack) == 0 && + fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 8e0257d01200..1540db65241a 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -450,6 +450,7 @@ out_unlock: out: NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, &grc); + skb->remcsum_offload = 0; return pp; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c2be26b98b5f..681e33998e03 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -412,7 +412,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) int type = icmp_param->data.icmph.type; int code = icmp_param->data.icmph.code; - if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) + if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; /* Needed by both icmp_global_allow and icmp_xmit_lock */ @@ -694,7 +694,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) + if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in)) goto out_unlock; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 28f14afd0dd3..ab183af0b5b6 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct net_device *dev = skb->dev; + struct in_device *in_dev; int len = skb->len; bool dropped = true; + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); + if (!dev) + goto drop; + } + + in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; @@ -2549,7 +2557,8 @@ done: /* * check if a multicast source filter allows delivery for a given <src,dst,intf> */ -int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) +int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, + int dif, int sdif) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *pmc; @@ -2564,7 +2573,8 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) rcu_read_lock(); for_each_pmc_rcu(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && - pmc->multi.imr_ifindex == dif) + (pmc->multi.imr_ifindex == dif || + (sdif && pmc->multi.imr_ifindex == sdif))) break; } ret = inet->mc_all; @@ -2974,12 +2984,6 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } - /* Sysctl initialization */ - net->ipv4.sysctl_igmp_max_memberships = 20; - net->ipv4.sysctl_igmp_max_msf = 10; - /* IGMP reports for link-local multicast groups are enabled by default */ - net->ipv4.sysctl_igmp_llm_reports = 1; - net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3828b3a805cd..c9c35b61a027 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -93,8 +93,17 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); -static size_t inet_sk_attr_size(void) +static size_t inet_sk_attr_size(struct sock *sk, + const struct inet_diag_req_v2 *req, + bool net_admin) { + const struct inet_diag_handler *handler; + size_t aux = 0; + + handler = inet_diag_table[req->sdiag_protocol]; + if (handler && handler->idiag_get_aux_size) + aux = handler->idiag_get_aux_size(sk, net_admin); + return nla_total_size(sizeof(struct tcp_info)) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ @@ -105,6 +114,7 @@ static size_t inet_sk_attr_size(void) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + nla_total_size(TCP_CA_NAME_MAX) + nla_total_size(sizeof(struct tcpvegas_info)) + + aux + 64; } @@ -260,6 +270,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, handler->idiag_get_info(sk, r, info); + if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux) + if (handler->idiag_get_aux(sk, net_admin, skb) < 0) + goto errout; + if (sk->sk_state < TCP_TIME_WAIT) { union tcp_cc_info info; size_t sz = 0; @@ -274,6 +288,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } + if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) { + u32 classid = 0; + +#ifdef CONFIG_SOCK_CGROUP_DATA + classid = sock_cgroup_classid(&sk->sk_cgrp_data); +#endif + + if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) + goto errout; + } + out: nlmsg_end(skb, nlh); return 0; @@ -438,6 +463,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, const struct nlmsghdr *nlh, const struct inet_diag_req_v2 *req) { + bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); struct net *net = sock_net(in_skb->sk); struct sk_buff *rep; struct sock *sk; @@ -447,7 +473,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, if (IS_ERR(sk)) return PTR_ERR(sk); - rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL); + rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; @@ -456,8 +482,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, - netlink_net_capable(in_skb, CAP_NET_ADMIN)); + nlh->nlmsg_seq, 0, nlh, net_admin); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2e3389d614d1..597bb4cfe805 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif, bool exact_dif) + const int dif, const int sdif, bool exact_dif) { int score = -1; struct inet_sock *inet = inet_sk(sk); @@ -185,9 +185,13 @@ static inline int compute_score(struct sock *sk, struct net *net, score += 4; } if (sk->sk_bound_dev_if || exact_dif) { - if (sk->sk_bound_dev_if != dif) + bool dev_match = (sk->sk_bound_dev_if == dif || + sk->sk_bound_dev_if == sdif); + + if (exact_dif && !dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if && dev_match) + score += 4; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -208,7 +212,7 @@ struct sock *__inet_lookup_listener(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, - const int dif) + const int dif, const int sdif) { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -218,7 +222,8 @@ struct sock *__inet_lookup_listener(struct net *net, u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif, exact_dif); + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -268,7 +273,7 @@ struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, - const int dif) + const int dif, const int sdif) { INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); @@ -286,11 +291,12 @@ begin: if (sk->sk_hash != hash) continue; if (likely(INET_MATCH(sk, net, acookie, - saddr, daddr, ports, dif))) { + saddr, daddr, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!INET_MATCH(sk, net, acookie, - saddr, daddr, ports, dif))) { + saddr, daddr, ports, + dif, sdif))) { sock_gen_put(sk); goto begin; } @@ -321,9 +327,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; + struct net *net = sock_net(sk); + int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - struct net *net = sock_net(sk); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); @@ -339,7 +346,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, continue; if (likely(INET_MATCH(sk2, net, acookie, - saddr, daddr, ports, dif))) { + saddr, daddr, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 337ad41bb80a..e7eb590c86ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -102,15 +102,18 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, struct rb_node **parent_p, struct rb_node ***pp_p) { - struct rb_node **pp, *parent; + struct rb_node **pp, *parent, *next; struct inet_peer *p; pp = &base->rb_root.rb_node; parent = NULL; - while (*pp) { + while (1) { int cmp; - parent = rcu_dereference_raw(*pp); + next = rcu_dereference_raw(*pp); + if (!next) + break; + parent = next; p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7a7829e839c2..0162fb955b33 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -48,6 +48,7 @@ #include <net/rtnetlink.h> #include <net/gre.h> #include <net/dst_metadata.h> +#include <net/erspan.h> /* Problems & solutions @@ -112,9 +113,12 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); +static void erspan_build_header(struct sk_buff *skb, + __be32 id, u32 index, bool truncate); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; +static unsigned int erspan_net_id __read_mostly; static void ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) @@ -246,6 +250,81 @@ static void gre_err(struct sk_buff *skb, u32 info) ipgre_err(skb, info, &tpi); } +static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, + int gre_hdr_len) +{ + struct net *net = dev_net(skb->dev); + struct metadata_dst *tun_dst = NULL; + struct ip_tunnel_net *itn; + struct ip_tunnel *tunnel; + struct erspanhdr *ershdr; + const struct iphdr *iph; + __be32 session_id; + __be32 index; + int len; + + itn = net_generic(net, erspan_net_id); + len = gre_hdr_len + sizeof(*ershdr); + + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + + iph = ip_hdr(skb); + ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); + + /* The original GRE header does not have key field, + * Use ERSPAN 10-bit session ID as key. + */ + session_id = cpu_to_be32(ntohs(ershdr->session_id)); + tpi->key = session_id; + index = ershdr->md.index; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_KEY, + iph->saddr, iph->daddr, tpi->key); + + if (tunnel) { + if (__iptunnel_pull_header(skb, + gre_hdr_len + sizeof(*ershdr), + htons(ETH_P_TEB), + false, false) < 0) + goto drop; + + if (tunnel->collect_md) { + struct ip_tunnel_info *info; + struct erspan_metadata *md; + __be64 tun_id; + __be16 flags; + + tpi->flags |= TUNNEL_KEY; + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ip_tun_rx_dst(skb, flags, + tun_id, sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + md = ip_tunnel_info_opts(&tun_dst->u.tun_info); + if (!md) + return PACKET_REJECT; + + md->index = index; + info = &tun_dst->u.tun_info; + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + info->options_len = sizeof(*md); + } else { + tunnel->index = ntohl(index); + } + + skb_reset_mac_header(skb); + ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + return PACKET_RCVD; + } +drop: + kfree_skb(skb); + return PACKET_RCVD; +} + static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { @@ -328,6 +407,11 @@ static int gre_rcv(struct sk_buff *skb) if (hdr_len < 0) goto drop; + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) + return 0; + } + if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; @@ -376,39 +460,33 @@ static struct rtable *gre_get_rt(struct sk_buff *skb, return ip_route_output_key(net, fl); } -static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, - __be16 proto) +static struct rtable *prepare_fb_xmit(struct sk_buff *skb, + struct net_device *dev, + struct flowi4 *fl, + int tunnel_hlen) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct rtable *rt = NULL; - struct flowi4 fl; int min_headroom; - int tunnel_hlen; - __be16 df, flags; bool use_cache; int err; tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET)) - goto err_free_skb; - key = &tun_info->key; use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr); if (!rt) { - rt = gre_get_rt(skb, dev, &fl, key); + rt = gre_get_rt(skb, dev, fl, key); if (IS_ERR(rt)) - goto err_free_skb; + goto err_free_skb; if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl.saddr); + fl->saddr); } - tunnel_hlen = gre_calc_hlen(key->tun_flags); - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr); if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { @@ -420,6 +498,37 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (unlikely(err)) goto err_free_rt; } + return rt; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return NULL; +} + +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) +{ + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct rtable *rt = NULL; + struct flowi4 fl; + int tunnel_hlen; + __be16 df, flags; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + tunnel_hlen = gre_calc_hlen(key->tun_flags); + + rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); + if (!rt) + return; /* Push Tunnel header. */ if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) @@ -442,6 +551,64 @@ err_free_skb: dev->stats.tx_dropped++; } +static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + struct rtable *rt = NULL; + bool truncate = false; + struct flowi4 fl; + int tunnel_hlen; + __be16 df; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + + /* ERSPAN has fixed 8 byte GRE header */ + tunnel_hlen = 8 + sizeof(struct erspanhdr); + + rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); + if (!rt) + return; + + if (gre_handle_offloads(skb, false)) + goto err_free_rt; + + if (skb->len > dev->mtu) { + pskb_trim(skb, dev->mtu); + truncate = true; + } + + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt; + + erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), + ntohl(md->index), truncate); + + gre_build_header(skb, 8, TUNNEL_SEQ, + htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); + + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + + iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, + key->tos, key->ttl, df, false); + return; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; +} + static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); @@ -503,6 +670,86 @@ free_skb: return NETDEV_TX_OK; } +static inline u8 tos_to_cos(u8 tos) +{ + u8 dscp, cos; + + dscp = tos >> 2; + cos = dscp >> 3; + return cos; +} + +static void erspan_build_header(struct sk_buff *skb, + __be32 id, u32 index, bool truncate) +{ + struct iphdr *iphdr = ip_hdr(skb); + struct ethhdr *eth = eth_hdr(skb); + enum erspan_encap_type enc_type; + struct erspanhdr *ershdr; + struct qtag_prefix { + __be16 eth_type; + __be16 tci; + } *qp; + u16 vlan_tci = 0; + + enc_type = ERSPAN_ENCAP_NOVLAN; + + /* If mirrored packet has vlan tag, extract tci and + * perserve vlan header in the mirrored frame. + */ + if (eth->h_proto == htons(ETH_P_8021Q)) { + qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); + vlan_tci = ntohs(qp->tci); + enc_type = ERSPAN_ENCAP_INFRAME; + } + + skb_push(skb, sizeof(*ershdr)); + ershdr = (struct erspanhdr *)skb->data; + memset(ershdr, 0, sizeof(*ershdr)); + + ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | + (ERSPAN_VERSION << VER_OFFSET)); + ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | + ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | + (enc_type << EN_OFFSET & EN_MASK) | + ((truncate << T_OFFSET) & T_MASK)); + ershdr->md.index = htonl(index & INDEX_MASK); +} + +static netdev_tx_t erspan_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + bool truncate = false; + + if (tunnel->collect_md) { + erspan_fb_xmit(skb, dev, skb->protocol); + return NETDEV_TX_OK; + } + + if (gre_handle_offloads(skb, false)) + goto free_skb; + + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; + + if (skb->len > dev->mtu) { + pskb_trim(skb, dev->mtu); + truncate = true; + } + + /* Push ERSPAN header */ + erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + tunnel->parms.o_flags &= ~TUNNEL_KEY; + __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); + return NETDEV_TX_OK; + +free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} + static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -828,6 +1075,42 @@ out: return ipgre_tunnel_validate(tb, data, extack); } +static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + __be16 flags = 0; + int ret; + + if (!data) + return 0; + + ret = ipgre_tap_validate(tb, data, extack); + if (ret) + return ret; + + /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + return 0; +} + static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], @@ -892,6 +1175,13 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + return 0; } @@ -949,6 +1239,36 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; +static int erspan_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen; + + tunnel->tun_hlen = 8; + tunnel->parms.iph.protocol = IPPROTO_GRE; + t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + + dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; + dev->mtu = ETH_DATA_LEN - t_hlen - 4; + dev->features |= GRE_FEATURES; + dev->hw_features |= GRE_FEATURES; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + + return ip_tunnel_init(dev); +} + +static const struct net_device_ops erspan_netdev_ops = { + .ndo_init = erspan_tunnel_init, + .ndo_uninit = ip_tunnel_uninit, + .ndo_start_xmit = erspan_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_fill_metadata_dst = gre_fill_metadata_dst, +}; + static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); @@ -1041,6 +1361,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + 0; } @@ -1083,12 +1405,25 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } + if (t->index) + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + return 0; nla_put_failure: return -EMSGSIZE; } +static void erspan_setup(struct net_device *dev) +{ + ether_setup(dev); + dev->netdev_ops = &erspan_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + ip_tunnel_setup(dev, erspan_net_id); +} + static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_LINK] = { .type = NLA_U32 }, [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, @@ -1107,6 +1442,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { @@ -1139,6 +1475,21 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .get_link_net = ip_tunnel_get_link_net, }; +static struct rtnl_link_ops erspan_link_ops __read_mostly = { + .kind = "erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = erspan_setup, + .validate = erspan_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .dellink = ip_tunnel_dellink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, + .get_link_net = ip_tunnel_get_link_net, +}; + struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { @@ -1202,6 +1553,26 @@ static struct pernet_operations ipgre_tap_net_ops = { .size = sizeof(struct ip_tunnel_net), }; +static int __net_init erspan_init_net(struct net *net) +{ + return ip_tunnel_init_net(net, erspan_net_id, + &erspan_link_ops, "erspan0"); +} + +static void __net_exit erspan_exit_net(struct net *net) +{ + struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); + + ip_tunnel_delete_net(itn, &erspan_link_ops); +} + +static struct pernet_operations erspan_net_ops = { + .init = erspan_init_net, + .exit = erspan_exit_net, + .id = &erspan_net_id, + .size = sizeof(struct ip_tunnel_net), +}; + static int __init ipgre_init(void) { int err; @@ -1214,7 +1585,11 @@ static int __init ipgre_init(void) err = register_pernet_device(&ipgre_tap_net_ops); if (err < 0) - goto pnet_tap_faied; + goto pnet_tap_failed; + + err = register_pernet_device(&erspan_net_ops); + if (err < 0) + goto pnet_erspan_failed; err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { @@ -1230,15 +1605,23 @@ static int __init ipgre_init(void) if (err < 0) goto tap_ops_failed; + err = rtnl_link_register(&erspan_link_ops); + if (err < 0) + goto erspan_link_failed; + return 0; +erspan_link_failed: + rtnl_link_unregister(&ipgre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: + unregister_pernet_device(&erspan_net_ops); +pnet_erspan_failed: unregister_pernet_device(&ipgre_tap_net_ops); -pnet_tap_faied: +pnet_tap_failed: unregister_pernet_device(&ipgre_net_ops); return err; } @@ -1247,9 +1630,11 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); + rtnl_link_unregister(&erspan_link_ops); gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); + unregister_pernet_device(&erspan_net_ops); } module_init(ipgre_init); @@ -1257,5 +1642,7 @@ module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); +MODULE_ALIAS_RTNL_LINK("erspan"); MODULE_ALIAS_NETDEV("gre0"); MODULE_ALIAS_NETDEV("gretap0"); +MODULE_ALIAS_NETDEV("erspan0"); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 93157f2f4758..525ae88d1e58 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -86,8 +86,8 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, * NOTE: dopt cannot point to skb. */ -int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, - const struct ip_options *sopt) +int __ip_options_echo(struct net *net, struct ip_options *dopt, + struct sk_buff *skb, const struct ip_options *sopt) { unsigned char *sptr, *dptr; int soffset, doffset; @@ -140,7 +140,7 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, __be32 addr; memcpy(&addr, dptr+soffset-1, 4); - if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { + if (inet_addr_type(net, addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } @@ -174,9 +174,6 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, doffset -= 4; } if (doffset > 3) { - __be32 daddr = fib_compute_spec_dst(skb); - - memcpy(&start[doffset-1], &daddr, 4); dopt->faddr = faddr; dptr[0] = start[0]; dptr[1] = doffset+3; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b631ec685d77..e8e675be60ec 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1223,15 +1223,11 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, cork->length += size; while (size > 0) { - if (skb_is_gso(skb)) { - len = size; - } else { + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; - /* Check if the remaining data fits into current packet. */ - len = mtu - skb->len; - if (len < size) - len = maxfraglen - skb->len; - } if (len <= 0) { struct sk_buff *skb_prev; int alloclen; @@ -1525,7 +1521,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, int err; int oif; - if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) + if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; ipc.addr = daddr; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ecc4b4a2413e..e558e4f9597b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -80,7 +80,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) } -static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) +static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg, + struct sk_buff *skb) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; @@ -88,7 +89,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) if (IPCB(skb)->opt.optlen == 0) return; - if (ip_options_echo(opt, skb)) { + if (ip_options_echo(net, opt, skb)) { msg->msg_flags |= MSG_CTRUNC; return; } @@ -204,7 +205,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, } if (flags & IP_CMSG_RETOPTS) { - ip_cmsg_recv_retopts(msg, skb); + ip_cmsg_recv_retopts(sock_net(sk), msg, skb); flags &= ~IP_CMSG_RETOPTS; if (!flags) @@ -1206,6 +1207,7 @@ e_inval: void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); + bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || ipv6_sk_rxinfo(sk); @@ -1219,7 +1221,7 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) * (e.g., process binds socket to eth0 for Tx which is * redirected to loopback in the rtable/dst). */ - if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) + if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX || l3slave) pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); @@ -1227,14 +1229,7 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; } - /* We need to keep the dst for __ip_options_echo() - * We could restrict the test to opt.ts_needtime || opt.srr, - * but the following is good enough as IP options are not often used. - */ - if (unlikely(IPCB(skb)->opt.optlen)) - skb_dst_force(skb); - else - skb_dst_drop(skb); + skb_dst_drop(skb); } int ip_setsockopt(struct sock *sk, int level, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 06863ea3fc5b..c9b3e6e069ae 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3114,14 +3114,14 @@ int __init ip_mr_init(void) } #endif rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, - ipmr_rtm_getroute, ipmr_rtm_dumproute, NULL); + ipmr_rtm_getroute, ipmr_rtm_dumproute, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE, - ipmr_rtm_route, NULL, NULL); + ipmr_rtm_route, NULL, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE, - ipmr_rtm_route, NULL, NULL); + ipmr_rtm_route, NULL, 0); rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK, - NULL, ipmr_rtm_dumplink, NULL); + NULL, ipmr_rtm_dumplink, 0); return 0; #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index cf520d30cb94..e04457198f93 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -268,14 +268,14 @@ unsigned int arpt_do_table(struct sk_buff *skb, acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); - /* Target might have changed stuff. */ - arp = arp_hdr(skb); - - if (verdict == XT_CONTINUE) + if (verdict == XT_CONTINUE) { + /* Target might have changed stuff. */ + arp = arp_hdr(skb); e = arpt_next_entry(e); - else + } else { /* Verdict */ break; + } } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2aea896f5708..ce1d97579ce8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -352,13 +352,14 @@ ipt_do_table(struct sk_buff *skb, acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); - /* Target might have changed stuff. */ - ip = ip_hdr(skb); - if (verdict == XT_CONTINUE) + if (verdict == XT_CONTINUE) { + /* Target might have changed stuff. */ + ip = ip_hdr(skb); e = ipt_next_entry(e); - else + } else { /* Verdict */ break; + } } while (!acpar.hotdrop); xt_write_recseq_end(addend); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 6637e8b37ee2..17b4ca562944 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -117,7 +117,8 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c) * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS - proc_remove(c->pde); + if (cn->procdir) + proc_remove(c->pde); #endif return; } @@ -815,6 +816,7 @@ static void clusterip_net_exit(struct net *net) #ifdef CONFIG_PROC_FS struct clusterip_net *cn = net_generic(net, clusterip_net_id); proc_remove(cn->procdir); + cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 43eb6567b3a0..127153f1ed8a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -206,12 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), - SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED), - SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG), - SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE), - SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED), SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), - SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), @@ -230,14 +225,12 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), - SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), - SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index b0bb5d0a30bd..33b70bfd1122 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -122,7 +122,8 @@ void raw_unhash_sk(struct sock *sk) EXPORT_SYMBOL_GPL(raw_unhash_sk); struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, - unsigned short num, __be32 raddr, __be32 laddr, int dif) + unsigned short num, __be32 raddr, __be32 laddr, + int dif, int sdif) { sk_for_each_from(sk) { struct inet_sock *inet = inet_sk(sk); @@ -130,7 +131,8 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != sdif)) goto found; /* gotcha */ } sk = NULL; @@ -171,6 +173,7 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) */ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { + int sdif = inet_sdif(skb); struct sock *sk; struct hlist_head *head; int delivered = 0; @@ -184,13 +187,13 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, - skb->dev->ifindex); + skb->dev->ifindex, sdif); while (sk) { delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, - skb->dev->ifindex)) { + skb->dev->ifindex, sdif)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ @@ -199,7 +202,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) } sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, - skb->dev->ifindex); + skb->dev->ifindex, sdif); } out: read_unlock(&raw_v4_hashinfo.lock); @@ -297,12 +300,15 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) read_lock(&raw_v4_hashinfo.lock); raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); if (raw_sk) { + int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); + iph = (const struct iphdr *)skb->data; net = dev_net(skb->dev); while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, iph->daddr, iph->saddr, - skb->dev->ifindex)) != NULL) { + dif, sdif)) != NULL) { raw_err(raw_sk, skb, info); raw_sk = sk_next(raw_sk); iph = (const struct iphdr *)skb->data; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index e1a51ca68d23..c200065ef9a5 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -46,13 +46,13 @@ static struct sock *raw_lookup(struct net *net, struct sock *from, sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], - r->id.idiag_if); + r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, - r->id.idiag_if); + r->id.idiag_if, 0); #endif return sk; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0383e66f59bc..94d4cd2d5ea4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) if (mtu) return mtu; - mtu = dst->dev->mtu; + mtu = READ_ONCE(dst->dev->mtu); if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { if (rt->rt_uses_gateway && mtu > 576) @@ -1398,7 +1398,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; - if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) kfree(p); if (!list_empty(&rt->rt_uncached)) { @@ -1456,7 +1456,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); if (fi->fib_metrics != &dst_default_metrics) { rt->dst._metrics |= DST_METRICS_REFCOUNTED; - atomic_inc(&fi->fib_metrics->refcnt); + refcount_inc(&fi->fib_metrics->refcnt); } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; @@ -2236,7 +2236,7 @@ add: if (!rth) return ERR_PTR(-ENOBUFS); - rth->rt_iif = orig_oif ? : 0; + rth->rt_iif = orig_oif; if (res->table) rth->rt_table_id = res->table->tb_id; @@ -2439,6 +2439,12 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, /* L3 master device is the loopback for that domain */ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; + + /* make sure orig_oif points to fib result device even + * though packet rx/tx happens over loopback or l3mdev + */ + orig_oif = FIB_RES_OIF(*res); + fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; @@ -2750,26 +2756,34 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); + else + skb_dst_set(skb, &rt->dst); } if (err) goto errout_free; - skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = rt->rt_table_id; - if (rtm->rtm_flags & RTM_F_FIB_MATCH) + if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + if (!res.fi) { + err = fib_props[res.type].error; + if (!err) + err = -EHOSTUNREACH; + goto errout_free; + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, table_id, rt->rt_type, res.prefix, res.prefixlen, fl4.flowi4_tos, res.fi, 0); - else + } else { err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + } if (err < 0) goto errout_free; @@ -3067,7 +3081,8 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 03ad8778c395..b1bb1b3a1082 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - ireq->opt = tcp_v4_save_options(skb); + ireq->opt = tcp_v4_save_options(sock_net(sk), skb); if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9bf809726066..0d3c038d7b04 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +/* obsolete */ +static int sysctl_tcp_low_latency __read_mostly; + /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..5091402720ab 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,6 +269,7 @@ #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> +#include <linux/errqueue.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -388,6 +389,19 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) return period; } +static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) +{ + u32 rate = READ_ONCE(tp->rate_delivered); + u32 intv = READ_ONCE(tp->rate_interval_us); + u64 rate64 = 0; + + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + } + return rate64; +} + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -400,7 +414,6 @@ void tcp_init_sock(struct sock *sk) tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; @@ -1034,23 +1047,29 @@ out_err: } EXPORT_SYMBOL_GPL(do_tcp_sendpages); -int tcp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) { - ssize_t res; - if (!(sk->sk_route_caps & NETIF_F_SG) || !sk_check_csum_caps(sk)) - return sock_no_sendpage(sk->sk_socket, page, offset, size, - flags); - - lock_sock(sk); + return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ - res = do_tcp_sendpages(sk, page, offset, size, flags); + return do_tcp_sendpages(sk, page, offset, size, flags); +} +EXPORT_SYMBOL_GPL(tcp_sendpage_locked); + +int tcp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + int ret; + + lock_sock(sk); + ret = tcp_sendpage_locked(sk, page, offset, size, flags); release_sock(sk); - return res; + + return ret; } EXPORT_SYMBOL(tcp_sendpage); @@ -1144,9 +1163,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return err; } -int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; @@ -1155,9 +1175,25 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool sg; long timeo; - lock_sock(sk); - flags = msg->msg_flags; + + if (flags & MSG_ZEROCOPY && size) { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -EINVAL; + goto out_err; + } + + skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); + if (!uarg) { + err = -ENOBUFS; + goto out_err; + } + + if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + uarg->zerocopy = 0; + } + if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) @@ -1281,7 +1317,7 @@ new_segment: err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else { + } else if (!uarg || !uarg->zerocopy) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1319,6 +1355,13 @@ new_segment: page_ref_inc(pfrag->page); } pfrag->offset += copy; + } else { + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + if (err == -EMSGSIZE || err == -EEXIST) + goto new_segment; + if (err < 0) + goto do_error; + copy = err; } if (!copied) @@ -1365,7 +1408,7 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - release_sock(sk); + sock_zerocopy_put(uarg); return copied + copied_syn; do_fault: @@ -1382,6 +1425,7 @@ do_error: if (copied + copied_syn) goto out; out_err: + sock_zerocopy_put_abort(uarg); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -1389,9 +1433,20 @@ out_err: sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } - release_sock(sk); return err; } +EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); + +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int ret; + + lock_sock(sk); + ret = tcp_sendmsg_locked(sk, msg, size); + release_sock(sk); + + return ret; +} EXPORT_SYMBOL(tcp_sendmsg); /* @@ -1525,20 +1580,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -static void tcp_prequeue_process(struct sock *sk) -{ - struct sk_buff *skb; - struct tcp_sock *tp = tcp_sk(sk); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - /* Clear memory counter. */ - tp->ucopy.memory = 0; -} - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1652,6 +1693,61 @@ int tcp_peek_len(struct socket *sock) } EXPORT_SYMBOL(tcp_peek_len); +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec(skb->tstamp); + else + tss->ts[0] = (struct timespec) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec) {0}; +} + +/* Similar to __sock_recv_timestamp, but does not require an skb */ +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping *tss) +{ + struct timeval tv; + bool has_timestamping = false; + + if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { + if (sock_flag(sk, SOCK_RCVTSTAMP)) { + if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, + sizeof(tss->ts[0]), &tss->ts[0]); + } else { + tv.tv_sec = tss->ts[0].tv_sec; + tv.tv_usec = tss->ts[0].tv_nsec / 1000; + + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(tv), &tv); + } + } + + if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) + has_timestamping = true; + else + tss->ts[0] = (struct timespec) {0}; + } + + if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { + if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) + has_timestamping = true; + else + tss->ts[2] = (struct timespec) {0}; + } + + if (has_timestamping) { + tss->ts[1] = (struct timespec) {0}; + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, + sizeof(*tss), tss); + } +} + /* * This routine copies from a sock struct into the user buffer. * @@ -1671,9 +1767,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int err; int target; /* Read at least this many bytes */ long timeo; - struct task_struct *user_recv = NULL; struct sk_buff *skb, *last; u32 urg_hole = 0; + struct scm_timestamping tss; + bool has_tss = false; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -1806,51 +1903,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, tcp_cleanup_rbuf(sk, copied); - if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { - /* Install new reader */ - if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { - user_recv = current; - tp->ucopy.task = user_recv; - tp->ucopy.msg = msg; - } - - tp->ucopy.len = len; - - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); - - /* Ugly... If prequeue is not empty, we have to - * process it before releasing socket, otherwise - * order will be broken at second iteration. - * More elegant solution is required!!! - * - * Look: we have the following (pseudo)queues: - * - * 1. packets in flight - * 2. backlog - * 3. prequeue - * 4. receive_queue - * - * Each queue can be processed only if the next ones - * are empty. At this point we have empty receive_queue. - * But prequeue _can_ be not empty after 2nd iteration, - * when we jumped to start of loop because backlog - * processing added something to receive_queue. - * We cannot release_sock(), because backlog contains - * packets arrived _after_ prequeued ones. - * - * Shortly, algorithm is clear --- to process all - * the queues in order. We could make it more directly, - * requeueing packets from backlog to prequeue, if - * is not empty. It is more elegant, but eats cycles, - * unfortunately. - */ - if (!skb_queue_empty(&tp->ucopy.prequeue)) - goto do_prequeue; - - /* __ Set realtime policy in scheduler __ */ - } - if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1859,31 +1911,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, sk_wait_data(sk, &timeo, last); } - if (user_recv) { - int chunk; - - /* __ Restore normal policy in scheduler __ */ - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); - len -= chunk; - copied += chunk; - } - - if (tp->rcv_nxt == tp->copied_seq && - !skb_queue_empty(&tp->ucopy.prequeue)) { -do_prequeue: - tcp_prequeue_process(sk); - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", @@ -1941,6 +1968,10 @@ skip_copy: if (used + offset < skb->len) continue; + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, &tss); + has_tss = true; + } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) @@ -1955,29 +1986,13 @@ skip_copy: break; } while (len > 0); - if (user_recv) { - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - int chunk; - - tp->ucopy.len = copied > 0 ? len : 0; - - tcp_prequeue_process(sk); - - if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - } - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ + if (has_tss) + tcp_recv_timestamp(msg, sk, &tss); + /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); @@ -2481,7 +2496,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true); + err = tcp_set_congestion_control(sk, name, true, true); release_sock(sk); return err; } @@ -2823,7 +2838,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now, intv; + u32 now; u64 rate64; bool slow; u32 rate; @@ -2922,13 +2937,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; - rate = READ_ONCE(tp->rate_delivered); - intv = READ_ONCE(tp->rate_interval_us); - if (rate && intv) { - rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; - do_div(rate64, intv); + rate64 = tcp_compute_delivery_rate(tp); + if (rate64) info->tcpi_delivery_rate = rate64; - } unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2938,8 +2949,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; + u64 rate64; + u32 rate; - stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + + 3 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -2954,6 +2969,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); + + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); + + rate64 = tcp_compute_delivery_rate(tp); + nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); + + nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); + nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); + nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); + + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); return stats; } diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 609965f0e298..fc3614377413 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -49,7 +49,6 @@ MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wma struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ - u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 epoch_start; /* beginning of an epoch */ @@ -72,7 +71,6 @@ static void bictcp_init(struct sock *sk) struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); - ca->loss_cwnd = 0; if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; @@ -172,22 +170,12 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) else ca->last_max_cwnd = tp->snd_cwnd; - ca->loss_cwnd = tp->snd_cwnd; - if (tp->snd_cwnd <= low_window) return max(tp->snd_cwnd >> 1U, 2U); else return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static u32 bictcp_undo_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct bictcp *ca = inet_csk_ca(sk); - - return max(tp->snd_cwnd, ca->loss_cwnd); -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -214,7 +202,7 @@ static struct tcp_congestion_ops bictcp __read_mostly = { .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, - .undo_cwnd = bictcp_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 50a0f3e51d5b..66ac69f7bd19 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -85,7 +85,6 @@ struct cdg { u8 state; u8 delack; u32 rtt_seq; - u32 undo_cwnd; u32 shadow_wnd; u16 backoff_cnt; u16 sample_cnt; @@ -330,8 +329,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - ca->undo_cwnd = tp->snd_cwnd; - if (ca->state == CDG_BACKOFF) return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); @@ -344,13 +341,6 @@ static u32 tcp_cdg_ssthresh(struct sock *sk) return max(2U, tp->snd_cwnd >> 1); } -static u32 tcp_cdg_undo_cwnd(struct sock *sk) -{ - struct cdg *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); -} - static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) { struct cdg *ca = inet_csk_ca(sk); @@ -403,7 +393,7 @@ struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, - .undo_cwnd = tcp_cdg_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .ssthresh = tcp_cdg_ssthresh, .release = tcp_cdg_release, .init = tcp_cdg_init, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index fde983f6376b..2f26124fd160 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_dontxmit(sk); } -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca) +static void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -338,7 +338,7 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) if (!ca) { err = -ENOENT; } else if (!load) { - icsk->icsk_ca_ops = ca; - if (!try_module_get(ca->owner)) + const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; + + if (try_module_get(ca->owner)) { + if (reinit) { + tcp_reinit_congestion_control(sk, ca); + } else { + icsk->icsk_ca_ops = ca; + module_put(old_ca->owner); + } + } else { err = -EBUSY; + } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { err = -EPERM; @@ -456,7 +465,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd, tp->snd_ssthresh << 1); + return max(tp->snd_cwnd, tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 57ae5b5ae643..78bfadfcf342 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -83,7 +83,6 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ - u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ @@ -142,7 +141,6 @@ static void bictcp_init(struct sock *sk) struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); - ca->loss_cwnd = 0; if (hystart) bictcp_hystart_reset(sk); @@ -366,18 +364,9 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) else ca->last_max_cwnd = tp->snd_cwnd; - ca->loss_cwnd = tp->snd_cwnd; - return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static u32 bictcp_undo_cwnd(struct sock *sk) -{ - struct bictcp *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { @@ -470,7 +459,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, - .undo_cwnd = bictcp_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = bictcp_cwnd_event, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index a748c74aa8b7..abbf0edcf6c2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -16,6 +16,7 @@ #include <linux/tcp.h> +#include <net/netlink.h> #include <net/tcp.h> static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -36,6 +37,100 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, tcp_get_info(sk, info); } +#ifdef CONFIG_TCP_MD5SIG +static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info, + const struct tcp_md5sig_key *key) +{ + info->tcpm_family = key->family; + info->tcpm_prefixlen = key->prefixlen; + info->tcpm_keylen = key->keylen; + memcpy(info->tcpm_key, key->key, key->keylen); + + if (key->family == AF_INET) + info->tcpm_addr[0] = key->addr.a4.s_addr; + #if IS_ENABLED(CONFIG_IPV6) + else if (key->family == AF_INET6) + memcpy(&info->tcpm_addr, &key->addr.a6, + sizeof(info->tcpm_addr)); + #endif +} + +static int tcp_diag_put_md5sig(struct sk_buff *skb, + const struct tcp_md5sig_info *md5sig) +{ + const struct tcp_md5sig_key *key; + struct tcp_diag_md5sig *info; + struct nlattr *attr; + int md5sig_count = 0; + + hlist_for_each_entry_rcu(key, &md5sig->head, node) + md5sig_count++; + if (md5sig_count == 0) + return 0; + + attr = nla_reserve(skb, INET_DIAG_MD5SIG, + md5sig_count * sizeof(struct tcp_diag_md5sig)); + if (!attr) + return -EMSGSIZE; + + info = nla_data(attr); + memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig)); + hlist_for_each_entry_rcu(key, &md5sig->head, node) { + tcp_diag_md5sig_fill(info++, key); + if (--md5sig_count == 0) + break; + } + + return 0; +} +#endif + +static int tcp_diag_get_aux(struct sock *sk, bool net_admin, + struct sk_buff *skb) +{ +#ifdef CONFIG_TCP_MD5SIG + if (net_admin) { + struct tcp_md5sig_info *md5sig; + int err = 0; + + rcu_read_lock(); + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); + if (md5sig) + err = tcp_diag_put_md5sig(skb, md5sig); + rcu_read_unlock(); + if (err < 0) + return err; + } +#endif + + return 0; +} + +static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) +{ + size_t size = 0; + +#ifdef CONFIG_TCP_MD5SIG + if (net_admin && sk_fullsock(sk)) { + const struct tcp_md5sig_info *md5sig; + const struct tcp_md5sig_key *key; + size_t md5sig_count = 0; + + rcu_read_lock(); + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); + if (md5sig) { + hlist_for_each_entry_rcu(key, &md5sig->head, node) + md5sig_count++; + } + rcu_read_unlock(); + size += nla_total_size(md5sig_count * + sizeof(struct tcp_diag_md5sig)); + } +#endif + + return size; +} + static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { @@ -68,13 +163,15 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler tcp_diag_handler = { - .dump = tcp_diag_dump, - .dump_one = tcp_diag_dump_one, - .idiag_get_info = tcp_diag_get_info, - .idiag_type = IPPROTO_TCP, - .idiag_info_size = sizeof(struct tcp_info), + .dump = tcp_diag_dump, + .dump_one = tcp_diag_dump_one, + .idiag_get_info = tcp_diag_get_info, + .idiag_get_aux = tcp_diag_get_aux, + .idiag_get_aux_size = tcp_diag_get_aux_size, + .idiag_type = IPPROTO_TCP, + .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY - .destroy = tcp_diag_destroy, + .destroy = tcp_diag_destroy, #endif }; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index ce9c7fef200f..e3c33220c418 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -171,7 +171,6 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, - struct dst_entry *dst, struct request_sock *req) { struct tcp_sock *tp; @@ -278,8 +277,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk) */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst) + struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; @@ -312,7 +310,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, * data in SYN_RECV state. */ fastopen: - child = tcp_fastopen_create_child(sk, skb, dst, req); + child = tcp_fastopen_create_child(sk, skb, req); if (child) { foc->len = -1; NET_INC_STATS(sock_net(sk), diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6d9879e93648..d1c33c91eadc 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -94,7 +94,6 @@ static const struct hstcp_aimd_val { struct hstcp { u32 ai; - u32 loss_cwnd; }; static void hstcp_init(struct sock *sk) @@ -153,22 +152,14 @@ static u32 hstcp_ssthresh(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - ca->loss_cwnd = tp->snd_cwnd; /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); } -static u32 hstcp_cwnd_undo(struct sock *sk) -{ - const struct hstcp *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, - .undo_cwnd = hstcp_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = hstcp_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 3eb78cde6ff0..082d479462fa 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -66,7 +66,6 @@ static inline void htcp_reset(struct htcp *ca) static u32 htcp_cwnd_undo(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); if (ca->undo_last_cong) { @@ -76,7 +75,7 @@ static u32 htcp_cwnd_undo(struct sock *sk) ca->undo_last_cong = 0; } - return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); + return tcp_reno_undo_cwnd(sk); } static inline void measure_rtt(struct sock *sk, u32 srtt) diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 60352ff4f5a8..7c843578f233 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -48,7 +48,6 @@ struct illinois { u32 end_seq; /* right edge of current RTT */ u32 alpha; /* Additive increase */ u32 beta; /* Muliplicative decrease */ - u32 loss_cwnd; /* cwnd on loss */ u16 acked; /* # packets acked by current ACK */ u8 rtt_above; /* average rtt has gone above threshold */ u8 rtt_low; /* # of rtts measurements below threshold */ @@ -297,18 +296,10 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); - ca->loss_cwnd = tp->snd_cwnd; /* Multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); } -static u32 tcp_illinois_cwnd_undo(struct sock *sk) -{ - const struct illinois *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) @@ -336,7 +327,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, static struct tcp_congestion_ops tcp_illinois __read_mostly = { .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, - .undo_cwnd = tcp_illinois_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2920e0cb09f8..c5d7656beeee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ @@ -1951,6 +1952,7 @@ void tcp_enter_loss(struct sock *sk) !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->prior_cwnd = tp->snd_cwnd; tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); @@ -2520,8 +2522,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) return; /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_jiffies32; } @@ -3004,21 +3006,24 @@ void tcp_rearm_rto(struct sock *sk) /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { - struct sk_buff *skb = tcp_write_queue_head(sk); - u64 rto_time_stamp = skb->skb_mstamp + - jiffies_to_usecs(rto); - s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta_us > 0) - rto = usecs_to_jiffies(delta_us); + rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); } } +/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ +static void tcp_set_xmit_timer(struct sock *sk) +{ + if (!tcp_schedule_loss_probe(sk)) + tcp_rearm_rto(sk); +} + /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3180,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3208,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ } if (icsk->icsk_ca_ops->pkts_acked) { @@ -3580,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) - tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; @@ -3647,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + /* If needed, reset TLP/RTO timer; RACK may later override this. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } - if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag); if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - if (icsk->icsk_pending == ICSK_TIME_RETRANS) - tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, sack_state.rate); @@ -4264,9 +4268,15 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } +enum tcp_queue { + OOO_QUEUE, + RCV_QUEUE, +}; + /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket + * @dest: destination queue * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean @@ -4278,6 +4288,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, + enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4299,6 +4310,15 @@ static bool tcp_try_coalesce(struct sock *sk, TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; + + if (TCP_SKB_CB(from)->has_rxtstamp) { + TCP_SKB_CB(to)->has_rxtstamp = true; + if (dest == OOO_QUEUE) + TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; + else + to->tstamp = from->tstamp; + } + return true; } @@ -4333,6 +4353,9 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); + /* Replace tstamp which was stomped by rbnode */ + if (TCP_SKB_CB(skb)->has_rxtstamp) + skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4344,7 +4367,8 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, + tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4398,6 +4422,10 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } + /* Stash tstamp to avoid being stomped on by rbnode */ + if (TCP_SKB_CB(skb)->has_rxtstamp) + TCP_SKB_CB(skb)->swtstamp = skb->tstamp; + /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -4425,7 +4453,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { + if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); @@ -4475,7 +4504,8 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; @@ -4526,7 +4556,8 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; + tcp_try_coalesce(sk, RCV_QUEUE, tail, + skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { __skb_queue_tail(&sk->sk_receive_queue, skb); @@ -4588,8 +4619,8 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - bool fragstolen = false; - int eaten = -1; + bool fragstolen; + int eaten; if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@ -4611,32 +4642,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto out_of_window; /* Ok. In sequence. In window. */ - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && - sock_owned_by_user(sk) && !tp->urg_data) { - int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); - - __set_current_state(TASK_RUNNING); - - if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - eaten = (chunk == skb->len); - tcp_rcv_space_adjust(sk); - } - } - - if (eaten <= 0) { queue_and_out: - if (eaten < 0) { - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - } + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); @@ -5186,26 +5198,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t } } -static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int err; - - if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); - else - err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); - - if (!err) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - } - - return err; -} - /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same @@ -5358,8 +5350,9 @@ discard: * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); tcp_mstamp_refresh(tp); @@ -5445,56 +5438,28 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, int eaten = 0; bool fragstolen = false; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); - - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + - TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - __skb_pull(skb, tcp_header_len); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; - } - } - if (!eaten) { - if (tcp_checksum_complete(skb)) - goto csum_error; + if (tcp_checksum_complete(skb)) + goto csum_error; - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(sk, skb); + tcp_rcv_rtt_measure_ts(sk, skb); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); - /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); - } + /* Bulk data transfer: receiver */ + eaten = tcp_queue_rcv(sk, skb, tcp_header_len, + &fragstolen); tcp_event_data_recv(sk, skb); @@ -5588,7 +5553,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) __tcp_fast_path_on(tp, tp->snd_wnd); else tp->pred_flags = 0; - } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@ -6303,9 +6267,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; - struct dst_entry *dst = NULL; struct request_sock *req; bool want_cookie = false; + struct dst_entry *dst; struct flowi fl; /* TW buckets are converted to open requests without @@ -6355,6 +6319,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (tmp_opt.tstamp_ok) tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); + dst = af_ops->route_req(sk, &fl, req); + if (!dst) + goto drop_and_free; + if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ if (!net->ipv4.sysctl_tcp_syncookies && @@ -6375,11 +6343,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, isn = af_ops->init_seq(skb); } - if (!dst) { - dst = af_ops->route_req(sk, &fl, req); - if (!dst) - goto drop_and_free; - } tcp_ecn_create_request(req, skb, sk, dst); @@ -6395,7 +6358,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init_rwin(req, sk, dst); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); - fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a20e7f03d5f7..a63486afa7a7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,8 +85,6 @@ #include <crypto/hash.h> #include <linux/scatterlist.h> -int sysctl_tcp_low_latency __read_mostly; - #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -385,7 +383,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, ntohs(th->source), - inet_iif(icmp_skb)); + inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; @@ -661,7 +659,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb)); + ntohs(th->source), inet_iif(skb), + tcp_v4_sdif(skb)); /* don't send rst if it can't find key */ if (!sk1) goto out; @@ -1269,7 +1268,7 @@ static void tcp_v4_init_req(struct request_sock *req, sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->opt = tcp_v4_save_options(skb); + ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, @@ -1458,7 +1457,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + tcp_rcv_established(sk, skb, tcp_hdr(skb)); return 0; } @@ -1525,7 +1524,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), - skb->skb_iif); + skb->skb_iif, inet_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; @@ -1541,61 +1540,6 @@ void tcp_v4_early_demux(struct sk_buff *skb) } } -/* Packet is added to VJ-style prequeue for processing in process - * context, if a reader task is waiting. Apparently, this exciting - * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) - * failed somewhere. Latency? Burstiness? Well, at least now we will - * see, why it failed. 8)8) --ANK - * - */ -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (sysctl_tcp_low_latency || !tp->ucopy.task) - return false; - - if (skb->len <= tcp_hdrlen(skb) && - skb_queue_len(&tp->ucopy.prequeue) == 0) - return false; - - /* Before escaping RCU protected region, we need to take care of skb - * dst. Prequeue is only enabled for established sockets. - * For such sockets, we might need the skb dst only to set sk->sk_rx_dst - * Instead of doing full sk_rx_dst validity here, let's perform - * an optimistic check. - */ - if (likely(sk->sk_rx_dst)) - skb_dst_drop(skb); - else - skb_dst_force_safe(skb); - - __skb_queue_tail(&tp->ucopy.prequeue, skb); - tp->ucopy.memory += skb->truesize; - if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || - tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - struct sk_buff *skb1; - - BUG_ON(sock_owned_by_user(sk)); - __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, - skb_queue_len(&tp->ucopy.prequeue)); - - while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb1); - - tp->ucopy.memory = 0; - } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { - wake_up_interruptible_sync_poll(sk_sleep(sk), - POLLIN | POLLRDNORM | POLLRDBAND); - if (!inet_csk_ack_scheduled(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * tcp_rto_min(sk)) / 4, - TCP_RTO_MAX); - } - return true; -} -EXPORT_SYMBOL(tcp_prequeue); - bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; @@ -1645,6 +1589,7 @@ EXPORT_SYMBOL(tcp_filter); int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + int sdif = inet_sdif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1692,10 +1637,12 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, - th->dest, &refcounted); + th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -1722,6 +1669,8 @@ process: */ sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1729,8 +1678,6 @@ process: } if (nsk == sk) { reqsk_put(req); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; @@ -1770,8 +1717,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); + ret = tcp_v4_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } @@ -1824,7 +1770,8 @@ do_time_wait: __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, - inet_iif(skb)); + inet_iif(skb), + sdif); if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; @@ -1936,9 +1883,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif - /* Clean prequeue, it must be empty really */ - __skb_queue_purge(&tp->ucopy.prequeue); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ff83c1637d8..188a6f31356d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,7 +445,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - tcp_prequeue_init(newtp); INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 6d650ed3cb59..1ff73982e28c 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -86,7 +86,6 @@ struct tcpnv { * < 0 => less than 1 packet/RTT */ u8 available8; u16 available16; - u32 loss_cwnd; /* cwnd at last loss */ u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ nv_reset:1, /* whether to reset values */ nv_catchup:1; /* whether we are growing because @@ -121,7 +120,6 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->nv_reset = 0; - ca->loss_cwnd = 0; ca->nv_no_cong_cnt = 0; ca->nv_rtt_cnt = 0; ca->nv_last_rtt = 0; @@ -177,19 +175,10 @@ static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcpnv_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct tcpnv *ca = inet_csk_ca(sk); - ca->loss_cwnd = tp->snd_cwnd; return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); } -static u32 tcpnv_undo_cwnd(struct sock *sk) -{ - struct tcpnv *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static void tcpnv_state(struct sock *sk, u8 new_state) { struct tcpnv *ca = inet_csk_ca(sk); @@ -446,7 +435,7 @@ static struct tcp_congestion_ops tcpnv __read_mostly = { .ssthresh = tcpnv_recalc_ssthresh, .cong_avoid = tcpnv_cong_avoid, .set_state = tcpnv_state, - .undo_cwnd = tcpnv_undo_cwnd, + .undo_cwnd = tcp_reno_undo_cwnd, .pkts_acked = tcpnv_acked, .get_info = tcpnv_get_info, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 886d874775df..5b6690d05abb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2202,9 +2202,10 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; + enum tcp_chrono old = tp->chrono_type; - if (tp->chrono_type > TCP_CHRONO_UNSPEC) - tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; + if (old > TCP_CHRONO_UNSPEC) + tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } @@ -2376,23 +2377,14 @@ bool tcp_schedule_loss_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, tlp_time_stamp, rto_time_stamp; + u32 timeout, rto_delta_us; - /* No consecutive loss probes. */ - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { - tcp_rearm_rto(sk); - return false; - } /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (tp->fastopen_rsk) return false; - /* TLP is only scheduled when next timer event is RTO. */ - if (icsk->icsk_pending != ICSK_TIME_RETRANS) - return false; - /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ @@ -2419,14 +2411,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) timeout = TCP_TIMEOUT_INIT; } - /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_jiffies32 + timeout; - rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; - if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_jiffies32; - if (delta > 0) - timeout = delta; - } + /* If the RTO formula yields an earlier time, then use that time. */ + rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + if (rto_delta_us > 0) + timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); @@ -3451,6 +3439,10 @@ int tcp_connect(struct sock *sk) int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_connect_init(sk); if (unlikely(tp->repair)) { diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f6c50af24a64..697f4c67b2e3 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -105,8 +105,9 @@ static inline int tcp_probe_avail(void) * Note: arguments must match tcp_rcv_established()! */ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); @@ -145,7 +146,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, BUG(); } - p->length = skb->len; + p->length = len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; p->snd_cwnd = tp->snd_cwnd; diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index f2123075ce6e..addc122f8818 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,10 +15,6 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -struct scalable { - u32 loss_cwnd; -}; - static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -36,23 +32,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct scalable *ca = inet_csk_ca(sk); - - ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } -static u32 tcp_scalable_cwnd_undo(struct sock *sk) -{ - const struct scalable *ca = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); -} - static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, - .undo_cwnd = tcp_scalable_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_scalable_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c0feeeef962a..655dd8d7f064 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk) /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); sk_mem_reclaim_partial(sk); @@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk) } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - struct sk_buff *skb; - - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - tp->ucopy.memory = 0; - } - if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ @@ -652,7 +640,8 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 2417f55374c5..6bb9e14c710a 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -122,14 +122,14 @@ int tcp_set_ulp(struct sock *sk, const char *name) ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) - err = -ENOENT; - else - err = ulp_ops->init(sk); + return -ENOENT; - if (err) - goto out; + err = ulp_ops->init(sk); + if (err) { + module_put(ulp_ops->owner); + return err; + } icsk->icsk_ulp_ops = ulp_ops; - out: - return err; + return 0; } diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 76005d4b8dfc..6fcf482d611b 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -30,7 +30,6 @@ struct veno { u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ u32 inc; /* decide whether to increase cwnd */ u32 diff; /* calculate the diff rate */ - u32 loss_cwnd; /* cwnd when loss occured */ }; /* There are several situations when we must "re-start" Veno: @@ -194,7 +193,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); - veno->loss_cwnd = tp->snd_cwnd; if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ return max(tp->snd_cwnd * 4 / 5, 2U); @@ -203,17 +201,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static u32 tcp_veno_cwnd_undo(struct sock *sk) -{ - const struct veno *veno = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd); -} - static struct tcp_congestion_ops tcp_veno __read_mostly = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, - .undo_cwnd = tcp_veno_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_veno_cong_avoid, .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index e6ff99c4bd3b..96e829b2e2fc 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -37,7 +37,6 @@ struct yeah { u32 fast_count; u32 pkts_acked; - u32 loss_cwnd; }; static void tcp_yeah_init(struct sock *sk) @@ -220,22 +219,14 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); - yeah->loss_cwnd = tp->snd_cwnd; return max_t(int, tp->snd_cwnd - reduction, 2); } -static u32 tcp_yeah_cwnd_undo(struct sock *sk) -{ - const struct yeah *yeah = inet_csk_ca(sk); - - return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd); -} - static struct tcp_congestion_ops tcp_yeah __read_mostly = { .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, - .undo_cwnd = tcp_yeah_cwnd_undo, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_yeah_cong_avoid, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b057653ceca9..f900cdd0fbfb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -380,8 +380,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum, int dif, - bool exact_dif) + __be32 daddr, unsigned short hnum, + int dif, int sdif, bool exact_dif) { int score; struct inet_sock *inet; @@ -413,10 +413,15 @@ static int compute_score(struct sock *sk, struct net *net, } if (sk->sk_bound_dev_if || exact_dif) { - if (sk->sk_bound_dev_if != dif) + bool dev_match = (sk->sk_bound_dev_if == dif || + sk->sk_bound_dev_if == sdif); + + if (exact_dif && !dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if && dev_match) + score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; return score; @@ -436,10 +441,11 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned int hnum, int dif, bool exact_dif, - struct udp_hslot *hslot2, - struct sk_buff *skb) + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, bool exact_dif, + struct udp_hslot *hslot2, + struct sk_buff *skb) { struct sock *sk, *result; int score, badness, matches = 0, reuseport = 0; @@ -449,7 +455,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, exact_dif); + daddr, hnum, dif, sdif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -477,8 +483,8 @@ static struct sock *udp4_lib_lookup2(struct net *net, * harder than this. -DaveM */ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, - __be16 sport, __be32 daddr, __be16 dport, - int dif, struct udp_table *udptable, struct sk_buff *skb) + __be16 sport, __be32 daddr, __be16 dport, int dif, + int sdif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; unsigned short hnum = ntohs(dport); @@ -496,7 +502,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto begin; result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, + daddr, hnum, dif, sdif, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; @@ -511,7 +517,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto begin; result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, + daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } return result; @@ -521,7 +527,7 @@ begin: badness = 0; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, exact_dif); + daddr, hnum, dif, sdif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -554,7 +560,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - udptable, skb); + inet_sdif(skb), udptable, skb); } struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, @@ -576,7 +582,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, struct sock *sk; sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, - dif, &udp_table, NULL); + dif, 0, &udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -587,7 +593,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup); static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -597,9 +603,10 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && + sk->sk_bound_dev_if != sdif)) return false; - if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif)) + if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; return true; } @@ -628,8 +635,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, udptable, - NULL); + iph->saddr, uh->source, skb->dev->ifindex, 0, + udptable, NULL); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -802,7 +809,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check_tx) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; @@ -1163,34 +1170,36 @@ out: return ret; } -#if BITS_PER_LONG == 64 +#define UDP_SKB_IS_STATELESS 0x80000000 + static void udp_set_dev_scratch(struct sk_buff *skb) { - struct udp_dev_scratch *scratch; + struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); - scratch = (struct udp_dev_scratch *)&skb->dev_scratch; - scratch->truesize = skb->truesize; + scratch->_tsize_state = skb->truesize; +#if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); +#endif + /* all head states execept sp (dst, sk, nf) are always cleared by + * udp_rcv() and we need to preserve secpath, if present, to eventually + * process IP_CMSG_PASSSEC at recvmsg() time + */ + if (likely(!skb_sec_path(skb))) + scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } static int udp_skb_truesize(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize; -} -#else -static void udp_set_dev_scratch(struct sk_buff *skb) -{ - skb->dev_scratch = skb->truesize; + return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; } -static int udp_skb_truesize(struct sk_buff *skb) +static bool udp_skb_has_head_state(struct sk_buff *skb) { - return skb->dev_scratch; + return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); } -#endif /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial, @@ -1388,10 +1397,10 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } - /* we cleared the head states previously only if the skb lacks any IP - * options, see __udp_queue_rcv_skb(). + /* In the more common cases we cleared the head states previously, + * see __udp_queue_rcv_skb(). */ - if (unlikely(IPCB(skb)->opt.optlen > 0)) + if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); consume_stateless_skb(skb); } @@ -1576,7 +1585,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; @@ -1784,13 +1794,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* At recvmsg() time we need skb->dst to process IP options-related - * cmsg, elsewhere can we clear all pending head states while they are - * hot in the cache - */ - if (likely(IPCB(skb)->opt.optlen == 0)) - skb_release_head_state(skb); - rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1930,15 +1933,18 @@ drop: /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ -static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; if (dst_hold_safe(dst)) { old = xchg(&sk->sk_rx_dst, dst); dst_release(old); + return old != dst; } + return false; } +EXPORT_SYMBOL(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. @@ -1957,6 +1963,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); unsigned int offset = offsetof(typeof(*sk), sk_node); int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -1971,7 +1978,7 @@ start_lookup: sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, hnum)) continue; if (!first) { @@ -2161,7 +2168,7 @@ drop: static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, - int dif) + int dif, int sdif) { struct sock *sk, *result; unsigned short hnum = ntohs(loc_port); @@ -2175,7 +2182,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, result = NULL; sk_for_each_rcu(sk, &hslot->head) { if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, - rmt_port, rmt_addr, dif, hnum)) { + rmt_port, rmt_addr, dif, sdif, hnum)) { if (result) return NULL; result = sk; @@ -2192,7 +2199,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, - int dif) + int dif, int sdif) { unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); @@ -2204,7 +2211,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (INET_MATCH(sk, net, acookie, rmt_addr, - loc_addr, ports, dif)) + loc_addr, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; @@ -2220,6 +2227,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); int ours; /* validate the packet */ @@ -2245,10 +2253,11 @@ void udp_v4_early_demux(struct sk_buff *skb) } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, - uh->source, iph->saddr, dif); + uh->source, iph->saddr, + dif, sdif); } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, - uh->source, iph->saddr, dif); + uh->source, iph->saddr, dif, sdif); } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 4515836d2a3a..d0390d844ac8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -45,7 +45,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, @@ -53,7 +53,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -182,7 +182,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && @@ -190,7 +190,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, @@ -198,7 +198,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, - req->id.idiag_if, tbl, NULL); + req->id.idiag_if, 0, tbl, NULL); } #endif else { diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 58bd39fb14b4..6539ff15e9a3 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -82,7 +82,8 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!dev->netdev_ops->ndo_udp_tunnel_add) + if (!dev->netdev_ops->ndo_udp_tunnel_add || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; ti.type = type; @@ -93,6 +94,24 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); +void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + if (!dev->netdev_ops->ndo_udp_tunnel_del || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); + /* Notify netdevs that UDP port started listening */ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) { @@ -109,6 +128,8 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) for_each_netdev_rcu(net, dev) { if (!dev->netdev_ops->ndo_udp_tunnel_add) continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); } rcu_read_unlock(); @@ -131,6 +152,8 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) for_each_netdev_rcu(net, dev) { if (!dev->netdev_ops->ndo_udp_tunnel_del) continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); } rcu_read_unlock(); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 4aefb149fe0a..d7bf0b041885 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -20,7 +20,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct rtable *rt; @@ -28,6 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); + fl4->flowi4_mark = mark; if (saddr) fl4->saddr = saddr->a4; @@ -42,20 +44,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr); + return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark); } static int xfrm4_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; |