diff options
Diffstat (limited to 'net/ipv4')
72 files changed, 1531 insertions, 1328 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 89aacb630a53..c29809f765dc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ + tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1d0c3adb6f34..5c5db6636704 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,7 +119,7 @@ #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif -#include <net/vrf.h> +#include <net/l3mdev.h> /* The inetsw table contains everything that inet_create needs to @@ -219,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog) * shutdown() (rather than close()). */ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && - !inet_csk(sk)->icsk_accept_queue.fastopenq) { + !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - err = fastopen_init_queue(sk, backlog); + fastopen_queue_tune(sk, backlog); else if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT2) != 0) - err = fastopen_init_queue(sk, + fastopen_queue_tune(sk, ((uint)sysctl_tcp_fastopen) >> 16); - else - err = 0; - if (err) - goto out; tcp_fastopen_init_key_once(true); } @@ -261,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ @@ -450,7 +449,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id; + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too @@ -1043,22 +1042,16 @@ void inet_register_protosw(struct inet_protosw *p) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ - answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); - /* Check only the non-wild match. */ - if (INET_PROTOSW_PERMANENT & answer->flags) { - if (protocol == answer->protocol) - break; - last_perm = lh; - } - - answer = NULL; + if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) + break; + if (protocol == answer->protocol) + goto out_permanent; + last_perm = lh; } - if (answer) - goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0c9c3482e419..59b3e0e8fd51 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -624,14 +624,20 @@ out: } EXPORT_SYMBOL(arp_create); +static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} + /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ - NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb, - NULL, skb->dev, dev_queue_xmit_sk); + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, + dev_net(skb->dev), NULL, skb, NULL, skb->dev, + arp_xmit_finish); } EXPORT_SYMBOL(arp_xmit); @@ -639,7 +645,7 @@ EXPORT_SYMBOL(arp_xmit); * Process an arp request. */ -static int arp_process(struct sock *sk, struct sk_buff *skb) +static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -651,7 +657,6 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) u16 dev_type = dev->type; int addr_type; struct neighbour *n; - struct net *net = dev_net(dev); struct dst_entry *reply_dst = NULL; bool is_garp = false; @@ -872,7 +877,7 @@ out_free_dst: static void parp_redo(struct sk_buff *skb) { - arp_process(NULL, skb); + arp_process(dev_net(skb->dev), NULL, skb); } @@ -905,8 +910,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb, - dev, NULL, arp_process); + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, + dev_net(dev), NULL, skb, dev, NULL, + arp_process); consumeskb: consume_skb(skb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 2d9cb1748f81..cebd9d31e65a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1644,7 +1644,8 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } -static size_t inet_get_link_af_size(const struct net_device *dev) +static size_t inet_get_link_af_size(const struct net_device *dev, + u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); @@ -1654,7 +1655,8 @@ static size_t inet_get_link_af_size(const struct net_device *dev) return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } -static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, + u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; @@ -2397,4 +2399,3 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, inet_netconf_dump_devconf, NULL); } - diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 457b2cd75b85..473447593060 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -45,7 +45,7 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/xfrm.h> -#include <net/vrf.h> +#include <net/l3mdev.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -255,7 +255,7 @@ EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } @@ -268,7 +268,7 @@ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { - u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } @@ -332,7 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = vrf_master_ifindex_rcu(dev); + fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev); if (!fl4.flowi4_iif) fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; @@ -367,7 +367,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (nh->nh_dev == dev) { dev_match = true; break; - } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { dev_match = true; break; } @@ -804,7 +804,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); - u32 tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev); + u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -867,9 +867,10 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_NEWROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - prefix, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) + fib_magic(RTM_NEWROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, prim); /* Add network specific broadcasts, when it takes a sense */ if (ifa->ifa_prefixlen < 31) { @@ -914,9 +915,10 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); subnet = 1; } @@ -1153,6 +1155,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_changeupper_info *info; struct in_device *in_dev; struct net *net = dev_net(dev); unsigned int flags; @@ -1191,6 +1194,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo case NETDEV_CHANGEMTU: rt_cache_flush(net); break; + case NETDEV_CHANGEUPPER: + info = ptr; + /* flush all routes if dev is linked to or unlinked from + * an L3 master device (e.g., VRF) + */ + if (info->upper_dev && netif_is_l3_master(info->upper_dev)) + fib_disable_ip(dev, NETDEV_DOWN, true); + break; } return NOTIFY_DONE; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e966f8599b4a..d97268e8ff10 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -57,8 +57,7 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH - -static DEFINE_SPINLOCK(fib_multipath_lock); +u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -532,7 +531,67 @@ errout: return ret; } -#endif +static void fib_rebalance(struct fib_info *fi) +{ + int total; + int w; + struct in_device *in_dev; + + if (fi->fib_nhs < 2) + return; + + total = 0; + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + + in_dev = __in_dev_get_rtnl(nh->nh_dev); + + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN) + continue; + + total += nh->nh_weight; + } endfor_nexthops(fi); + + w = 0; + change_nexthops(fi) { + int upper_bound; + + in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); + + if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + upper_bound = -1; + } else if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + upper_bound = -1; + } else { + w += nexthop_nh->nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, + total) - 1; + } + + atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + } endfor_nexthops(fi); + + net_get_random_once(&fib_multipath_secret, + sizeof(fib_multipath_secret)); +} + +static inline void fib_add_weight(struct fib_info *fi, + const struct fib_nh *nh) +{ + fi->fib_weight += nh->nh_weight; +} + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define fib_rebalance(fi) do { } while (0) +#define fib_add_weight(fi, nh) do { } while (0) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, @@ -864,14 +923,21 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { u32 tb_id = cfg->fc_table; + int rc; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; - if (inet_addr_type_table(cfg->fc_nlinfo.nl_net, - fib_prefsrc, tb_id) != RTN_LOCAL) { - return false; + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, tb_id); + + if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, RT_TABLE_LOCAL); } + + if (rc != RTN_LOCAL) + return false; } return true; } @@ -1094,8 +1160,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) + fib_rebalance(fi); + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -1322,12 +1391,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; break; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nexthop_nh->nh_power; - nexthop_nh->nh_power = 0; - spin_unlock_bh(&fib_multipath_lock); -#endif dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1350,6 +1413,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) } ret++; } + + fib_rebalance(fi); } return ret; @@ -1479,20 +1544,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) !__in_dev_get_rtnl(dev)) continue; alive++; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~nh_flags; - spin_unlock_bh(&fib_multipath_lock); -#else nexthop_nh->nh_flags &= ~nh_flags; -#endif } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } + + fib_rebalance(fi); } return ret; @@ -1500,62 +1560,41 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) #ifdef CONFIG_IP_ROUTE_MULTIPATH -/* - * The algorithm is suboptimal, but it provides really - * fair weighted route distribution. - */ -void fib_select_multipath(struct fib_result *res) +void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; - struct in_device *in_dev; - int w; - - spin_lock_bh(&fib_multipath_lock); - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { - in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); - if (nexthop_nh->nh_flags & RTNH_F_DEAD) - continue; - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nexthop_nh->nh_flags & RTNH_F_LINKDOWN) - continue; - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } endfor_nexthops(fi); - fi->fib_power = power; - if (power <= 0) { - spin_unlock_bh(&fib_multipath_lock); - /* Race condition: route has just become dead. */ - res->nh_sel = 0; - return; - } - } - - /* w should be random number [0..fi->fib_power-1], - * it is pretty bad approximation. - */ - - w = jiffies % fi->fib_power; + for_nexthops(fi) { + if (hash > atomic_read(&nh->nh_upper_bound)) + continue; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && - nexthop_nh->nh_power) { - w -= nexthop_nh->nh_power; - if (w <= 0) { - nexthop_nh->nh_power--; - fi->fib_power--; - res->nh_sel = nhsel; - spin_unlock_bh(&fib_multipath_lock); - return; - } - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); /* Race condition: route has just become dead. */ res->nh_sel = 0; - spin_unlock_bh(&fib_multipath_lock); } #endif + +void fib_select_path(struct net *net, struct fib_result *res, + struct flowi4 *fl4, int mp_hash) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (mp_hash < 0) + mp_hash = get_hash_from_flowi4(fl4) >> 1; + + fib_select_multipath(res, mp_hash); + } + else +#endif + if (!res->prefixlen && + res->table->tb_num_default > 1 && + res->type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(fl4, res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, *res); +} +EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index e0fcbbbcfe54..bd903fe0f750 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -24,6 +24,7 @@ struct fou { u16 type; struct udp_offload udp_offloads; struct list_head list; + struct rcu_head rcu; }; #define FOU_F_REMCSUM_NOPARTIAL BIT(0) @@ -417,7 +418,7 @@ static void fou_release(struct fou *fou) list_del(&fou->list); udp_tunnel_sock_release(sock); - kfree(fou); + kfree_rcu(fou, rcu); } static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e5eb8ac4089d..36e26977c908 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -96,7 +96,7 @@ #include <net/xfrm.h> #include <net/inet_common.h> #include <net/ip_fib.h> -#include <net/vrf.h> +#include <net/l3mdev.h> /* * Build xmit assembly blocks @@ -309,7 +309,7 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, rc = false; if (icmp_global_allow()) { - int vif = vrf_master_ifindex(dst->dev); + int vif = l3mdev_master_ifindex(dst->dev); struct inet_peer *peer; peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex(skb->dev); + fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -440,6 +440,22 @@ out_unlock: icmp_xmit_unlock(sk); } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* Source and destination is swapped. See ip_multipath_icmp_hash */ +static int icmp_multipath_hash_skb(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + return fib_multipath_hash(iph->daddr, iph->saddr); +} + +#else + +#define icmp_multipath_hash_skb(skb) (-1) + +#endif + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -461,10 +477,11 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev); + fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key_hash(net, fl4, + icmp_multipath_hash_skb(skb_in)); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d38b8b61eaee..05e4cba14162 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(skb); + return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(unsigned long data) @@ -2126,7 +2126,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); - if (!in_dev) { + if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } @@ -2147,7 +2147,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - ip_mc_dec_group(in_dev, group); + if (in_dev) + ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); @@ -2392,11 +2393,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct ip_sf_socklist *psl; struct net *net = sock_net(sk); + ASSERT_RTNL(); + if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; @@ -2417,7 +2418,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, goto done; msf->imsf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); - rtnl_unlock(); if (!psl) { len = 0; count = 0; @@ -2436,7 +2436,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, return -EFAULT; return 0; done: - rtnl_unlock(); return err; } @@ -2450,6 +2449,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; + ASSERT_RTNL(); + psin = (struct sockaddr_in *)&gsf->gf_group; if (psin->sin_family != AF_INET) return -EINVAL; @@ -2457,8 +2458,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { @@ -2470,7 +2469,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, goto done; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); - rtnl_unlock(); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; @@ -2490,7 +2488,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, } return 0; done: - rtnl_unlock(); return err; } @@ -2569,7 +2566,7 @@ void ip_mc_drop_socket(struct sock *sk) } /* called with rcu_read_lock() */ -int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 61b45a17fc73..46b9c887bede 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - req = reqsk_queue_remove(queue); + req = reqsk_queue_remove(queue, sk); newsk = req->sk; - sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && - tcp_rsk(req)->tfo_listener && - queue->fastopenq) { - spin_lock_bh(&queue->fastopenq->lock); + tcp_rsk(req)->tfo_listener) { + spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to @@ -348,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) req->sk = NULL; req = NULL; } - spin_unlock_bh(&queue->fastopenq->lock); + spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); @@ -408,7 +406,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -struct dst_entry *inet_csk_route_req(struct sock *sk, +struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { @@ -439,7 +437,7 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_req); -struct dst_entry *inet_csk_route_child_sock(struct sock *sk, +struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { @@ -478,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -563,7 +508,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, req->num_timeout >= rskq_defer_accept - 1; } -int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); @@ -573,27 +518,20 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct request_sock **prev; - struct listen_sock *lopt; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; - spin_lock(&queue->syn_wait_lock); - lopt = queue->listen_opt; - if (lopt) { - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } - } + if (sk_hashed(req_to_sk(req))) { + spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); } - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -608,21 +546,25 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); +void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) +{ + inet_csk_reqsk_queue_drop(sk, req); + reqsk_put(req); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); + static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct listen_sock *lopt = queue->listen_opt; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_state_load(sk_listener) != TCP_LISTEN) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -643,9 +585,9 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - qlen = listen_sock_qlen(lopt); - if (qlen >> (lopt->max_qlen_log - 1)) { - int young = listen_sock_young(lopt) << 1; + qlen = reqsk_queue_len(queue); + if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { if (qlen < young) @@ -667,41 +609,40 @@ static void reqsk_timer_handler(unsigned long data) unsigned long timeo; if (req->num_timeout++ == 0) - atomic_inc(&lopt->young_dec); + atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } - inet_csk_reqsk_queue_drop(sk_listener, req); - reqsk_put(req); +drop: + inet_csk_reqsk_queue_drop_and_put(sk_listener, req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); - req->rsk_hash = hash; + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); + atomic_set(&req->rsk_refcnt, 2 + 1); +} - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone @@ -792,16 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); -int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +int inet_csk_listen_start(struct sock *sk, int backlog) { - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + struct inet_sock *inet = inet_sk(sk); - if (rc != 0) - return rc; + reqsk_queue_alloc(&icsk->icsk_accept_queue); - sk->sk_max_ack_backlog = 0; + sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); @@ -810,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk->sk_state = TCP_LISTEN; + sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -821,11 +760,76 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); +static void inet_child_forget(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != req->rsk_listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + } + inet_csk_destroy_sock(child); + reqsk_put(req); +} + +void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + + spin_lock(&queue->rskq_lock); + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_child_forget(sk, req, child); + } else { + req->sk = child; + req->dl_next = NULL; + if (queue->rskq_accept_head == NULL) + queue->rskq_accept_head = req; + else + queue->rskq_accept_tail->dl_next = req; + queue->rskq_accept_tail = req; + sk_acceptq_added(sk); + } + spin_unlock(&queue->rskq_lock); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_add); + +struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, + struct request_sock *req, bool own_req) +{ + if (own_req) { + inet_csk_reqsk_queue_drop(sk, req); + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + inet_csk_reqsk_queue_add(sk, req, child); + /* Warning: caller must not call reqsk_put(req); + * child stole last reference on it. + */ + return child; + } + /* Too bad, another child took ownership of the request, undo. */ + bh_unlock_sock(child); + sock_put(child); + return NULL; +} +EXPORT_SYMBOL(inet_csk_complete_hashdance); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. @@ -834,11 +838,7 @@ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct request_sock *acc_req; - struct request_sock *req; - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(queue); + struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -848,57 +848,34 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(queue); - - while ((req = acc_req) != NULL) { + while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; - acc_req = req->dl_next; - local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); - sk->sk_prot->disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - percpu_counter_inc(sk->sk_prot->orphan_count); - - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != req->rsk_listener); - - /* Paranoid, to prevent race condition if - * an inbound pkt destined for child is - * blocked by sock lock in tcp_v4_rcv(). - * Also to satisfy an assertion in - * tcp_v4_destroy_sock(). - */ - tcp_sk(child)->fastopen_rsk = NULL; - } - inet_csk_destroy_sock(child); - + inet_child_forget(sk, req, child); bh_unlock_sock(child); local_bh_enable(); sock_put(child); - sk_acceptq_removed(sk); - reqsk_put(req); + cond_resched(); } - if (queue->fastopenq) { + if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ - spin_lock_bh(&queue->fastopenq->lock); - acc_req = queue->fastopenq->rskq_rst_head; - queue->fastopenq->rskq_rst_head = NULL; - spin_unlock_bh(&queue->fastopenq->lock); - while ((req = acc_req) != NULL) { - acc_req = req->dl_next; + spin_lock_bh(&queue->fastopenq.lock); + req = queue->fastopenq.rskq_rst_head; + queue->fastopenq.rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq.lock); + while (req != NULL) { + next = req->dl_next; reqsk_put(req); + req = next; } } - WARN_ON(sk->sk_ack_backlog); + WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c3b1f3a0f4cf..ab9f8a66615d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -730,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !listen_sock_qlen(lopt)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -879,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -906,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index d0a7c0319e3d..fe144dae7372 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -209,12 +209,6 @@ int inet_frags_init(struct inet_frags *f) } EXPORT_SYMBOL(inet_frags_init); -void inet_frags_init_net(struct netns_frags *nf) -{ - init_frag_mem_limit(nf); -} -EXPORT_SYMBOL(inet_frags_init_net); - void inet_frags_fini(struct inet_frags *f) { cancel_work_sync(&f->frags_work); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 89120196a949..ccc5980797fc 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -126,7 +126,7 @@ void inet_put_port(struct sock *sk) } EXPORT_SYMBOL(inet_put_port); -int __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; @@ -137,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (unlikely(!tb)) { + spin_unlock(&head->lock); + return -ENOENT; + } if (tb->port != port) { /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption @@ -185,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } @@ -398,14 +404,18 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; + bool ret = true; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -413,24 +423,41 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); - __sk_nulls_add_node_rcu(sk, list); if (osk) { - WARN_ON(sk->sk_hash != osk->sk_hash); - sk_nulls_del_node_init_rcu(osk); + WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); + ret = sk_nulls_del_node_init_rcu(osk); } + if (ret) + __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return ret; +} + +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +{ + bool ok = inet_ehash_insert(sk, osk); + + if (ok) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } else { + percpu_counter_inc(sk->sk_prot->orphan_count); + sk->sk_state = TCP_CLOSE; + sock_set_flag(sk, SOCK_DEAD); + inet_csk_destroy_sock(sk); + } + return ok; } -EXPORT_SYMBOL_GPL(__inet_hash_nolisten); +EXPORT_SYMBOL_GPL(inet_ehash_nolisten); void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; - if (sk->sk_state != TCP_LISTEN) - return __inet_hash_nolisten(sk, osk); - + if (sk->sk_state != TCP_LISTEN) { + inet_ehash_nolisten(sk, osk); + return; + } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -551,7 +578,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - __inet_hash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw); } if (tw) inet_twsk_bind_unhash(tw, hinfo); @@ -568,7 +595,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 2d3aa408fbdc..da0d7ce85844 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -61,18 +61,18 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) } -static int ip_forward_finish(struct sock *sk, struct sk_buff *skb) +static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); skb_sender_cpu_clear(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } int ip_forward(struct sk_buff *skb) @@ -81,6 +81,7 @@ int ip_forward(struct sk_buff *skb) struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + struct net *net; /* that should never happen */ if (skb->pkt_type != PACKET_HOST) @@ -99,6 +100,7 @@ int ip_forward(struct sk_buff *skb) return NET_RX_SUCCESS; skb_forward_csum(skb); + net = dev_net(skb->dev); /* * According to the RFC, we must first decrease the TTL field. If @@ -119,7 +121,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { - IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto drop; @@ -143,8 +145,9 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, rt->dst.dev, ip_forward_finish); + return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, rt->dst.dev, + ip_forward_finish); sr_failed: /* @@ -155,7 +158,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fa7f15305f9a..1fe55ae81781 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -48,7 +48,7 @@ #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> -#include <net/vrf.h> +#include <net/l3mdev.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -78,7 +78,7 @@ struct ipq { u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* VRF device index */ + int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -654,11 +654,10 @@ out_fail: } /* Process an incoming IP datagram fragment. */ -int ip_defrag(struct sk_buff *skb, u32 user) +int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; - int vif = vrf_master_ifindex_rcu(dev); - struct net *net = dev_net(dev); + int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); @@ -683,7 +682,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_defrag); -struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; @@ -712,7 +711,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - if (ip_defrag(skb, user)) + if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } @@ -840,6 +839,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -863,9 +864,13 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + res = inet_frags_init_net(&net->ipv4.frags); + if (res) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res) + inet_frags_uninit_net(&net->ipv4.frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f4fc8a77aaa7..b1209b63381f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -157,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; @@ -167,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), dev_net(dev))) { + net_eq(sock_net(sk), net)) { if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) + if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { @@ -188,10 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb) +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); - __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); @@ -248,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb) /* * Reassemble IP fragments. */ + struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } - return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, + net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } @@ -311,7 +311,7 @@ drop: int sysctl_ip_early_demux __read_mostly = 1; EXPORT_SYMBOL(sysctl_ip_early_demux); -static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; @@ -337,8 +337,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); + NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER); goto drop; } } @@ -359,11 +358,9 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); @@ -378,6 +375,7 @@ drop: int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct iphdr *iph; + struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -387,11 +385,12 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); + net = dev_net(dev); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto out; } @@ -417,7 +416,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); - IP_ADD_STATS_BH(dev_net(dev), + IP_ADD_STATS_BH(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); @@ -431,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -441,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -453,14 +452,14 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, ip_rcv_finish); csum_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0138fada0951..4233cbe47052 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,9 +83,10 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); -static int ip_fragment(struct sock *sk, struct sk_buff *skb, - unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)); +static int +ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct net *, struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) @@ -95,32 +96,28 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL, - skb_dst(skb)->dev, dst_output_sk); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb_dst(skb)->dev, + dst_output); } -int __ip_local_out(struct sk_buff *skb) -{ - return __ip_local_out_sk(skb->sk, skb); -} - -int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; - err = __ip_local_out(skb); + err = __ip_local_out(net, sk, skb); if (likely(err == 1)) - err = dst_output_sk(sk, skb); + err = dst_output(net, sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out_sk); +EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -135,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * Add an ip header to a skbuff and send it out. * */ -int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, +int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ @@ -149,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->dst)) - iph->frag_off = htons(IP_DF); - else - iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(sock_net(sk), skb, sk); + if (ip_dont_fragment(sk, &rt->dst)) { + iph->frag_off = htons(IP_DF); + iph->id = 0; + } else { + iph->frag_off = 0; + __ip_select_ident(net, iph, 1); + } if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -168,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, skb->mark = sk->sk_mark; /* Send it out. */ - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; @@ -182,9 +182,9 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -220,8 +220,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, - unsigned int mtu) +static int ip_finish_output_gso(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -230,7 +230,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || skb_gso_network_seglen(skb) <= mtu) - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * @@ -253,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, int err; segs->next = NULL; - err = ip_fragment(sk, segs, mtu, ip_finish_output2); + err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, return ret; } -static int ip_finish_output(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; @@ -271,20 +271,20 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb, mtu); + return ip_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) - return ip_fragment(sk, skb, mtu, ip_finish_output2); + return ip_fragment(net, sk, skb, mtu, ip_finish_output2); - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); } -int ip_mc_output(struct sock *sk, struct sk_buff *skb) +int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -292,7 +292,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -320,7 +320,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, - sk, newskb, NULL, newskb->dev, + net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); } @@ -335,26 +335,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb, - NULL, newskb->dev, dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, newskb, NULL, newskb->dev, + dev_loopback_xmit); } - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL, - skb->dev, ip_finish_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb->dev, + ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sock *sk, struct sk_buff *skb) +int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -377,6 +379,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; @@ -407,7 +410,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, + rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -444,20 +447,20 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(sock_net(sk), skb, sk, + ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out(skb); + res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; no_route: rcu_read_unlock(); - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -486,29 +489,26 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -static int ip_fragment(struct sock *sk, struct sk_buff *skb, +static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)) + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); if ((iph->frag_off & htons(IP_DF)) == 0) - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); } /* @@ -518,8 +518,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, * single device frame, and queue such a frame for sending. */ -int ip_do_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -533,6 +533,11 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, dev = rt->dst.dev; + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto fail; + /* * Point into the IP datagram header. */ @@ -621,10 +626,10 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, ip_send_check(iph); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -634,7 +639,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, } if (err == 0) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } @@ -643,7 +648,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, kfree_skb(frag); frag = skb; } - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: @@ -657,9 +662,6 @@ slow_path_clean: } slow_path: - /* for offloaded checksums cleanup checksum before fragmentation */ - if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) - goto fail; iph = ip_hdr(skb); left = skb->len - hlen; /* Space per frame */ @@ -761,19 +763,19 @@ slow_path: ip_send_check(iph); - err = output(sk, skb2); + err = output(net, sk, skb2); if (err) goto fail; - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } consume_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; } EXPORT_SYMBOL(ip_do_fragment); @@ -911,6 +913,7 @@ static int __ip_append_data(struct sock *sk, if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & NETIF_F_V4_CSUM && + !(flags & MSG_MORE) && !exthdrlen) csummode = CHECKSUM_PARTIAL; @@ -1434,7 +1437,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; - err = ip_local_out(skb); + err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); @@ -1561,7 +1564,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - if (!oif && netif_index_is_vrf(net, skb->skb_iif)) + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; flowi4_init_output(&fl4, oif, @@ -1596,7 +1599,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c3c359ad66e3..5f73a7c03e27 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1251,11 +1251,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt); * the _received_ ones. The set sets the _sent_ ones. */ +static bool getsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_MSFILTER: + case MCAST_MSFILTER: + return true; + } + return false; +} + static int do_ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); - int val; + bool needs_rtnl = getsockopt_needs_rtnl(optname); + int val, err = 0; int len; if (level != SOL_IP) @@ -1269,6 +1280,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (len < 0) return -EINVAL; + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -1386,39 +1399,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MSFILTER: { struct ip_msfilter msf; - int err; if (len < IP_MSFILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out; } if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; + err = -EFAULT; + goto out; } err = ip_mc_msfget(sk, &msf, (struct ip_msfilter __user *)optval, optlen); - release_sock(sk); - return err; + goto out; } case MCAST_MSFILTER: { struct group_filter gsf; - int err; if (len < GROUP_FILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out; } if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; + err = -EFAULT; + goto out; } err = ip_mc_gsfget(sk, &gsf, (struct group_filter __user *)optval, optlen); - release_sock(sk); - return err; + goto out; } case IP_MULTICAST_ALL: val = inet->mc_all; @@ -1485,6 +1494,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; } return 0; + +out: + release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); + return err; } int ip_getsockopt(struct sock *sk, int level, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 84dce6a92f93..6cb9009c3d96 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -53,6 +53,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __u8 tos, __u8 ttl, __be16 df, bool xnet) { int pkt_len = skb->len - skb_inner_network_offset(skb); + struct net *net = dev_net(rt->dst.dev); struct iphdr *iph; int err; @@ -76,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(dev_net(rt->dst.dev), iph, - skb_shinfo(skb)->gso_segs ?: 1); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out_sk(sk, skb); + err = ip_local_out(net, sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 0c152087ca15..4d8f0b698777 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; - err = dst_output(skb); + err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = skb->len; iptunnel_xmit_stats(err, &dev->stats, dev->tstats); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index ed4ef09c2136..0bc7412d9e14 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -146,6 +146,10 @@ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; +#if defined(CONFIG_IP_PNP_DHCP) +static char dhcp_client_identifier[253] __initdata; +#endif + /* Persistent data: */ static int ic_proto_used; /* Protocol used, if any */ @@ -728,6 +732,16 @@ ic_dhcp_init_options(u8 *options) memcpy(e, vendor_class_identifier, len); e += len; } + len = strlen(dhcp_client_identifier + 1); + /* the minimum length of identifier is 2, include 1 byte type, + * and can not be larger than the length of options + */ + if (len >= 1 && len < 312 - (e - options) - 1) { + *e++ = 61; + *e++ = len + 1; + memcpy(e, dhcp_client_identifier, len + 1); + e += len + 1; + } } *e++ = 255; /* End of the list */ @@ -1557,8 +1571,24 @@ static int __init ic_proto_name(char *name) return 0; } #ifdef CONFIG_IP_PNP_DHCP - else if (!strcmp(name, "dhcp")) { + else if (!strncmp(name, "dhcp", 4)) { + char *client_id; + ic_proto_enabled &= ~IC_RARP; + client_id = strstr(name, "dhcp,"); + if (client_id) { + char *v; + + client_id = client_id + 5; + v = strchr(client_id, ','); + if (!v) + return 1; + *v = 0; + if (kstrtou8(client_id, 0, dhcp_client_identifier)) + DBG("DHCP: Invalid client identifier type\n"); + strncpy(dhcp_client_identifier + 1, v + 1, 251); + *v = ','; + } return 1; } #endif diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index f34c31defafe..a09fb0dec725 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -253,9 +253,6 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) p.i_key = p.o_key = 0; p.i_flags = p.o_flags = 0; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8e8203d5c520..c3a38353f5dc 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -134,7 +134,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); -static void mroute_clean_tables(struct mr_table *mrt); +static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -350,7 +350,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, true); kfree(mrt); } @@ -441,10 +441,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -540,10 +536,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -1208,7 +1200,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt) +static void mroute_clean_tables(struct mr_table *mrt, bool all) { int i; LIST_HEAD(list); @@ -1217,8 +1209,9 @@ static void mroute_clean_tables(struct mr_table *mrt) /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif_table[i].flags & VIFF_STATIC)) - vif_delete(mrt, i, 0, &list); + if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) + continue; + vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); @@ -1226,7 +1219,7 @@ static void mroute_clean_tables(struct mr_table *mrt) for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (c->mfc_flags & MFC_STATIC) + if (!all && (c->mfc_flags & MFC_STATIC)) continue; list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); @@ -1261,7 +1254,7 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, false); } } rtnl_unlock(); @@ -1678,17 +1671,18 @@ static void ip_encap(struct net *net, struct sk_buff *skb, nf_reset(skb); } -static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb) +static inline int ipmr_forward_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } /* @@ -1745,7 +1739,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * to blackhole. */ - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } @@ -1787,8 +1781,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, dev, + NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dev, ipmr_forward_finish); return; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 61eafc9b4545..c3776ff6749f 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,9 +17,8 @@ #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) +int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; @@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, } } -static int nf_ip_reroute(struct sk_buff *skb, +static int nf_ip_reroute(struct net *net, struct sk_buff *skb, const struct nf_queue_entry *entry) { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb, skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(skb, RTN_UNSPEC); + return ip_route_me_harder(net, skb, RTN_UNSPEC); } return 0; } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a35584176535..c187c60e3e0c 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -60,6 +60,7 @@ config NFT_REJECT_IPV4 config NFT_DUP_IPV4 tristate "IPv4 nf_tables packet duplication support" + depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 help This module enables IPv4 packet duplication support for nf_tables. diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 8f87fc38ccde..11dccba474b7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -186,7 +186,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", indev, arpinfo->iniface, - arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : ""); return 0; } @@ -195,7 +195,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, arpinfo->outiface, - arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : ""); return 0; } @@ -247,10 +247,10 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) } unsigned int arpt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; @@ -285,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, */ e = get_entry(table_base, private->hook_entry[hook]); + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.hooknum = hook; @@ -467,7 +468,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } - next: +next: duprintf("Finished chain %u\n", hook); } return 1; @@ -631,7 +632,7 @@ static inline void cleanup_entry(struct arpt_entry *e) * newinfo). */ static int translate_table(struct xt_table_info *newinfo, void *entry0, - const struct arpt_replace *repl) + const struct arpt_replace *repl) { struct arpt_entry *iter; unsigned int i; @@ -891,7 +892,7 @@ static int compat_table_info(const struct xt_table_info *info, #endif static int get_info(struct net *net, void __user *user, - const int *len, int compat) + const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1068,7 +1069,7 @@ static int __do_replace(struct net *net, const char *name, } static int do_replace(struct net *net, const void __user *user, - unsigned int len) + unsigned int len) { int ret; struct arpt_replace tmp; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 93876d03120c..1897ee160920 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,10 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int -arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +arptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return arpt_do_table(skb, ops->hooknum, state, - net->ipv4.arptable_filter); + return arpt_do_table(skb, state, state->net->ipv4.arptable_filter); } static struct nf_hook_ops *arpfilter_ops __read_mostly; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b0a86e73451c..b99affad6ba1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -102,7 +102,7 @@ ip_packet_match(const struct iphdr *ip, if (FWINV(ret != 0, IPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", indev, ipinfo->iniface, - ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : ""); return false; } @@ -111,7 +111,7 @@ ip_packet_match(const struct iphdr *ip, if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, ipinfo->outiface, - ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : ""); return false; } @@ -120,7 +120,7 @@ ip_packet_match(const struct iphdr *ip, FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { dprintf("Packet protocol %hi does not match %hi.%s\n", ip->protocol, ipinfo->proto, - ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : ""); return false; } @@ -246,7 +246,8 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, return 0; } -static void trace_packet(const struct sk_buff *skb, +static void trace_packet(struct net *net, + const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, @@ -258,7 +259,6 @@ static void trace_packet(const struct sk_buff *skb, const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; - struct net *net = dev_net(in ? in : out); root = get_entry(private->entries, private->hook_entry[hook]); @@ -285,10 +285,10 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ipt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; /* Initializing verdict to NF_DROP keeps gcc happy. */ @@ -315,6 +315,7 @@ ipt_do_table(struct sk_buff *skb, acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.family = NFPROTO_IPV4; @@ -378,8 +379,8 @@ ipt_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, state->in, state->out, - table->name, private, e); + trace_packet(state->net, skb, hook, state->in, + state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { @@ -430,8 +431,8 @@ ipt_do_table(struct sk_buff *skb, } while (!acpar.hotdrop); pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); - xt_write_recseq_end(addend); - local_bh_enable(); + xt_write_recseq_end(addend); + local_bh_enable(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -483,7 +484,7 @@ mark_source_chains(const struct xt_table_info *newinfo, unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -548,7 +549,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } - next: +next: duprintf("Finished chain %u\n", hook); } return 1; @@ -803,7 +804,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, - const struct ipt_replace *repl) + const struct ipt_replace *repl) { struct ipt_entry *iter; unsigned int i; @@ -1077,7 +1078,7 @@ static int compat_table_info(const struct xt_table_info *info, #endif static int get_info(struct net *net, void __user *user, - const int *len, int compat) + const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1303,7 +1304,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) static int do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) + unsigned int len, int compat) { unsigned int i; struct xt_counters_info tmp; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 45cb16a6a4a3..4a9e6db9df8d 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -492,14 +492,14 @@ static void arp_print(struct arp_payload *payload) { #define HBUFFERLEN 30 char hbuffer[HBUFFERLEN]; - int j,k; + int j, k; - for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { + for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); - hbuffer[k++]=':'; + hbuffer[k++] = ':'; } - hbuffer[--k]='\0'; + hbuffer[--k] = '\0'; pr_debug("src %pI4@%s, dst %pI4\n", &payload->src_ip, hbuffer, &payload->dst_ip); @@ -507,14 +507,14 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(const struct nf_hook_ops *ops, +arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; - struct net *net = dev_net(state->in ? state->in : state->out); + struct net *net = state->net; /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 87907d4bd259..1d16c0f28df0 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(skb, hook); + nf_send_reset(par->net, skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 95ea633e8356..5fdc556514ba 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) } static void -synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, +synproxy_send_tcp(const struct synproxy_net *snet, + const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { + struct net *net = nf_ct_net(snet->tmpl); + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; @@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { @@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, nf_conntrack_get(nfct); } - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; free_nskb: @@ -68,7 +71,8 @@ free_nskb: } static void -synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, +synproxy_send_client_synack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; @@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } @@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } @@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void @@ -226,8 +230,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, - niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); } static bool @@ -258,7 +262,7 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_net *snet = synproxy_pernet(par->net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -287,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(skb, th, &opts); + synproxy_send_client_synack(snet, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { @@ -299,11 +303,11 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, +static unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); + struct synproxy_net *snet = synproxy_pernet(nhs->net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -433,14 +437,12 @@ static struct xt_target synproxy_tg4_reg __read_mostly = { static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { { .hook = ipv4_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv4_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index 14a2aa8b8a14..a787d07f6cb7 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -25,7 +25,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) bool r; pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); - r=(spi >= min && spi <= max) ^ invert; + r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c4ffc9de1654..78cc64eddfc1 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -32,12 +32,11 @@ static __be32 rpfilter_get_saddr(__be32 addr) return addr; } -static bool rpfilter_lookup_reverse(struct flowi4 *fl4, +static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; bool dev_match; - struct net *net = dev_net(dev); int ret __maybe_unused; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) @@ -96,7 +95,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; - return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index a0f3beca52d2..397ef2dd133e 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,19 +33,16 @@ static const struct xt_table packet_filter = { }; static unsigned int -iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter); + return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 62cbb8c5f4a8..ba5d392a13c4 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -39,7 +39,6 @@ static const struct xt_table packet_mangler = { static unsigned int ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) { - struct net_device *out = state->out; unsigned int ret; const struct iphdr *iph; u_int8_t tos; @@ -59,8 +58,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state, - dev_net(out)->ipv4.iptable_mangle); + ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -69,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -80,18 +78,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* The work comes in here from netfilter.c. */ static unsigned int -iptable_mangle_hook(const struct nf_hook_ops *ops, +iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (ops->hooknum == NF_INET_LOCAL_OUT) + if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state); - if (ops->hooknum == NF_INET_POST_ROUTING) - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->out)->ipv4.iptable_mangle); + if (state->hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, state, + state->net->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->in)->ipv4.iptable_mangle); + return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 0d4d9cdf98a4..ae2cd2752046 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -28,49 +28,46 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table); + return ipt_do_table(skb, state, state->net->ipv4.nat_table); } -static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, @@ -78,7 +75,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_out, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, @@ -86,7 +82,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_local_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, @@ -94,7 +89,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0356e6da4bb7..1ba02811acb0 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,19 +20,16 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw); + return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 4bce3980ccd9..c2e23d5e9cd4 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,20 +37,16 @@ static const struct xt_table security_table = { }; static unsigned int -iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* Somebody is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, - net->ipv4.iptable_security); + return ipt_do_table(skb, state, state->net->ipv4.iptable_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -83,7 +79,7 @@ static int __init iptable_security_init(void) int ret; ret = register_pernet_subsys(&iptable_security_net_ops); - if (ret < 0) + if (ret < 0) return ret; sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8a2caaf3940b..461ca926fd39 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv4_helper(const struct nf_hook_ops *ops, +static unsigned int ipv4_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -119,7 +119,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops, ct, ctinfo); } -static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, +static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -143,14 +143,14 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } -static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -158,7 +158,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } /* Connection tracking may drop packets, but never alters them, so @@ -166,42 +166,36 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { .hook = ipv4_conntrack_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index cdde3ec496e9..c567e1b5d799 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -30,7 +30,7 @@ static inline struct nf_icmp_net *icmp_pernet(struct net *net) } static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; @@ -144,7 +144,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), - PF_INET, &origtuple)) { + PF_INET, net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 9306ec4fab41..6fb869f646bf 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,14 +22,15 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, + u_int32_t user) { int err; skb_orphan(skb); local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) { @@ -61,15 +62,14 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, return IP_DEFRAG_CONNTRACK_OUT + zone_id; } -static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (sk && (sk->sk_family == PF_INET) && - inet->nodefrag) + if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && + inet_sk(sk)->nodefrag) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -83,9 +83,9 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = - nf_ct_defrag_user(ops->hooknum, skb); + nf_ct_defrag_user(state->hook, skb); - if (nf_ct_ipv4_gather_frags(skb, user)) + if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; @@ -94,14 +94,12 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, static struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 2d79e6e8d934..ceb187308120 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -23,25 +23,10 @@ #include <net/netfilter/nf_conntrack.h> #endif -static struct net *pick_net(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_NS - const struct dst_entry *dst; - - if (skb->dev != NULL) - return dev_net(skb->dev); - dst = skb_dst(skb); - if (dst != NULL && dst->dev != NULL) - return dev_net(dst->dev); -#endif - return &init_net; -} - -static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, - int oif) +static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, + const struct in_addr *gw, int oif) { const struct iphdr *iph = ip_hdr(skb); - struct net *net = pick_net(skb); struct rtable *rt; struct flowi4 fl4; @@ -65,7 +50,7 @@ static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, return true; } -void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, +void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in_addr *gw, int oif) { struct iphdr *iph; @@ -105,9 +90,9 @@ void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, --iph->ttl; ip_send_check(iph); - if (nf_dup_ipv4_route(skb, gw, oif)) { + if (nf_dup_ipv4_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip_local_out(skb); + ip_local_out(net, skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 22f4579b0c2a..5075b7ecd26d 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -255,9 +255,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -266,7 +266,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); /* We never see fragments: conntrack defrags on pre-routing * and local-out, and nf_nat_out protects post-routing. @@ -295,7 +295,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) + state->hook)) return NF_DROP; else return NF_ACCEPT; @@ -308,21 +308,21 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, state, ct); + ret = do_chain(priv, skb, state, ct); if (ret != NF_ACCEPT) return ret; - if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) break; - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } @@ -332,11 +332,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -345,9 +345,9 @@ oif_changed: EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -355,7 +355,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); @@ -365,9 +365,9 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -384,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -396,7 +396,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } @@ -407,9 +407,9 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -424,14 +424,14 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -440,7 +440,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 657d2307f031..b3ca21b2ba9b 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -45,7 +45,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct net *net = nf_ct_net(ct); const struct nf_conn *master = ct->master; struct nf_conntrack_expect *other_exp; - struct nf_conntrack_tuple t; + struct nf_conntrack_tuple t = {}; const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; struct nf_nat_range range; diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 7c676671329d..ddb894ac1458 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -1156,7 +1156,7 @@ static int snmp_parse_mangle(unsigned char *msg, } if (obj->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4 , map, check); + mangle_address(ctx.begin, ctx.pointer - 4, map, check); kfree(obj->id); kfree(obj); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 3262e41ff76f..c747b2d9eb77 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); /* Send RST reply */ -void nf_send_reset(struct sk_buff *oldskb, int hook) +void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; const struct iphdr *oiph; @@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; /* "Never happens" */ @@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 8412268bbad1..9d09d4f59545 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -15,15 +15,15 @@ #include <net/netfilter/nf_tables.h> static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, +nft_do_chain_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, ops, skb, state); + nft_set_pktinfo(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } static struct nft_af_info nft_af_arp __read_mostly = { diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index aa180d3a69a5..ca9dc3c46c4f 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -18,18 +18,18 @@ #include <net/ip.h> #include <net/netfilter/nf_tables_ipv4.h> -static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, +static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, +static unsigned int nft_ipv4_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -41,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv4(ops, skb, state); + return nft_do_chain_ipv4(priv, skb, state); } struct nft_af_info nft_af_ipv4 __read_mostly = { diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index bf5c30ae14e4..f5c66a7a4bf2 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -26,44 +26,44 @@ #include <net/netfilter/nf_nat_l3proto.h> #include <net/ip.h> -static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv4 = { diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index e335b0afdaf3..2375b0a8be46 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -21,7 +21,7 @@ #include <net/route.h> #include <net/ip.h> -static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +static unsigned int nf_route_table_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -37,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); mark = skb->mark; iph = ip_hdr(skb); @@ -45,7 +45,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, daddr = iph->daddr; tos = iph->tos; - ret = nft_do_chain(&pkt, ops); + ret = nft_do_chain(&pkt, priv); if (ret != NF_DROP && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, iph->daddr != daddr || skb->mark != mark || iph->tos != tos) - if (ip_route_me_harder(skb, RTN_UNSPEC)) + if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) ret = NF_DROP; } return ret; diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index b45932d43b69..bf855e64fc45 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -30,7 +30,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, }; int oif = regs->data[priv->sreg_dev]; - nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif); + nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); } static int nft_dup_ipv4_init(const struct nft_ctx *ctx, diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 40e414c4ca56..b72ffc58e255 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -26,7 +26,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index d8d795df9c13..c09d4381427e 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -36,7 +36,7 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, mr.range[0].flags |= priv->flags; regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, - pkt->ops->hooknum); + pkt->hook); } static struct nft_expr_type nft_redir_ipv4_type; diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index b07e58b51158..c24f41c816b3 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,11 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->skb, pkt->ops->hooknum); + nf_send_reset(pkt->net, pkt->skb, pkt->hook); break; default: break; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 561cd4b8fc6e..63e5be0abd86 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -406,13 +406,16 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->transport_header += iphlen; + if (iph->protocol == IPPROTO_ICMP && + length >= iphlen + sizeof(struct icmphdr)) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); } - if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(net, ((struct icmphdr *) - skb_transport_header(skb))->type); - err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, - NULL, rt->dst.dev, dst_output_sk); + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, rt->dst.dev, + dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -483,6 +486,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; @@ -542,7 +546,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); + err = ip_cmsg_send(net, msg, &ipc, false); if (err) goto out; if (ipc.opt) @@ -597,6 +601,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, &fl4); + if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -607,7 +614,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c81deb85acb4..85f184e429c6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,7 +112,7 @@ #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> -#include <net/vrf.h> +#include <net/l3mdev.h> #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -847,7 +847,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); - vif = vrf_master_ifindex_rcu(rt->dst.dev); + vif = l3mdev_master_ifindex_rcu(rt->dst.dev); rcu_read_unlock(); net = dev_net(rt->dst.dev); @@ -941,7 +941,7 @@ static int ip_error(struct sk_buff *skb) } peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - vrf_master_ifindex(skb->dev), 1); + l3mdev_master_ifindex(skb->dev), 1); send = true; if (peer) { @@ -1152,7 +1152,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) +static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1438,12 +1438,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, } static struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, bool nopolicy, bool noxfrm, bool will_cache) { - return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | - (nopolicy ? DST_NOPOLICY : 0) | - (noxfrm ? DST_NOXFRM : 0)); + struct rtable *rt; + + rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); + + if (rt) { + rt->rt_genid = rt_genid_ipv4(dev_net(dev)); + rt->rt_flags = flags; + rt->rt_type = type; + rt->rt_is_input = 0; + rt->rt_iif = 0; + rt->rt_pmtu = 0; + rt->rt_gateway = 0; + rt->rt_uses_gateway = 0; + rt->rt_table_id = 0; + INIT_LIST_HEAD(&rt->rt_uncached); + + rt->dst.output = ip_output; + if (flags & RTCF_LOCAL) + rt->dst.input = ip_local_deliver; + } + + return rt; } /* called in rcu_read_lock() section */ @@ -1452,6 +1474,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct rtable *rth; struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; u32 itag = 0; int err; @@ -1464,9 +1487,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) - if (ipv4_is_loopback(saddr)) - goto e_inval; + if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) + goto e_inval; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) @@ -1477,7 +1499,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err < 0) goto e_err; } - rth = rt_dst_alloc(dev_net(dev)->loopback_dev, + if (our) + flags |= RTCF_LOCAL; + + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1486,20 +1511,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev)); - rth->rt_flags = RTCF_MULTICAST; - rth->rt_type = RTN_MULTICAST; rth->rt_is_input= 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); - if (our) { - rth->dst.input= ip_local_deliver; - rth->rt_flags |= RTCF_LOCAL; - } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) @@ -1608,7 +1620,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev->dev, + rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { @@ -1616,19 +1628,12 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); - rth->rt_flags = 0; - rth->rt_type = res->type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; - rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); if (lwtunnel_output_redirect(rth->dst.lwtstate)) { @@ -1646,6 +1651,48 @@ out: return err; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* To make ICMP packets follow the right flow, the multipath hash is + * calculated from the inner IP addresses in reverse order. + */ +static int ip_multipath_icmp_hash(struct sk_buff *skb) +{ + const struct iphdr *outer_iph = ip_hdr(skb); + struct icmphdr _icmph; + const struct icmphdr *icmph; + struct iphdr _inner_iph; + const struct iphdr *inner_iph; + + if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) + goto standard_hash; + + icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), + &_icmph); + if (!icmph) + goto standard_hash; + + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_REDIRECT && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB) { + goto standard_hash; + } + + inner_iph = skb_header_pointer(skb, + outer_iph->ihl * 4 + sizeof(_icmph), + sizeof(_inner_iph), &_inner_iph); + if (!inner_iph) + goto standard_hash; + + return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); + +standard_hash: + return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); +} + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, const struct flowi4 *fl4, @@ -1653,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) - fib_select_multipath(res); + if (res->fi && res->fi->fib_nhs > 1) { + int h; + + if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP)) + h = ip_multipath_icmp_hash(skb); + else + h = fib_multipath_hash(saddr, daddr); + fib_select_multipath(res, h); + } #endif /* create a routing cache entry */ @@ -1706,6 +1760,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_source; res.fi = NULL; + res.table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1733,7 +1788,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex; + fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; @@ -1754,7 +1809,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; goto local_input; } @@ -1776,7 +1831,7 @@ brd_input: err = fib_validate_source(skb, saddr, 0, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; @@ -1796,26 +1851,18 @@ local_input: } } - rth = rt_dst_alloc(net->loopback_dev, + rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; - rth->dst.input= ip_local_deliver; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - - rth->rt_genid = rt_genid_ipv4(net); - rth->rt_flags = flags|RTCF_LOCAL; - rth->rt_type = res.type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res.table) + rth->rt_table_id = res.table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { @@ -1837,6 +1884,7 @@ no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; res.fi = NULL; + res.table = NULL; goto local_input; /* @@ -1859,8 +1907,6 @@ e_nobufs: goto out; martian_source: - err = -EINVAL; -martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } @@ -1988,28 +2034,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } add: - rth = rt_dst_alloc(dev_out, + rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), do_cache); if (!rth) return ERR_PTR(-ENOBUFS); - rth->dst.output = ip_output; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); - rth->rt_flags = flags; - rth->rt_type = type; - rth->rt_is_input = 0; rth->rt_iif = orig_oif ? : 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; + RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) - rth->dst.input = ip_local_deliver; if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { @@ -2038,7 +2075,8 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + int mp_hash) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2137,11 +2175,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - if (netif_is_vrf(dev_out) && - !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) { - rth = vrf_dev_get_rth(dev_out); + + rth = l3mdev_get_rtable(dev_out, fl4); + if (rth) goto out; - } } if (!fl4->daddr) { @@ -2159,7 +2196,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif) { + if (fl4->flowi4_oif && + !netif_index_is_l3_master(net, fl4->flowi4_oif)) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2201,18 +2239,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) - fib_select_multipath(&res); - else -#endif - if (!res.prefixlen && - res.table->tb_num_default > 1 && - res.type == RTN_UNICAST && !fl4->flowi4_oif) - fib_select_default(fl4, &res); - - if (!fl4->saddr) - fl4->saddr = FIB_RES_PREFSRC(net, res); + fib_select_path(net, &res, fl4, mp_hash); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; @@ -2225,7 +2252,7 @@ out: rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { @@ -2277,7 +2304,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; new->dev = ort->dst.dev; if (new->dev) @@ -2303,7 +2330,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, - struct sock *sk) + const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); @@ -2319,7 +2346,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, __be32 dst, __be32 src, +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, int event, int nowait, unsigned int flags) { @@ -2339,8 +2366,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = fl4->flowi4_tos; - r->rtm_table = RT_TABLE_MAIN; - if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + r->rtm_table = table_id; + if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; @@ -2445,6 +2472,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int err; int mark; struct sk_buff *skb; + u32 table_id = RT_TABLE_MAIN; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2480,6 +2508,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + if (netif_index_is_l3_master(net, fl4.flowi4_oif)) + fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; + if (iif) { struct net_device *dev; @@ -2514,7 +2545,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, dst, src, &fl4, skb, + if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) + table_id = rt->rt_table_id; + + err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err < 0) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d70b1f603692..4cbe9f0a4281 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, - __u16 *mssp) +__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - tcp_synq_overflow(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return __cookie_v4_init_sequence(iph, th, mssp); } @@ -225,10 +221,13 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; + bool own_req; - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + NULL, &own_req); if (child) { atomic_set(&req->rsk_refcnt, 1); + sock_rps_save_rxhash(child, skb); inet_csk_reqsk_queue_add(sk, req, child); } else { reqsk_free(req); @@ -288,6 +287,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; @@ -326,7 +329,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ if (!req) goto out; @@ -345,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->snt_synack.v64 = 0; treq->tfo_listener = false; ireq->ir_iif = sk->sk_bound_dev_if; @@ -381,10 +384,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 894da3a70aff..a0bd7a55193e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,14 +48,14 @@ static void set_local_port_range(struct net *net, int range[2]) { bool same_parity = !((range[0] ^ range[1]) & 1); - write_seqlock(&net->ipv4.ip_local_ports.lock); + write_seqlock_bh(&net->ipv4.ip_local_ports.lock); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.ip_local_ports.lock); + write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_recovery", + .data = &sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "tcp_reordering", .data = &sysctl_tcp_reordering, .maxlen = sizeof(int), @@ -577,6 +584,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_min_rtt_wlen", + .data = &sysctl_tcp_min_rtt_wlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b8b8fa184f75..c82cca18c90f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + tp->rtt_min[0].rtt = ~0U; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -450,11 +451,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + int state; sock_rps_record_flow(sk); sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_LISTEN) + + state = sk_state_load(sk); + if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -491,14 +495,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; /* Connected or passive Fast Open socket? */ - if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) { + if (state != TCP_SYN_SENT && + (state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -506,9 +510,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - /* Potential race condition. If read of tp below will - * escape above sk->sk_state, we can be illegally awaken - * in SYN_* states. */ if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; @@ -516,8 +517,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after @@ -900,11 +900,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto out_err; } - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; @@ -967,7 +968,8 @@ new_segment: copied += copy; offset += copy; - if (!(size -= copy)) { + size -= copy; + if (!size) { tcp_tx_timestamp(sk, skb); goto out; } @@ -988,7 +990,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1111,7 +1114,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto do_error; } @@ -1129,7 +1133,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1267,7 +1271,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1767,7 +1772,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* __ Restore normal policy in scheduler __ */ - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; @@ -1778,7 +1784,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, do_prequeue: tcp_prequeue_process(sk); - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; @@ -1926,7 +1933,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2230,7 +2237,8 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; - if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq += tp->max_window + 2; + if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; @@ -2253,13 +2261,6 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); -} - static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && @@ -2581,7 +2582,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, TCPF_LISTEN))) { tcp_fastopen_init_key_once(true); - err = fastopen_init_queue(sk, val); + fastopen_queue_tune(sk, val); } else { err = -EINVAL; } @@ -2642,7 +2643,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk->sk_state; + info->tcpi_state = sk_state_load(sk); + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2670,7 +2672,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (sk->sk_state == TCP_LISTEN) { + if (info->tcpi_state == TCP_LISTEN) { info->tcpi_unacked = sk->sk_ack_backlog; info->tcpi_sacked = sk->sk_max_ack_backlog; } else { @@ -2849,10 +2851,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq) - val = icsk->icsk_accept_queue.fastopenq->max_qlen; - else - val = 0; + val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; case TCP_TIMESTAMP: diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 93c4dc3ab23f..882caa4e72bc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -173,6 +173,10 @@ out: */ if (ca->get_info) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) @@ -181,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } static void tcp_reinit_congestion_control(struct sock *sk, @@ -192,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); + if (sk->sk_state != TCP_CLOSE) + tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 479f34946177..b31604086edd 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -21,7 +21,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct tcp_info *info = _info; - if (sk->sk_state == TCP_LISTEN) { + if (sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else if (sk->sk_type == SOCK_STREAM) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f9c0fb84e435..55be6ac70cff 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -124,27 +124,29 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } -static bool tcp_fastopen_create_child(struct sock *sk, - struct sk_buff *skb, - struct dst_entry *dst, - struct request_sock *req) +static struct sock *tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; u32 end_seq; + bool own_req; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + NULL, &own_req); if (!child) - return false; + return NULL; - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); + spin_lock(&queue->fastopenq.lock); + queue->fastopenq.qlen++; + spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created @@ -161,15 +163,13 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); + atomic_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); @@ -178,12 +178,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * Note that IPv6 might also have used skb_get() trick - * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts) - * So we need to eventually get a clone of the packet, - * before inserting it in sk_receive_queue. + /* Queue the data carried in the SYN packet. + * We used to play tricky games with skb_get(). + * With lockless listener, it is a dead end. + * Do not think about it. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since @@ -191,12 +189,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, */ end_seq = TCP_SKB_CB(skb)->end_seq; if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2; - - if (unlikely(skb_shared(skb))) - skb2 = skb_clone(skb, GFP_ATOMIC); - else - skb2 = skb_get(skb); + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (likely(skb2)) { skb_dst_drop(skb2); @@ -214,11 +207,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, } } tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; - sk->sk_data_ready(sk); - bh_unlock_sock(child); - sock_put(child); - WARN_ON(!req->sk); - return true; + /* tcp_conn_request() is sending the SYNACK, + * and queues the child into listener accept queue. + */ + return child; } static bool tcp_fastopen_queue_check(struct sock *sk) @@ -235,8 +227,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (!fastopenq || fastopenq->max_qlen == 0) + fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { @@ -261,13 +253,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ -bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst) +struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) { struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -276,7 +269,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; - return false; + return NULL; } if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) @@ -296,11 +289,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, * data in SYN_RECV state. */ fastopen: - if (tcp_fastopen_create_child(sk, skb, dst, req)) { + child = tcp_fastopen_create_child(sk, skb, dst, req); + if (child) { foc->len = -1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - return true; + return child; } NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ @@ -308,6 +302,5 @@ fastopen: valid_foc.exp = foc->exp; *foc = valid_foc; - return false; + return NULL; } -EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a8f515bb19c4..2d656eef7f8e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; +int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_thin_dupack __read_mostly; @@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > 0) tcp_disable_early_retrans(tp); + tp->rack.reord = 1; } /* This must be called before lost_out is incremented */ @@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) } } -static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, - struct sk_buff *skb) +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); @@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, return !before(start_seq, end_seq - tp->max_window); } -/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "B". Later note: FACK people cheated me again 8), we have to account - * for reordering! Ugly, but should help. - * - * Search retransmitted skbs from write_queue that were sent when snd_nxt was - * less than what is now known to be received by the other end (derived from - * highest SACK block). Also calculate the lowest snd_nxt among the remaining - * retransmitted skbs to avoid some costly processing per ACKs. - */ -static void tcp_mark_lost_retrans(struct sock *sk, int *flag) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int cnt = 0; - u32 new_low_seq = tp->snd_nxt; - u32 received_upto = tcp_highest_sack_seq(tp); - - if (!tcp_is_fack(tp) || !tp->retrans_out || - !after(received_upto, tp->lost_retrans_low) || - icsk->icsk_ca_state != TCP_CA_Recovery) - return; - - tcp_for_write_queue(skb, sk) { - u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; - - if (skb == tcp_send_head(sk)) - break; - if (cnt == tp->retrans_out) - break; - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - continue; - - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) - continue; - - /* TODO: We would like to get rid of tcp_is_fack(tp) only - * constraint here (see above) but figuring out that at - * least tp->reordering SACK blocks reside between ack_seq - * and received_upto is not easy task to do cheaply with - * the available datastructures. - * - * Whether FACK should check here for tp->reordering segs - * in-between one could argue for either way (it would be - * rather simple to implement as we could count fack_count - * during the walk and do tp->fackets_out - fack_count). - */ - if (after(received_upto, ack_seq)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - *flag |= FLAG_LOST_RETRANS; - tcp_skb_mark_lost_uncond_verify(tp, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); - } else { - if (before(ack_seq, new_low_seq)) - new_low_seq = ack_seq; - cnt += tcp_skb_pcount(skb); - } - } - - if (tp->retrans_out) - tp->lost_retrans_low = new_low_seq; -} - static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { + tcp_rack_advance(tp, xmit_time, sacked); + if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing @@ -1837,7 +1776,6 @@ advance_sp: ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk, &state->flag); tcp_verify_left_out(tp); out: @@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) +{ + return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + before(tp->rx_opt.rcv_tsecr, when); +} + +/* skb is spurious retransmitted if the returned timestamp echo + * reply is prior to the skb transmission time + */ +static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, + const struct sk_buff *skb) +{ + return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); +} + /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || - (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp)); + tcp_tsopt_ecr_before(tp, tp->retrans_stamp); } /* Undo procedures. */ @@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && + tcp_rack_mark_lost(sk)) + flag |= FLAG_LOST_RETRANS; + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: @@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_xmit_retransmit_queue(sk); } +/* Kathleen Nichols' algorithm for tracking the minimum value of + * a data stream over some fixed time interval. (E.g., the minimum + * RTT over the past five minutes.) It uses constant space and constant + * time per update yet almost always delivers the same minimum as an + * implementation that has to keep all the data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of the + * n'th best >= n-1'th best. It also makes sure that the three values + * are widely separated in the time window since that bounds the worse + * case error when that data is monotonically increasing over the window. + * + * Upon getting a new min, we can forget everything earlier because it + * has no value - the new min is <= everything else in the window by + * definition and it's the most recent. So we restart fresh on every new min + * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd + * best. + */ +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +{ + const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; + struct rtt_meas *m = tcp_sk(sk)->rtt_min; + struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + u32 elapsed; + + /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ + if (unlikely(rttm.rtt <= m[0].rtt)) + m[0] = m[1] = m[2] = rttm; + else if (rttm.rtt <= m[1].rtt) + m[1] = m[2] = rttm; + else if (rttm.rtt <= m[2].rtt) + m[2] = rttm; + + elapsed = now - m[0].ts; + if (unlikely(elapsed > wlen)) { + /* Passed entire window without a new min so make 2nd choice + * the new min & 3rd choice the new 2nd. So forth and so on. + */ + m[0] = m[1]; + m[1] = m[2]; + m[2] = rttm; + if (now - m[0].ts > wlen) { + m[0] = m[1]; + m[1] = rttm; + if (now - m[0].ts > wlen) + m[0] = rttm; + } + } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { + /* Passed a quarter of the window without a new min so + * take 2nd choice from the 2nd quarter of the window. + */ + m[2] = m[1] = rttm; + } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { + /* Passed half the window without a new min so take the 3rd + * choice from the last half of the window. + */ + m[2] = rttm; + } +} + static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - long seq_rtt_us, long sack_rtt_us) + long seq_rtt_us, long sack_rtt_us, + long ca_rtt_us) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * Karn's algorithm forbids taking RTT if some retransmitted data * is acked (RFC6298). */ - if (flag & FLAG_RETRANS_DATA_ACKED) - seq_rtt_us = -1L; - if (seq_rtt_us < 0) seq_rtt_us = sack_rtt_us; @@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - + seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - + tp->rx_opt.rcv_tsecr); if (seq_rtt_us < 0) return false; + /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is + * always taken together with ACK, SACK, or TS-opts. Any negative + * values will be skipped with the seq_rtt_us < 0 check above. + */ + tcp_update_rtt_min(sk, ca_rtt_us); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); @@ -2953,21 +2974,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) +void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - long seq_rtt_us = -1L; + long rtt_us = -1L; - if (synack_stamp && !tp->total_retrans) - seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); + if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { + struct skb_mstamp now; - /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets - * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() - */ - if (!tp->srtt_us) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); + skb_mstamp_get(&now); + rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); + } + + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); } + static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= acked_pcount; + else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_SACK_RENEGING; skb_mstamp_get(&now); - if (likely(first_ackt.v64)) { + if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); } @@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); } - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, + ca_rtt_us); if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); @@ -4457,19 +4481,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_buff *skb; + int err = -ENOMEM; + int data_len = 0; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size, sk->sk_allocation); + if (size > PAGE_SIZE) { + int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS); + + data_len = npages << PAGE_SHIFT; + size = data_len + (size & ~PAGE_MASK); + } + skb = alloc_skb_with_frags(size - data_len, data_len, + PAGE_ALLOC_COSTLY_ORDER, + &err, sk->sk_allocation); if (!skb) goto err; + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - if (memcpy_from_msg(skb_put(skb, size), msg, size)) + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (err) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; @@ -4485,7 +4524,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) err_free: kfree_skb(skb); err: - return -ENOMEM; + return err; + } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -5472,7 +5512,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5643,6 +5683,7 @@ discard: } tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->copied_seq = tp->rcv_nxt; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -5698,15 +5739,14 @@ reset_and_undo: * address independent. */ -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; - u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5750,7 +5790,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5785,15 +5825,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (!acceptable) return 1; + if (!tp->srtt_us) + tcp_synack_rtt_meas(sk, req); + /* Once we leave TCP_SYN_RECV, we no longer need req * so release it. */ if (req) { - synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { - synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); @@ -5816,7 +5857,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, synack_stamp); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -6023,11 +6063,11 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_time_stamp; + skb_mstamp_get(&tcp_rsk(req)->snt_synack); tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -6043,9 +6083,11 @@ static void tcp_openreq_init(struct request_sock *req, } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener) + struct sock *sk_listener, + bool attach_listener) { - struct request_sock *req = reqsk_alloc(ops, sk_listener); + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); if (req) { struct inet_request_sock *ireq = inet_rsk(req); @@ -6065,13 +6107,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(struct sock *sk, +static bool tcp_syn_flood_action(const struct sock *sk, const struct sk_buff *skb, const char *proto) { + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; - struct listen_sock *lopt; #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { @@ -6082,12 +6124,12 @@ static bool tcp_syn_flood_action(struct sock *sk, #endif NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; + if (!queue->synflood_warned && + sysctl_tcp_syncookies != 2 && + xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); - } + return want_cookie; } @@ -6112,16 +6154,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_fastopen_cookie foc = { .len = -1 }; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; - struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); + struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; - bool want_cookie = false, fastopen; + struct request_sock *req; + bool want_cookie = false; struct flowi fl; - struct tcp_fastopen_cookie foc = { .len = -1 }; - int err; - /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is @@ -6145,7 +6186,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops, sk); + req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; @@ -6228,20 +6269,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - + if (!want_cookie) { + tcp_reqsk_record_syn(sk, req, skb); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); + } + if (fastopen_sk) { + af_ops->send_synack(fastopen_sk, dst, &fl, req, + &foc, false); + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, fastopen_sk); + sk->sk_data_ready(sk); + bh_unlock_sock(fastopen_sk); + sock_put(fastopen_sk); + } else { tcp_rsk(req)->tfo_listener = false; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (!want_cookie) + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->send_synack(sk, dst, &fl, req, + &foc, !want_cookie); + if (want_cookie) + goto drop_and_free; } - tcp_reqsk_record_syn(sk, req, skb); - + reqsk_put(req); return 0; drop_and_release: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 93898e093d4e..d8841a2f1569 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -324,7 +324,6 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); } else { /* * Still in SYN_RECV, just remove it silently. @@ -332,9 +331,10 @@ void tcp_req_err(struct sock *sk, u32 seq) * created socket, and POSIX does not want network * errors returned from accept(). */ - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); inet_csk_reqsk_queue_drop(req->rsk_listener, req); + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); } + reqsk_put(req); } EXPORT_SYMBOL(tcp_req_err); @@ -576,7 +576,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -795,7 +795,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV @@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, @@ -818,11 +818,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,12 +833,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb, queue_mapping); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->opt); @@ -865,7 +864,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family) { @@ -877,7 +876,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, sock_owned_by_user(sk) || - lockdep_is_held(&sk->sk_lock.slock)); + lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) @@ -894,7 +893,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, } EXPORT_SYMBOL(tcp_md5_do_lookup); -struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; @@ -922,7 +921,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk)); + sock_owned_by_user(sk) || + lockdep_is_held(&sk->sk_lock.slock)); if (!md5sig) { md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) @@ -1112,10 +1112,13 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +#endif + /* Called with rcu_read_lock() */ -static bool tcp_v4_inbound_md5_hash(struct sock *sk, +static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG /* * This gets called for each TCP segment that arrives * so we want to be efficient. @@ -1165,10 +1168,12 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, return true; } return false; -} #endif + return false; +} -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, +static void tcp_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); @@ -1179,7 +1184,8 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, ireq->opt = tcp_v4_save_options(skb); } -static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { @@ -1218,7 +1224,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, - .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1241,9 +1246,11 @@ EXPORT_SYMBOL(tcp_v4_conn_request); * The three way handshake has completed - we got a valid synack - * now create the new socket. */ -struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct inet_sock *newinet; @@ -1277,7 +1284,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; - sk_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; @@ -1320,7 +1326,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - __inet_hash_nolisten(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + if (*own_req) + tcp_move_syn(newtp, req); return newsk; @@ -1338,34 +1346,11 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - - nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb); #endif @@ -1373,7 +1358,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1404,13 +1389,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_cookie_check(sk, skb); + if (!nsk) goto discard; - if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1420,7 +1405,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { + if (tcp_rcv_state_process(sk, skb)) { rsk = sk; goto reset; } @@ -1508,7 +1493,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (likely(sk->sk_rx_dst)) skb_dst_drop(skb); else - skb_dst_force(skb); + skb_dst_force_safe(skb); __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; @@ -1590,6 +1575,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; +lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; @@ -1598,6 +1584,33 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; + if (likely(sk->sk_state == TCP_LISTEN)) { + nsk = tcp_check_req(sk, skb, req, false); + } else { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1606,25 +1619,23 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif nf_reset(skb); if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v4_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1639,6 +1650,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret; @@ -1709,8 +1721,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { - dst_hold(dst); + if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } @@ -1833,35 +1844,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_OPENREQ) { - struct request_sock *req = cur; - - icsk = inet_csk(st->syn_wait_sk); - req = req->dl_next; - while (1) { - while (req) { - if (req->rsk_ops->family == st->family) { - cur = req; - goto out; - } - req = req->dl_next; - } - if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) - break; -get_req: - req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; - } - sk = sk_nulls_next(st->syn_wait_sk); - st->state = TCP_SEQ_STATE_LISTENING; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } else { - icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) - goto start_req; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_nulls_next(sk); - } + sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) @@ -1871,16 +1854,6 @@ get_sk: goto out; } icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) { -start_req: - st->uid = sock_i_uid(sk); - st->syn_wait_sk = sk; - st->state = TCP_SEQ_STATE_OPENREQ; - st->sbucket = 0; - goto get_req; - } - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2012,7 +1985,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) void *rc = NULL; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; @@ -2071,7 +2043,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { @@ -2096,11 +2067,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) struct tcp_iter_state *st = seq->private; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: - if (v) { - struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); @@ -2154,7 +2120,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid) + struct seq_file *f, int i) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->rsk_timer.expires - jiffies; @@ -2171,7 +2137,8 @@ static void get_openreq4(const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_delta_to_clock_t(delta), req->num_timeout, - from_kuid_munged(seq_user_ns(f), uid), + from_kuid_munged(seq_user_ns(f), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2185,12 +2152,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); int rx_queue; + int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || @@ -2208,17 +2176,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - if (sk->sk_state == TCP_LISTEN) + state = sk_state_load(sk); + if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else - /* - * because we dont lock socket, we might find a transient negative value + /* Because we don't lock the socket, + * we might find a transient negative value. */ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", - i, src, srcp, dest, destp, sk->sk_state, + i, src, srcp, dest, destp, state, tp->write_seq - tp->snd_una, rx_queue, timer_active, @@ -2232,8 +2201,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sk->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + state == TCP_LISTEN ? + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } @@ -2272,18 +2241,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: - get_openreq4(v, seq, st->num, st->uid); - break; - } + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + get_openreq4(v, seq, st->num); + else + get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index def765911ff8..ac6b1961ffeb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -361,30 +361,38 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +/* Warning : This function is called without sk_listener being locked. + * Be sure to read socket fields once, as their value could change under us. + */ void tcp_openreq_init_rwin(struct request_sock *req, - struct sock *sk, struct dst_entry *dst) + const struct sock *sk_listener, + const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - __u8 rcv_wscale; + const struct tcp_sock *tp = tcp_sk(sk_listener); + u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); + int full_space = tcp_full_space(sk_listener); int mss = dst_metric_advmss(dst); + u32 window_clamp; + __u8 rcv_wscale; - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + if (user_mss && user_mss < mss) + mss = user_mss; + window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); + if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -433,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) +struct sock *tcp_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); @@ -460,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + newtp->rtt_min[0].rtt = ~0U; newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -469,7 +480,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; - newtp->lsndtime = treq->snt_synack; + newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; @@ -501,9 +513,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -536,9 +548,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_ecn_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; - - newtp->saved_syn = req->saved_syn; - req->saved_syn = NULL; + newtp->rack.mstamp.v64 = 0; + newtp->rack.advanced = 0; TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } @@ -566,8 +577,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; - - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { @@ -698,7 +708,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); @@ -755,16 +765,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); if (!child) goto listen_overflow; - inet_csk_reqsk_queue_drop(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + sock_rps_save_rxhash(child, skb); + tcp_synack_rtt_meas(child, req); + return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: if (!sysctl_tcp_abort_on_overflow) { @@ -811,8 +819,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), - skb->len); + ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3dbee0d83b15..9bfc39ff2285 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -357,14 +357,10 @@ static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) } static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, - struct sock *sk) +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) { + if (inet_rsk(req)->ecn_ok) th->ece = 1; - if (tcp_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); - } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -612,12 +608,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct sock *sk, - struct request_sock *req, - unsigned int mss, struct sk_buff *skb, - struct tcp_out_options *opts, - const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) +static unsigned int tcp_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -1827,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Ok, it looks like it is advisable to defer. */ - if (cong_win < send_win && cong_win < skb->len) + if (cong_win < send_win && cong_win <= skb->len) *is_cwnd_limited = true; return true; @@ -2060,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { - is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -2142,6 +2136,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; } @@ -2165,7 +2160,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (sk->sk_state == TCP_SYN_RECV) + if (tp->fastopen_rsk) return false; /* TLP is only scheduled when next timer event is RTO. */ @@ -2175,7 +2170,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -2184,9 +2179,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account - * for delayed ack when there's one outstanding packet. + * for delayed ack when there's one outstanding packet. If no RTT + * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1; + timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; if (tp->packets_out == 1) timeout = max_t(u32, timeout, (rtt + (rtt >> 1) + TCP_DELACK_MAX)); @@ -2659,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("retrans_out leaked\n"); } #endif - if (!tp->retrans_out) - tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); @@ -2668,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, - * see tcp_input.c tcp_sacktag_write_queue(). - */ - TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } @@ -2949,20 +2939,22 @@ int tcp_send_synack(struct sock *sk) * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { - struct tcp_out_options opts; struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th; - struct sk_buff *skb; + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *md5 = NULL; + struct tcp_out_options opts; + struct sk_buff *skb; int tcp_header_size; + struct tcphdr *th; + u16 user_mss; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2970,11 +2962,21 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb_set_owner_w(skb, req_to_sk(req)); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -2988,8 +2990,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, rcu_read_lock(); md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2998,7 +3001,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -3012,8 +3015,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); + tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); @@ -3147,7 +3150,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, err = 0, copied; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; struct sk_buff *syn_data; @@ -3185,17 +3188,18 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); - copied = copy_from_iter(skb_put(syn_data, space), space, - &fo->data->msg_iter); - if (unlikely(!copied)) { - kfree_skb(syn_data); - goto fallback; - } - if (copied != space) { - skb_trim(syn_data, copied); - space = copied; + if (space) { + int copied = copy_from_iter(skb_put(syn_data, space), space, + &fo->data->msg_iter); + if (unlikely(!copied)) { + kfree_skb(syn_data); + goto fallback; + } + if (copied != space) { + skb_trim(syn_data, copied); + space = copied; + } } - /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; @@ -3500,13 +3504,14 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } -int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + tcp_rsk(req)->txhash = net_tx_rndhash(); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c new file mode 100644 index 000000000000..5353085fd0b2 --- /dev/null +++ b/net/ipv4/tcp_recovery.c @@ -0,0 +1,109 @@ +#include <linux/tcp.h> +#include <net/tcp.h> + +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; + +/* Marks a packet lost, if some packet sent later has been (s)acked. + * The underlying idea is similar to the traditional dupthresh and FACK + * but they look at different metrics: + * + * dupthresh: 3 OOO packets delivered (packet count) + * FACK: sequence delta to highest sacked sequence (sequence space) + * RACK: sent time delta to the latest delivered packet (time domain) + * + * The advantage of RACK is it applies to both original and retransmitted + * packet and therefore is robust against tail losses. Another advantage + * is being more resilient to reordering by simply allowing some + * "settling delay", instead of tweaking the dupthresh. + * + * The current version is only used after recovery starts but can be + * easily extended to detect the first loss. + */ +int tcp_rack_mark_lost(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + u32 reo_wnd, prior_retrans = tp->retrans_out; + + if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) + return 0; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + + /* To be more reordering resilient, allow min_rtt/4 settling delay + * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed + * RTT because reordering is often a path property and less related + * to queuing or delayed ACKs. + * + * TODO: measure and adapt to the observed reordering delay, and + * use a timer to retransmit like the delayed early retransmit. + */ + reo_wnd = 1000; + if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + + tcp_for_write_queue(skb, sk) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + + if (skb == tcp_send_head(sk)) + break; + + /* Skip ones already (s)acked */ + if (!after(scb->end_seq, tp->snd_una) || + scb->sacked & TCPCB_SACKED_ACKED) + continue; + + if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + + if (skb_mstamp_us_delta(&tp->rack.mstamp, + &skb->skb_mstamp) <= reo_wnd) + continue; + + /* skb is lost if packet sent later is sacked */ + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (scb->sacked & TCPCB_SACKED_RETRANS) { + scb->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSTRETRANSMIT); + } + } else if (!(scb->sacked & TCPCB_RETRANS)) { + /* Original data are sent sequentially so stop early + * b/c the rest are all sent after rack_sent + */ + break; + } + } + return prior_retrans - tp->retrans_out; +} + +/* Record the most recently (re)sent time among the (s)acked packets */ +void tcp_rack_advance(struct tcp_sock *tp, + const struct skb_mstamp *xmit_time, u8 sacked) +{ + if (tp->rack.mstamp.v64 && + !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + return; + + if (sacked & TCPCB_RETRANS) { + struct skb_mstamp now; + + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + skb_mstamp_get(&now); + if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + return; + } + + tp->rack.mstamp = *xmit_time; + tp->rack.advanced = 1; +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7149ebc820c7..193ba1fa8a9a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) } /* Calculate maximal number or retries on an orphaned socket. */ -static int tcp_orphan_retries(struct sock *sk, int alive) +static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = sysctl_tcp_orphan_retries; /* May be zero. */ @@ -168,7 +168,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data) + if (tp->syn_data && icsk->icsk_retransmits == 1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -176,6 +176,18 @@ static int tcp_write_timeout(struct sock *sk) syn_set = true; } else { if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + /* Some middle-boxes may black-hole Fast Open _after_ + * the handshake. Therefore we conservatively disable + * Fast Open on this path on recurring timeouts with + * few or zero bytes acked after Fast Open. + */ + if (tp->syn_data_acked && + tp->bytes_acked <= tp->rx_opt.mss_clamp) { + tcp_fastopen_cache_set(sk, 0, NULL, true, 0); + if (icsk->icsk_retransmits == sysctl_tcp_retries1) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -184,7 +196,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -298,7 +310,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f7d1d5e19e95..0c7b0e61b917 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -100,7 +100,6 @@ #include <linux/slab.h> #include <net/tcp_states.h> #include <linux/skbuff.h> -#include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/net_namespace.h> @@ -375,7 +374,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; return score; } @@ -419,6 +419,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } @@ -1017,30 +1020,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - /* unconnected socket. If output device is enslaved to a VRF - * device lookup source address from VRF table. This mimics - * behavior of ip_route_connect{_init}. - */ - if (netif_index_is_vrf(net, ipc.oif)) { - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, - RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC | - FLOWI_FLAG_SKIP_NH_OIF), - faddr, saddr, dport, - inet->inet_sport); - - rt = ip_route_output_flow(net, fl4, sk); - if (!IS_ERR(rt)) { - saddr = fl4->saddr; - ip_rt_put(rt); - } - } - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, fl4); + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 60b032f58ccc..62e1e72db461 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -22,7 +22,8 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } -static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb) +static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); @@ -52,8 +53,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) iph->tot_len = htons(skb->len); ip_send_check(iph); - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 41a261355662..7ee6518afa86 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -82,24 +82,25 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } -static int __xfrm4_output(struct sock *sk, struct sk_buff *skb) +static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; #ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm4_output(struct sock *sk, struct sk_buff *skb) +int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, skb_dst(skb)->dev, __xfrm4_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb_dst(skb)->dev, + __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 126ff9020bad..7b0edb37a115 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,7 +15,7 @@ #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/vrf.h> +#include <net/l3mdev.h> static struct xfrm_policy_afinfo xfrm4_policy_afinfo; @@ -97,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; @@ -110,10 +111,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) { - oif = vrf_master_ifindex(skb_dst(skb)->dev) ? - : skb_dst(skb)->dev->ifindex; - } + if (skb_dst(skb)) + oif = l3mdev_fib_oif(skb_dst(skb)->dev); memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; @@ -128,7 +127,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) case IPPROTO_DCCP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ports = (__be16 *)xprth; + __be16 *ports; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ports = (__be16 *)xprth; fl4->fl4_sport = ports[!!reverse]; fl4->fl4_dport = ports[!reverse]; @@ -136,8 +138,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) break; case IPPROTO_ICMP: - if (pskb_may_pull(skb, xprth + 2 - skb->data)) { - u8 *icmp = xprth; + if (xprth + 2 < skb->data || + pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp; + + xprth = skb_network_header(skb) + iph->ihl * 4; + icmp = xprth; fl4->fl4_icmp_type = icmp[0]; fl4->fl4_icmp_code = icmp[1]; @@ -145,33 +151,50 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) break; case IPPROTO_ESP: - if (pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be32 *ehdr = (__be32 *)xprth; + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be32 *ehdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ehdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ehdr[0]; } break; case IPPROTO_AH: - if (pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr = (__be32 *)xprth; + if (xprth + 8 < skb->data || + pskb_may_pull(skb, xprth + 8 - skb->data)) { + __be32 *ah_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ah_hdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ah_hdr[1]; } break; case IPPROTO_COMP: - if (pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ipcomp_hdr = (__be16 *)xprth; + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be16 *ipcomp_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ipcomp_hdr = (__be16 *)xprth; fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; case IPPROTO_GRE: - if (pskb_may_pull(skb, xprth + 12 - skb->data)) { - __be16 *greflags = (__be16 *)xprth; - __be32 *gre_hdr = (__be32 *)xprth; + if (xprth + 12 < skb->data || + pskb_may_pull(skb, xprth + 12 - skb->data)) { + __be16 *greflags; + __be32 *gre_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + greflags = (__be16 *)xprth; + gre_hdr = (__be32 *)xprth; if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) @@ -245,7 +268,7 @@ static struct dst_ops xfrm4_dst_ops_template = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 32768, + .gc_thresh = INT_MAX, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |