summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c33
-rw-r--r--net/ipv4/arp.c20
-rw-r--r--net/ipv4/devinet.c7
-rw-r--r--net/ipv4/fib_frontend.c35
-rw-r--r--net/ipv4/fib_semantics.c175
-rw-r--r--net/ipv4/fou.c3
-rw-r--r--net/ipv4/icmp.c27
-rw-r--r--net/ipv4/igmp.c23
-rw-r--r--net/ipv4/inet_connection_sock.c273
-rw-r--r--net/ipv4/inet_diag.c96
-rw-r--r--net/ipv4/inet_fragment.c6
-rw-r--r--net/ipv4/inet_hashtables.c53
-rw-r--r--net/ipv4/ip_forward.c19
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_input.c47
-rw-r--r--net/ipv4/ip_output.c148
-rw-r--r--net/ipv4/ip_sockglue.c45
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c32
-rw-r--r--net/ipv4/ipip.c3
-rw-r--r--net/ipv4/ipmr.c38
-rw-r--r--net/ipv4/netfilter.c7
-rw-r--r--net/ipv4/netfilter/Kconfig1
-rw-r--r--net/ipv4/netfilter/arp_tables.c15
-rw-r--r--net/ipv4/netfilter/arptable_filter.c7
-rw-r--r--net/ipv4/netfilter/ip_tables.c31
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c12
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c32
-rw-r--r--net/ipv4/netfilter/ipt_ah.c2
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c5
-rw-r--r--net/ipv4/netfilter/iptable_filter.c9
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c19
-rw-r--r--net/ipv4/netfilter/iptable_nat.c26
-rw-r--r--net/ipv4/netfilter/iptable_raw.c9
-rw-r--r--net/ipv4/netfilter/iptable_security.c12
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c18
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c18
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c25
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c44
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c6
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c22
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c5
-rw-r--r--net/ipv4/raw.c21
-rw-r--r--net/ipv4/route.c216
-rw-r--r--net/ipv4/syncookies.c23
-rw-r--r--net/ipv4/sysctl_net_ipv4.c18
-rw-r--r--net/ipv4/tcp.c67
-rw-r--r--net/ipv4/tcp_cong.c12
-rw-r--r--net/ipv4/tcp_diag.c2
-rw-r--r--net/ipv4/tcp_fastopen.c75
-rw-r--r--net/ipv4/tcp_input.c303
-rw-r--r--net/ipv4/tcp_ipv4.c227
-rw-r--r--net/ipv4/tcp_minisocks.c71
-rw-r--r--net/ipv4/tcp_output.c107
-rw-r--r--net/ipv4/tcp_recovery.c109
-rw-r--r--net/ipv4/tcp_timer.c20
-rw-r--r--net/ipv4/udp.c29
-rw-r--r--net/ipv4/xfrm4_input.c7
-rw-r--r--net/ipv4/xfrm4_output.c11
-rw-r--r--net/ipv4/xfrm4_policy.c59
72 files changed, 1531 insertions, 1328 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 89aacb630a53..c29809f765dc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1d0c3adb6f34..5c5db6636704 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -119,7 +119,7 @@
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
-#include <net/vrf.h>
+#include <net/l3mdev.h>
/* The inetsw table contains everything that inet_create needs to
@@ -219,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog)
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
- !inet_csk(sk)->icsk_accept_queue.fastopenq) {
+ !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
- err = fastopen_init_queue(sk, backlog);
+ fastopen_queue_tune(sk, backlog);
else if ((sysctl_tcp_fastopen &
TFO_SERVER_WO_SOCKOPT2) != 0)
- err = fastopen_init_queue(sk,
+ fastopen_queue_tune(sk,
((uint)sysctl_tcp_fastopen) >> 16);
- else
- err = 0;
- if (err)
- goto out;
tcp_fastopen_init_key_once(true);
}
@@ -261,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -450,7 +449,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id;
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
@@ -1043,22 +1042,16 @@ void inet_register_protosw(struct inet_protosw *p)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
- answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
-
/* Check only the non-wild match. */
- if (INET_PROTOSW_PERMANENT & answer->flags) {
- if (protocol == answer->protocol)
- break;
- last_perm = lh;
- }
-
- answer = NULL;
+ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
+ break;
+ if (protocol == answer->protocol)
+ goto out_permanent;
+ last_perm = lh;
}
- if (answer)
- goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 0c9c3482e419..59b3e0e8fd51 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -624,14 +624,20 @@ out:
}
EXPORT_SYMBOL(arp_create);
+static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+
/*
* Send an arp packet.
*/
void arp_xmit(struct sk_buff *skb)
{
/* Send it off, maybe filter it using firewalling first. */
- NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb,
- NULL, skb->dev, dev_queue_xmit_sk);
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
+ dev_net(skb->dev), NULL, skb, NULL, skb->dev,
+ arp_xmit_finish);
}
EXPORT_SYMBOL(arp_xmit);
@@ -639,7 +645,7 @@ EXPORT_SYMBOL(arp_xmit);
* Process an arp request.
*/
-static int arp_process(struct sock *sk, struct sk_buff *skb)
+static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -651,7 +657,6 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
- struct net *net = dev_net(dev);
struct dst_entry *reply_dst = NULL;
bool is_garp = false;
@@ -872,7 +877,7 @@ out_free_dst:
static void parp_redo(struct sk_buff *skb)
{
- arp_process(NULL, skb);
+ arp_process(dev_net(skb->dev), NULL, skb);
}
@@ -905,8 +910,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
- return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb,
- dev, NULL, arp_process);
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
+ dev_net(dev), NULL, skb, dev, NULL,
+ arp_process);
consumeskb:
consume_skb(skb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 2d9cb1748f81..cebd9d31e65a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1644,7 +1644,8 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}
-static size_t inet_get_link_af_size(const struct net_device *dev)
+static size_t inet_get_link_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
@@ -1654,7 +1655,8 @@ static size_t inet_get_link_af_size(const struct net_device *dev)
return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
}
-static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
struct nlattr *nla;
@@ -2397,4 +2399,3 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
inet_netconf_dump_devconf, NULL);
}
-
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 457b2cd75b85..473447593060 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -45,7 +45,7 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -255,7 +255,7 @@ EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
return __inet_dev_addr_type(net, dev, addr, rt_table);
}
@@ -268,7 +268,7 @@ unsigned int inet_addr_type_dev_table(struct net *net,
const struct net_device *dev,
__be32 addr)
{
- u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
return __inet_dev_addr_type(net, NULL, addr, rt_table);
}
@@ -332,7 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = vrf_master_ifindex_rcu(dev);
+ fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
if (!fl4.flowi4_iif)
fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
@@ -367,7 +367,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
if (nh->nh_dev == dev) {
dev_match = true;
break;
- } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+ } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
dev_match = true;
break;
}
@@ -804,7 +804,7 @@ out:
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
- u32 tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev);
+ u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -867,9 +867,10 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- prefix, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
@@ -914,9 +915,10 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
} else if (!ipv4_is_zeronet(any) &&
(any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_DELROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
subnet = 1;
}
@@ -1153,6 +1155,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_changeupper_info *info;
struct in_device *in_dev;
struct net *net = dev_net(dev);
unsigned int flags;
@@ -1191,6 +1194,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
case NETDEV_CHANGEMTU:
rt_cache_flush(net);
break;
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ /* flush all routes if dev is linked to or unlinked from
+ * an L3 master device (e.g., VRF)
+ */
+ if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ fib_disable_ip(dev, NETDEV_DOWN, true);
+ break;
}
return NOTIFY_DONE;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e966f8599b4a..d97268e8ff10 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,8 +57,7 @@ static unsigned int fib_info_cnt;
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-static DEFINE_SPINLOCK(fib_multipath_lock);
+u32 fib_multipath_secret __read_mostly;
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
@@ -532,7 +531,67 @@ errout:
return ret;
}
-#endif
+static void fib_rebalance(struct fib_info *fi)
+{
+ int total;
+ int w;
+ struct in_device *in_dev;
+
+ if (fi->fib_nhs < 2)
+ return;
+
+ total = 0;
+ for_nexthops(fi) {
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN)
+ continue;
+
+ total += nh->nh_weight;
+ } endfor_nexthops(fi);
+
+ w = 0;
+ change_nexthops(fi) {
+ int upper_bound;
+
+ in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
+
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+ upper_bound = -1;
+ } else if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+ upper_bound = -1;
+ } else {
+ w += nexthop_nh->nh_weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+ } endfor_nexthops(fi);
+
+ net_get_random_once(&fib_multipath_secret,
+ sizeof(fib_multipath_secret));
+}
+
+static inline void fib_add_weight(struct fib_info *fi,
+ const struct fib_nh *nh)
+{
+ fi->fib_weight += nh->nh_weight;
+}
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define fib_rebalance(fi) do { } while (0)
+#define fib_add_weight(fi, nh) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
@@ -864,14 +923,21 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
fib_prefsrc != cfg->fc_dst) {
u32 tb_id = cfg->fc_table;
+ int rc;
if (tb_id == RT_TABLE_MAIN)
tb_id = RT_TABLE_LOCAL;
- if (inet_addr_type_table(cfg->fc_nlinfo.nl_net,
- fib_prefsrc, tb_id) != RTN_LOCAL) {
- return false;
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, tb_id);
+
+ if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, RT_TABLE_LOCAL);
}
+
+ if (rc != RTN_LOCAL)
+ return false;
}
return true;
}
@@ -1094,8 +1160,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
+ fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
+ fib_rebalance(fi);
+
link_it:
ofi = fib_find_info(fi);
if (ofi) {
@@ -1322,12 +1391,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
break;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nexthop_nh->nh_power;
- nexthop_nh->nh_power = 0;
- spin_unlock_bh(&fib_multipath_lock);
-#endif
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1350,6 +1413,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
}
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
@@ -1479,20 +1544,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
!__in_dev_get_rtnl(dev))
continue;
alive++;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- nexthop_nh->nh_power = 0;
- nexthop_nh->nh_flags &= ~nh_flags;
- spin_unlock_bh(&fib_multipath_lock);
-#else
nexthop_nh->nh_flags &= ~nh_flags;
-#endif
} endfor_nexthops(fi)
if (alive > 0) {
fi->fib_flags &= ~nh_flags;
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
@@ -1500,62 +1560,41 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
- struct in_device *in_dev;
- int w;
-
- spin_lock_bh(&fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
- if (nexthop_nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
- continue;
- power += nexthop_nh->nh_weight;
- nexthop_nh->nh_power = nexthop_nh->nh_weight;
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power <= 0) {
- spin_unlock_bh(&fib_multipath_lock);
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
- return;
- }
- }
-
- /* w should be random number [0..fi->fib_power-1],
- * it is pretty bad approximation.
- */
-
- w = jiffies % fi->fib_power;
+ for_nexthops(fi) {
+ if (hash > atomic_read(&nh->nh_upper_bound))
+ continue;
- change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
- nexthop_nh->nh_power) {
- w -= nexthop_nh->nh_power;
- if (w <= 0) {
- nexthop_nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&fib_multipath_lock);
- return;
- }
- }
+ res->nh_sel = nhsel;
+ return;
} endfor_nexthops(fi);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
- spin_unlock_bh(&fib_multipath_lock);
}
#endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+ struct flowi4 *fl4, int mp_hash)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (mp_hash < 0)
+ mp_hash = get_hash_from_flowi4(fl4) >> 1;
+
+ fib_select_multipath(res, mp_hash);
+ }
+ else
+#endif
+ if (!res->prefixlen &&
+ res->table->tb_num_default > 1 &&
+ res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(fl4, res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, *res);
+}
+EXPORT_SYMBOL_GPL(fib_select_path);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index e0fcbbbcfe54..bd903fe0f750 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -24,6 +24,7 @@ struct fou {
u16 type;
struct udp_offload udp_offloads;
struct list_head list;
+ struct rcu_head rcu;
};
#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
@@ -417,7 +418,7 @@ static void fou_release(struct fou *fou)
list_del(&fou->list);
udp_tunnel_sock_release(sock);
- kfree(fou);
+ kfree_rcu(fou, rcu);
}
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e5eb8ac4089d..36e26977c908 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -96,7 +96,7 @@
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/ip_fib.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
/*
* Build xmit assembly blocks
@@ -309,7 +309,7 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
rc = false;
if (icmp_global_allow()) {
- int vif = vrf_master_ifindex(dst->dev);
+ int vif = l3mdev_master_ifindex(dst->dev);
struct inet_peer *peer;
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
@@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
- fl4.flowi4_oif = vrf_master_ifindex(skb->dev);
+ fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -440,6 +440,22 @@ out_unlock:
icmp_xmit_unlock(sk);
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Source and destination is swapped. See ip_multipath_icmp_hash */
+static int icmp_multipath_hash_skb(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return fib_multipath_hash(iph->daddr, iph->saddr);
+}
+
+#else
+
+#define icmp_multipath_hash_skb(skb) (-1)
+
+#endif
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -461,10 +477,11 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
- fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev);
+ fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key(net, fl4);
+ rt = __ip_route_output_key_hash(net, fl4,
+ icmp_multipath_hash_skb(skb_in));
if (IS_ERR(rt))
return rt;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d38b8b61eaee..05e4cba14162 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return ip_local_out(skb);
+ return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
static void igmp_gq_timer_expire(unsigned long data)
@@ -2126,7 +2126,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
ASSERT_RTNL();
in_dev = ip_mc_find_dev(net, imr);
- if (!in_dev) {
+ if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
ret = -ENODEV;
goto out;
}
@@ -2147,7 +2147,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
- ip_mc_dec_group(in_dev, group);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, group);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
@@ -2392,11 +2393,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct ip_sf_socklist *psl;
struct net *net = sock_net(sk);
+ ASSERT_RTNL();
+
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
@@ -2417,7 +2418,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
goto done;
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
if (!psl) {
len = 0;
count = 0;
@@ -2436,7 +2436,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
return -EFAULT;
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2450,6 +2449,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ ASSERT_RTNL();
+
psin = (struct sockaddr_in *)&gsf->gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
@@ -2457,8 +2458,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
@@ -2470,7 +2469,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
goto done;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
@@ -2490,7 +2488,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
}
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2569,7 +2566,7 @@ void ip_mc_drop_socket(struct sock *sk)
}
/* called with rcu_read_lock() */
-int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
struct ip_mc_list *im;
struct ip_mc_list __rcu **mc_hash;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 61b45a17fc73..46b9c887bede 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -330,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
}
- req = reqsk_queue_remove(queue);
+ req = reqsk_queue_remove(queue, sk);
newsk = req->sk;
- sk_acceptq_removed(sk);
if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_rsk(req)->tfo_listener &&
- queue->fastopenq) {
- spin_lock_bh(&queue->fastopenq->lock);
+ tcp_rsk(req)->tfo_listener) {
+ spin_lock_bh(&queue->fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
@@ -348,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
req->sk = NULL;
req = NULL;
}
- spin_unlock_bh(&queue->fastopenq->lock);
+ spin_unlock_bh(&queue->fastopenq.lock);
}
out:
release_sock(sk);
@@ -408,7 +406,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
}
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
-struct dst_entry *inet_csk_route_req(struct sock *sk,
+struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req)
{
@@ -439,7 +437,7 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
-struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
const struct request_sock *req)
{
@@ -478,65 +476,12 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
- const u32 rnd, const u32 synq_hsize)
-{
- return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
-}
-
#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
#define AF_INET_FAMILY(fam) true
#endif
-/* Note: this is temporary :
- * req sock will no longer be in listener hash table
-*/
-struct request_sock *inet_csk_search_req(struct sock *sk,
- const __be16 rport,
- const __be32 raddr,
- const __be32 laddr)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req;
- u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries);
-
- spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
- for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->ir_rmt_port == rport &&
- ireq->ir_rmt_addr == raddr &&
- ireq->ir_loc_addr == laddr &&
- AF_INET_FAMILY(req->rsk_ops->family)) {
- atomic_inc(&req->rsk_refcnt);
- WARN_ON(req->sk);
- break;
- }
- }
- spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return req;
-}
-EXPORT_SYMBOL_GPL(inet_csk_search_req);
-
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
- inet_rsk(req)->ir_rmt_port,
- lopt->hash_rnd, lopt->nr_table_entries);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
- inet_csk_reqsk_queue_added(sk, timeout);
-}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
-
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
@@ -563,7 +508,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
req->num_timeout >= rskq_defer_accept - 1;
}
-int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
{
int err = req->rsk_ops->rtx_syn_ack(parent, req);
@@ -573,27 +518,20 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
-/* return true if req was found in the syn_table[] */
+/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock_queue *queue,
struct request_sock *req)
{
- struct request_sock **prev;
- struct listen_sock *lopt;
+ struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
bool found = false;
- spin_lock(&queue->syn_wait_lock);
- lopt = queue->listen_opt;
- if (lopt) {
- for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
- prev = &(*prev)->dl_next) {
- if (*prev == req) {
- *prev = req->dl_next;
- found = true;
- break;
- }
- }
+ if (sk_hashed(req_to_sk(req))) {
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
+
+ spin_lock(lock);
+ found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
+ spin_unlock(lock);
}
- spin_unlock(&queue->syn_wait_lock);
if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
reqsk_put(req);
return found;
@@ -608,21 +546,25 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
+{
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_put(req);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+
static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
struct sock *sk_listener = req->rsk_listener;
struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct listen_sock *lopt = queue->listen_opt;
int qlen, expire = 0, resend = 0;
int max_retries, thresh;
u8 defer_accept;
- if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
- reqsk_put(req);
- return;
- }
+ if (sk_state_load(sk_listener) != TCP_LISTEN)
+ goto drop;
max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
thresh = max_retries;
@@ -643,9 +585,9 @@ static void reqsk_timer_handler(unsigned long data)
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
- qlen = listen_sock_qlen(lopt);
- if (qlen >> (lopt->max_qlen_log - 1)) {
- int young = listen_sock_young(lopt) << 1;
+ qlen = reqsk_queue_len(queue);
+ if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+ int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) {
if (qlen < young)
@@ -667,41 +609,40 @@ static void reqsk_timer_handler(unsigned long data)
unsigned long timeo;
if (req->num_timeout++ == 0)
- atomic_inc(&lopt->young_dec);
+ atomic_dec(&queue->young);
timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
return;
}
- inet_csk_reqsk_queue_drop(sk_listener, req);
- reqsk_put(req);
+drop:
+ inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
}
-void reqsk_queue_hash_req(struct request_sock_queue *queue,
- u32 hash, struct request_sock *req,
- unsigned long timeout)
+static void reqsk_queue_hash_req(struct request_sock *req,
+ unsigned long timeout)
{
- struct listen_sock *lopt = queue->listen_opt;
-
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
- req->rsk_hash = hash;
+ inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
- atomic_set(&req->rsk_refcnt, 2);
+ atomic_set(&req->rsk_refcnt, 2 + 1);
+}
- spin_lock(&queue->syn_wait_lock);
- req->dl_next = lopt->syn_table[hash];
- lopt->syn_table[hash] = req;
- spin_unlock(&queue->syn_wait_lock);
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ unsigned long timeout)
+{
+ reqsk_queue_hash_req(req, timeout);
+ inet_csk_reqsk_queue_added(sk);
}
-EXPORT_SYMBOL(reqsk_queue_hash_req);
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -792,16 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+int inet_csk_listen_start(struct sock *sk, int backlog)
{
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+ struct inet_sock *inet = inet_sk(sk);
- if (rc != 0)
- return rc;
+ reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = 0;
+ sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
@@ -810,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
- sk->sk_state = TCP_LISTEN;
+ sk_state_store(sk, TCP_LISTEN);
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
@@ -821,11 +760,76 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
}
sk->sk_state = TCP_CLOSE;
- __reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+static void inet_child_forget(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ }
+ inet_csk_destroy_sock(child);
+ reqsk_put(req);
+}
+
+void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+
+ spin_lock(&queue->rskq_lock);
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_child_forget(sk, req, child);
+ } else {
+ req->sk = child;
+ req->dl_next = NULL;
+ if (queue->rskq_accept_head == NULL)
+ queue->rskq_accept_head = req;
+ else
+ queue->rskq_accept_tail->dl_next = req;
+ queue->rskq_accept_tail = req;
+ sk_acceptq_added(sk);
+ }
+ spin_unlock(&queue->rskq_lock);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
+
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+ struct request_sock *req, bool own_req)
+{
+ if (own_req) {
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ inet_csk_reqsk_queue_add(sk, req, child);
+ /* Warning: caller must not call reqsk_put(req);
+ * child stole last reference on it.
+ */
+ return child;
+ }
+ /* Too bad, another child took ownership of the request, undo. */
+ bh_unlock_sock(child);
+ sock_put(child);
+ return NULL;
+}
+EXPORT_SYMBOL(inet_csk_complete_hashdance);
+
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted.
@@ -834,11 +838,7 @@ void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct request_sock *acc_req;
- struct request_sock *req;
-
- /* make all the listen_opt local to us */
- acc_req = reqsk_queue_yank_acceptq(queue);
+ struct request_sock *next, *req;
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
@@ -848,57 +848,34 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
- reqsk_queue_destroy(queue);
-
- while ((req = acc_req) != NULL) {
+ while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
struct sock *child = req->sk;
- acc_req = req->dl_next;
-
local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
- sk->sk_prot->disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- percpu_counter_inc(sk->sk_prot->orphan_count);
-
- if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
- BUG_ON(sk != req->rsk_listener);
-
- /* Paranoid, to prevent race condition if
- * an inbound pkt destined for child is
- * blocked by sock lock in tcp_v4_rcv().
- * Also to satisfy an assertion in
- * tcp_v4_destroy_sock().
- */
- tcp_sk(child)->fastopen_rsk = NULL;
- }
- inet_csk_destroy_sock(child);
-
+ inet_child_forget(sk, req, child);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
- sk_acceptq_removed(sk);
- reqsk_put(req);
+ cond_resched();
}
- if (queue->fastopenq) {
+ if (queue->fastopenq.rskq_rst_head) {
/* Free all the reqs queued in rskq_rst_head. */
- spin_lock_bh(&queue->fastopenq->lock);
- acc_req = queue->fastopenq->rskq_rst_head;
- queue->fastopenq->rskq_rst_head = NULL;
- spin_unlock_bh(&queue->fastopenq->lock);
- while ((req = acc_req) != NULL) {
- acc_req = req->dl_next;
+ spin_lock_bh(&queue->fastopenq.lock);
+ req = queue->fastopenq.rskq_rst_head;
+ queue->fastopenq.rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq.lock);
+ while (req != NULL) {
+ next = req->dl_next;
reqsk_put(req);
+ req = next;
}
}
- WARN_ON(sk->sk_ack_backlog);
+ WARN_ON_ONCE(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c3b1f3a0f4cf..ab9f8a66615d 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -730,91 +730,21 @@ static void twsk_build_assert(void)
#endif
}
-static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_diag_entry entry;
- int j, s_j, reqnum, s_reqnum;
- struct listen_sock *lopt;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- lopt = icsk->icsk_accept_queue.listen_opt;
- if (!lopt || !listen_sock_qlen(lopt))
- goto out;
-
- if (bc) {
- entry.sport = inet->inet_num;
- entry.userlocks = sk->sk_userlocks;
- }
-
- for (j = s_j; j < lopt->nr_table_entries; j++) {
- struct request_sock *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- if (reqnum < s_reqnum)
- continue;
- if (r->id.idiag_dport != ireq->ir_rmt_port &&
- r->id.idiag_dport)
- continue;
-
- if (bc) {
- /* Note: entry.sport and entry.userlocks are already set */
- entry_fill_addrs(&entry, req_to_sk(req));
- entry.dport = ntohs(ireq->ir_rmt_port);
-
- if (!inet_diag_bc_run(bc, &entry))
- continue;
- }
-
- err = inet_req_diag_fill(req_to_sk(req), skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI, cb->nlh);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
- }
-
- s_reqnum = 0;
- }
-
-out:
- spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return err;
-}
-
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
+ u32 idiag_states = r->idiag_states;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1];
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
- if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & TCPF_LISTEN))
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
@@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (!(r->idiag_states & TCPF_LISTEN) ||
- r->id.idiag_dport ||
+ if (r->id.idiag_dport ||
cb->args[3] > 0)
- goto syn_recv;
-
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
- spin_unlock_bh(&ilb->lock);
- goto done;
- }
-
-syn_recv:
- if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
- if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -879,7 +799,7 @@ skip_listen_ht:
s_i = num = s_num = 0;
}
- if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & ~TCPF_LISTEN))
goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
@@ -906,7 +826,7 @@ skip_listen_ht:
goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_substate : sk->sk_state;
- if (!(r->idiag_states & (1 << state)))
+ if (!(idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index d0a7c0319e3d..fe144dae7372 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -209,12 +209,6 @@ int inet_frags_init(struct inet_frags *f)
}
EXPORT_SYMBOL(inet_frags_init);
-void inet_frags_init_net(struct netns_frags *nf)
-{
- init_frag_mem_limit(nf);
-}
-EXPORT_SYMBOL(inet_frags_init_net);
-
void inet_frags_fini(struct inet_frags *f)
{
cancel_work_sync(&f->frags_work);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 89120196a949..ccc5980797fc 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -126,7 +126,7 @@ void inet_put_port(struct sock *sk)
}
EXPORT_SYMBOL(inet_put_port);
-int __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
unsigned short port = inet_sk(child)->inet_num;
@@ -137,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
+ if (unlikely(!tb)) {
+ spin_unlock(&head->lock);
+ return -ENOENT;
+ }
if (tb->port != port) {
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
@@ -185,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
}
return score;
}
@@ -398,14 +404,18 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport);
}
-void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
+/* insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
+ bool ret = true;
- WARN_ON(!sk_unhashed(sk));
+ WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
@@ -413,24 +423,41 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock(lock);
- __sk_nulls_add_node_rcu(sk, list);
if (osk) {
- WARN_ON(sk->sk_hash != osk->sk_hash);
- sk_nulls_del_node_init_rcu(osk);
+ WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+ ret = sk_nulls_del_node_init_rcu(osk);
}
+ if (ret)
+ __sk_nulls_add_node_rcu(sk, list);
spin_unlock(lock);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ return ret;
+}
+
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
+{
+ bool ok = inet_ehash_insert(sk, osk);
+
+ if (ok) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ } else {
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ sk->sk_state = TCP_CLOSE;
+ sock_set_flag(sk, SOCK_DEAD);
+ inet_csk_destroy_sock(sk);
+ }
+ return ok;
}
-EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
void __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
- if (sk->sk_state != TCP_LISTEN)
- return __inet_hash_nolisten(sk, osk);
-
+ if (sk->sk_state != TCP_LISTEN) {
+ inet_ehash_nolisten(sk, osk);
+ return;
+ }
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -551,7 +578,7 @@ ok:
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- __inet_hash_nolisten(sk, (struct sock *)tw);
+ inet_ehash_nolisten(sk, (struct sock *)tw);
}
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
@@ -568,7 +595,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash_nolisten(sk, NULL);
+ inet_ehash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 2d3aa408fbdc..da0d7ce85844 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -61,18 +61,18 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
}
-static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
skb_sender_cpu_clear(skb);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
int ip_forward(struct sk_buff *skb)
@@ -81,6 +81,7 @@ int ip_forward(struct sk_buff *skb)
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
+ struct net *net;
/* that should never happen */
if (skb->pkt_type != PACKET_HOST)
@@ -99,6 +100,7 @@ int ip_forward(struct sk_buff *skb)
return NET_RX_SUCCESS;
skb_forward_csum(skb);
+ net = dev_net(skb->dev);
/*
* According to the RFC, we must first decrease the TTL field. If
@@ -119,7 +121,7 @@ int ip_forward(struct sk_buff *skb)
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
- IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
goto drop;
@@ -143,8 +145,9 @@ int ip_forward(struct sk_buff *skb)
skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
- skb->dev, rt->dst.dev, ip_forward_finish);
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, rt->dst.dev,
+ ip_forward_finish);
sr_failed:
/*
@@ -155,7 +158,7 @@ sr_failed:
too_many_hops:
/* Tell the sender its packet died... */
- IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fa7f15305f9a..1fe55ae81781 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -48,7 +48,7 @@
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
#include <net/inet_ecn.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -78,7 +78,7 @@ struct ipq {
u8 ecn; /* RFC3168 support */
u16 max_df_size; /* largest frag with DF set seen */
int iif;
- int vif; /* VRF device index */
+ int vif; /* L3 master device index */
unsigned int rid;
struct inet_peer *peer;
};
@@ -654,11 +654,10 @@ out_fail:
}
/* Process an incoming IP datagram fragment. */
-int ip_defrag(struct sk_buff *skb, u32 user)
+int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
- int vif = vrf_master_ifindex_rcu(dev);
- struct net *net = dev_net(dev);
+ int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
@@ -683,7 +682,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_defrag);
-struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct iphdr iph;
int netoff;
@@ -712,7 +711,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
if (pskb_trim_rcsum(skb, netoff + len))
return skb;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- if (ip_defrag(skb, user))
+ if (ip_defrag(net, skb, user))
return NULL;
skb_clear_hash(skb);
}
@@ -840,6 +839,8 @@ static void __init ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
+ int res;
+
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -863,9 +864,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
*/
net->ipv4.frags.timeout = IP_FRAG_TIME;
- inet_frags_init_net(&net->ipv4.frags);
-
- return ip4_frags_ns_ctl_register(net);
+ res = inet_frags_init_net(&net->ipv4.frags);
+ if (res)
+ return res;
+ res = ip4_frags_ns_ctl_register(net);
+ if (res)
+ inet_frags_uninit_net(&net->ipv4.frags);
+ return res;
}
static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f4fc8a77aaa7..b1209b63381f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -157,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
u8 protocol = ip_hdr(skb)->protocol;
struct sock *last = NULL;
struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
@@ -167,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb)
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
- net_eq(sock_net(sk), dev_net(dev))) {
+ net_eq(sock_net(sk), net)) {
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+ if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true;
}
if (last) {
@@ -188,10 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
-
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
@@ -248,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb)
/*
* Reassemble IP fragments.
*/
+ struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+ if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
- return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
- skb->dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
+ net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
@@ -311,7 +311,7 @@ drop:
int sysctl_ip_early_demux __read_mostly = 1;
EXPORT_SYMBOL(sysctl_ip_early_demux);
-static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
@@ -337,8 +337,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EXDEV)
- NET_INC_STATS_BH(dev_net(skb->dev),
- LINUX_MIB_IPRPFILTER);
+ NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
}
@@ -359,11 +358,9 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
- skb->len);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
- skb->len);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
return dst_input(skb);
@@ -378,6 +375,7 @@ drop:
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
const struct iphdr *iph;
+ struct net *net;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
@@ -387,11 +385,12 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
- IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
+ net = dev_net(dev);
+ IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
@@ -417,7 +416,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
- IP_ADD_STATS_BH(dev_net(dev),
+ IP_ADD_STATS_BH(net,
IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
@@ -431,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
len = ntohs(iph->tot_len);
if (skb->len < len) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
@@ -441,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -453,14 +452,14 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
- dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
ip_rcv_finish);
csum_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0138fada0951..4233cbe47052 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -83,9 +83,10 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
-static int ip_fragment(struct sock *sk, struct sk_buff *skb,
- unsigned int mtu,
- int (*output)(struct sock *, struct sk_buff *));
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *));
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
@@ -95,32 +96,28 @@ void ip_send_check(struct iphdr *iph)
}
EXPORT_SYMBOL(ip_send_check);
-static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
- skb_dst(skb)->dev, dst_output_sk);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
}
-int __ip_local_out(struct sk_buff *skb)
-{
- return __ip_local_out_sk(skb->sk, skb);
-}
-
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
- err = __ip_local_out(skb);
+ err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
- err = dst_output_sk(sk, skb);
+ err = dst_output(net, sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out_sk);
+EXPORT_SYMBOL_GPL(ip_local_out);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
@@ -135,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
* Add an ip header to a skbuff and send it out.
*
*/
-int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct iphdr *iph;
/* Build the IP header. */
@@ -149,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->version = 4;
iph->ihl = 5;
iph->tos = inet->tos;
- if (ip_dont_fragment(sk, &rt->dst))
- iph->frag_off = htons(IP_DF);
- else
- iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(sock_net(sk), skb, sk);
+ if (ip_dont_fragment(sk, &rt->dst)) {
+ iph->frag_off = htons(IP_DF);
+ iph->id = 0;
+ } else {
+ iph->frag_off = 0;
+ __ip_select_ident(net, iph, 1);
+ }
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
@@ -168,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
skb->mark = sk->sk_mark;
/* Send it out. */
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
@@ -182,9 +182,9 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
@@ -220,8 +220,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
- unsigned int mtu)
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -230,7 +230,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
skb_gso_network_seglen(skb) <= mtu)
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
*
@@ -253,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
int err;
segs->next = NULL;
- err = ip_fragment(sk, segs, mtu, ip_finish_output2);
+ err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
return ret;
}
-static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
@@ -271,20 +271,20 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(sk, skb, mtu);
+ return ip_finish_output_gso(net, sk, skb, mtu);
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
- return ip_fragment(sk, skb, mtu, ip_finish_output2);
+ return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
}
-int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
@@ -292,7 +292,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
/*
* If the indicated interface is up and running, send the packet.
*/
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -320,7 +320,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- sk, newskb, NULL, newskb->dev,
+ net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
}
@@ -335,26 +335,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
- NULL, newskb->dev, dev_loopback_xmit);
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
}
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
- skb->dev, ip_finish_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
+ ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sock *sk, struct sk_buff *skb)
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
- NULL, dev,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -377,6 +379,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
@@ -407,7 +410,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
@@ -444,20 +447,20 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_segs(sock_net(sk), skb, sk,
+ ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- res = ip_local_out(skb);
+ res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
@@ -486,29 +489,26 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
-static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
unsigned int mtu,
- int (*output)(struct sock *, struct sk_buff *))
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph = ip_hdr(skb);
if ((iph->frag_off & htons(IP_DF)) == 0)
- return ip_do_fragment(sk, skb, output);
+ return ip_do_fragment(net, sk, skb, output);
if (unlikely(!skb->ignore_df ||
(IPCB(skb)->frag_max_size &&
IPCB(skb)->frag_max_size > mtu))) {
- struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = rt->dst.dev;
-
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
kfree_skb(skb);
return -EMSGSIZE;
}
- return ip_do_fragment(sk, skb, output);
+ return ip_do_fragment(net, sk, skb, output);
}
/*
@@ -518,8 +518,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb,
* single device frame, and queue such a frame for sending.
*/
-int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -533,6 +533,11 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
dev = rt->dst.dev;
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto fail;
+
/*
* Point into the IP datagram header.
*/
@@ -621,10 +626,10 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
ip_send_check(iph);
}
- err = output(sk, skb);
+ err = output(net, sk, skb);
if (!err)
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
@@ -634,7 +639,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
}
if (err == 0) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return 0;
}
@@ -643,7 +648,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
kfree_skb(frag);
frag = skb;
}
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
slow_path_clean:
@@ -657,9 +662,6 @@ slow_path_clean:
}
slow_path:
- /* for offloaded checksums cleanup checksum before fragmentation */
- if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
- goto fail;
iph = ip_hdr(skb);
left = skb->len - hlen; /* Space per frame */
@@ -761,19 +763,19 @@ slow_path:
ip_send_check(iph);
- err = output(sk, skb2);
+ err = output(net, sk, skb2);
if (err)
goto fail;
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
}
consume_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
}
EXPORT_SYMBOL(ip_do_fragment);
@@ -911,6 +913,7 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ !(flags & MSG_MORE) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -1434,7 +1437,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb)
{
int err;
- err = ip_local_out(skb);
+ err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
@@ -1561,7 +1564,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
}
oif = arg->bound_dev_if;
- if (!oif && netif_index_is_vrf(net, skb->skb_iif))
+ if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
oif = skb->skb_iif;
flowi4_init_output(&fl4, oif,
@@ -1596,7 +1599,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
out:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c3c359ad66e3..5f73a7c03e27 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1251,11 +1251,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
* the _received_ ones. The set sets the _sent_ ones.
*/
+static bool getsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_MSFILTER:
+ case MCAST_MSFILTER:
+ return true;
+ }
+ return false;
+}
+
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
- int val;
+ bool needs_rtnl = getsockopt_needs_rtnl(optname);
+ int val, err = 0;
int len;
if (level != SOL_IP)
@@ -1269,6 +1280,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (len < 0)
return -EINVAL;
+ if (needs_rtnl)
+ rtnl_lock();
lock_sock(sk);
switch (optname) {
@@ -1386,39 +1399,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MSFILTER:
{
struct ip_msfilter msf;
- int err;
if (len < IP_MSFILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_msfget(sk, &msf,
(struct ip_msfilter __user *)optval, optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case MCAST_MSFILTER:
{
struct group_filter gsf;
- int err;
if (len < GROUP_FILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_gsfget(sk, &gsf,
(struct group_filter __user *)optval,
optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case IP_MULTICAST_ALL:
val = inet->mc_all;
@@ -1485,6 +1494,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
}
return 0;
+
+out:
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return err;
}
int ip_getsockopt(struct sock *sk, int level,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 84dce6a92f93..6cb9009c3d96 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -53,6 +53,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
{
int pkt_len = skb->len - skb_inner_network_offset(skb);
+ struct net *net = dev_net(rt->dst.dev);
struct iphdr *iph;
int err;
@@ -76,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(dev_net(rt->dst.dev), iph,
- skb_shinfo(skb)->gso_segs ?: 1);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
- err = ip_local_out_sk(sk, skb);
+ err = ip_local_out(net, sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 0c152087ca15..4d8f0b698777 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
- err = dst_output(skb);
+ err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index ed4ef09c2136..0bc7412d9e14 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -146,6 +146,10 @@ u8 root_server_path[256] = { 0, }; /* Path to mount as root */
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
+#if defined(CONFIG_IP_PNP_DHCP)
+static char dhcp_client_identifier[253] __initdata;
+#endif
+
/* Persistent data: */
static int ic_proto_used; /* Protocol used, if any */
@@ -728,6 +732,16 @@ ic_dhcp_init_options(u8 *options)
memcpy(e, vendor_class_identifier, len);
e += len;
}
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
+ }
}
*e++ = 255; /* End of the list */
@@ -1557,8 +1571,24 @@ static int __init ic_proto_name(char *name)
return 0;
}
#ifdef CONFIG_IP_PNP_DHCP
- else if (!strcmp(name, "dhcp")) {
+ else if (!strncmp(name, "dhcp", 4)) {
+ char *client_id;
+
ic_proto_enabled &= ~IC_RARP;
+ client_id = strstr(name, "dhcp,");
+ if (client_id) {
+ char *v;
+
+ client_id = client_id + 5;
+ v = strchr(client_id, ',');
+ if (!v)
+ return 1;
+ *v = 0;
+ if (kstrtou8(client_id, 0, dhcp_client_identifier))
+ DBG("DHCP: Invalid client identifier type\n");
+ strncpy(dhcp_client_identifier + 1, v + 1, 251);
+ *v = ',';
+ }
return 1;
}
#endif
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index f34c31defafe..a09fb0dec725 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -253,9 +253,6 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
p.i_key = p.o_key = 0;
p.i_flags = p.o_flags = 0;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
err = ip_tunnel_ioctl(dev, &p, cmd);
if (err)
return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8e8203d5c520..c3a38353f5dc 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -134,7 +134,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
-static void mroute_clean_tables(struct mr_table *mrt);
+static void mroute_clean_tables(struct mr_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -350,7 +350,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, true);
kfree(mrt);
}
@@ -441,10 +441,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -540,10 +536,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -1208,7 +1200,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
* Close the multicast socket, and clear the vif tables etc
*/
-static void mroute_clean_tables(struct mr_table *mrt)
+static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
int i;
LIST_HEAD(list);
@@ -1217,8 +1209,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
/* Shut down all active vif entries */
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif_table[i].flags & VIFF_STATIC))
- vif_delete(mrt, i, 0, &list);
+ if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
+ continue;
+ vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
@@ -1226,7 +1219,7 @@ static void mroute_clean_tables(struct mr_table *mrt)
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (c->mfc_flags & MFC_STATIC)
+ if (!all && (c->mfc_flags & MFC_STATIC))
continue;
list_del_rcu(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
@@ -1261,7 +1254,7 @@ static void mrtsock_destruct(struct sock *sk)
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, false);
}
}
rtnl_unlock();
@@ -1678,17 +1671,18 @@ static void ip_encap(struct net *net, struct sk_buff *skb,
nf_reset(skb);
}
-static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb)
+static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
/*
@@ -1745,7 +1739,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* to blackhole.
*/
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
goto out_free;
}
@@ -1787,8 +1781,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
- skb->dev, dev,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+ net, NULL, skb, skb->dev, dev,
ipmr_forward_finish);
return;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 61eafc9b4545..c3776ff6749f 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,9 +17,8 @@
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi4 fl4 = {};
@@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb,
}
}
-static int nf_ip_reroute(struct sk_buff *skb,
+static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
const struct nf_queue_entry *entry)
{
const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
@@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb,
skb->mark == rt_info->mark &&
iph->daddr == rt_info->daddr &&
iph->saddr == rt_info->saddr))
- return ip_route_me_harder(skb, RTN_UNSPEC);
+ return ip_route_me_harder(net, skb, RTN_UNSPEC);
}
return 0;
}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a35584176535..c187c60e3e0c 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -60,6 +60,7 @@ config NFT_REJECT_IPV4
config NFT_DUP_IPV4
tristate "IPv4 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV4
help
This module enables IPv4 packet duplication support for nf_tables.
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 8f87fc38ccde..11dccba474b7 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -186,7 +186,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, arpinfo->iniface,
- arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : "");
return 0;
}
@@ -195,7 +195,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, arpinfo->outiface,
- arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : "");
return 0;
}
@@ -247,10 +247,10 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
}
unsigned int arpt_do_table(struct sk_buff *skb,
- unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table)
{
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
const struct arphdr *arp;
@@ -285,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
*/
e = get_entry(table_base, private->hook_entry[hook]);
+ acpar.net = state->net;
acpar.in = state->in;
acpar.out = state->out;
acpar.hooknum = hook;
@@ -467,7 +468,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -631,7 +632,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
* newinfo).
*/
static int translate_table(struct xt_table_info *newinfo, void *entry0,
- const struct arpt_replace *repl)
+ const struct arpt_replace *repl)
{
struct arpt_entry *iter;
unsigned int i;
@@ -891,7 +892,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1068,7 +1069,7 @@ static int __do_replace(struct net *net, const char *name,
}
static int do_replace(struct net *net, const void __user *user,
- unsigned int len)
+ unsigned int len)
{
int ret;
struct arpt_replace tmp;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 93876d03120c..1897ee160920 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -27,13 +27,10 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c */
static unsigned int
-arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+arptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net = dev_net(state->in ? state->in : state->out);
-
- return arpt_do_table(skb, ops->hooknum, state,
- net->ipv4.arptable_filter);
+ return arpt_do_table(skb, state, state->net->ipv4.arptable_filter);
}
static struct nf_hook_ops *arpfilter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b0a86e73451c..b99affad6ba1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -102,7 +102,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, ipinfo->iniface,
- ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : "");
return false;
}
@@ -111,7 +111,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, ipinfo->outiface,
- ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : "");
return false;
}
@@ -120,7 +120,7 @@ ip_packet_match(const struct iphdr *ip,
FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
dprintf("Packet protocol %hi does not match %hi.%s\n",
ip->protocol, ipinfo->proto,
- ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : "");
return false;
}
@@ -246,7 +246,8 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
return 0;
}
-static void trace_packet(const struct sk_buff *skb,
+static void trace_packet(struct net *net,
+ const struct sk_buff *skb,
unsigned int hook,
const struct net_device *in,
const struct net_device *out,
@@ -258,7 +259,6 @@ static void trace_packet(const struct sk_buff *skb,
const char *hookname, *chainname, *comment;
const struct ipt_entry *iter;
unsigned int rulenum = 0;
- struct net *net = dev_net(in ? in : out);
root = get_entry(private->entries, private->hook_entry[hook]);
@@ -285,10 +285,10 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(struct sk_buff *skb,
- unsigned int hook,
const struct nf_hook_state *state,
struct xt_table *table)
{
+ unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
@@ -315,6 +315,7 @@ ipt_do_table(struct sk_buff *skb,
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
+ acpar.net = state->net;
acpar.in = state->in;
acpar.out = state->out;
acpar.family = NFPROTO_IPV4;
@@ -378,8 +379,8 @@ ipt_do_table(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
- trace_packet(skb, hook, state->in, state->out,
- table->name, private, e);
+ trace_packet(state->net, skb, hook, state->in,
+ state->out, table->name, private, e);
#endif
/* Standard target? */
if (!t->u.kernel.target->target) {
@@ -430,8 +431,8 @@ ipt_do_table(struct sk_buff *skb,
} while (!acpar.hotdrop);
pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
- xt_write_recseq_end(addend);
- local_bh_enable();
+ xt_write_recseq_end(addend);
+ local_bh_enable();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -483,7 +484,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- XT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -548,7 +549,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -803,7 +804,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
- const struct ipt_replace *repl)
+ const struct ipt_replace *repl)
{
struct ipt_entry *iter;
unsigned int i;
@@ -1077,7 +1078,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1303,7 +1304,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
static int
do_add_counters(struct net *net, const void __user *user,
- unsigned int len, int compat)
+ unsigned int len, int compat)
{
unsigned int i;
struct xt_counters_info tmp;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 45cb16a6a4a3..4a9e6db9df8d 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -492,14 +492,14 @@ static void arp_print(struct arp_payload *payload)
{
#define HBUFFERLEN 30
char hbuffer[HBUFFERLEN];
- int j,k;
+ int j, k;
- for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) {
hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
- hbuffer[k++]=':';
+ hbuffer[k++] = ':';
}
- hbuffer[--k]='\0';
+ hbuffer[--k] = '\0';
pr_debug("src %pI4@%s, dst %pI4\n",
&payload->src_ip, hbuffer, &payload->dst_ip);
@@ -507,14 +507,14 @@ static void arp_print(struct arp_payload *payload)
#endif
static unsigned int
-arp_mangle(const struct nf_hook_ops *ops,
+arp_mangle(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
- struct net *net = dev_net(state->in ? state->in : state->out);
+ struct net *net = state->net;
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 87907d4bd259..1d16c0f28df0 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(skb, hook);
+ nf_send_reset(par->net, skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 95ea633e8356..5fdc556514ba 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
}
static void
-synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+synproxy_send_tcp(const struct synproxy_net *snet,
+ const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct iphdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
+ struct net *net = nf_ct_net(snet->tmpl);
+
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
@@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
@@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
nf_conntrack_get(nfct);
}
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
free_nskb:
@@ -68,7 +71,8 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+synproxy_send_client_synack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
@@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
@@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
@@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
@@ -226,8 +230,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
}
static bool
@@ -258,7 +262,7 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_net *snet = synproxy_pernet(par->net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
@@ -287,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(skb, th, &opts);
+ synproxy_send_client_synack(snet, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
@@ -299,11 +303,11 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
+static unsigned int ipv4_synproxy_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
+ struct synproxy_net *snet = synproxy_pernet(nhs->net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
@@ -433,14 +437,12 @@ static struct xt_target synproxy_tg4_reg __read_mostly = {
static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 14a2aa8b8a14..a787d07f6cb7 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -25,7 +25,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
bool r;
pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
invert ? '!' : ' ', min, spi, max);
- r=(spi >= min && spi <= max) ^ invert;
+ r = (spi >= min && spi <= max) ^ invert;
pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index c4ffc9de1654..78cc64eddfc1 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -32,12 +32,11 @@ static __be32 rpfilter_get_saddr(__be32 addr)
return addr;
}
-static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
+static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
const struct net_device *dev, u8 flags)
{
struct fib_result res;
bool dev_match;
- struct net *net = dev_net(dev);
int ret __maybe_unused;
if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
@@ -96,7 +95,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.flowi4_tos = RT_TOS(iph->tos);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
- return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index a0f3beca52d2..397ef2dd133e 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -33,19 +33,16 @@ static const struct xt_table packet_filter = {
};
static unsigned int
-iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_filter_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 62cbb8c5f4a8..ba5d392a13c4 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -39,7 +39,6 @@ static const struct xt_table packet_mangler = {
static unsigned int
ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
{
- struct net_device *out = state->out;
unsigned int ret;
const struct iphdr *iph;
u_int8_t tos;
@@ -59,8 +58,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
daddr = iph->daddr;
tos = iph->tos;
- ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state,
- dev_net(out)->ipv4.iptable_mangle);
+ ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
/* Reroute for ANY change. */
if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
@@ -69,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -80,18 +78,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_mangle_hook(const struct nf_hook_ops *ops,
+iptable_mangle_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (ops->hooknum == NF_INET_LOCAL_OUT)
+ if (state->hook == NF_INET_LOCAL_OUT)
return ipt_mangle_out(skb, state);
- if (ops->hooknum == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, ops->hooknum, state,
- dev_net(state->out)->ipv4.iptable_mangle);
+ if (state->hook == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, state,
+ state->net->ipv4.iptable_mangle);
/* PREROUTING/INPUT/FORWARD: */
- return ipt_do_table(skb, ops->hooknum, state,
- dev_net(state->in)->ipv4.iptable_mangle);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 0d4d9cdf98a4..ae2cd2752046 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -28,49 +28,46 @@ static const struct xt_table nf_nat_ipv4_table = {
.af = NFPROTO_IPV4,
};
-static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
-
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table);
+ return ipt_do_table(skb, state, state->net->ipv4.nat_table);
}
-static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
}
-static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+static unsigned int iptable_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain);
+ return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
}
static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
@@ -78,7 +75,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_out,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
@@ -86,7 +82,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_local_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
@@ -94,7 +89,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 0356e6da4bb7..1ba02811acb0 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -20,19 +20,16 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_raw_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 4bce3980ccd9..c2e23d5e9cd4 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -37,20 +37,16 @@ static const struct xt_table security_table = {
};
static unsigned int
-iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+iptable_security_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- const struct net *net;
-
- if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ if (state->hook == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* Somebody is playing with raw sockets. */
return NF_ACCEPT;
- net = dev_net(state->in ? state->in : state->out);
- return ipt_do_table(skb, ops->hooknum, state,
- net->ipv4.iptable_security);
+ return ipt_do_table(skb, state, state->net->ipv4.iptable_security);
}
static struct nf_hook_ops *sectbl_ops __read_mostly;
@@ -83,7 +79,7 @@ static int __init iptable_security_init(void)
int ret;
ret = register_pernet_subsys(&iptable_security_net_ops);
- if (ret < 0)
+ if (ret < 0)
return ret;
sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8a2caaf3940b..461ca926fd39 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
return NF_ACCEPT;
}
-static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
+static unsigned int ipv4_helper(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -119,7 +119,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
ct, ctinfo);
}
-static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
+static unsigned int ipv4_confirm(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -143,14 +143,14 @@ out:
return nf_conntrack_confirm(skb);
}
-static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
-static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_local(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -158,7 +158,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb);
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
/* Connection tracking may drop packets, but never alters them, so
@@ -166,42 +166,36 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
.hook = ipv4_conntrack_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index cdde3ec496e9..c567e1b5d799 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -30,7 +30,7 @@ static inline struct nf_icmp_net *icmp_pernet(struct net *net)
}
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct nf_conntrack_tuple *tuple)
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct icmphdr *hp;
struct icmphdr _hdr;
@@ -144,7 +144,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
if (!nf_ct_get_tuplepr(skb,
skb_network_offset(skb) + ip_hdrlen(skb)
+ sizeof(struct icmphdr),
- PF_INET, &origtuple)) {
+ PF_INET, net, &origtuple)) {
pr_debug("icmp_error_message: failed to get tuple\n");
return -NF_ACCEPT;
}
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 9306ec4fab41..6fb869f646bf 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,14 +22,15 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
+ u_int32_t user)
{
int err;
skb_orphan(skb);
local_bh_disable();
- err = ip_defrag(skb, user);
+ err = ip_defrag(net, skb, user);
local_bh_enable();
if (!err) {
@@ -61,15 +62,14 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
return IP_DEFRAG_CONNTRACK_OUT + zone_id;
}
-static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
+static unsigned int ipv4_conntrack_defrag(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (sk && (sk->sk_family == PF_INET) &&
- inet->nodefrag)
+ if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
+ inet_sk(sk)->nodefrag)
return NF_ACCEPT;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -83,9 +83,9 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
/* Gather fragments. */
if (ip_is_fragment(ip_hdr(skb))) {
enum ip_defrag_users user =
- nf_ct_defrag_user(ops->hooknum, skb);
+ nf_ct_defrag_user(state->hook, skb);
- if (nf_ct_ipv4_gather_frags(skb, user))
+ if (nf_ct_ipv4_gather_frags(state->net, skb, user))
return NF_STOLEN;
}
return NF_ACCEPT;
@@ -94,14 +94,12 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
static struct nf_hook_ops ipv4_defrag_ops[] = {
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 2d79e6e8d934..ceb187308120 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -23,25 +23,10 @@
#include <net/netfilter/nf_conntrack.h>
#endif
-static struct net *pick_net(struct sk_buff *skb)
-{
-#ifdef CONFIG_NET_NS
- const struct dst_entry *dst;
-
- if (skb->dev != NULL)
- return dev_net(skb->dev);
- dst = skb_dst(skb);
- if (dst != NULL && dst->dev != NULL)
- return dev_net(dst->dev);
-#endif
- return &init_net;
-}
-
-static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw,
- int oif)
+static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
+ const struct in_addr *gw, int oif)
{
const struct iphdr *iph = ip_hdr(skb);
- struct net *net = pick_net(skb);
struct rtable *rt;
struct flowi4 fl4;
@@ -65,7 +50,7 @@ static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw,
return true;
}
-void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
+void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
const struct in_addr *gw, int oif)
{
struct iphdr *iph;
@@ -105,9 +90,9 @@ void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
--iph->ttl;
ip_send_check(iph);
- if (nf_dup_ipv4_route(skb, gw, oif)) {
+ if (nf_dup_ipv4_route(net, skb, gw, oif)) {
__this_cpu_write(nf_skb_duplicated, true);
- ip_local_out(skb);
+ ip_local_out(net, skb->sk, skb);
__this_cpu_write(nf_skb_duplicated, false);
} else {
kfree_skb(skb);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 22f4579b0c2a..5075b7ecd26d 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -255,9 +255,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
unsigned int
-nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -266,7 +266,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
/* We never see fragments: conntrack defrags on pre-routing
* and local-out, and nf_nat_out protects post-routing.
@@ -295,7 +295,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
case IP_CT_RELATED_REPLY:
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- ops->hooknum))
+ state->hook))
return NF_DROP;
else
return NF_ACCEPT;
@@ -308,21 +308,21 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- ret = do_chain(ops, skb, state, ct);
+ ret = do_chain(priv, skb, state, ct);
if (ret != NF_ACCEPT)
return ret;
- if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
break;
- ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ ret = nf_nat_alloc_null_binding(ct, state->hook);
if (ret != NF_ACCEPT)
return ret;
} else {
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat,
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat,
state->out))
goto oif_changed;
}
@@ -332,11 +332,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+ return nf_nat_packet(ct, ctinfo, state->hook, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
@@ -345,9 +345,9 @@ oif_changed:
EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
unsigned int
-nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -355,7 +355,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
@@ -365,9 +365,9 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
unsigned int
-nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -384,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -396,7 +396,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
(ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.src.u.all !=
ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(skb, AF_INET);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -407,9 +407,9 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
unsigned int
-nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
- unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
@@ -424,14 +424,14 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -440,7 +440,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(skb, AF_INET);
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 657d2307f031..b3ca21b2ba9b 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -45,7 +45,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
struct net *net = nf_ct_net(ct);
const struct nf_conn *master = ct->master;
struct nf_conntrack_expect *other_exp;
- struct nf_conntrack_tuple t;
+ struct nf_conntrack_tuple t = {};
const struct nf_ct_pptp_master *ct_pptp_info;
const struct nf_nat_pptp *nat_pptp_info;
struct nf_nat_range range;
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 7c676671329d..ddb894ac1458 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1156,7 +1156,7 @@ static int snmp_parse_mangle(unsigned char *msg,
}
if (obj->type == SNMP_IPADDR)
- mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+ mangle_address(ctx.begin, ctx.pointer - 4, map, check);
kfree(obj->id);
kfree(obj);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 3262e41ff76f..c747b2d9eb77 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
/* Send RST reply */
-void nf_send_reset(struct sk_buff *oldskb, int hook)
+void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
const struct iphdr *oiph;
@@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
/* "Never happens" */
@@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 8412268bbad1..9d09d4f59545 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -15,15 +15,15 @@
#include <net/netfilter/nf_tables.h>
static unsigned int
-nft_do_chain_arp(const struct nf_hook_ops *ops,
+nft_do_chain_arp(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo(&pkt, ops, skb, state);
+ nft_set_pktinfo(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
static struct nft_af_info nft_af_arp __read_mostly = {
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index aa180d3a69a5..ca9dc3c46c4f 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -18,18 +18,18 @@
#include <net/ip.h>
#include <net/netfilter/nf_tables_ipv4.h>
-static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+static unsigned int nft_do_chain_ipv4(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+static unsigned int nft_ipv4_output(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -41,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
return NF_ACCEPT;
}
- return nft_do_chain_ipv4(ops, skb, state);
+ return nft_do_chain_ipv4(priv, skb, state);
}
struct nft_af_info nft_af_ipv4 __read_mostly = {
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index bf5c30ae14e4..f5c66a7a4bf2 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -26,44 +26,44 @@
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/ip.h>
-static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_do_chain(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct)
{
struct nft_pktinfo pkt;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
- return nft_do_chain(&pkt, ops);
+ return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_out(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
}
-static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+static unsigned int nft_nat_ipv4_local_fn(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain);
+ return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
}
static const struct nf_chain_type nft_chain_nat_ipv4 = {
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index e335b0afdaf3..2375b0a8be46 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -21,7 +21,7 @@
#include <net/route.h>
#include <net/ip.h>
-static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+static unsigned int nf_route_table_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -37,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ nft_set_pktinfo_ipv4(&pkt, skb, state);
mark = skb->mark;
iph = ip_hdr(skb);
@@ -45,7 +45,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
daddr = iph->daddr;
tos = iph->tos;
- ret = nft_do_chain(&pkt, ops);
+ ret = nft_do_chain(&pkt, priv);
if (ret != NF_DROP && ret != NF_QUEUE) {
iph = ip_hdr(skb);
@@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos)
- if (ip_route_me_harder(skb, RTN_UNSPEC))
+ if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
ret = NF_DROP;
}
return ret;
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index b45932d43b69..bf855e64fc45 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -30,7 +30,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr,
};
int oif = regs->data[priv->sreg_dev];
- nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif);
+ nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif);
}
static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 40e414c4ca56..b72ffc58e255 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -26,7 +26,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
&range, pkt->out);
}
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index d8d795df9c13..c09d4381427e 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -36,7 +36,7 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
mr.range[0].flags |= priv->flags;
regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
- pkt->ops->hooknum);
+ pkt->hook);
}
static struct nft_expr_type nft_redir_ipv4_type;
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index b07e58b51158..c24f41c816b3 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,11 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code,
- pkt->ops->hooknum);
+ nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ nf_send_reset(pkt->net, pkt->skb, pkt->hook);
break;
default:
break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 561cd4b8fc6e..63e5be0abd86 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -406,13 +406,16 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
ip_select_ident(net, skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ skb->transport_header += iphlen;
+ if (iph->protocol == IPPROTO_ICMP &&
+ length >= iphlen + sizeof(struct icmphdr))
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
}
- if (iph->protocol == IPPROTO_ICMP)
- icmp_out_count(net, ((struct icmphdr *)
- skb_transport_header(skb))->type);
- err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb,
- NULL, rt->dst.dev, dst_output_sk);
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+ net, sk, skb, NULL, rt->dst.dev,
+ dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -483,6 +486,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
struct flowi4 fl4;
@@ -542,7 +546,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ err = ip_cmsg_send(net, msg, &ipc, false);
if (err)
goto out;
if (ipc.opt)
@@ -597,6 +601,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
+ if (!saddr && ipc.oif)
+ l3mdev_get_saddr(net, ipc.oif, &fl4);
+
if (!inet->hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
@@ -607,7 +614,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
- rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c81deb85acb4..85f184e429c6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,7 +112,7 @@
#endif
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -847,7 +847,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
return;
}
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
- vif = vrf_master_ifindex_rcu(rt->dst.dev);
+ vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
rcu_read_unlock();
net = dev_net(rt->dst.dev);
@@ -941,7 +941,7 @@ static int ip_error(struct sk_buff *skb)
}
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
- vrf_master_ifindex(skb->dev), 1);
+ l3mdev_master_ifindex(skb->dev), 1);
send = true;
if (peer) {
@@ -1152,7 +1152,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
pr_debug("%s: %pI4 -> %pI4, %s\n",
__func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1438,12 +1438,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
}
static struct rtable *rt_dst_alloc(struct net_device *dev,
+ unsigned int flags, u16 type,
bool nopolicy, bool noxfrm, bool will_cache)
{
- return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
- (nopolicy ? DST_NOPOLICY : 0) |
- (noxfrm ? DST_NOXFRM : 0));
+ struct rtable *rt;
+
+ rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0));
+
+ if (rt) {
+ rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+ rt->rt_flags = flags;
+ rt->rt_type = type;
+ rt->rt_is_input = 0;
+ rt->rt_iif = 0;
+ rt->rt_pmtu = 0;
+ rt->rt_gateway = 0;
+ rt->rt_uses_gateway = 0;
+ rt->rt_table_id = 0;
+ INIT_LIST_HEAD(&rt->rt_uncached);
+
+ rt->dst.output = ip_output;
+ if (flags & RTCF_LOCAL)
+ rt->dst.input = ip_local_deliver;
+ }
+
+ return rt;
}
/* called in rcu_read_lock() section */
@@ -1452,6 +1474,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct rtable *rth;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ unsigned int flags = RTCF_MULTICAST;
u32 itag = 0;
int err;
@@ -1464,9 +1487,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
- if (ipv4_is_loopback(saddr))
- goto e_inval;
+ if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
+ goto e_inval;
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
@@ -1477,7 +1499,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (err < 0)
goto e_err;
}
- rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+ if (our)
+ flags |= RTCF_LOCAL;
+
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
goto e_nobufs;
@@ -1486,20 +1511,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->dst.tclassid = itag;
#endif
rth->dst.output = ip_rt_bug;
-
- rth->rt_genid = rt_genid_ipv4(dev_net(dev));
- rth->rt_flags = RTCF_MULTICAST;
- rth->rt_type = RTN_MULTICAST;
rth->rt_is_input= 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
- if (our) {
- rth->dst.input= ip_local_deliver;
- rth->rt_flags |= RTCF_LOCAL;
- }
#ifdef CONFIG_IP_MROUTE
if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
@@ -1608,7 +1620,7 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
- rth = rt_dst_alloc(out_dev->dev,
+ rth = rt_dst_alloc(out_dev->dev, 0, res->type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
if (!rth) {
@@ -1616,19 +1628,12 @@ static int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
- rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
- rth->rt_flags = 0;
- rth->rt_type = res->type;
rth->rt_is_input = 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
RT_CACHE_STAT_INC(in_slow_tot);
rth->dst.input = ip_forward;
- rth->dst.output = ip_output;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
@@ -1646,6 +1651,48 @@ out:
return err;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses in reverse order.
+ */
+static int ip_multipath_icmp_hash(struct sk_buff *skb)
+{
+ const struct iphdr *outer_iph = ip_hdr(skb);
+ struct icmphdr _icmph;
+ const struct icmphdr *icmph;
+ struct iphdr _inner_iph;
+ const struct iphdr *inner_iph;
+
+ if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+ goto standard_hash;
+
+ icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+ &_icmph);
+ if (!icmph)
+ goto standard_hash;
+
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_REDIRECT &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB) {
+ goto standard_hash;
+ }
+
+ inner_iph = skb_header_pointer(skb,
+ outer_iph->ihl * 4 + sizeof(_icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto standard_hash;
+
+ return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+
+standard_hash:
+ return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
@@ -1653,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ if (res->fi && res->fi->fib_nhs > 1) {
+ int h;
+
+ if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
+ h = ip_multipath_icmp_hash(skb);
+ else
+ h = fib_multipath_hash(saddr, daddr);
+ fib_select_multipath(res, h);
+ }
#endif
/* create a routing cache entry */
@@ -1706,6 +1760,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_source;
res.fi = NULL;
+ res.table = NULL;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
@@ -1733,7 +1788,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex;
+ fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -1754,7 +1809,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
err = fib_validate_source(skb, saddr, daddr, tos,
0, dev, in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
goto local_input;
}
@@ -1776,7 +1831,7 @@ brd_input:
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
}
flags |= RTCF_BROADCAST;
res.type = RTN_BROADCAST;
@@ -1796,26 +1851,18 @@ local_input:
}
}
- rth = rt_dst_alloc(net->loopback_dev,
+ rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
- rth->dst.input= ip_local_deliver;
rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
-
- rth->rt_genid = rt_genid_ipv4(net);
- rth->rt_flags = flags|RTCF_LOCAL;
- rth->rt_type = res.type;
rth->rt_is_input = 1;
- rth->rt_iif = 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res.table)
+ rth->rt_table_id = res.table->tb_id;
RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_UNREACHABLE) {
@@ -1837,6 +1884,7 @@ no_route:
RT_CACHE_STAT_INC(in_no_route);
res.type = RTN_UNREACHABLE;
res.fi = NULL;
+ res.table = NULL;
goto local_input;
/*
@@ -1859,8 +1907,6 @@ e_nobufs:
goto out;
martian_source:
- err = -EINVAL;
-martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
goto out;
}
@@ -1988,28 +2034,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
}
add:
- rth = rt_dst_alloc(dev_out,
+ rth = rt_dst_alloc(dev_out, flags, type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(in_dev, NOXFRM),
do_cache);
if (!rth)
return ERR_PTR(-ENOBUFS);
- rth->dst.output = ip_output;
-
- rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
- rth->rt_flags = flags;
- rth->rt_type = type;
- rth->rt_is_input = 0;
rth->rt_iif = orig_oif ? : 0;
- rth->rt_pmtu = 0;
- rth->rt_gateway = 0;
- rth->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rth->rt_uncached);
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
+
RT_CACHE_STAT_INC(out_slow_tot);
- if (flags & RTCF_LOCAL)
- rth->dst.input = ip_local_deliver;
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
@@ -2038,7 +2075,8 @@ add:
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ int mp_hash)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
@@ -2137,11 +2175,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
- if (netif_is_vrf(dev_out) &&
- !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
- rth = vrf_dev_get_rth(dev_out);
+
+ rth = l3mdev_get_rtable(dev_out, fl4);
+ if (rth)
goto out;
- }
}
if (!fl4->daddr) {
@@ -2159,7 +2196,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
if (err) {
res.fi = NULL;
res.table = NULL;
- if (fl4->flowi4_oif) {
+ if (fl4->flowi4_oif &&
+ !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2201,18 +2239,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
- else
-#endif
- if (!res.prefixlen &&
- res.table->tb_num_default > 1 &&
- res.type == RTN_UNICAST && !fl4->flowi4_oif)
- fib_select_default(fl4, &res);
-
- if (!fl4->saddr)
- fl4->saddr = FIB_RES_PREFSRC(net, res);
+ fib_select_path(net, &res, fl4, mp_hash);
dev_out = FIB_RES_DEV(res);
fl4->flowi4_oif = dev_out->ifindex;
@@ -2225,7 +2252,7 @@ out:
rcu_read_unlock();
return rth;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
@@ -2277,7 +2304,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard_sk;
+ new->output = dst_discard_out;
new->dev = ort->dst.dev;
if (new->dev)
@@ -2303,7 +2330,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
}
struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
- struct sock *sk)
+ const struct sock *sk)
{
struct rtable *rt = __ip_route_output_key(net, flp4);
@@ -2319,7 +2346,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
u32 seq, int event, int nowait, unsigned int flags)
{
@@ -2339,8 +2366,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
r->rtm_tos = fl4->flowi4_tos;
- r->rtm_table = RT_TABLE_MAIN;
- if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ r->rtm_table = table_id;
+ if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
r->rtm_type = rt->rt_type;
r->rtm_scope = RT_SCOPE_UNIVERSE;
@@ -2445,6 +2472,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int err;
int mark;
struct sk_buff *skb;
+ u32 table_id = RT_TABLE_MAIN;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2480,6 +2508,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ if (netif_index_is_l3_master(net, fl4.flowi4_oif))
+ fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
+
if (iif) {
struct net_device *dev;
@@ -2514,7 +2545,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- err = rt_fill_info(net, dst, src, &fl4, skb,
+ if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
+ table_id = rt->rt_table_id;
+
+ err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
RTM_NEWROUTE, 0, 0);
if (err < 0)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index d70b1f603692..4cbe9f0a4281 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
-__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
- __u16 *mssp)
+__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- tcp_synq_overflow(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
return __cookie_v4_init_sequence(iph, th, mssp);
}
@@ -225,10 +221,13 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
+ bool own_req;
- child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ NULL, &own_req);
if (child) {
atomic_set(&req->rsk_refcnt, 1);
+ sock_rps_save_rxhash(child, skb);
inet_csk_reqsk_queue_add(sk, req, child);
} else {
reqsk_free(req);
@@ -288,6 +287,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
}
EXPORT_SYMBOL(cookie_ecn_ok);
+/* On input, sk is a listener.
+ * Output is listener if incoming packet would not create a child
+ * NULL if memory could not be allocated.
+ */
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
@@ -326,7 +329,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
if (!req)
goto out;
@@ -345,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
- treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->snt_synack.v64 = 0;
treq->tfo_listener = false;
ireq->ir_iif = sk->sk_bound_dev_if;
@@ -381,10 +384,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
- &req->rcv_wnd, &req->window_clamp,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 894da3a70aff..a0bd7a55193e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -48,14 +48,14 @@ static void set_local_port_range(struct net *net, int range[2])
{
bool same_parity = !((range[0] ^ range[1]) & 1);
- write_seqlock(&net->ipv4.ip_local_ports.lock);
+ write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
if (same_parity && !net->ipv4.ip_local_ports.warned) {
net->ipv4.ip_local_ports.warned = true;
pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
}
net->ipv4.ip_local_ports.range[0] = range[0];
net->ipv4.ip_local_ports.range[1] = range[1];
- write_sequnlock(&net->ipv4.ip_local_ports.lock);
+ write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_recovery",
+ .data = &sysctl_tcp_recovery,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_reordering",
.data = &sysctl_tcp_reordering,
.maxlen = sizeof(int),
@@ -577,6 +584,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_min_rtt_wlen",
+ .data = &sysctl_tcp_min_rtt_wlen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8b8fa184f75..c82cca18c90f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ tp->rtt_min[0].rtt = ~0U;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -450,11 +451,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
unsigned int mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ int state;
sock_rps_record_flow(sk);
sock_poll_wait(file, sk_sleep(sk), wait);
- if (sk->sk_state == TCP_LISTEN)
+
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
@@ -491,14 +495,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+ if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM | POLLRDHUP;
/* Connected or passive Fast Open socket? */
- if (sk->sk_state != TCP_SYN_SENT &&
- (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ if (state != TCP_SYN_SENT &&
+ (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
if (tp->urg_seq == tp->copied_seq &&
@@ -506,9 +510,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;
- /* Potential race condition. If read of tp below will
- * escape above sk->sk_state, we can be illegally awaken
- * in SYN_* states. */
if (tp->rcv_nxt - tp->copied_seq >= target)
mask |= POLLIN | POLLRDNORM;
@@ -516,8 +517,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
@@ -900,11 +900,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto out_err;
}
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
copied = 0;
@@ -967,7 +968,8 @@ new_segment:
copied += copy;
offset += copy;
- if (!(size -= copy)) {
+ size -= copy;
+ if (!size) {
tcp_tx_timestamp(sk, skb);
goto out;
}
@@ -988,7 +990,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1111,7 +1114,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto do_error;
}
@@ -1129,7 +1133,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
/* This should be in poll */
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1267,7 +1271,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1767,7 +1772,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* __ Restore normal policy in scheduler __ */
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
@@ -1778,7 +1784,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
do_prequeue:
tcp_prequeue_process(sk);
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
@@ -1926,7 +1933,7 @@ void tcp_set_state(struct sock *sk, int state)
/* Change state AFTER socket is unhashed to avoid closed
* socket sitting in hash tables.
*/
- sk->sk_state = state;
+ sk_state_store(sk, state);
#ifdef STATE_TRACE
SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
@@ -2230,7 +2237,8 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
- if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq += tp->max_window + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
@@ -2253,13 +2261,6 @@ int tcp_disconnect(struct sock *sk, int flags)
}
EXPORT_SYMBOL(tcp_disconnect);
-void tcp_sock_destruct(struct sock *sk)
-{
- inet_sock_destruct(sk);
-
- kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
-}
-
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
@@ -2581,7 +2582,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
TCPF_LISTEN))) {
tcp_fastopen_init_key_once(true);
- err = fastopen_init_queue(sk, val);
+ fastopen_queue_tune(sk, val);
} else {
err = -EINVAL;
}
@@ -2642,7 +2643,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (sk->sk_type != SOCK_STREAM)
return;
- info->tcpi_state = sk->sk_state;
+ info->tcpi_state = sk_state_load(sk);
+
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2670,7 +2672,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- if (sk->sk_state == TCP_LISTEN) {
+ if (info->tcpi_state == TCP_LISTEN) {
info->tcpi_unacked = sk->sk_ack_backlog;
info->tcpi_sacked = sk->sk_max_ack_backlog;
} else {
@@ -2849,10 +2851,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_FASTOPEN:
- if (icsk->icsk_accept_queue.fastopenq)
- val = icsk->icsk_accept_queue.fastopenq->max_qlen;
- else
- val = 0;
+ val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
case TCP_TIMESTAMP:
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 93c4dc3ab23f..882caa4e72bc 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -173,6 +173,10 @@ out:
*/
if (ca->get_info)
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
}
void tcp_init_congestion_control(struct sock *sk)
@@ -181,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk)
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
}
static void tcp_reinit_congestion_control(struct sock *sk,
@@ -192,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
- if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
- icsk->icsk_ca_ops->init(sk);
+ if (sk->sk_state != TCP_CLOSE)
+ tcp_init_congestion_control(sk);
}
/* Manage refcounts on socket close. */
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 479f34946177..b31604086edd 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -21,7 +21,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
{
struct tcp_info *info = _info;
- if (sk->sk_state == TCP_LISTEN) {
+ if (sk_state_load(sk) == TCP_LISTEN) {
r->idiag_rqueue = sk->sk_ack_backlog;
r->idiag_wqueue = sk->sk_max_ack_backlog;
} else if (sk->sk_type == SOCK_STREAM) {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f9c0fb84e435..55be6ac70cff 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -124,27 +124,29 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
return false;
}
-static bool tcp_fastopen_create_child(struct sock *sk,
- struct sk_buff *skb,
- struct dst_entry *dst,
- struct request_sock *req)
+static struct sock *tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
{
struct tcp_sock *tp;
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
u32 end_seq;
+ bool own_req;
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ NULL, &own_req);
if (!child)
- return false;
+ return NULL;
- spin_lock(&queue->fastopenq->lock);
- queue->fastopenq->qlen++;
- spin_unlock(&queue->fastopenq->lock);
+ spin_lock(&queue->fastopenq.lock);
+ queue->fastopenq.qlen++;
+ spin_unlock(&queue->fastopenq.lock);
/* Initialize the child socket. Have to fix some values to take
* into account the child is a Fast Open socket and is created
@@ -161,15 +163,13 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
/* Activate the retrans timer so that SYNACK can be retransmitted.
- * The request socket is not added to the SYN table of the parent
+ * The request socket is not added to the ehash
* because it's been added to the accept queue directly.
*/
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
- atomic_set(&req->rsk_refcnt, 1);
- /* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, child);
+ atomic_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
inet_csk(child)->icsk_af_ops->rebuild_header(child);
@@ -178,12 +178,10 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tcp_init_metrics(child);
tcp_init_buffer_space(child);
- /* Queue the data carried in the SYN packet. We need to first
- * bump skb's refcnt because the caller will attempt to free it.
- * Note that IPv6 might also have used skb_get() trick
- * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
- * So we need to eventually get a clone of the packet,
- * before inserting it in sk_receive_queue.
+ /* Queue the data carried in the SYN packet.
+ * We used to play tricky games with skb_get().
+ * With lockless listener, it is a dead end.
+ * Do not think about it.
*
* XXX (TFO) - we honor a zero-payload TFO request for now,
* (any reason not to?) but no need to queue the skb since
@@ -191,12 +189,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
*/
end_seq = TCP_SKB_CB(skb)->end_seq;
if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
- struct sk_buff *skb2;
-
- if (unlikely(skb_shared(skb)))
- skb2 = skb_clone(skb, GFP_ATOMIC);
- else
- skb2 = skb_get(skb);
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (likely(skb2)) {
skb_dst_drop(skb2);
@@ -214,11 +207,10 @@ static bool tcp_fastopen_create_child(struct sock *sk,
}
}
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
- sk->sk_data_ready(sk);
- bh_unlock_sock(child);
- sock_put(child);
- WARN_ON(!req->sk);
- return true;
+ /* tcp_conn_request() is sending the SYNACK,
+ * and queues the child into listener accept queue.
+ */
+ return child;
}
static bool tcp_fastopen_queue_check(struct sock *sk)
@@ -235,8 +227,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* between qlen overflow causing Fast Open to be disabled
* temporarily vs a server not supporting Fast Open at all.
*/
- fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (!fastopenq || fastopenq->max_qlen == 0)
+ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (fastopenq->max_qlen == 0)
return false;
if (fastopenq->qlen >= fastopenq->max_qlen) {
@@ -261,13 +253,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* may be updated and return the client in the SYN-ACK later. E.g., Fast Open
* cookie request (foc->len == 0).
*/
-bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
{
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+ struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
@@ -276,7 +269,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
(syn_data || foc->len >= 0) &&
tcp_fastopen_queue_check(sk))) {
foc->len = -1;
- return false;
+ return NULL;
}
if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
@@ -296,11 +289,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
* data in SYN_RECV state.
*/
fastopen:
- if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ child = tcp_fastopen_create_child(sk, skb, dst, req);
+ if (child) {
foc->len = -1;
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
- return true;
+ return child;
}
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
@@ -308,6 +302,5 @@ fastopen:
valid_foc.exp = foc->exp;
*foc = valid_foc;
- return false;
+ return NULL;
}
-EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a8f515bb19c4..2d656eef7f8e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
+int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_thin_dupack __read_mostly;
@@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
if (metric > 0)
tcp_disable_early_retrans(tp);
+ tp->rack.reord = 1;
}
/* This must be called before lost_out is incremented */
@@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
}
}
-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
- struct sk_buff *skb)
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
@@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
return !before(start_seq, end_seq - tp->max_window);
}
-/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "B". Later note: FACK people cheated me again 8), we have to account
- * for reordering! Ugly, but should help.
- *
- * Search retransmitted skbs from write_queue that were sent when snd_nxt was
- * less than what is now known to be received by the other end (derived from
- * highest SACK block). Also calculate the lowest snd_nxt among the remaining
- * retransmitted skbs to avoid some costly processing per ACKs.
- */
-static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt = 0;
- u32 new_low_seq = tp->snd_nxt;
- u32 received_upto = tcp_highest_sack_seq(tp);
-
- if (!tcp_is_fack(tp) || !tp->retrans_out ||
- !after(received_upto, tp->lost_retrans_low) ||
- icsk->icsk_ca_state != TCP_CA_Recovery)
- return;
-
- tcp_for_write_queue(skb, sk) {
- u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
-
- if (skb == tcp_send_head(sk))
- break;
- if (cnt == tp->retrans_out)
- break;
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- continue;
-
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
- continue;
-
- /* TODO: We would like to get rid of tcp_is_fack(tp) only
- * constraint here (see above) but figuring out that at
- * least tp->reordering SACK blocks reside between ack_seq
- * and received_upto is not easy task to do cheaply with
- * the available datastructures.
- *
- * Whether FACK should check here for tp->reordering segs
- * in-between one could argue for either way (it would be
- * rather simple to implement as we could count fack_count
- * during the walk and do tp->fackets_out - fack_count).
- */
- if (after(received_upto, ack_seq)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- *flag |= FLAG_LOST_RETRANS;
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
- } else {
- if (before(ack_seq, new_low_seq))
- new_low_seq = ack_seq;
- cnt += tcp_skb_pcount(skb);
- }
- }
-
- if (tp->retrans_out)
- tp->lost_retrans_low = new_low_seq;
-}
-
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una)
@@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
+ tcp_rack_advance(tp, xmit_time, sacked);
+
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
@@ -1837,7 +1776,6 @@ advance_sp:
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
- tcp_mark_lost_retrans(sk, &state->flag);
tcp_verify_left_out(tp);
out:
@@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
+{
+ return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, when);
+}
+
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+}
+
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
- (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+ tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
}
/* Undo procedures. */
@@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
+ tcp_rack_mark_lost(sk))
+ flag |= FLAG_LOST_RETRANS;
+
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_xmit_retransmit_queue(sk);
}
+/* Kathleen Nichols' algorithm for tracking the minimum value of
+ * a data stream over some fixed time interval. (E.g., the minimum
+ * RTT over the past five minutes.) It uses constant space and constant
+ * time per update yet almost always delivers the same minimum as an
+ * implementation that has to keep all the data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of the
+ * n'th best >= n-1'th best. It also makes sure that the three values
+ * are widely separated in the time window since that bounds the worse
+ * case error when that data is monotonically increasing over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because it
+ * has no value - the new min is <= everything else in the window by
+ * definition and it's the most recent. So we restart fresh on every new min
+ * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
+ * best.
+ */
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+{
+ const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
+ struct rtt_meas *m = tcp_sk(sk)->rtt_min;
+ struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
+ u32 elapsed;
+
+ /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
+ if (unlikely(rttm.rtt <= m[0].rtt))
+ m[0] = m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[1].rtt)
+ m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[2].rtt)
+ m[2] = rttm;
+
+ elapsed = now - m[0].ts;
+ if (unlikely(elapsed > wlen)) {
+ /* Passed entire window without a new min so make 2nd choice
+ * the new min & 3rd choice the new 2nd. So forth and so on.
+ */
+ m[0] = m[1];
+ m[1] = m[2];
+ m[2] = rttm;
+ if (now - m[0].ts > wlen) {
+ m[0] = m[1];
+ m[1] = rttm;
+ if (now - m[0].ts > wlen)
+ m[0] = rttm;
+ }
+ } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
+ /* Passed a quarter of the window without a new min so
+ * take 2nd choice from the 2nd quarter of the window.
+ */
+ m[2] = m[1] = rttm;
+ } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
+ /* Passed half the window without a new min so take the 3rd
+ * choice from the last half of the window.
+ */
+ m[2] = rttm;
+ }
+}
+
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
- long seq_rtt_us, long sack_rtt_us)
+ long seq_rtt_us, long sack_rtt_us,
+ long ca_rtt_us)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
- if (flag & FLAG_RETRANS_DATA_ACKED)
- seq_rtt_us = -1L;
-
if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us;
@@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-
+ seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
+ tp->rx_opt.rcv_tsecr);
if (seq_rtt_us < 0)
return false;
+ /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
+ * always taken together with ACK, SACK, or TS-opts. Any negative
+ * values will be skipped with the seq_rtt_us < 0 check above.
+ */
+ tcp_update_rtt_min(sk, ca_rtt_us);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
@@ -2953,21 +2974,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
-static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
+void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
- struct tcp_sock *tp = tcp_sk(sk);
- long seq_rtt_us = -1L;
+ long rtt_us = -1L;
- if (synack_stamp && !tp->total_retrans)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
+ if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) {
+ struct skb_mstamp now;
- /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
- * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
- */
- if (!tp->srtt_us)
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
+ skb_mstamp_get(&now);
+ rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
+ }
+
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
}
+
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
+ else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now);
- if (likely(first_ackt.v64)) {
+ if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
}
@@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
}
- rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
+ ca_rtt_us);
if (flag & FLAG_ACKED) {
tcp_rearm_rto(sk);
@@ -4457,19 +4481,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
+ int err = -ENOMEM;
+ int data_len = 0;
bool fragstolen;
if (size == 0)
return 0;
- skb = alloc_skb(size, sk->sk_allocation);
+ if (size > PAGE_SIZE) {
+ int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
+
+ data_len = npages << PAGE_SHIFT;
+ size = data_len + (size & ~PAGE_MASK);
+ }
+ skb = alloc_skb_with_frags(size - data_len, data_len,
+ PAGE_ALLOC_COSTLY_ORDER,
+ &err, sk->sk_allocation);
if (!skb)
goto err;
+ skb_put(skb, size - data_len);
+ skb->data_len = data_len;
+ skb->len = size;
+
if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto err_free;
- if (memcpy_from_msg(skb_put(skb, size), msg, size))
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+ if (err)
goto err_free;
TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
@@ -4485,7 +4524,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
err_free:
kfree_skb(skb);
err:
- return -ENOMEM;
+ return err;
+
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
@@ -5472,7 +5512,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
}
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5643,6 +5683,7 @@ discard:
}
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->copied_seq = tp->rcv_nxt;
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5698,15 +5739,14 @@ reset_and_undo:
* address independent.
*/
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;
- u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5750,7 +5790,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
case TCP_SYN_SENT:
- queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -5785,15 +5825,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (!acceptable)
return 1;
+ if (!tp->srtt_us)
+ tcp_synack_rtt_meas(sk, req);
+
/* Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) {
- synack_stamp = tcp_rsk(req)->snt_synack;
tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
- synack_stamp = tp->lsndtime;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
@@ -5816,7 +5857,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_synack_rtt_meas(sk, synack_stamp);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -6023,11 +6063,11 @@ static void tcp_openreq_init(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
- req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ skb_mstamp_get(&tcp_rsk(req)->snt_synack);
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
@@ -6043,9 +6083,11 @@ static void tcp_openreq_init(struct request_sock *req,
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener)
+ struct sock *sk_listener,
+ bool attach_listener)
{
- struct request_sock *req = reqsk_alloc(ops, sk_listener);
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
@@ -6065,13 +6107,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(struct sock *sk,
+static bool tcp_syn_flood_action(const struct sock *sk,
const struct sk_buff *skb,
const char *proto)
{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
- struct listen_sock *lopt;
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
@@ -6082,12 +6124,12 @@ static bool tcp_syn_flood_action(struct sock *sk,
#endif
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
- if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
- lopt->synflood_warned = 1;
+ if (!queue->synflood_warned &&
+ sysctl_tcp_syncookies != 2 &&
+ xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
- }
+
return want_cookie;
}
@@ -6112,16 +6154,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
- struct request_sock *req;
struct tcp_sock *tp = tcp_sk(sk);
+ struct sock *fastopen_sk = NULL;
struct dst_entry *dst = NULL;
- __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
- bool want_cookie = false, fastopen;
+ struct request_sock *req;
+ bool want_cookie = false;
struct flowi fl;
- struct tcp_fastopen_cookie foc = { .len = -1 };
- int err;
-
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
@@ -6145,7 +6186,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops, sk);
+ req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
goto drop;
@@ -6228,20 +6269,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
- fastopen = !want_cookie &&
- tcp_try_fastopen(sk, skb, req, &foc, dst);
- err = af_ops->send_synack(sk, dst, &fl, req,
- skb_get_queue_mapping(skb), &foc);
- if (!fastopen) {
- if (err || want_cookie)
- goto drop_and_free;
-
+ if (!want_cookie) {
+ tcp_reqsk_record_syn(sk, req, skb);
+ fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
+ }
+ if (fastopen_sk) {
+ af_ops->send_synack(fastopen_sk, dst, &fl, req,
+ &foc, false);
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(fastopen_sk);
+ sock_put(fastopen_sk);
+ } else {
tcp_rsk(req)->tfo_listener = false;
- af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ if (!want_cookie)
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ af_ops->send_synack(sk, dst, &fl, req,
+ &foc, !want_cookie);
+ if (want_cookie)
+ goto drop_and_free;
}
- tcp_reqsk_record_syn(sk, req, skb);
-
+ reqsk_put(req);
return 0;
drop_and_release:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 93898e093d4e..d8841a2f1569 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -324,7 +324,6 @@ void tcp_req_err(struct sock *sk, u32 seq)
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- reqsk_put(req);
} else {
/*
* Still in SYN_RECV, just remove it silently.
@@ -332,9 +331,10 @@ void tcp_req_err(struct sock *sk, u32 seq)
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
}
+ reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);
@@ -576,7 +576,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
@@ -795,7 +795,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
inet_twsk_put(tw);
}
-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
@@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
0,
@@ -818,11 +818,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
* This still operates on a request_sock only, not on a big
* socket.
*/
-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
- u16 queue_mapping,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -833,12 +833,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc);
+ skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- skb_set_queue_mapping(skb, queue_mapping);
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
ireq->opt);
@@ -865,7 +864,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
*/
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
const union tcp_md5_addr *addr,
int family)
{
@@ -877,7 +876,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
sock_owned_by_user(sk) ||
- lockdep_is_held(&sk->sk_lock.slock));
+ lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
if (!md5sig)
return NULL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -894,7 +893,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
}
EXPORT_SYMBOL(tcp_md5_do_lookup);
-struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
+struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
@@ -922,7 +921,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk));
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
@@ -1112,10 +1112,13 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+#endif
+
/* Called with rcu_read_lock() */
-static bool tcp_v4_inbound_md5_hash(struct sock *sk,
+static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb)
{
+#ifdef CONFIG_TCP_MD5SIG
/*
* This gets called for each TCP segment that arrives
* so we want to be efficient.
@@ -1165,10 +1168,12 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk,
return true;
}
return false;
-}
#endif
+ return false;
+}
-static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
+static void tcp_v4_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
@@ -1179,7 +1184,8 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
ireq->opt = tcp_v4_save_options(skb);
}
-static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct flowi *fl,
const struct request_sock *req,
bool *strict)
{
@@ -1218,7 +1224,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_sequence,
.send_synack = tcp_v4_send_synack,
- .queue_hash_add = inet_csk_reqsk_queue_hash_add,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1241,9 +1246,11 @@ EXPORT_SYMBOL(tcp_v4_conn_request);
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
-struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
@@ -1277,7 +1284,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- sk_set_txhash(newsk);
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
@@ -1320,7 +1326,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- __inet_hash_nolisten(newsk, NULL);
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ if (*own_req)
+ tcp_move_syn(newtp, req);
return newsk;
@@ -1338,34 +1346,11 @@ put_and_exit:
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
+#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
- const struct iphdr *iph = ip_hdr(skb);
- struct request_sock *req;
- struct sock *nsk;
-
- req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
- if (req) {
- nsk = tcp_check_req(sk, skb, req, false);
- if (!nsk || nsk == sk)
- reqsk_put(req);
- return nsk;
- }
-
- nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
- th->source, iph->daddr, th->dest, inet_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-#ifdef CONFIG_SYN_COOKIES
if (!th->syn)
sk = cookie_v4_check(sk, skb);
#endif
@@ -1373,7 +1358,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
}
/* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
@@ -1404,13 +1389,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+ struct sock *nsk = tcp_v4_cookie_check(sk, skb);
+
if (!nsk)
goto discard;
-
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
- sk_mark_napi_id(sk, skb);
+ sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
@@ -1420,7 +1405,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+ if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
@@ -1508,7 +1493,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
if (likely(sk->sk_rx_dst))
skb_dst_drop(skb);
else
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
@@ -1590,6 +1575,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
+lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1598,6 +1584,33 @@ process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ struct sock *nsk = NULL;
+
+ sk = req->rsk_listener;
+ if (tcp_v4_inbound_md5_hash(sk, skb))
+ goto discard_and_relse;
+ if (likely(sk->sk_state == TCP_LISTEN)) {
+ nsk = tcp_check_req(sk, skb, req, false);
+ } else {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ if (!nsk) {
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (nsk == sk) {
+ sock_hold(sk);
+ reqsk_put(req);
+ } else if (tcp_child_process(sk, nsk, skb)) {
+ tcp_v4_send_reset(nsk, skb);
+ goto discard_it;
+ } else {
+ return 0;
+ }
+ }
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
@@ -1606,25 +1619,23 @@ process:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * We really want to reject the packet as early as possible
- * if:
- * o We're expecting an MD5'd packet and this is no MD5 tcp option
- * o There is an MD5 option and we're not expecting one
- */
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard_and_relse;
-#endif
nf_reset(skb);
if (sk_filter(sk, skb))
goto discard_and_relse;
- sk_incoming_cpu_update(sk);
skb->dev = NULL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = tcp_v4_do_rcv(sk, skb);
+ goto put_and_return;
+ }
+
+ sk_incoming_cpu_update(sk);
+
bh_lock_sock_nested(sk);
tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0;
@@ -1639,6 +1650,7 @@ process:
}
bh_unlock_sock(sk);
+put_and_return:
sock_put(sk);
return ret;
@@ -1709,8 +1721,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
- dst_hold(dst);
+ if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
@@ -1833,35 +1844,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- if (st->state == TCP_SEQ_STATE_OPENREQ) {
- struct request_sock *req = cur;
-
- icsk = inet_csk(st->syn_wait_sk);
- req = req->dl_next;
- while (1) {
- while (req) {
- if (req->rsk_ops->family == st->family) {
- cur = req;
- goto out;
- }
- req = req->dl_next;
- }
- if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
- break;
-get_req:
- req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
- }
- sk = sk_nulls_next(st->syn_wait_sk);
- st->state = TCP_SEQ_STATE_LISTENING;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- } else {
- icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue))
- goto start_req;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- sk = sk_nulls_next(sk);
- }
+ sk = sk_nulls_next(sk);
get_sk:
sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
@@ -1871,16 +1854,6 @@ get_sk:
goto out;
}
icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
-start_req:
- st->uid = sock_i_uid(sk);
- st->syn_wait_sk = sk;
- st->state = TCP_SEQ_STATE_OPENREQ;
- st->sbucket = 0;
- goto get_req;
- }
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
@@ -2012,7 +1985,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
void *rc = NULL;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
if (st->bucket >= INET_LHTABLE_SIZE)
break;
@@ -2071,7 +2043,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
@@ -2096,11 +2067,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
struct tcp_iter_state *st = seq->private;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
- if (v) {
- struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- }
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
@@ -2154,7 +2120,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(const struct request_sock *req,
- struct seq_file *f, int i, kuid_t uid)
+ struct seq_file *f, int i)
{
const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->rsk_timer.expires - jiffies;
@@ -2171,7 +2137,8 @@ static void get_openreq4(const struct request_sock *req,
1, /* timers active (only the expire timer) */
jiffies_delta_to_clock_t(delta),
req->num_timeout,
- from_kuid_munged(seq_user_ns(f), uid),
+ from_kuid_munged(seq_user_ns(f),
+ sock_i_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
@@ -2185,12 +2152,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct inet_sock *inet = inet_sk(sk);
- struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+ const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
__be32 dest = inet->inet_daddr;
__be32 src = inet->inet_rcv_saddr;
__u16 destp = ntohs(inet->inet_dport);
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
+ int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
@@ -2208,17 +2176,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
timer_expires = jiffies;
}
- if (sk->sk_state == TCP_LISTEN)
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
rx_queue = sk->sk_ack_backlog;
else
- /*
- * because we dont lock socket, we might find a transient negative value
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
*/
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
- i, src, srcp, dest, destp, sk->sk_state,
+ i, src, srcp, dest, destp, state,
tp->write_seq - tp->snd_una,
rx_queue,
timer_active,
@@ -2232,8 +2201,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
- sk->sk_state == TCP_LISTEN ?
- (fastopenq ? fastopenq->max_qlen : 0) :
+ state == TCP_LISTEN ?
+ fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}
@@ -2272,18 +2241,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
}
st = seq->private;
- switch (st->state) {
- case TCP_SEQ_STATE_LISTENING:
- case TCP_SEQ_STATE_ESTABLISHED:
- if (sk->sk_state == TCP_TIME_WAIT)
- get_timewait4_sock(v, seq, st->num);
- else
- get_tcp4_sock(v, seq, st->num);
- break;
- case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(v, seq, st->num, st->uid);
- break;
- }
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ get_openreq4(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
out:
seq_pad(seq, '\n');
return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index def765911ff8..ac6b1961ffeb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -361,30 +361,38 @@ void tcp_twsk_destructor(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+/* Warning : This function is called without sk_listener being locked.
+ * Be sure to read socket fields once, as their value could change under us.
+ */
void tcp_openreq_init_rwin(struct request_sock *req,
- struct sock *sk, struct dst_entry *dst)
+ const struct sock *sk_listener,
+ const struct dst_entry *dst)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_sock *tp = tcp_sk(sk);
- __u8 rcv_wscale;
+ const struct tcp_sock *tp = tcp_sk(sk_listener);
+ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ int full_space = tcp_full_space(sk_listener);
int mss = dst_metric_advmss(dst);
+ u32 window_clamp;
+ __u8 rcv_wscale;
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
- mss = tp->rx_opt.user_mss;
+ if (user_mss && user_mss < mss)
+ mss = user_mss;
+ window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
- if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
- req->window_clamp = tcp_full_space(sk);
+ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
/* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(tcp_full_space(sk),
+ tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
+ &req->rsk_rcv_wnd,
+ &req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
@@ -433,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
-struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(const struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
@@ -460,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ newtp->rtt_min[0].rtt = ~0U;
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -469,7 +480,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
- newtp->lsndtime = treq->snt_synack;
+ newtp->lsndtime = treq->snt_synack.stamp_jiffies;
+ newsk->sk_txhash = treq->txhash;
newtp->last_oow_ack_time = 0;
newtp->total_retrans = req->num_retrans;
@@ -501,9 +513,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
if (sysctl_tcp_fack)
tcp_enable_fack(newtp);
}
- newtp->window_clamp = req->window_clamp;
- newtp->rcv_ssthresh = req->rcv_wnd;
- newtp->rcv_wnd = req->rcv_wnd;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
@@ -536,9 +548,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
-
- newtp->saved_syn = req->saved_syn;
- req->saved_syn = NULL;
+ newtp->rack.mstamp.v64 = 0;
+ newtp->rack.advanced = 0;
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
@@ -566,8 +577,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false;
-
- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+ bool own_req;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
@@ -698,7 +708,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req);
@@ -755,16 +765,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ req, &own_req);
if (!child)
goto listen_overflow;
- inet_csk_reqsk_queue_drop(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
- /* Warning: caller must not call reqsk_put(req);
- * child stole last reference on it.
- */
- return child;
+ sock_rps_save_rxhash(child, skb);
+ tcp_synack_rtt_meas(child, req);
+ return inet_csk_complete_hashdance(sk, child, req, own_req);
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
@@ -811,8 +819,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int state = child->sk_state;
if (!sock_owned_by_user(child)) {
- ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
- skb->len);
+ ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3dbee0d83b15..9bfc39ff2285 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -357,14 +357,10 @@ static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
}
static void
-tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
- struct sock *sk)
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
- if (inet_rsk(req)->ecn_ok) {
+ if (inet_rsk(req)->ecn_ok)
th->ece = 1;
- if (tcp_ca_needs_ecn(sk))
- INET_ECN_xmit(sk);
- }
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -612,12 +608,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct sock *sk,
- struct request_sock *req,
- unsigned int mss, struct sk_buff *skb,
- struct tcp_out_options *opts,
- const struct tcp_md5sig_key *md5,
- struct tcp_fastopen_cookie *foc)
+static unsigned int tcp_synack_options(struct request_sock *req,
+ unsigned int mss, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ const struct tcp_md5sig_key *md5,
+ struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -1827,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
/* Ok, it looks like it is advisable to defer. */
- if (cong_win < send_win && cong_win < skb->len)
+ if (cong_win < send_win && cong_win <= skb->len)
*is_cwnd_limited = true;
return true;
@@ -2060,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
- is_cwnd_limited = true;
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
@@ -2142,6 +2136,7 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk);
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
@@ -2165,7 +2160,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
- if (sk->sk_state == TCP_SYN_RECV)
+ if (tp->fastopen_rsk)
return false;
/* TLP is only scheduled when next timer event is RTO. */
@@ -2175,7 +2170,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
+ if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
return false;
@@ -2184,9 +2179,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
return false;
/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
- * for delayed ack when there's one outstanding packet.
+ * for delayed ack when there's one outstanding packet. If no RTT
+ * sample is available then probe after TCP_TIMEOUT_INIT.
*/
- timeout = rtt << 1;
+ timeout = rtt << 1 ? : TCP_TIMEOUT_INIT;
if (tp->packets_out == 1)
timeout = max_t(u32, timeout,
(rtt + (rtt >> 1) + TCP_DELACK_MAX));
@@ -2659,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
- if (!tp->retrans_out)
- tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
@@ -2668,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- /* snd_nxt is stored to detect loss of retransmitted segment,
- * see tcp_input.c tcp_sacktag_write_queue().
- */
- TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
} else if (err != -EBUSY) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
@@ -2949,20 +2939,22 @@ int tcp_send_synack(struct sock *sk)
* Allocate one skb and build a SYNACK packet.
* @dst is consumed : Caller should not use it again.
*/
-struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
- struct tcp_out_options opts;
struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcphdr *th;
- struct sk_buff *skb;
+ const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *md5 = NULL;
+ struct tcp_out_options opts;
+ struct sk_buff *skb;
int tcp_header_size;
+ struct tcphdr *th;
+ u16 user_mss;
int mss;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -2970,11 +2962,21 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
+ if (attach_req) {
+ skb_set_owner_w(skb, req_to_sk(req));
+ } else {
+ /* sk is a const pointer, because we want to express multiple
+ * cpu might call us concurrently.
+ * sk->sk_wmem_alloc in an atomic, we can promote to rw.
+ */
+ skb_set_owner_w(skb, (struct sock *)sk);
+ }
skb_dst_set(skb, dst);
mss = dst_metric_advmss(dst);
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
- mss = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss && user_mss < mss)
+ mss = user_mss;
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
@@ -2988,8 +2990,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
- tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc) + sizeof(*th);
+ skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
+ sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2998,7 +3001,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- tcp_ecn_make_synack(req, th, sk);
+ tcp_ecn_make_synack(req, th);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
@@ -3012,8 +3015,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
- th->window = htons(min(req->rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ th->window = htons(min(req->rsk_rcv_wnd, 65535U));
+ tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
@@ -3147,7 +3150,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0, copied;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
struct sk_buff *syn_data;
@@ -3185,17 +3188,18 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
goto fallback;
syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
- copied = copy_from_iter(skb_put(syn_data, space), space,
- &fo->data->msg_iter);
- if (unlikely(!copied)) {
- kfree_skb(syn_data);
- goto fallback;
- }
- if (copied != space) {
- skb_trim(syn_data, copied);
- space = copied;
+ if (space) {
+ int copied = copy_from_iter(skb_put(syn_data, space), space,
+ &fo->data->msg_iter);
+ if (unlikely(!copied)) {
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ if (copied != space) {
+ skb_trim(syn_data, copied);
+ space = copied;
+ }
}
-
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
fo->data = NULL;
@@ -3500,13 +3504,14 @@ void tcp_send_probe0(struct sock *sk)
TCP_RTO_MAX);
}
-int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
{
const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
struct flowi fl;
int res;
- res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ tcp_rsk(req)->txhash = net_tx_rndhash();
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true);
if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000000..5353085fd0b2
--- /dev/null
+++ b/net/ipv4/tcp_recovery.c
@@ -0,0 +1,109 @@
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+
+/* Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * The current version is only used after recovery starts but can be
+ * easily extended to detect the first loss.
+ */
+int tcp_rack_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ u32 reo_wnd, prior_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+ return 0;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay
+ * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
+ * RTT because reordering is often a path property and less related
+ * to queuing or delayed ACKs.
+ *
+ * TODO: measure and adapt to the observed reordering delay, and
+ * use a timer to retransmit like the delayed early retransmit.
+ */
+ reo_wnd = 1000;
+ if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+
+ tcp_for_write_queue(skb, sk) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* Skip ones already (s)acked */
+ if (!after(scb->end_seq, tp->snd_una) ||
+ scb->sacked & TCPCB_SACKED_ACKED)
+ continue;
+
+ if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+
+ if (skb_mstamp_us_delta(&tp->rack.mstamp,
+ &skb->skb_mstamp) <= reo_wnd)
+ continue;
+
+ /* skb is lost if packet sent later is sacked */
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (scb->sacked & TCPCB_SACKED_RETRANS) {
+ scb->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+ } else if (!(scb->sacked & TCPCB_RETRANS)) {
+ /* Original data are sent sequentially so stop early
+ * b/c the rest are all sent after rack_sent
+ */
+ break;
+ }
+ }
+ return prior_retrans - tp->retrans_out;
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets */
+void tcp_rack_advance(struct tcp_sock *tp,
+ const struct skb_mstamp *xmit_time, u8 sacked)
+{
+ if (tp->rack.mstamp.v64 &&
+ !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ return;
+
+ if (sacked & TCPCB_RETRANS) {
+ struct skb_mstamp now;
+
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ skb_mstamp_get(&now);
+ if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ return;
+ }
+
+ tp->rack.mstamp = *xmit_time;
+ tp->rack.advanced = 1;
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7149ebc820c7..193ba1fa8a9a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
}
/* Calculate maximal number or retries on an orphaned socket. */
-static int tcp_orphan_retries(struct sock *sk, int alive)
+static int tcp_orphan_retries(struct sock *sk, bool alive)
{
int retries = sysctl_tcp_orphan_retries; /* May be zero. */
@@ -168,7 +168,7 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (tp->syn_data)
+ if (tp->syn_data && icsk->icsk_retransmits == 1)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
@@ -176,6 +176,18 @@ static int tcp_write_timeout(struct sock *sk)
syn_set = true;
} else {
if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ /* Some middle-boxes may black-hole Fast Open _after_
+ * the handshake. Therefore we conservatively disable
+ * Fast Open on this path on recurring timeouts with
+ * few or zero bytes acked after Fast Open.
+ */
+ if (tp->syn_data_acked &&
+ tp->bytes_acked <= tp->rx_opt.mss_clamp) {
+ tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
+ if (icsk->icsk_retransmits == sysctl_tcp_retries1)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -184,7 +196,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = icsk->icsk_rto < TCP_RTO_MAX;
+ const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -298,7 +310,7 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+ const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f7d1d5e19e95..0c7b0e61b917 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -100,7 +100,6 @@
#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
-#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/net_namespace.h>
@@ -375,7 +374,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
-
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
return score;
}
@@ -419,6 +419,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
+
return score;
}
@@ -1017,30 +1020,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- /* unconnected socket. If output device is enslaved to a VRF
- * device lookup source address from VRF table. This mimics
- * behavior of ip_route_connect{_init}.
- */
- if (netif_index_is_vrf(net, ipc.oif)) {
- flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
- (flow_flags | FLOWI_FLAG_VRFSRC |
- FLOWI_FLAG_SKIP_NH_OIF),
- faddr, saddr, dport,
- inet->inet_sport);
-
- rt = ip_route_output_flow(net, fl4, sk);
- if (!IS_ERR(rt)) {
- saddr = fl4->saddr;
- ip_rt_put(rt);
- }
- }
-
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport);
+ if (!saddr && ipc.oif)
+ l3mdev_get_saddr(net, ipc.oif, fl4);
+
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 60b032f58ccc..62e1e72db461 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -22,7 +22,8 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm4_extract_header(skb);
}
-static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb)
+static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
if (!skb_dst(skb)) {
const struct iphdr *iph = ip_hdr(skb);
@@ -52,8 +53,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
- skb->dev, NULL,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
xfrm4_rcv_encap_finish);
return 0;
}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 41a261355662..7ee6518afa86 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -82,24 +82,25 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
return xfrm_output(sk, skb);
}
-static int __xfrm4_output(struct sock *sk, struct sk_buff *skb)
+static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct xfrm_state *x = skb_dst(skb)->xfrm;
#ifdef CONFIG_NETFILTER
if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output_sk(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
return x->outer_mode->afinfo->output_finish(sk, skb);
}
-int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
- NULL, skb_dst(skb)->dev, __xfrm4_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb_dst(skb)->dev,
+ __xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 126ff9020bad..7b0edb37a115 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -15,7 +15,7 @@
#include <net/dst.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
@@ -97,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_gateway = rt->rt_gateway;
xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ xdst->u.rt.rt_table_id = rt->rt_table_id;
INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
return 0;
@@ -110,10 +111,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
struct flowi4 *fl4 = &fl->u.ip4;
int oif = 0;
- if (skb_dst(skb)) {
- oif = vrf_master_ifindex(skb_dst(skb)->dev) ?
- : skb_dst(skb)->dev->ifindex;
- }
+ if (skb_dst(skb))
+ oif = l3mdev_fib_oif(skb_dst(skb)->dev);
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
@@ -128,7 +127,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
case IPPROTO_DCCP:
if (xprth + 4 < skb->data ||
pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ports = (__be16 *)xprth;
+ __be16 *ports;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ports = (__be16 *)xprth;
fl4->fl4_sport = ports[!!reverse];
fl4->fl4_dport = ports[!reverse];
@@ -136,8 +138,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
break;
case IPPROTO_ICMP:
- if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
- u8 *icmp = xprth;
+ if (xprth + 2 < skb->data ||
+ pskb_may_pull(skb, xprth + 2 - skb->data)) {
+ u8 *icmp;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ icmp = xprth;
fl4->fl4_icmp_type = icmp[0];
fl4->fl4_icmp_code = icmp[1];
@@ -145,33 +151,50 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
break;
case IPPROTO_ESP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be32 *ehdr = (__be32 *)xprth;
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be32 *ehdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ehdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ehdr[0];
}
break;
case IPPROTO_AH:
- if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr = (__be32 *)xprth;
+ if (xprth + 8 < skb->data ||
+ pskb_may_pull(skb, xprth + 8 - skb->data)) {
+ __be32 *ah_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ah_hdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ah_hdr[1];
}
break;
case IPPROTO_COMP:
- if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ipcomp_hdr = (__be16 *)xprth;
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ipcomp_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ ipcomp_hdr = (__be16 *)xprth;
fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
}
break;
case IPPROTO_GRE:
- if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
- __be16 *greflags = (__be16 *)xprth;
- __be32 *gre_hdr = (__be32 *)xprth;
+ if (xprth + 12 < skb->data ||
+ pskb_may_pull(skb, xprth + 12 - skb->data)) {
+ __be16 *greflags;
+ __be32 *gre_hdr;
+
+ xprth = skb_network_header(skb) + iph->ihl * 4;
+ greflags = (__be16 *)xprth;
+ gre_hdr = (__be32 *)xprth;
if (greflags[0] & GRE_KEY) {
if (greflags[0] & GRE_CSUM)
@@ -245,7 +268,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 32768,
+ .gc_thresh = INT_MAX,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {