summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig55
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c47
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/arp.c6
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/datagram.c1
-rw-r--r--net/ipv4/devinet.c36
-rw-r--r--net/ipv4/fib_frontend.c14
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c2
-rw-r--r--net/ipv4/fou.c514
-rw-r--r--net/ipv4/geneve.c373
-rw-r--r--net/ipv4/gre_demux.c9
-rw-r--r--net/ipv4/gre_offload.c57
-rw-r--r--net/ipv4/icmp.c70
-rw-r--r--net/ipv4/igmp.c37
-rw-r--r--net/ipv4/inet_fragment.c318
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/inetpeer.c21
-rw-r--r--net/ipv4/ip_fragment.c91
-rw-r--r--net/ipv4/ip_gre.c94
-rw-r--r--net/ipv4/ip_options.c6
-rw-r--r--net/ipv4/ip_output.c23
-rw-r--r--net/ipv4/ip_sockglue.c21
-rw-r--r--net/ipv4/ip_tunnel.c148
-rw-r--r--net/ipv4/ip_vti.c56
-rw-r--r--net/ipv4/ipconfig.c8
-rw-r--r--net/ipv4/ipip.c82
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/Kconfig168
-rw-r--r--net/ipv4/netfilter/Makefile12
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c2
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c108
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c498
-rw-r--r--net/ipv4/netfilter/iptable_nat.c233
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c149
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c385
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c203
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c153
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c127
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c157
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c77
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c1
-rw-r--r--net/ipv4/ping.c4
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/route.c24
-rw-r--r--net/ipv4/syncookies.c5
-rw-r--r--net/ipv4/sysctl_net_ipv4.c49
-rw-r--r--net/ipv4/tcp.c227
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c55
-rw-r--r--net/ipv4/tcp_cubic.c18
-rw-r--r--net/ipv4/tcp_dctcp.c344
-rw-r--r--net/ipv4/tcp_diag.c5
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_highspeed.c145
-rw-r--r--net/ipv4/tcp_htcp.c6
-rw-r--r--net/ipv4/tcp_hybla.c1
-rw-r--r--net/ipv4/tcp_illinois.c3
-rw-r--r--net/ipv4/tcp_input.c573
-rw-r--r--net/ipv4/tcp_ipv4.c328
-rw-r--r--net/ipv4/tcp_memcontrol.c4
-rw-r--r--net/ipv4/tcp_metrics.c7
-rw-r--r--net/ipv4/tcp_minisocks.c15
-rw-r--r--net/ipv4/tcp_offload.c90
-rw-r--r--net/ipv4/tcp_output.c162
-rw-r--r--net/ipv4/tcp_probe.c6
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c56
-rw-r--r--net/ipv4/tcp_vegas.c6
-rw-r--r--net/ipv4/tcp_veno.c3
-rw-r--r--net/ipv4/tcp_westwood.c35
-rw-r--r--net/ipv4/tcp_yeah.c9
-rw-r--r--net/ipv4/udp.c169
-rw-r--r--net/ipv4/udp_offload.c219
-rw-r--r--net/ipv4/udp_tunnel.c108
-rw-r--r--net/ipv4/xfrm4_protocol.c2
86 files changed, 4601 insertions, 2515 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05c57f0fcabe..e682b48e0709 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -307,6 +307,35 @@ config NET_IPVTI
the notion of a secure tunnel for IPSEC and then use routing protocol
on top.
+config NET_UDP_TUNNEL
+ tristate
+ select NET_IP_TUNNEL
+ default n
+
+config NET_FOU
+ tristate "IP: Foo (IP protocols) over UDP"
+ select XFRM
+ select NET_UDP_TUNNEL
+ ---help---
+ Foo over UDP allows any IP protocol to be directly encapsulated
+ over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
+ network mechanisms and optimizations for UDP (such as ECMP
+ and RSS) can be leveraged to provide better service.
+
+config GENEVE
+ tristate "Generic Network Virtualization Encapsulation (Geneve)"
+ depends on INET
+ select NET_UDP_TUNNEL
+ ---help---
+ This allows one to create Geneve virtual interfaces that provide
+ Layer 2 Networks over Layer 3 Networks. Geneve is often used
+ to tunnel virtual network infrastructure in virtualized environments.
+ For more information see:
+ http://tools.ietf.org/html/draft-gross-geneve-01
+
+ To compile this driver as a module, choose M here: the module
+
+
config INET_AH
tristate "IP: AH transformation"
select XFRM_ALGO
@@ -556,6 +585,27 @@ config TCP_CONG_ILLINOIS
For further details see:
http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+config TCP_CONG_DCTCP
+ tristate "DataCenter TCP (DCTCP)"
+ default n
+ ---help---
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
+
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
+
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
+
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -584,9 +634,11 @@ choice
config DEFAULT_WESTWOOD
bool "Westwood" if TCP_CONG_WESTWOOD=y
+ config DEFAULT_DCTCP
+ bool "DCTCP" if TCP_CONG_DCTCP=y
+
config DEFAULT_RENO
bool "Reno"
-
endchoice
endif
@@ -606,6 +658,7 @@ config DEFAULT_TCP_CONG
default "westwood" if DEFAULT_WESTWOOD
default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
+ default "dctcp" if DEFAULT_DCTCP
default "cubic"
config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f032688d20d3..518c04ed666e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,8 +20,10 @@ obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
gre-y := gre_demux.o
+obj-$(CONFIG_NET_FOU) += fou.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
@@ -41,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
@@ -53,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_GENEVE) += geneve.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d156b3c5f363..92db7a69f2b9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -418,10 +418,6 @@ int inet_release(struct socket *sock)
}
EXPORT_SYMBOL(inet_release);
-/* It is off by default, see below. */
-int sysctl_ip_nonlocal_bind __read_mostly;
-EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
-
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
@@ -461,7 +457,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!sysctl_ip_nonlocal_bind &&
+ if (!net->ipv4.sysctl_ip_nonlocal_bind &&
!(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
@@ -1201,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk)
}
EXPORT_SYMBOL(inet_sk_rebuild_header);
-static int inet_gso_send_check(struct sk_buff *skb)
-{
- const struct net_offload *ops;
- const struct iphdr *iph;
- int proto;
- int ihl;
- int err = -EINVAL;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
- goto out;
-
- iph = ip_hdr(skb);
- ihl = iph->ihl * 4;
- if (ihl < sizeof(*iph))
- goto out;
-
- proto = iph->protocol;
-
- /* Warning: after this point, iph might be no longer valid */
- if (unlikely(!pskb_may_pull(skb, ihl)))
- goto out;
- __skb_pull(skb, ihl);
-
- skb_reset_transport_header(skb);
- err = -EPROTONOSUPPORT;
-
- ops = rcu_dereference(inet_offloads[proto]);
- if (likely(ops && ops->callbacks.gso_send_check))
- err = ops->callbacks.gso_send_check(skb);
-
-out:
- return err;
-}
-
static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -1407,6 +1369,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
* immediately following this IP hdr.
*/
+ /* Note : No need to call skb_gro_postpull_rcsum() here,
+ * as we already checked checksum over ipv4 header was 0
+ */
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
@@ -1659,7 +1624,6 @@ static int ipv4_proc_init(void);
static struct packet_offload ip_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.callbacks = {
- .gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
@@ -1668,8 +1632,9 @@ static struct packet_offload ip_packet_offload __read_mostly = {
static const struct net_offload ipip_offload = {
.callbacks = {
- .gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
},
};
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a2afa89513a0..ac9a32ec3ee4 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -505,8 +505,6 @@ static int ah_init_state(struct xfrm_state *x)
ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
- BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
-
if (x->props.flags & XFRM_STATE_ALIGN4)
x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
ahp->icv_trunc_len);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1a9b99e04465..16acb59d665e 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -953,10 +953,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
{
const struct arphdr *arp;
+ /* do not tweak dropwatch on an ARP we will ignore */
if (dev->flags & IFF_NOARP ||
skb->pkt_type == PACKET_OTHERHOST ||
skb->pkt_type == PACKET_LOOPBACK)
- goto freeskb;
+ goto consumeskb;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
@@ -974,6 +975,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+consumeskb:
+ consume_skb(skb);
+ return 0;
freeskb:
kfree_skb(skb);
out_of_mem:
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 05b708bbdb0d..4715f25dfe03 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -246,7 +246,7 @@ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
* success, negative values on error.
*
*/
-static int cipso_v4_cache_init(void)
+static int __init cipso_v4_cache_init(void)
{
u32 iter;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index a3095fdefbed..90c0e8386116 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
+ inet_set_txhash(sk);
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e9449376b58e..214882e7d6de 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -180,11 +180,12 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
-static void devinet_sysctl_register(struct in_device *idev);
+static int devinet_sysctl_register(struct in_device *idev);
static void devinet_sysctl_unregister(struct in_device *idev);
#else
-static void devinet_sysctl_register(struct in_device *idev)
+static int devinet_sysctl_register(struct in_device *idev)
{
+ return 0;
}
static void devinet_sysctl_unregister(struct in_device *idev)
{
@@ -232,6 +233,7 @@ EXPORT_SYMBOL(in_dev_finish_destroy);
static struct in_device *inetdev_init(struct net_device *dev)
{
struct in_device *in_dev;
+ int err = -ENOMEM;
ASSERT_RTNL();
@@ -252,7 +254,13 @@ static struct in_device *inetdev_init(struct net_device *dev)
/* Account for reference dev->ip_ptr (below) */
in_dev_hold(in_dev);
- devinet_sysctl_register(in_dev);
+ err = devinet_sysctl_register(in_dev);
+ if (err) {
+ in_dev->dead = 1;
+ in_dev_put(in_dev);
+ in_dev = NULL;
+ goto out;
+ }
ip_mc_init_dev(in_dev);
if (dev->flags & IFF_UP)
ip_mc_up(in_dev);
@@ -260,7 +268,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
/* we can receive as soon as ip_ptr is set -- do this last */
rcu_assign_pointer(dev->ip_ptr, in_dev);
out:
- return in_dev;
+ return in_dev ?: ERR_PTR(err);
out_kfree:
kfree(in_dev);
in_dev = NULL;
@@ -1347,8 +1355,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (!in_dev) {
if (event == NETDEV_REGISTER) {
in_dev = inetdev_init(dev);
- if (!in_dev)
- return notifier_from_errno(-ENOMEM);
+ if (IS_ERR(in_dev))
+ return notifier_from_errno(PTR_ERR(in_dev));
if (dev->flags & IFF_LOOPBACK) {
IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
@@ -2182,11 +2190,21 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
kfree(t);
}
-static void devinet_sysctl_register(struct in_device *idev)
+static int devinet_sysctl_register(struct in_device *idev)
{
- neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
- __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ int err;
+
+ if (!sysctl_dev_name_is_allowed(idev->dev->name))
+ return -EINVAL;
+
+ err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+ if (err)
+ return err;
+ err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
&idev->cnf);
+ if (err)
+ neigh_sysctl_unregister(idev->arp_parms);
+ return err;
}
static void devinet_sysctl_unregister(struct in_device *idev)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 255aa9946fe7..23104a3f2924 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -243,7 +243,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
u8 tos, int oif, struct net_device *dev,
int rpf, struct in_device *idev, u32 *itag)
{
- int ret, no_addr, accept_local;
+ int ret, no_addr;
struct fib_result res;
struct flowi4 fl4;
struct net *net;
@@ -258,16 +258,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
no_addr = idev->ifa_list == NULL;
- accept_local = IN_DEV_ACCEPT_LOCAL(idev);
fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
net = dev_net(dev);
if (fib_lookup(net, &fl4, &res))
goto last_resort;
- if (res.type != RTN_UNICAST) {
- if (res.type != RTN_LOCAL || !accept_local)
- goto e_inval;
- }
+ if (res.type != RTN_UNICAST &&
+ (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
+ goto e_inval;
+ if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
+ goto last_resort;
fib_combine_itag(itag, &res);
dev_match = false;
@@ -321,6 +322,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
+ IN_DEV_ACCEPT_LOCAL(idev) &&
(dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
*itag = 0;
return 0;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b10cd43a4722..5b6efb3d2308 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -157,9 +157,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
static void free_nh_exceptions(struct fib_nh *nh)
{
- struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+ struct fnhe_hash_bucket *hash;
int i;
+ hash = rcu_dereference_protected(nh->nh_exceptions, 1);
+ if (!hash)
+ return;
for (i = 0; i < FNHE_HASH_SIZE; i++) {
struct fib_nh_exception *fnhe;
@@ -205,8 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
dev_put(nexthop_nh->nh_dev);
- if (nexthop_nh->nh_exceptions)
- free_nh_exceptions(nexthop_nh);
+ free_nh_exceptions(nexthop_nh);
rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5afeb5aa4c7c..e9cb2588e416 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -940,7 +940,7 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
last = li;
}
if (last)
- hlist_add_after_rcu(&last->hlist, &new->hlist);
+ hlist_add_behind_rcu(&new->hlist, &last->hlist);
else
hlist_add_before_rcu(&new->hlist, &li->hlist);
}
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
new file mode 100644
index 000000000000..efa70ad44906
--- /dev/null
+++ b/net/ipv4/fou.c
@@ -0,0 +1,514 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/genetlink.h>
+#include <net/gue.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/xfrm.h>
+#include <uapi/linux/fou.h>
+#include <uapi/linux/genetlink.h>
+
+static DEFINE_SPINLOCK(fou_lock);
+static LIST_HEAD(fou_list);
+
+struct fou {
+ struct socket *sock;
+ u8 protocol;
+ u16 port;
+ struct udp_offload udp_offloads;
+ struct list_head list;
+};
+
+struct fou_cfg {
+ u16 type;
+ u8 protocol;
+ struct udp_port_cfg udp_config;
+};
+
+static inline struct fou *fou_from_sock(struct sock *sk)
+{
+ return sk->sk_user_data;
+}
+
+static int fou_udp_encap_recv_deliver(struct sk_buff *skb,
+ u8 protocol, size_t len)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ /* Remove 'len' bytes from the packet (UDP header and
+ * FOU header if present), modify the protocol to the one
+ * we found, and then call rcv_encap.
+ */
+ iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+ skb_reset_transport_header(skb);
+
+ return -protocol;
+}
+
+static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+
+ if (!fou)
+ return 1;
+
+ return fou_udp_encap_recv_deliver(skb, fou->protocol,
+ sizeof(struct udphdr));
+}
+
+static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+ size_t len;
+ struct guehdr *guehdr;
+ struct udphdr *uh;
+
+ if (!fou)
+ return 1;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ uh = udp_hdr(skb);
+ guehdr = (struct guehdr *)&uh[1];
+
+ len += guehdr->hlen << 2;
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ if (guehdr->version != 0)
+ goto drop;
+
+ if (guehdr->flags) {
+ /* No support yet */
+ goto drop;
+ }
+
+ return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct sk_buff **fou_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ u8 proto = NAPI_GRO_CB(skb)->proto;
+ const struct net_offload **offloads;
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out_unlock;
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return pp;
+}
+
+static int fou_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct net_offload *ops;
+ u8 proto = NAPI_GRO_CB(skb)->proto;
+ int err = -ENOSYS;
+ const struct net_offload **offloads;
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ err = ops->callbacks.gro_complete(skb, nhoff);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static struct sk_buff **gue_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ u8 proto;
+ struct guehdr *guehdr;
+ unsigned int hlen, guehlen;
+ unsigned int off;
+ int flush = 1;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*guehdr);
+ guehdr = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ guehdr = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
+
+ proto = guehdr->next_hdr;
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+ goto out_unlock;
+
+ guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+
+ hlen = off + guehlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ guehdr = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!guehdr))
+ goto out_unlock;
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct guehdr *guehdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ guehdr2 = (struct guehdr *)(p->data + off);
+
+ /* Compare base GUE header to be equal (covers
+ * hlen, version, next_hdr, and flags.
+ */
+ if (guehdr->word != guehdr2->word) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Compare optional fields are the same. */
+ if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
+ guehdr->hlen << 2)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ skb_gro_pull(skb, guehlen);
+
+ /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+ skb_gro_postpull_rcsum(skb, guehdr, guehlen);
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int gue_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct net_offload **offloads;
+ struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ unsigned int guehlen;
+ u8 proto;
+ int err = -ENOENT;
+
+ proto = guehdr->next_hdr;
+
+ guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
+
+out_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int fou_add_to_port_list(struct fou *fou)
+{
+ struct fou *fout;
+
+ spin_lock(&fou_lock);
+ list_for_each_entry(fout, &fou_list, list) {
+ if (fou->port == fout->port) {
+ spin_unlock(&fou_lock);
+ return -EALREADY;
+ }
+ }
+
+ list_add(&fou->list, &fou_list);
+ spin_unlock(&fou_lock);
+
+ return 0;
+}
+
+static void fou_release(struct fou *fou)
+{
+ struct socket *sock = fou->sock;
+ struct sock *sk = sock->sk;
+
+ udp_del_offload(&fou->udp_offloads);
+
+ list_del(&fou->list);
+
+ /* Remove hooks into tunnel socket */
+ sk->sk_user_data = NULL;
+
+ sock_release(sock);
+
+ kfree(fou);
+}
+
+static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+ udp_sk(sk)->encap_rcv = fou_udp_recv;
+ fou->protocol = cfg->protocol;
+ fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
+ fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
+ fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+ fou->udp_offloads.ipproto = cfg->protocol;
+
+ return 0;
+}
+
+static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+ udp_sk(sk)->encap_rcv = gue_udp_recv;
+ fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
+ fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
+ fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+
+ return 0;
+}
+
+static int fou_create(struct net *net, struct fou_cfg *cfg,
+ struct socket **sockp)
+{
+ struct fou *fou = NULL;
+ int err;
+ struct socket *sock = NULL;
+ struct sock *sk;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &cfg->udp_config, &sock);
+ if (err < 0)
+ goto error;
+
+ /* Allocate FOU port structure */
+ fou = kzalloc(sizeof(*fou), GFP_KERNEL);
+ if (!fou) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ sk = sock->sk;
+
+ fou->port = cfg->udp_config.local_udp_port;
+
+ /* Initial for fou type */
+ switch (cfg->type) {
+ case FOU_ENCAP_DIRECT:
+ err = fou_encap_init(sk, fou, cfg);
+ if (err)
+ goto error;
+ break;
+ case FOU_ENCAP_GUE:
+ err = gue_encap_init(sk, fou, cfg);
+ if (err)
+ goto error;
+ break;
+ default:
+ err = -EINVAL;
+ goto error;
+ }
+
+ udp_sk(sk)->encap_type = 1;
+ udp_encap_enable();
+
+ sk->sk_user_data = fou;
+ fou->sock = sock;
+
+ udp_set_convert_csum(sk, true);
+
+ sk->sk_allocation = GFP_ATOMIC;
+
+ if (cfg->udp_config.family == AF_INET) {
+ err = udp_add_offload(&fou->udp_offloads);
+ if (err)
+ goto error;
+ }
+
+ err = fou_add_to_port_list(fou);
+ if (err)
+ goto error;
+
+ if (sockp)
+ *sockp = sock;
+
+ return 0;
+
+error:
+ kfree(fou);
+ if (sock)
+ sock_release(sock);
+
+ return err;
+}
+
+static int fou_destroy(struct net *net, struct fou_cfg *cfg)
+{
+ struct fou *fou;
+ u16 port = cfg->udp_config.local_udp_port;
+ int err = -EINVAL;
+
+ spin_lock(&fou_lock);
+ list_for_each_entry(fou, &fou_list, list) {
+ if (fou->port == port) {
+ udp_del_offload(&fou->udp_offloads);
+ fou_release(fou);
+ err = 0;
+ break;
+ }
+ }
+ spin_unlock(&fou_lock);
+
+ return err;
+}
+
+static struct genl_family fou_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = FOU_GENL_NAME,
+ .version = FOU_GENL_VERSION,
+ .maxattr = FOU_ATTR_MAX,
+ .netnsok = true,
+};
+
+static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
+ [FOU_ATTR_PORT] = { .type = NLA_U16, },
+ [FOU_ATTR_AF] = { .type = NLA_U8, },
+ [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
+ [FOU_ATTR_TYPE] = { .type = NLA_U8, },
+};
+
+static int parse_nl_config(struct genl_info *info,
+ struct fou_cfg *cfg)
+{
+ memset(cfg, 0, sizeof(*cfg));
+
+ cfg->udp_config.family = AF_INET;
+
+ if (info->attrs[FOU_ATTR_AF]) {
+ u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
+ cfg->udp_config.family = family;
+ }
+
+ if (info->attrs[FOU_ATTR_PORT]) {
+ u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]);
+
+ cfg->udp_config.local_udp_port = port;
+ }
+
+ if (info->attrs[FOU_ATTR_IPPROTO])
+ cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
+
+ if (info->attrs[FOU_ATTR_TYPE])
+ cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
+
+ return 0;
+}
+
+static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_create(&init_net, &cfg, NULL);
+}
+
+static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct fou_cfg cfg;
+
+ parse_nl_config(info, &cfg);
+
+ return fou_destroy(&init_net, &cfg);
+}
+
+static const struct genl_ops fou_nl_ops[] = {
+ {
+ .cmd = FOU_CMD_ADD,
+ .doit = fou_nl_cmd_add_port,
+ .policy = fou_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_DEL,
+ .doit = fou_nl_cmd_rm_port,
+ .policy = fou_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static int __init fou_init(void)
+{
+ int ret;
+
+ ret = genl_register_family_with_ops(&fou_nl_family,
+ fou_nl_ops);
+
+ return ret;
+}
+
+static void __exit fou_fini(void)
+{
+ struct fou *fou, *next;
+
+ genl_unregister_family(&fou_nl_family);
+
+ /* Close all the FOU sockets */
+
+ spin_lock(&fou_lock);
+ list_for_each_entry_safe(fou, next, &fou_list, list)
+ fou_release(fou);
+ spin_unlock(&fou_lock);
+}
+
+module_init(fou_init);
+module_exit(fou_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
new file mode 100644
index 000000000000..065cd94c640c
--- /dev/null
+++ b/net/ipv4/geneve.c
@@ -0,0 +1,373 @@
+/*
+ * Geneve: Generic Network Virtualization Encapsulation
+ *
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/hash.h>
+#include <linux/ethtool.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/geneve.h>
+#include <net/protocol.h>
+#include <net/udp_tunnel.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#endif
+
+#define PORT_HASH_BITS 8
+#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
+
+/* per-network namespace private data for this module */
+struct geneve_net {
+ struct hlist_head sock_list[PORT_HASH_SIZE];
+ spinlock_t sock_lock; /* Protects sock_list */
+};
+
+static int geneve_net_id;
+
+static struct workqueue_struct *geneve_wq;
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+ return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+static struct hlist_head *gs_head(struct net *net, __be16 port)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+
+ return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
+}
+
+/* Find geneve socket based on network namespace and UDP port */
+static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
+{
+ struct geneve_sock *gs;
+
+ hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) {
+ if (inet_sk(gs->sock->sk)->inet_sport == port)
+ return gs;
+ }
+
+ return NULL;
+}
+
+static void geneve_build_header(struct genevehdr *geneveh,
+ __be16 tun_flags, u8 vni[3],
+ u8 options_len, u8 *options)
+{
+ geneveh->ver = GENEVE_VER;
+ geneveh->opt_len = options_len / 4;
+ geneveh->oam = !!(tun_flags & TUNNEL_OAM);
+ geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+ geneveh->rsvd1 = 0;
+ memcpy(geneveh->vni, vni, 3);
+ geneveh->proto_type = htons(ETH_P_TEB);
+ geneveh->rsvd2 = 0;
+
+ memcpy(geneveh->options, options, options_len);
+}
+
+/* Transmit a fully formated Geneve frame.
+ *
+ * When calling this function. The skb->data should point
+ * to the geneve header which is fully formed.
+ *
+ * This function will add other UDP tunnel headers.
+ */
+int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
+ struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
+ __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+ __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+ bool xnet)
+{
+ struct genevehdr *gnvh;
+ int min_headroom;
+ int err;
+
+ skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx);
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+ + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err))
+ return err;
+
+ if (vlan_tx_tag_present(skb)) {
+ if (unlikely(!__vlan_put_tag(skb,
+ skb->vlan_proto,
+ vlan_tx_tag_get(skb)))) {
+ err = -ENOMEM;
+ return err;
+ }
+ skb->vlan_tci = 0;
+ }
+
+ gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
+ geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
+
+ return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
+ tos, ttl, df, src_port, dst_port, xnet);
+}
+EXPORT_SYMBOL_GPL(geneve_xmit_skb);
+
+static void geneve_notify_add_rx_port(struct geneve_sock *gs)
+{
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+ int err;
+
+ if (sa_family == AF_INET) {
+ err = udp_add_offload(&gs->udp_offloads);
+ if (err)
+ pr_warn("geneve: udp_add_offload failed with status %d\n",
+ err);
+ }
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct genevehdr *geneveh;
+ struct geneve_sock *gs;
+ int opts_len;
+
+ /* Need Geneve and inner Ethernet header to be present */
+ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
+ goto error;
+
+ /* Return packets with reserved bits set */
+ geneveh = geneve_hdr(skb);
+
+ if (unlikely(geneveh->ver != GENEVE_VER))
+ goto error;
+
+ if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+ goto error;
+
+ opts_len = geneveh->opt_len * 4;
+ if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
+ htons(ETH_P_TEB)))
+ goto drop;
+
+ gs = rcu_dereference_sk_user_data(sk);
+ if (!gs)
+ goto drop;
+
+ gs->rcv(gs, skb);
+ return 0;
+
+drop:
+ /* Consume bad packet */
+ kfree_skb(skb);
+ return 0;
+
+error:
+ /* Let the UDP layer deal with the skb */
+ return 1;
+}
+
+static void geneve_del_work(struct work_struct *work)
+{
+ struct geneve_sock *gs = container_of(work, struct geneve_sock,
+ del_work);
+
+ udp_tunnel_sock_release(gs->sock);
+ kfree_rcu(gs, rcu);
+}
+
+static struct socket *geneve_create_sock(struct net *net, bool ipv6,
+ __be16 port)
+{
+ struct socket *sock;
+ struct udp_port_cfg udp_conf;
+ int err;
+
+ memset(&udp_conf, 0, sizeof(udp_conf));
+
+ if (ipv6) {
+ udp_conf.family = AF_INET6;
+ } else {
+ udp_conf.family = AF_INET;
+ udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+ }
+
+ udp_conf.local_udp_port = port;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &udp_conf, &sock);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ return sock;
+}
+
+/* Create new listen socket if needed */
+static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
+ geneve_rcv_t *rcv, void *data,
+ bool ipv6)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_sock *gs;
+ struct socket *sock;
+ struct udp_tunnel_sock_cfg tunnel_cfg;
+
+ gs = kzalloc(sizeof(*gs), GFP_KERNEL);
+ if (!gs)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_WORK(&gs->del_work, geneve_del_work);
+
+ sock = geneve_create_sock(net, ipv6, port);
+ if (IS_ERR(sock)) {
+ kfree(gs);
+ return ERR_CAST(sock);
+ }
+
+ gs->sock = sock;
+ atomic_set(&gs->refcnt, 1);
+ gs->rcv = rcv;
+ gs->rcv_data = data;
+
+ /* Initialize the geneve udp offloads structure */
+ gs->udp_offloads.port = port;
+ gs->udp_offloads.callbacks.gro_receive = NULL;
+ gs->udp_offloads.callbacks.gro_complete = NULL;
+
+ spin_lock(&gn->sock_lock);
+ hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
+ geneve_notify_add_rx_port(gs);
+ spin_unlock(&gn->sock_lock);
+
+ /* Mark socket as an encapsulation socket */
+ tunnel_cfg.sk_user_data = gs;
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+ tunnel_cfg.encap_destroy = NULL;
+ setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+ return gs;
+}
+
+struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
+ geneve_rcv_t *rcv, void *data,
+ bool no_share, bool ipv6)
+{
+ struct geneve_sock *gs;
+
+ gs = geneve_socket_create(net, port, rcv, data, ipv6);
+ if (!IS_ERR(gs))
+ return gs;
+
+ if (no_share) /* Return error if sharing is not allowed. */
+ return ERR_PTR(-EINVAL);
+
+ gs = geneve_find_sock(net, port);
+ if (gs) {
+ if (gs->rcv == rcv)
+ atomic_inc(&gs->refcnt);
+ else
+ gs = ERR_PTR(-EBUSY);
+ } else {
+ gs = ERR_PTR(-EINVAL);
+ }
+
+ return gs;
+}
+EXPORT_SYMBOL_GPL(geneve_sock_add);
+
+void geneve_sock_release(struct geneve_sock *gs)
+{
+ if (!atomic_dec_and_test(&gs->refcnt))
+ return;
+
+ queue_work(geneve_wq, &gs->del_work);
+}
+EXPORT_SYMBOL_GPL(geneve_sock_release);
+
+static __net_init int geneve_init_net(struct net *net)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ unsigned int h;
+
+ spin_lock_init(&gn->sock_lock);
+
+ for (h = 0; h < PORT_HASH_SIZE; ++h)
+ INIT_HLIST_HEAD(&gn->sock_list[h]);
+
+ return 0;
+}
+
+static struct pernet_operations geneve_net_ops = {
+ .init = geneve_init_net,
+ .exit = NULL,
+ .id = &geneve_net_id,
+ .size = sizeof(struct geneve_net),
+};
+
+static int __init geneve_init_module(void)
+{
+ int rc;
+
+ geneve_wq = alloc_workqueue("geneve", 0, 0);
+ if (!geneve_wq)
+ return -ENOMEM;
+
+ rc = register_pernet_subsys(&geneve_net_ops);
+ if (rc)
+ return rc;
+
+ pr_info("Geneve driver\n");
+
+ return 0;
+}
+late_initcall(geneve_init_module);
+
+static void __exit geneve_cleanup_module(void)
+{
+ destroy_workqueue(geneve_wq);
+}
+module_exit(geneve_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
+MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
+MODULE_ALIAS_RTNL_LINK("geneve");
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 0485bf7f8f03..4a7b5b2a1ce3 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -98,7 +98,6 @@ EXPORT_SYMBOL_GPL(gre_build_header);
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
- unsigned int ip_hlen = ip_hdrlen(skb);
const struct gre_base_hdr *greh;
__be32 *options;
int hdr_len;
@@ -106,7 +105,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
return -EINVAL;
- greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
@@ -116,7 +115,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
- greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
@@ -125,6 +124,10 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
*csum_err = true;
return -EINVAL;
}
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+
options++;
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f0bdd47bbbcb..a77729503071 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,13 +15,6 @@
#include <net/protocol.h>
#include <net/gre.h>
-static int gre_gso_send_check(struct sk_buff *skb)
-{
- if (!skb->encapsulation)
- return -EINVAL;
- return 0;
-}
-
static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
SKB_GSO_IPIP)))
goto out;
+ if (!skb->encapsulation)
+ goto out;
+
if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
goto out;
@@ -74,7 +70,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
/* segment inner packet. */
enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
segs = skb_mac_gso_segment(skb, enc_features);
- if (!segs || IS_ERR(segs)) {
+ if (IS_ERR_OR_NULL(segs)) {
skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
goto out;
}
@@ -119,28 +115,6 @@ out:
return segs;
}
-/* Compute the whole skb csum in s/w and store it, then verify GRO csum
- * starting from gro_offset.
- */
-static __sum16 gro_skb_checksum(struct sk_buff *skb)
-{
- __sum16 sum;
-
- skb->csum = skb_checksum(skb, 0, skb->len, 0);
- NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
- csum_partial(skb->data, skb_gro_offset(skb), 0));
- sum = csum_fold(NAPI_GRO_CB(skb)->csum);
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
- if (unlikely(!sum) && !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- } else {
- skb->ip_summed = CHECKSUM_COMPLETE;
- skb->csum_complete_sw = 1;
- }
-
- return sum;
-}
-
static struct sk_buff **gre_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
@@ -192,22 +166,16 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
if (unlikely(!greh))
goto out_unlock;
}
- if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
- __sum16 csum = 0;
-
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- csum = csum_fold(NAPI_GRO_CB(skb)->csum);
- /* Don't trust csum error calculated/reported by h/w */
- if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
- csum = gro_skb_checksum(skb);
-
- /* GRE CSUM is the 1's complement of the 1's complement sum
- * of the GRE hdr plus payload so it should add up to 0xffff
- * (and 0 after csum_fold()) just like the IPv4 hdr csum.
- */
- if (csum)
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
+ if (skb_gro_checksum_simple_validate(skb))
goto out_unlock;
+
+ skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
}
+
flush = 0;
for (p = *head; p; p = p->next) {
@@ -284,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
static const struct net_offload gre_offload = {
.callbacks = {
- .gso_send_check = gre_gso_send_check,
.gso_segment = gre_gso_segment,
.gro_receive = gre_gro_receive,
.gro_complete = gre_gro_complete,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 42b7bcf8045b..5882f584910e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk)
spin_unlock_bh(&sk->sk_lock.slock);
}
+int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
+int sysctl_icmp_msgs_burst __read_mostly = 50;
+
+static struct {
+ spinlock_t lock;
+ u32 credit;
+ u32 stamp;
+} icmp_global = {
+ .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
+};
+
+/**
+ * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ *
+ * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Returns false if we reached the limit and can not send another packet.
+ * Note: called with BH disabled
+ */
+bool icmp_global_allow(void)
+{
+ u32 credit, delta, incr = 0, now = (u32)jiffies;
+ bool rc = false;
+
+ /* Check if token bucket is empty and cannot be refilled
+ * without taking the spinlock.
+ */
+ if (!icmp_global.credit) {
+ delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (delta < HZ / 50)
+ return false;
+ }
+
+ spin_lock(&icmp_global.lock);
+ delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (delta >= HZ / 50) {
+ incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
+ if (incr)
+ icmp_global.stamp = now;
+ }
+ credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
+ if (credit) {
+ credit--;
+ rc = true;
+ }
+ icmp_global.credit = credit;
+ spin_unlock(&icmp_global.lock);
+ return rc;
+}
+EXPORT_SYMBOL(icmp_global_allow);
+
/*
* Send an ICMP frame.
*/
-static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
- struct flowi4 *fl4, int type, int code)
+static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+ struct flowi4 *fl4, int type, int code)
{
struct dst_entry *dst = &rt->dst;
bool rc = true;
@@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
goto out;
/* Limit if icmp type is enabled in ratemask. */
- if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
- struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ goto out;
+
+ rc = false;
+ if (icmp_global_allow()) {
+ struct inet_peer *peer;
+
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
if (peer)
@@ -663,16 +719,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
/* Checkin full IP header plus 8 bytes of protocol to
* avoid additional coding at protocol handlers.
*/
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
return;
+ }
raw_icmp_error(skb, protocol, info);
- rcu_read_lock();
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->err_handler)
ipprot->err_handler(skb, info);
- rcu_read_unlock();
}
static bool icmp_tag_validation(int proto)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index db710b059bab..fb70e3ecc3e4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -117,7 +117,7 @@
#define IGMP_V2_Unsolicited_Report_Interval (10*HZ)
#define IGMP_V3_Unsolicited_Report_Interval (1*HZ)
#define IGMP_Query_Response_Interval (10*HZ)
-#define IGMP_Unsolicited_Report_Count 2
+#define IGMP_Query_Robustness_Variable 2
#define IGMP_Initial_Report_Delay (1)
@@ -756,8 +756,7 @@ static void igmp_ifc_event(struct in_device *in_dev)
{
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
return;
- in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
igmp_ifc_start_timer(in_dev, 1);
}
@@ -932,7 +931,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
in_dev->mr_qrv = ih3->qrv;
if (!group) { /* general query */
if (ih3->nsrcs)
- return false; /* no sources allowed */
+ return true; /* no sources allowed */
igmp_gq_start_timer(in_dev);
return false;
}
@@ -1086,8 +1085,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->interface = im->interface;
in_dev_hold(in_dev);
pmc->multiaddr = im->multiaddr;
- pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
pmc->sfmode = im->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
struct ip_sf_list *psf;
@@ -1226,8 +1224,7 @@ static void igmp_group_added(struct ip_mc_list *im)
}
/* else, v3 */
- im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
igmp_ifc_event(in_dev);
#endif
}
@@ -1321,8 +1318,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
atomic_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
- setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
- im->unsolicit_count = IGMP_Unsolicited_Report_Count;
+ setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
+ im->unsolicit_count = sysctl_igmp_qrv;
#endif
im->next_rcu = in_dev->mc_list;
@@ -1460,7 +1457,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
(unsigned long)in_dev);
setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
(unsigned long)in_dev);
- in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
+ in_dev->mr_qrv = sysctl_igmp_qrv;
#endif
spin_lock_init(&in_dev->mc_tomb_lock);
@@ -1474,6 +1471,9 @@ void ip_mc_up(struct in_device *in_dev)
ASSERT_RTNL();
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_qrv = sysctl_igmp_qrv;
+#endif
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
for_each_pmc_rtnl(in_dev, pmc)
@@ -1540,7 +1540,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
*/
int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
-
+#ifdef CONFIG_IP_MULTICAST
+int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable;
+#endif
static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
@@ -1575,8 +1577,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
#ifdef CONFIG_IP_MULTICAST
if (psf->sf_oldin &&
!IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
- psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
psf->sf_next = pmc->tomb;
pmc->tomb = psf;
rv = 1;
@@ -1639,8 +1640,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
/* filter mode change */
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
- pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
@@ -1818,8 +1818,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
#ifdef CONFIG_IP_MULTICAST
/* else no filters; keep old mode for reports */
- pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
@@ -2539,7 +2538,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
querier = "NONE";
#endif
- if (rcu_dereference(state->in_dev->mc_list) == im) {
+ if (rcu_access_pointer(state->in_dev->mc_list) == im) {
seq_printf(seq, "%d\t%-10s: %5d %7s\n",
state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 3b01959bf4bb..9eb89f3f0ee4 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,6 +25,12 @@
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
+#define INETFRAGS_EVICT_BUCKETS 128
+#define INETFRAGS_EVICT_MAX 512
+
+/* don't rebuild inetfrag table with new secret more often than this */
+#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
+
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped.
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -46,24 +52,39 @@ const u8 ip_frag_ecn_table[16] = {
};
EXPORT_SYMBOL(ip_frag_ecn_table);
-static void inet_frag_secret_rebuild(unsigned long dummy)
+static unsigned int
+inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
+{
+ return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
+}
+
+static bool inet_frag_may_rebuild(struct inet_frags *f)
+{
+ return time_after(jiffies,
+ f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
+}
+
+static void inet_frag_secret_rebuild(struct inet_frags *f)
{
- struct inet_frags *f = (struct inet_frags *)dummy;
- unsigned long now = jiffies;
int i;
- /* Per bucket lock NOT needed here, due to write lock protection */
- write_lock(&f->lock);
+ write_seqlock_bh(&f->rnd_seqlock);
+
+ if (!inet_frag_may_rebuild(f))
+ goto out;
get_random_bytes(&f->rnd, sizeof(u32));
+
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n;
hb = &f->hash[i];
+ spin_lock(&hb->chain_lock);
+
hlist_for_each_entry_safe(q, n, &hb->chain, list) {
- unsigned int hval = f->hashfn(q);
+ unsigned int hval = inet_frag_hashfn(f, q);
if (hval != i) {
struct inet_frag_bucket *hb_dest;
@@ -72,76 +93,200 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
/* Relink to new hash chain. */
hb_dest = &f->hash[hval];
+
+ /* This is the only place where we take
+ * another chain_lock while already holding
+ * one. As this will not run concurrently,
+ * we cannot deadlock on hb_dest lock below, if its
+ * already locked it will be released soon since
+ * other caller cannot be waiting for hb lock
+ * that we've taken above.
+ */
+ spin_lock_nested(&hb_dest->chain_lock,
+ SINGLE_DEPTH_NESTING);
hlist_add_head(&q->list, &hb_dest->chain);
+ spin_unlock(&hb_dest->chain_lock);
}
}
+ spin_unlock(&hb->chain_lock);
+ }
+
+ f->rebuild = false;
+ f->last_rebuild_jiffies = jiffies;
+out:
+ write_sequnlock_bh(&f->rnd_seqlock);
+}
+
+static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
+{
+ return q->net->low_thresh == 0 ||
+ frag_mem_limit(q->net) >= q->net->low_thresh;
+}
+
+static unsigned int
+inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
+{
+ struct inet_frag_queue *fq;
+ struct hlist_node *n;
+ unsigned int evicted = 0;
+ HLIST_HEAD(expired);
+
+evict_again:
+ spin_lock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
+ if (!inet_fragq_should_evict(fq))
+ continue;
+
+ if (!del_timer(&fq->timer)) {
+ /* q expiring right now thus increment its refcount so
+ * it won't be freed under us and wait until the timer
+ * has finished executing then destroy it
+ */
+ atomic_inc(&fq->refcnt);
+ spin_unlock(&hb->chain_lock);
+ del_timer_sync(&fq->timer);
+ WARN_ON(atomic_read(&fq->refcnt) != 1);
+ inet_frag_put(fq, f);
+ goto evict_again;
+ }
+
+ fq->flags |= INET_FRAG_EVICTED;
+ hlist_del(&fq->list);
+ hlist_add_head(&fq->list, &expired);
+ ++evicted;
}
- write_unlock(&f->lock);
- mod_timer(&f->secret_timer, now + f->secret_interval);
+ spin_unlock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &expired, list)
+ f->frag_expire((unsigned long) fq);
+
+ return evicted;
}
-void inet_frags_init(struct inet_frags *f)
+static void inet_frag_worker(struct work_struct *work)
+{
+ unsigned int budget = INETFRAGS_EVICT_BUCKETS;
+ unsigned int i, evicted = 0;
+ struct inet_frags *f;
+
+ f = container_of(work, struct inet_frags, frags_work);
+
+ BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
+
+ local_bh_disable();
+
+ for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+ evicted += inet_evict_bucket(f, &f->hash[i]);
+ i = (i + 1) & (INETFRAGS_HASHSZ - 1);
+ if (evicted > INETFRAGS_EVICT_MAX)
+ break;
+ }
+
+ f->next_bucket = i;
+
+ local_bh_enable();
+
+ if (f->rebuild && inet_frag_may_rebuild(f))
+ inet_frag_secret_rebuild(f);
+}
+
+static void inet_frag_schedule_worker(struct inet_frags *f)
+{
+ if (unlikely(!work_pending(&f->frags_work)))
+ schedule_work(&f->frags_work);
+}
+
+int inet_frags_init(struct inet_frags *f)
{
int i;
+ INIT_WORK(&f->frags_work, inet_frag_worker);
+
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb = &f->hash[i];
spin_lock_init(&hb->chain_lock);
INIT_HLIST_HEAD(&hb->chain);
}
- rwlock_init(&f->lock);
- setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
- (unsigned long)f);
- f->secret_timer.expires = jiffies + f->secret_interval;
- add_timer(&f->secret_timer);
+ seqlock_init(&f->rnd_seqlock);
+ f->last_rebuild_jiffies = 0;
+ f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+ NULL);
+ if (!f->frags_cachep)
+ return -ENOMEM;
+
+ return 0;
}
EXPORT_SYMBOL(inet_frags_init);
void inet_frags_init_net(struct netns_frags *nf)
{
- nf->nqueues = 0;
init_frag_mem_limit(nf);
- INIT_LIST_HEAD(&nf->lru_list);
- spin_lock_init(&nf->lru_lock);
}
EXPORT_SYMBOL(inet_frags_init_net);
void inet_frags_fini(struct inet_frags *f)
{
- del_timer(&f->secret_timer);
+ cancel_work_sync(&f->frags_work);
+ kmem_cache_destroy(f->frags_cachep);
}
EXPORT_SYMBOL(inet_frags_fini);
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
- nf->low_thresh = 0;
+ unsigned int seq;
+ int i;
+ nf->low_thresh = 0;
local_bh_disable();
- inet_frag_evictor(nf, f, true);
+
+evict_again:
+ seq = read_seqbegin(&f->rnd_seqlock);
+
+ for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+ inet_evict_bucket(f, &f->hash[i]);
+
+ if (read_seqretry(&f->rnd_seqlock, seq))
+ goto evict_again;
+
local_bh_enable();
percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static struct inet_frag_bucket *
+get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
+__acquires(hb->chain_lock)
{
struct inet_frag_bucket *hb;
- unsigned int hash;
+ unsigned int seq, hash;
+
+ restart:
+ seq = read_seqbegin(&f->rnd_seqlock);
- read_lock(&f->lock);
- hash = f->hashfn(fq);
+ hash = inet_frag_hashfn(f, fq);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
+ if (read_seqretry(&f->rnd_seqlock, seq)) {
+ spin_unlock(&hb->chain_lock);
+ goto restart;
+ }
+
+ return hb;
+}
+
+static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+ struct inet_frag_bucket *hb;
+
+ hb = get_frag_bucket_locked(fq, f);
hlist_del(&fq->list);
spin_unlock(&hb->chain_lock);
-
- read_unlock(&f->lock);
- inet_frag_lru_del(fq);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -149,30 +294,29 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
if (del_timer(&fq->timer))
atomic_dec(&fq->refcnt);
- if (!(fq->last_in & INET_FRAG_COMPLETE)) {
+ if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
atomic_dec(&fq->refcnt);
- fq->last_in |= INET_FRAG_COMPLETE;
+ fq->flags |= INET_FRAG_COMPLETE;
}
}
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
- struct sk_buff *skb)
+ struct sk_buff *skb)
{
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
}
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
- int *work)
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
{
struct sk_buff *fp;
struct netns_frags *nf;
unsigned int sum, sum_truesize = 0;
- WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
+ WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
@@ -186,87 +330,32 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
fp = xp;
}
sum = sum_truesize + f->qsize;
- if (work)
- *work -= sum;
sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
- kfree(q);
-
+ kmem_cache_free(f->frags_cachep, q);
}
EXPORT_SYMBOL(inet_frag_destroy);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
-{
- struct inet_frag_queue *q;
- int work, evicted = 0;
-
- if (!force) {
- if (frag_mem_limit(nf) <= nf->high_thresh)
- return 0;
- }
-
- work = frag_mem_limit(nf) - nf->low_thresh;
- while (work > 0 || force) {
- spin_lock(&nf->lru_lock);
-
- if (list_empty(&nf->lru_list)) {
- spin_unlock(&nf->lru_lock);
- break;
- }
-
- q = list_first_entry(&nf->lru_list,
- struct inet_frag_queue, lru_list);
- atomic_inc(&q->refcnt);
- /* Remove q from list to avoid several CPUs grabbing it */
- list_del_init(&q->lru_list);
-
- spin_unlock(&nf->lru_lock);
-
- spin_lock(&q->lock);
- if (!(q->last_in & INET_FRAG_COMPLETE))
- inet_frag_kill(q, f);
- spin_unlock(&q->lock);
-
- if (atomic_dec_and_test(&q->refcnt))
- inet_frag_destroy(q, f, &work);
- evicted++;
- }
-
- return evicted;
-}
-EXPORT_SYMBOL(inet_frag_evictor);
-
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
- struct inet_frag_queue *qp_in, struct inet_frags *f,
- void *arg)
+ struct inet_frag_queue *qp_in,
+ struct inet_frags *f,
+ void *arg)
{
- struct inet_frag_bucket *hb;
+ struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
struct inet_frag_queue *qp;
- unsigned int hash;
-
- read_lock(&f->lock); /* Protects against hash rebuild */
- /*
- * While we stayed w/o the lock other CPU could update
- * the rnd seed, so we need to re-calculate the hash
- * chain. Fortunatelly the qp_in can be used to get one.
- */
- hash = f->hashfn(qp_in);
- hb = &f->hash[hash];
- spin_lock(&hb->chain_lock);
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
- * such entry could be created on other cpu, while we
- * released the hash bucket lock.
+ * such entry could have been created on other cpu before
+ * we acquired hash bucket lock.
*/
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
- qp_in->last_in |= INET_FRAG_COMPLETE;
+ qp_in->flags |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
@@ -278,19 +367,24 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &hb->chain);
- inet_frag_lru_add(nf, qp);
+
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
return qp;
}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
- struct inet_frags *f, void *arg)
+ struct inet_frags *f,
+ void *arg)
{
struct inet_frag_queue *q;
- q = kzalloc(f->qsize, GFP_ATOMIC);
+ if (frag_mem_limit(nf) > nf->high_thresh) {
+ inet_frag_schedule_worker(f);
+ return NULL;
+ }
+
+ q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (q == NULL)
return NULL;
@@ -301,13 +395,13 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
- INIT_LIST_HEAD(&q->lru_list);
return q;
}
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
- struct inet_frags *f, void *arg)
+ struct inet_frags *f,
+ void *arg)
{
struct inet_frag_queue *q;
@@ -319,13 +413,17 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
}
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
- struct inet_frags *f, void *key, unsigned int hash)
- __releases(&f->lock)
+ struct inet_frags *f, void *key,
+ unsigned int hash)
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
int depth = 0;
+ if (frag_mem_limit(nf) > nf->low_thresh)
+ inet_frag_schedule_worker(f);
+
+ hash &= (INETFRAGS_HASHSZ - 1);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
@@ -333,18 +431,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
return q;
}
depth++;
}
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
if (depth <= INETFRAGS_MAXDEPTH)
return inet_frag_create(nf, f, key);
- else
- return ERR_PTR(-ENOBUFS);
+
+ if (inet_frag_may_rebuild(f)) {
+ if (!f->rebuild)
+ f->rebuild = true;
+ inet_frag_schedule_worker(f);
+ }
+
+ return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 43116e8c8e13..9111a4e22155 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -229,7 +229,7 @@ begin:
}
} else if (score == hiscore && reuseport) {
matches++;
- if (((u64)phash * matches) >> 32 == 0)
+ if (reciprocal_scale(phash, matches) == 0)
result = sk;
phash = next_pseudo_random32(phash);
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index bd5f5928167d..241afd743d2c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -72,29 +72,10 @@ void inet_peer_base_init(struct inet_peer_base *bp)
{
bp->root = peer_avl_empty_rcu;
seqlock_init(&bp->lock);
- bp->flush_seq = ~0U;
bp->total = 0;
}
EXPORT_SYMBOL_GPL(inet_peer_base_init);
-static atomic_t v4_seq = ATOMIC_INIT(0);
-static atomic_t v6_seq = ATOMIC_INIT(0);
-
-static atomic_t *inetpeer_seq_ptr(int family)
-{
- return (family == AF_INET ? &v4_seq : &v6_seq);
-}
-
-static inline void flush_check(struct inet_peer_base *base, int family)
-{
- atomic_t *fp = inetpeer_seq_ptr(family);
-
- if (unlikely(base->flush_seq != atomic_read(fp))) {
- inetpeer_invalidate_tree(base);
- base->flush_seq = atomic_read(fp);
- }
-}
-
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
/* Exported for sysctl_net_ipv4. */
@@ -444,8 +425,6 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
unsigned int sequence;
int invalidated, gccnt = 0;
- flush_check(base, daddr->family);
-
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ed32313e307c..2811cc18701a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -55,6 +55,7 @@
*/
static int sysctl_ipfrag_max_dist __read_mostly = 64;
+static const char ip_frag_cache_name[] = "ip4-frags";
struct ipfrag_skb_cb
{
@@ -86,11 +87,6 @@ static inline u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
-int ip_frag_nqueues(struct net *net)
-{
- return net->ipv4.frags.nqueues;
-}
-
int ip_frag_mem(struct net *net)
{
return sum_frag_mem_limit(&net->ipv4.frags);
@@ -109,21 +105,21 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
- ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
+ ip4_frags.rnd);
}
-static unsigned int ip4_hashfn(struct inet_frag_queue *q)
+static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
{
- struct ipq *ipq;
+ const struct ipq *ipq;
ipq = container_of(q, struct ipq, q);
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
-static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
+static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
{
- struct ipq *qp;
- struct ip4_create_arg *arg = a;
+ const struct ipq *qp;
+ const struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
@@ -133,14 +129,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
qp->user == arg->user;
}
-static void ip4_frag_init(struct inet_frag_queue *q, void *a)
+static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
frags);
struct net *net = container_of(ipv4, struct net, ipv4);
- struct ip4_create_arg *arg = a;
+ const struct ip4_create_arg *arg = a;
qp->protocol = arg->iph->protocol;
qp->id = arg->iph->id;
@@ -177,18 +173,6 @@ static void ipq_kill(struct ipq *ipq)
inet_frag_kill(&ipq->q, &ip4_frags);
}
-/* Memory limiting on fragments. Evictor trashes the oldest
- * fragment queue until we are back under the threshold.
- */
-static void ip_evictor(struct net *net)
-{
- int evicted;
-
- evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
- if (evicted)
- IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
-}
-
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
@@ -202,19 +186,22 @@ static void ip_expire(unsigned long arg)
spin_lock(&qp->q.lock);
- if (qp->q.last_in & INET_FRAG_COMPLETE)
+ if (qp->q.flags & INET_FRAG_COMPLETE)
goto out;
ipq_kill(qp);
-
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
- if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
+ if (!(qp->q.flags & INET_FRAG_EVICTED)) {
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+ if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+ goto out;
+
rcu_read_lock();
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
@@ -227,8 +214,7 @@ static void ip_expire(unsigned long arg)
if (err)
goto out_rcu_unlock;
- /*
- * Only an end host needs to send an ICMP
+ /* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (qp->user == IP_DEFRAG_AF_PACKET ||
@@ -237,7 +223,6 @@ static void ip_expire(unsigned long arg)
(skb_rtable(head)->rt_type != RTN_LOCAL)))
goto out_rcu_unlock;
-
/* Send an ICMP "Fragment Reassembly Timeout" message. */
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
out_rcu_unlock:
@@ -260,7 +245,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
arg.iph = iph;
arg.user = user;
- read_lock(&ip4_frags.lock);
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
@@ -319,7 +303,7 @@ static int ip_frag_reinit(struct ipq *qp)
} while (fp);
sub_frag_mem_limit(&qp->q, sum_truesize);
- qp->q.last_in = 0;
+ qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
@@ -340,7 +324,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
int err = -ENOENT;
u8 ecn;
- if (qp->q.last_in & INET_FRAG_COMPLETE)
+ if (qp->q.flags & INET_FRAG_COMPLETE)
goto err;
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
@@ -367,9 +351,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
* or have different end, the segment is corrupted.
*/
if (end < qp->q.len ||
- ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
+ ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
goto err;
- qp->q.last_in |= INET_FRAG_LAST_IN;
+ qp->q.flags |= INET_FRAG_LAST_IN;
qp->q.len = end;
} else {
if (end&7) {
@@ -379,7 +363,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
}
if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
- if (qp->q.last_in & INET_FRAG_LAST_IN)
+ if (qp->q.flags & INET_FRAG_LAST_IN)
goto err;
qp->q.len = end;
}
@@ -488,13 +472,13 @@ found:
qp->ecn |= ecn;
add_frag_mem_limit(&qp->q, skb->truesize);
if (offset == 0)
- qp->q.last_in |= INET_FRAG_FIRST_IN;
+ qp->q.flags |= INET_FRAG_FIRST_IN;
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
skb->len + ihl > qp->q.max_size)
qp->q.max_size = skb->len + ihl;
- if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
unsigned long orefdst = skb->_skb_refdst;
@@ -505,7 +489,6 @@ found:
}
skb_dst_drop(skb);
- inet_frag_lru_move(&qp->q);
return -EINPROGRESS;
err:
@@ -655,9 +638,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
- /* Start by cleaning up the memory. */
- ip_evictor(net);
-
/* Lookup (or create) queue header */
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
int ret;
@@ -721,14 +701,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
.data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.ipv4.frags.low_thresh
},
{
.procname = "ipfrag_low_thresh",
.data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &init_net.ipv4.frags.high_thresh
},
{
.procname = "ipfrag_time",
@@ -740,10 +723,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
{ }
};
+/* secret interval has been deprecated */
+static int ip4_frags_secret_interval_unused;
static struct ctl_table ip4_frags_ctl_table[] = {
{
.procname = "ipfrag_secret_interval",
- .data = &ip4_frags.secret_interval,
+ .data = &ip4_frags_secret_interval_unused,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -771,7 +756,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
goto err_alloc;
table[0].data = &net->ipv4.frags.high_thresh;
+ table[0].extra1 = &net->ipv4.frags.low_thresh;
+ table[0].extra2 = &init_net.ipv4.frags.high_thresh;
table[1].data = &net->ipv4.frags.low_thresh;
+ table[1].extra2 = &net->ipv4.frags.high_thresh;
table[2].data = &net->ipv4.frags.timeout;
/* Don't export sysctls to unprivileged users */
@@ -802,7 +790,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
kfree(table);
}
-static void ip4_frags_ctl_register(void)
+static void __init ip4_frags_ctl_register(void)
{
register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
}
@@ -816,7 +804,7 @@ static inline void ip4_frags_ns_ctl_unregister(struct net *net)
{
}
-static inline void ip4_frags_ctl_register(void)
+static inline void __init ip4_frags_ctl_register(void)
{
}
#endif
@@ -873,6 +861,7 @@ void __init ipfrag_init(void)
ip4_frags.qsize = sizeof(struct ipq);
ip4_frags.match = ip4_frag_match;
ip4_frags.frag_expire = ip_expire;
- ip4_frags.secret_interval = 10 * 60 * HZ;
- inet_frags_init(&ip4_frags);
+ ip4_frags.frags_cache_name = ip_frag_cache_name;
+ if (inet_frags_init(&ip4_frags))
+ panic("IP: failed to allocate ip4_frags cache\n");
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9b842544aea3..12055fdbe716 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -239,7 +239,9 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
tpi.seq = htonl(tunnel->o_seqno);
/* Push GRE header. */
- gre_build_header(skb, &tpi, tunnel->hlen);
+ gre_build_header(skb, &tpi, tunnel->tun_hlen);
+
+ skb_set_inner_protocol(skb, tpi.proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
@@ -310,7 +312,7 @@ out:
static int ipgre_tunnel_ioctl(struct net_device *dev,
struct ifreq *ifr, int cmd)
{
- int err = 0;
+ int err;
struct ip_tunnel_parm p;
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
@@ -470,13 +472,18 @@ static void ipgre_tunnel_setup(struct net_device *dev)
static void __gre_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel;
+ int t_hlen;
tunnel = netdev_priv(dev);
- tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
tunnel->parms.iph.protocol = IPPROTO_GRE;
- dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
+ t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+ dev->mtu = ETH_DATA_LEN - t_hlen - 4;
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
@@ -503,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
memcpy(dev->broadcast, &iph->daddr, 4);
dev->flags = IFF_NOARP;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
dev->addr_len = 4;
if (iph->daddr) {
@@ -628,6 +635,40 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
parms->iph.frag_off = htons(IP_DF);
}
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipgre_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_GRE_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
static int gre_tap_init(struct net_device *dev)
{
__gre_tunnel_init(dev);
@@ -657,6 +698,15 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipgre_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
ipgre_netlink_parms(data, tb, &p);
return ip_tunnel_newlink(dev, tb, &p);
@@ -666,6 +716,15 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipgre_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
ipgre_netlink_parms(data, tb, &p);
return ip_tunnel_changelink(dev, tb, &p);
@@ -694,6 +753,14 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(1) +
/* IFLA_GRE_PMTUDISC */
nla_total_size(1) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
0;
}
@@ -714,6 +781,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_GRE_PMTUDISC,
!!(p->iph.frag_off & htons(IP_DF))))
goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+ t->encap.type) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT,
+ t->encap.sport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT,
+ t->encap.dport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+ t->encap.dport))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -731,6 +809,10 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_TTL] = { .type = NLA_U8 },
[IFLA_GRE_TOS] = { .type = NLA_U8 },
[IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ad382499bace..5b3d91be2db0 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
* NOTE: dopt cannot point to skb.
*/
-int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
+int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
+ const struct ip_options *sopt)
{
- const struct ip_options *sopt;
unsigned char *sptr, *dptr;
int soffset, doffset;
int optlen;
memset(dopt, 0, sizeof(struct ip_options));
- sopt = &(IPCB(skb)->opt);
-
if (sopt->optlen == 0)
return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d3b6b0e9857..e35b71289156 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -516,7 +516,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
hlen = iph->ihl * 4;
mtu = mtu - hlen; /* Size of data space */
-#ifdef CONFIG_BRIDGE_NETFILTER
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (skb->nf_bridge)
mtu -= nf_bridge_mtu_reduction(skb);
#endif
@@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk,
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
+ u32 tskey = 0;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->fragsize;
+ if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = sk->sk_tskey++;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -962,10 +966,6 @@ alloc_new_skb:
sk->sk_allocation);
if (unlikely(skb == NULL))
err = -ENOBUFS;
- else
- /* only the initial fragment is
- time stamped */
- cork->tx_flags = 0;
}
if (skb == NULL)
goto error;
@@ -976,7 +976,12 @@ alloc_new_skb:
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
+
+ /* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
/*
* Find where to start putting bytes.
@@ -1517,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
.uc_ttl = -1,
};
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
- __be32 saddr, const struct ip_reply_arg *arg,
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
+ const struct ip_options *sopt,
+ __be32 daddr, __be32 saddr,
+ const struct ip_reply_arg *arg,
unsigned int len)
{
struct ip_options_data replyopts;
@@ -1529,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
struct sock *sk;
struct inet_sock *inet;
- if (ip_options_echo(&replyopts.opt.opt, skb))
+ if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
return;
ipc.addr = daddr;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64741b938632..c373a9ad4555 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -303,7 +303,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
- rcu_assign_pointer(*rap, ra->next);
+ RCU_INIT_POINTER(*rap, ra->next);
spin_unlock_bh(&ip_ra_lock);
if (ra->destructor)
@@ -325,7 +325,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
new_ra->sk = sk;
new_ra->destructor = destructor;
- new_ra->next = ra;
+ RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
spin_unlock_bh(&ip_ra_lock);
@@ -405,7 +405,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
struct sock_exterr_skb *serr;
- struct sk_buff *skb, *skb2;
+ struct sk_buff *skb;
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct {
struct sock_extended_err ee;
@@ -415,7 +415,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
int copied;
err = -EAGAIN;
- skb = skb_dequeue(&sk->sk_error_queue);
+ skb = sock_dequeue_err_skb(sk);
if (skb == NULL)
goto out;
@@ -462,17 +462,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
- /* Reset and regenerate socket error */
- spin_lock_bh(&sk->sk_error_queue.lock);
- sk->sk_err = 0;
- skb2 = skb_peek(&sk->sk_error_queue);
- if (skb2 != NULL) {
- sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_bh(&sk->sk_error_queue.lock);
- sk->sk_error_report(sk);
- } else
- spin_unlock_bh(&sk->sk_error_queue.lock);
-
out_free_skb:
kfree_skb(skb);
out:
@@ -1319,7 +1308,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control = optval;
+ msg.msg_control = (__force void *) optval;
msg.msg_controllen = len;
msg.msg_flags = flags;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6f9de61dce5f..0bb8e141eacc 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,8 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
+#include <net/udp.h>
+#include <net/gue.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -69,23 +71,25 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
}
static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
- struct dst_entry *dst)
+ struct dst_entry *dst, __be32 saddr)
{
struct dst_entry *old_dst;
dst_clone(dst);
old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
dst_release(old_dst);
+ idst->saddr = saddr;
}
-static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+static noinline void tunnel_dst_set(struct ip_tunnel *t,
+ struct dst_entry *dst, __be32 saddr)
{
- __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
+ __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
}
static void tunnel_dst_reset(struct ip_tunnel *t)
{
- tunnel_dst_set(t, NULL);
+ tunnel_dst_set(t, NULL, 0);
}
void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
@@ -93,20 +97,25 @@ void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
int i;
for_each_possible_cpu(i)
- __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
+ __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
}
EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
-static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
+ u32 cookie, __be32 *saddr)
{
+ struct ip_tunnel_dst *idst;
struct dst_entry *dst;
rcu_read_lock();
- dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
+ idst = raw_cpu_ptr(t->dst_cache);
+ dst = rcu_dereference(idst->dst);
if (dst && !atomic_inc_not_zero(&dst->__refcnt))
dst = NULL;
if (dst) {
- if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (!dst->obsolete || dst->ops->check(dst, cookie)) {
+ *saddr = idst->saddr;
+ } else {
tunnel_dst_reset(t);
dst_release(dst);
dst = NULL;
@@ -305,7 +314,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
}
ASSERT_RTNL();
- dev = alloc_netdev(ops->priv_size, name, ops->setup);
+ dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
if (!dev) {
err = -ENOMEM;
goto failed;
@@ -367,7 +376,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
- tunnel_dst_set(tunnel, &rt->dst);
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
ip_rt_put(rt);
}
if (dev->type != ARPHRD_ETHER)
@@ -480,6 +489,103 @@ drop:
}
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+static int ip_encap_hlen(struct ip_tunnel_encap *e)
+{
+ switch (e->type) {
+ case TUNNEL_ENCAP_NONE:
+ return 0;
+ case TUNNEL_ENCAP_FOU:
+ return sizeof(struct udphdr);
+ case TUNNEL_ENCAP_GUE:
+ return sizeof(struct udphdr) + sizeof(struct guehdr);
+ default:
+ return -EINVAL;
+ }
+}
+
+int ip_tunnel_encap_setup(struct ip_tunnel *t,
+ struct ip_tunnel_encap *ipencap)
+{
+ int hlen;
+
+ memset(&t->encap, 0, sizeof(t->encap));
+
+ hlen = ip_encap_hlen(ipencap);
+ if (hlen < 0)
+ return hlen;
+
+ t->encap.type = ipencap->type;
+ t->encap.sport = ipencap->sport;
+ t->encap.dport = ipencap->dport;
+ t->encap.flags = ipencap->flags;
+
+ t->encap_hlen = hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
+
+static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ size_t hdr_len, u8 *protocol, struct flowi4 *fl4)
+{
+ struct udphdr *uh;
+ __be16 sport;
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Get length and hash before making space in skb */
+
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ if (e->type == TUNNEL_ENCAP_GUE) {
+ struct guehdr *guehdr = (struct guehdr *)&uh[1];
+
+ guehdr->version = 0;
+ guehdr->hlen = 0;
+ guehdr->flags = 0;
+ guehdr->next_hdr = *protocol;
+ }
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ uh->check = 0;
+ udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
+ fl4->saddr, fl4->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+
+ return 0;
+}
+
+int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ switch (t->encap.type) {
+ case TUNNEL_ENCAP_NONE:
+ return 0;
+ case TUNNEL_ENCAP_FOU:
+ case TUNNEL_ENCAP_GUE:
+ return fou_build_header(skb, &t->encap, t->encap_hlen,
+ protocol, fl4);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(ip_tunnel_encap);
+
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rtable *rt, __be16 df)
{
@@ -529,7 +635,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
}
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
- const struct iphdr *tnl_params, const u8 protocol)
+ const struct iphdr *tnl_params, u8 protocol)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *inner_iph;
@@ -610,7 +716,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
- rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
+ if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+ goto tx_error;
+
+ rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -620,7 +729,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
if (connected)
- tunnel_dst_set(tunnel, &rt->dst);
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
}
if (rt->dst.dev == dev) {
@@ -663,7 +772,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
df |= (inner_iph->frag_off&htons(IP_DF));
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
- + rt->dst.header_len;
+ + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom;
@@ -757,9 +866,14 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
- if (!t && (cmd == SIOCADDTUNNEL)) {
- t = ip_tunnel_create(net, itn, p);
- err = PTR_ERR_OR_ZERO(t);
+ if (cmd == SIOCADDTUNNEL) {
+ if (!t) {
+ t = ip_tunnel_create(net, itn, p);
+ err = PTR_ERR_OR_ZERO(t);
+ break;
+ }
+
+ err = -EEXIST;
break;
}
if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index b8960f3527f3..3e861011e4a3 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
return ip_tunnel_init(dev);
}
@@ -534,40 +534,28 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
static int __init vti_init(void)
{
+ const char *msg;
int err;
- pr_info("IPv4 over IPSec tunneling driver\n");
+ pr_info("IPv4 over IPsec tunneling driver\n");
+ msg = "tunnel device";
err = register_pernet_device(&vti_net_ops);
if (err < 0)
- return err;
- err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
- if (err < 0) {
- unregister_pernet_device(&vti_net_ops);
- pr_info("vti init: can't register tunnel\n");
-
- return err;
- }
+ goto pernet_dev_failed;
+ msg = "tunnel protocols";
+ err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
- if (err < 0) {
- xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
- unregister_pernet_device(&vti_net_ops);
- pr_info("vti init: can't register tunnel\n");
-
- return err;
- }
-
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
- if (err < 0) {
- xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
- xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
- unregister_pernet_device(&vti_net_ops);
- pr_info("vti init: can't register tunnel\n");
-
- return err;
- }
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+ msg = "netlink interface";
err = rtnl_link_register(&vti_link_ops);
if (err < 0)
goto rtnl_link_failed;
@@ -576,23 +564,23 @@ static int __init vti_init(void)
rtnl_link_failed:
xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
unregister_pernet_device(&vti_net_ops);
+pernet_dev_failed:
+ pr_err("vti init: failed to register %s\n", msg);
return err;
}
static void __exit vti_fini(void)
{
rtnl_link_unregister(&vti_link_ops);
- if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP))
- pr_info("vti close: can't deregister tunnel\n");
- if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))
- pr_info("vti close: can't deregister tunnel\n");
- if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
- pr_info("vti close: can't deregister tunnel\n");
-
-
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b3e86ea7b71b..648fa1490ea7 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
__be32 root_server_addr = NONE; /* Address of NFS server */
u8 root_server_path[256] = { 0, }; /* Path to mount as root */
-__be32 ic_dev_xid; /* Device under configuration */
-
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
@@ -264,7 +262,8 @@ static int __init ic_open_devs(void)
/* wait for a carrier on at least one device */
start = jiffies;
next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
- while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
+ while (time_before(jiffies, start +
+ msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) {
int wait, elapsed;
for_each_netdev(&init_net, dev)
@@ -654,6 +653,7 @@ static struct packet_type bootp_packet_type __initdata = {
.func = ic_bootp_recv,
};
+static __be32 ic_dev_xid; /* Device under configuration */
/*
* Initialize DHCP/BOOTP extension fields in the request.
@@ -1218,10 +1218,10 @@ static int __init ic_dynamic(void)
get_random_bytes(&timeout, sizeof(timeout));
timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
for (;;) {
+#ifdef IPCONFIG_BOOTP
/* Track the device we are configuring */
ic_dev_xid = d->xid;
-#ifdef IPCONFIG_BOOTP
if (do_bootp && (d->able & IC_BOOTP))
ic_bootp_send_if(d, jiffies - start_jiffies);
#endif
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 62eaa005e146..37096d64730e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -224,6 +224,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (IS_ERR(skb))
goto out;
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+
ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
return NETDEV_TX_OK;
@@ -287,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
dev->features |= IPIP_FEATURES;
dev->hw_features |= IPIP_FEATURES;
@@ -301,7 +303,8 @@ static int ipip_tunnel_init(struct net_device *dev)
memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
- tunnel->hlen = 0;
+ tunnel->tun_hlen = 0;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
tunnel->parms.iph.protocol = IPPROTO_IPIP;
return ip_tunnel_init(dev);
}
@@ -340,10 +343,53 @@ static void ipip_netlink_parms(struct nlattr *data[],
parms->iph.frag_off = htons(IP_DF);
}
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipip_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipip_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
ipip_netlink_parms(data, &p);
return ip_tunnel_newlink(dev, tb, &p);
@@ -353,6 +399,15 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipip_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
ipip_netlink_parms(data, &p);
@@ -378,6 +433,14 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(1) +
/* IFLA_IPTUN_PMTUDISC */
nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
0;
}
@@ -394,6 +457,17 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
!!(parm->iph.frag_off & htons(IP_DF))))
goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.dport))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -407,6 +481,10 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_TTL] = { .type = NLA_U8 },
[IFLA_IPTUN_TOS] = { .type = NLA_U8 },
[IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ipip_link_ops __read_mostly = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 65bcaa789043..c8034587859d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
else
sprintf(name, "pimreg%u", mrt->id);
- dev = alloc_netdev(0, name, reg_vif_setup);
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
if (dev == NULL)
return NULL;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a26ce035e3fa..4c019d5c3f57 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
+config NF_LOG_ARP
+ tristate "ARP packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_LOG_IPV4
+ tristate "IPv4 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
config NF_TABLES_IPV4
depends on NF_TABLES
tristate "IPv4 nf_tables support"
@@ -51,9 +61,36 @@ config NFT_CHAIN_ROUTE_IPV4
fields such as the source, destination, type of service and
the packet mark.
+config NF_REJECT_IPV4
+ tristate "IPv4 packet rejection"
+ default m if NETFILTER_ADVANCED=n
+
+config NFT_REJECT_IPV4
+ depends on NF_TABLES_IPV4
+ select NF_REJECT_IPV4
+ default NFT_REJECT
+ tristate
+
+config NF_TABLES_ARP
+ depends on NF_TABLES
+ tristate "ARP nf_tables support"
+ help
+ This option enables the ARP support for nf_tables.
+
+config NF_NAT_IPV4
+ tristate "IPv4 NAT"
+ depends on NF_CONNTRACK_IPV4
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ help
+ The IPv4 NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. This can be
+ controlled by iptables or nft.
+
+if NF_NAT_IPV4
+
config NFT_CHAIN_NAT_IPV4
depends on NF_TABLES_IPV4
- depends on NF_NAT_IPV4 && NFT_NAT
tristate "IPv4 nf_tables nat chain support"
help
This option enables the "nat" chain for IPv4 in nf_tables. This
@@ -61,16 +98,54 @@ config NFT_CHAIN_NAT_IPV4
packet transformations such as the source, destination address and
source and destination ports.
-config NFT_REJECT_IPV4
+config NF_NAT_MASQUERADE_IPV4
+ tristate "IPv4 masquerade support"
+ help
+ This is the kernel functionality to provide NAT in the masquerade
+ flavour (automatic source address selection).
+
+config NFT_MASQ_IPV4
+ tristate "IPv4 masquerading support for nf_tables"
depends on NF_TABLES_IPV4
- default NFT_REJECT
+ depends on NFT_MASQ
+ select NF_NAT_MASQUERADE_IPV4
+ help
+ This is the expression that provides IPv4 masquerading support for
+ nf_tables.
+
+config NF_NAT_SNMP_BASIC
+ tristate "Basic SNMP-ALG support"
+ depends on NF_CONNTRACK_SNMP
+ depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
+ ---help---
+
+ This module implements an Application Layer Gateway (ALG) for
+ SNMP payloads. In conjunction with NAT, it allows a network
+ management system to access multiple private networks with
+ conflicting addresses. It works by modifying IP addresses
+ inside SNMP payloads to match IP-layer NAT mapping.
+
+ This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_NAT_PROTO_GRE
tristate
+ depends on NF_CT_PROTO_GRE
-config NF_TABLES_ARP
- depends on NF_TABLES
- tristate "ARP nf_tables support"
- help
- This option enables the ARP support for nf_tables.
+config NF_NAT_PPTP
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_PPTP
+ select NF_NAT_PROTO_GRE
+
+config NF_NAT_H323
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_H323
+
+endif # NF_NAT_IPV4
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
@@ -138,6 +213,7 @@ config IP_NF_FILTER
config IP_NF_TARGET_REJECT
tristate "REJECT target support"
depends on IP_NF_FILTER
+ select NF_REJECT_IPV4
default m if NETFILTER_ADVANCED=n
help
The REJECT target allows a filtering rule to specify that an ICMP
@@ -159,42 +235,26 @@ config IP_NF_TARGET_SYNPROXY
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_ULOG
- tristate "ULOG target support (obsolete)"
- default m if NETFILTER_ADVANCED=n
- ---help---
-
- This option enables the old IPv4-only "ipt_ULOG" implementation
- which has been obsoleted by the new "nfnetlink_log" code (see
- CONFIG_NETFILTER_NETLINK_LOG).
-
- This option adds a `ULOG' target, which allows you to create rules in
- any iptables table. The packet is passed to a userspace logging
- daemon using netlink multicast sockets; unlike the LOG target
- which can only be viewed through syslog.
-
- The appropriate userspace logging daemon (ulogd) may be obtained from
- <http://www.netfilter.org/projects/ulogd/index.html>
-
- To compile it as a module, choose M here. If unsure, say N.
-
# NAT + specific targets: nf_conntrack
-config NF_NAT_IPV4
- tristate "IPv4 NAT"
+config IP_NF_NAT
+ tristate "iptables NAT support"
depends on NF_CONNTRACK_IPV4
default m if NETFILTER_ADVANCED=n
select NF_NAT
+ select NF_NAT_IPV4
+ select NETFILTER_XT_NAT
help
- The IPv4 NAT option allows masquerading, port forwarding and other
- forms of full Network Address Port Translation. It is controlled by
- the `nat' table in iptables: see the man page for iptables(8).
+ This enables the `nat' table in iptables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
To compile it as a module, choose M here. If unsure, say N.
-if NF_NAT_IPV4
+if IP_NF_NAT
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
+ select NF_NAT_MASQUERADE_IPV4
default m if NETFILTER_ADVANCED=n
help
Masquerading is a special case of NAT: all outgoing connections are
@@ -223,47 +283,7 @@ config IP_NF_TARGET_REDIRECT
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_REDIRECT.
-endif
-
-config NF_NAT_SNMP_BASIC
- tristate "Basic SNMP-ALG support"
- depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
- depends on NETFILTER_ADVANCED
- default NF_NAT && NF_CONNTRACK_SNMP
- ---help---
-
- This module implements an Application Layer Gateway (ALG) for
- SNMP payloads. In conjunction with NAT, it allows a network
- management system to access multiple private networks with
- conflicting addresses. It works by modifying IP addresses
- inside SNMP payloads to match IP-layer NAT mapping.
-
- This is the "basic" form of SNMP-ALG, as described in RFC 2962
-
- To compile it as a module, choose M here. If unsure, say N.
-
-# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y),
-# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker.
-# From kconfig-language.txt:
-#
-# <expr> '&&' <expr> (6)
-#
-# (6) Returns the result of min(/expr/, /expr/).
-
-config NF_NAT_PROTO_GRE
- tristate
- depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
-
-config NF_NAT_PPTP
- tristate
- depends on NF_CONNTRACK && NF_NAT_IPV4
- default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
- select NF_NAT_PROTO_GRE
-
-config NF_NAT_H323
- tristate
- depends on NF_CONNTRACK && NF_NAT_IPV4
- default NF_NAT_IPV4 && NF_CONNTRACK_H323
+endif # IP_NF_NAT
# mangle + specific targets
config IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 90b82405331e..f4cef5af0969 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,10 +19,18 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+# logging
+obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
+obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
+
+# reject
+obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
+
# NAT helpers (nf_conntrack)
obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
# NAT protocols (nf_nat)
obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
@@ -31,6 +39,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
# generic IP tables
@@ -39,7 +48,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
@@ -53,7 +62,6 @@ obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
-obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
# generic ARP tables
obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2510c02c2d21..e90f83a3415b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -285,7 +285,7 @@ clusterip_hashfn(const struct sk_buff *skb,
}
/* node numbers are 1..n, not 0..n */
- return (((u64)hashval * config->num_total_nodes) >> 32) + 1;
+ return reciprocal_scale(hashval, config->num_total_nodes) + 1;
}
static inline int
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 00352ce0f0de..da7f02a0b868 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -22,6 +22,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/x_tables.h>
#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -46,103 +47,17 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct nf_conn *ct;
- struct nf_conn_nat *nat;
- enum ip_conntrack_info ctinfo;
- struct nf_nat_range newrange;
+ struct nf_nat_range range;
const struct nf_nat_ipv4_multi_range_compat *mr;
- const struct rtable *rt;
- __be32 newsrc, nh;
-
- NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
-
- ct = nf_ct_get(skb, &ctinfo);
- nat = nfct_nat(ct);
-
- NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED_REPLY));
-
- /* Source address is 0.0.0.0 - locally generated packet that is
- * probably not supposed to be masqueraded.
- */
- if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
- return NF_ACCEPT;
mr = par->targinfo;
- rt = skb_rtable(skb);
- nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
- newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
- if (!newsrc) {
- pr_info("%s ate my IP address\n", par->out->name);
- return NF_DROP;
- }
-
- nat->masq_index = par->out->ifindex;
-
- /* Transfer from original range. */
- memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
- memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
- newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.ip = newsrc;
- newrange.max_addr.ip = newsrc;
- newrange.min_proto = mr->range[0].min;
- newrange.max_proto = mr->range[0].max;
+ range.flags = mr->range[0].flags;
+ range.min_proto = mr->range[0].min;
+ range.max_proto = mr->range[0].max;
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+ return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
}
-static int
-device_cmp(struct nf_conn *i, void *ifindex)
-{
- const struct nf_conn_nat *nat = nfct_nat(i);
-
- if (!nat)
- return 0;
- if (nf_ct_l3num(i) != NFPROTO_IPV4)
- return 0;
- return nat->masq_index == (int)(long)ifindex;
-}
-
-static int masq_device_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct net *net = dev_net(dev);
-
- if (event == NETDEV_DOWN) {
- /* Device was downed. Search entire table for
- conntracks which were associated with that device,
- and forget them. */
- NF_CT_ASSERT(dev->ifindex != 0);
-
- nf_ct_iterate_cleanup(net, device_cmp,
- (void *)(long)dev->ifindex, 0, 0);
- }
-
- return NOTIFY_DONE;
-}
-
-static int masq_inet_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
- struct netdev_notifier_info info;
-
- netdev_notifier_info_init(&info, dev);
- return masq_device_event(this, event, &info);
-}
-
-static struct notifier_block masq_dev_notifier = {
- .notifier_call = masq_device_event,
-};
-
-static struct notifier_block masq_inet_notifier = {
- .notifier_call = masq_inet_event,
-};
-
static struct xt_target masquerade_tg_reg __read_mostly = {
.name = "MASQUERADE",
.family = NFPROTO_IPV4,
@@ -160,12 +75,8 @@ static int __init masquerade_tg_init(void)
ret = xt_register_target(&masquerade_tg_reg);
- if (ret == 0) {
- /* Register for device down reports */
- register_netdevice_notifier(&masq_dev_notifier);
- /* Register IP address change reports */
- register_inetaddr_notifier(&masq_inet_notifier);
- }
+ if (ret == 0)
+ nf_nat_masquerade_ipv4_register_notifier();
return ret;
}
@@ -173,8 +84,7 @@ static int __init masquerade_tg_init(void)
static void __exit masquerade_tg_exit(void)
{
xt_unregister_target(&masquerade_tg_reg);
- unregister_netdevice_notifier(&masq_dev_notifier);
- unregister_inetaddr_notifier(&masq_inet_notifier);
+ nf_nat_masquerade_ipv4_unregister_notifier();
}
module_init(masquerade_tg_init);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 5b6e0df4ccff..8f48f5517e33 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -20,7 +20,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_REJECT.h>
-#ifdef CONFIG_BRIDGE_NETFILTER
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
#include <linux/netfilter_bridge.h>
#endif
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
deleted file mode 100644
index 9cb993cd224b..000000000000
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * netfilter module for userspace packet logging daemons
- *
- * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2005-2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This module accepts two parameters:
- *
- * nlbufsiz:
- * The parameter specifies how big the buffer for each netlink multicast
- * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
- * get accumulated in the kernel until they are sent to userspace. It is
- * NOT possible to allocate more than 128kB, and it is strongly discouraged,
- * because atomically allocating 128kB inside the network rx softirq is not
- * reliable. Please also keep in mind that this buffer size is allocated for
- * each nlgroup you are using, so the total kernel memory usage increases
- * by that factor.
- *
- * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since
- * nlbufsiz is used with alloc_skb, which adds another
- * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead.
- *
- * flushtimeout:
- * Specify, after how many hundredths of a second the queue should be
- * flushed even if it is not full yet.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/socket.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <net/netlink.h>
-#include <linux/netdevice.h>
-#include <linux/mm.h>
-#include <linux/moduleparam.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_ULOG.h>
-#include <net/netfilter/nf_log.h>
-#include <net/netns/generic.h>
-#include <net/sock.h>
-#include <linux/bitops.h>
-#include <asm/unaligned.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
-
-#define ULOG_NL_EVENT 111 /* Harald's favorite number */
-#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
-
-static unsigned int nlbufsiz = NLMSG_GOODSIZE;
-module_param(nlbufsiz, uint, 0400);
-MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
-
-static unsigned int flushtimeout = 10;
-module_param(flushtimeout, uint, 0600);
-MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
-
-static bool nflog = true;
-module_param(nflog, bool, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
-/* global data structures */
-
-typedef struct {
- unsigned int qlen; /* number of nlmsgs' in the skb */
- struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
- struct sk_buff *skb; /* the pre-allocated skb */
- struct timer_list timer; /* the timer function */
-} ulog_buff_t;
-
-static int ulog_net_id __read_mostly;
-struct ulog_net {
- unsigned int nlgroup[ULOG_MAXNLGROUPS];
- ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
- struct sock *nflognl;
- spinlock_t lock;
-};
-
-static struct ulog_net *ulog_pernet(struct net *net)
-{
- return net_generic(net, ulog_net_id);
-}
-
-/* send one ulog_buff_t to userspace */
-static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
-{
- ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
-
- pr_debug("ulog_send: timer is deleting\n");
- del_timer(&ub->timer);
-
- if (!ub->skb) {
- pr_debug("ulog_send: nothing to send\n");
- return;
- }
-
- /* last nlmsg needs NLMSG_DONE */
- if (ub->qlen > 1)
- ub->lastnlh->nlmsg_type = NLMSG_DONE;
-
- NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
- pr_debug("throwing %d packets to netlink group %u\n",
- ub->qlen, nlgroupnum + 1);
- netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
- GFP_ATOMIC);
-
- ub->qlen = 0;
- ub->skb = NULL;
- ub->lastnlh = NULL;
-}
-
-
-/* timer function to flush queue in flushtimeout time */
-static void ulog_timer(unsigned long data)
-{
- unsigned int groupnum = *((unsigned int *)data);
- struct ulog_net *ulog = container_of((void *)data,
- struct ulog_net,
- nlgroup[groupnum]);
- pr_debug("timer function called, calling ulog_send\n");
-
- /* lock to protect against somebody modifying our structure
- * from ipt_ulog_target at the same time */
- spin_lock_bh(&ulog->lock);
- ulog_send(ulog, groupnum);
- spin_unlock_bh(&ulog->lock);
-}
-
-static struct sk_buff *ulog_alloc_skb(unsigned int size)
-{
- struct sk_buff *skb;
- unsigned int n;
-
- /* alloc skb which should be big enough for a whole
- * multipart message. WARNING: has to be <= 131000
- * due to slab allocator restrictions */
-
- n = max(size, nlbufsiz);
- skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
- if (!skb) {
- if (n > size) {
- /* try to allocate only as much as we need for
- * current packet */
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- pr_debug("cannot even allocate %ub\n", size);
- }
- }
-
- return skb;
-}
-
-static void ipt_ulog_packet(struct net *net,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct ipt_ulog_info *loginfo,
- const char *prefix)
-{
- ulog_buff_t *ub;
- ulog_packet_msg_t *pm;
- size_t size, copy_len;
- struct nlmsghdr *nlh;
- struct timeval tv;
- struct ulog_net *ulog = ulog_pernet(net);
-
- /* ffs == find first bit set, necessary because userspace
- * is already shifting groupnumber, but we need unshifted.
- * ffs() returns [1..32], we need [0..31] */
- unsigned int groupnum = ffs(loginfo->nl_group) - 1;
-
- /* calculate the size of the skb needed */
- if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len)
- copy_len = skb->len;
- else
- copy_len = loginfo->copy_range;
-
- size = nlmsg_total_size(sizeof(*pm) + copy_len);
-
- ub = &ulog->ulog_buffers[groupnum];
-
- spin_lock_bh(&ulog->lock);
-
- if (!ub->skb) {
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- } else if (ub->qlen >= loginfo->qthreshold ||
- size > skb_tailroom(ub->skb)) {
- /* either the queue len is too high or we don't have
- * enough room in nlskb left. send it to userspace. */
-
- ulog_send(ulog, groupnum);
-
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- }
-
- pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
-
- nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
- sizeof(*pm)+copy_len, 0);
- if (!nlh) {
- pr_debug("error during nlmsg_put\n");
- goto out_unlock;
- }
- ub->qlen++;
-
- pm = nlmsg_data(nlh);
- memset(pm, 0, sizeof(*pm));
-
- /* We might not have a timestamp, get one */
- if (skb->tstamp.tv64 == 0)
- __net_timestamp((struct sk_buff *)skb);
-
- /* copy hook, prefix, timestamp, payload, etc. */
- pm->data_len = copy_len;
- tv = ktime_to_timeval(skb->tstamp);
- put_unaligned(tv.tv_sec, &pm->timestamp_sec);
- put_unaligned(tv.tv_usec, &pm->timestamp_usec);
- put_unaligned(skb->mark, &pm->mark);
- pm->hook = hooknum;
- if (prefix != NULL) {
- strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1);
- pm->prefix[sizeof(pm->prefix) - 1] = '\0';
- }
- else if (loginfo->prefix[0] != '\0')
- strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
-
- if (in && in->hard_header_len > 0 &&
- skb->mac_header != skb->network_header &&
- in->hard_header_len <= ULOG_MAC_LEN) {
- memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
- pm->mac_len = in->hard_header_len;
- } else
- pm->mac_len = 0;
-
- if (in)
- strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
-
- if (out)
- strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
-
- /* copy_len <= skb->len, so can't fail. */
- if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
- BUG();
-
- /* check if we are building multi-part messages */
- if (ub->qlen > 1)
- ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
-
- ub->lastnlh = nlh;
-
- /* if timer isn't already running, start it */
- if (!timer_pending(&ub->timer)) {
- ub->timer.expires = jiffies + flushtimeout * HZ / 100;
- add_timer(&ub->timer);
- }
-
- /* if threshold is reached, send message to userspace */
- if (ub->qlen >= loginfo->qthreshold) {
- if (loginfo->qthreshold > 1)
- nlh->nlmsg_type = NLMSG_DONE;
- ulog_send(ulog, groupnum);
- }
-out_unlock:
- spin_unlock_bh(&ulog->lock);
-
- return;
-
-alloc_failure:
- pr_debug("Error building netlink message\n");
- spin_unlock_bh(&ulog->lock);
-}
-
-static unsigned int
-ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- struct net *net = dev_net(par->in ? par->in : par->out);
-
- ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
- par->targinfo, NULL);
- return XT_CONTINUE;
-}
-
-static void ipt_logfn(struct net *net,
- u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *li,
- const char *prefix)
-{
- struct ipt_ulog_info loginfo;
-
- if (!li || li->type != NF_LOG_TYPE_ULOG) {
- loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
- loginfo.copy_range = 0;
- loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
- loginfo.prefix[0] = '\0';
- } else {
- loginfo.nl_group = li->u.ulog.group;
- loginfo.copy_range = li->u.ulog.copy_len;
- loginfo.qthreshold = li->u.ulog.qthreshold;
- strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
- }
-
- ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
-}
-
-static int ulog_tg_check(const struct xt_tgchk_param *par)
-{
- const struct ipt_ulog_info *loginfo = par->targinfo;
-
- if (!par->net->xt.ulog_warn_deprecated) {
- pr_info("ULOG is deprecated and it will be removed soon, "
- "use NFLOG instead\n");
- par->net->xt.ulog_warn_deprecated = true;
- }
-
- if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
- pr_debug("prefix not null-terminated\n");
- return -EINVAL;
- }
- if (loginfo->qthreshold > ULOG_MAX_QLEN) {
- pr_debug("queue threshold %Zu > MAX_QLEN\n",
- loginfo->qthreshold);
- return -EINVAL;
- }
- return 0;
-}
-
-#ifdef CONFIG_COMPAT
-struct compat_ipt_ulog_info {
- compat_uint_t nl_group;
- compat_size_t copy_range;
- compat_size_t qthreshold;
- char prefix[ULOG_PREFIX_LEN];
-};
-
-static void ulog_tg_compat_from_user(void *dst, const void *src)
-{
- const struct compat_ipt_ulog_info *cl = src;
- struct ipt_ulog_info l = {
- .nl_group = cl->nl_group,
- .copy_range = cl->copy_range,
- .qthreshold = cl->qthreshold,
- };
-
- memcpy(l.prefix, cl->prefix, sizeof(l.prefix));
- memcpy(dst, &l, sizeof(l));
-}
-
-static int ulog_tg_compat_to_user(void __user *dst, const void *src)
-{
- const struct ipt_ulog_info *l = src;
- struct compat_ipt_ulog_info cl = {
- .nl_group = l->nl_group,
- .copy_range = l->copy_range,
- .qthreshold = l->qthreshold,
- };
-
- memcpy(cl.prefix, l->prefix, sizeof(cl.prefix));
- return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
-}
-#endif /* CONFIG_COMPAT */
-
-static struct xt_target ulog_tg_reg __read_mostly = {
- .name = "ULOG",
- .family = NFPROTO_IPV4,
- .target = ulog_tg,
- .targetsize = sizeof(struct ipt_ulog_info),
- .checkentry = ulog_tg_check,
-#ifdef CONFIG_COMPAT
- .compatsize = sizeof(struct compat_ipt_ulog_info),
- .compat_from_user = ulog_tg_compat_from_user,
- .compat_to_user = ulog_tg_compat_to_user,
-#endif
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ipt_ulog_logger __read_mostly = {
- .name = "ipt_ULOG",
- .logfn = ipt_logfn,
- .me = THIS_MODULE,
-};
-
-static int __net_init ulog_tg_net_init(struct net *net)
-{
- int i;
- struct ulog_net *ulog = ulog_pernet(net);
- struct netlink_kernel_cfg cfg = {
- .groups = ULOG_MAXNLGROUPS,
- };
-
- spin_lock_init(&ulog->lock);
- /* initialize ulog_buffers */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ulog->nlgroup[i] = i;
- setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
- (unsigned long)&ulog->nlgroup[i]);
- }
-
- ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
- if (!ulog->nflognl)
- return -ENOMEM;
-
- if (nflog)
- nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
-
- return 0;
-}
-
-static void __net_exit ulog_tg_net_exit(struct net *net)
-{
- ulog_buff_t *ub;
- int i;
- struct ulog_net *ulog = ulog_pernet(net);
-
- if (nflog)
- nf_log_unset(net, &ipt_ulog_logger);
-
- netlink_kernel_release(ulog->nflognl);
-
- /* remove pending timers and free allocated skb's */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ub = &ulog->ulog_buffers[i];
- pr_debug("timer is deleting\n");
- del_timer(&ub->timer);
-
- if (ub->skb) {
- kfree_skb(ub->skb);
- ub->skb = NULL;
- }
- }
-}
-
-static struct pernet_operations ulog_tg_net_ops = {
- .init = ulog_tg_net_init,
- .exit = ulog_tg_net_exit,
- .id = &ulog_net_id,
- .size = sizeof(struct ulog_net),
-};
-
-static int __init ulog_tg_init(void)
-{
- int ret;
- pr_debug("init module\n");
-
- if (nlbufsiz > 128*1024) {
- pr_warn("Netlink buffer has to be <= 128kB\n");
- return -EINVAL;
- }
-
- ret = register_pernet_subsys(&ulog_tg_net_ops);
- if (ret)
- goto out_pernet;
-
- ret = xt_register_target(&ulog_tg_reg);
- if (ret < 0)
- goto out_target;
-
- if (nflog)
- nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
-
- return 0;
-
-out_target:
- unregister_pernet_subsys(&ulog_tg_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit ulog_tg_exit(void)
-{
- pr_debug("cleanup_module\n");
- if (nflog)
- nf_log_unregister(&ipt_ulog_logger);
- xt_unregister_target(&ulog_tg_reg);
- unregister_pernet_subsys(&ulog_tg_net_ops);
-}
-
-module_init(ulog_tg_init);
-module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index f1787c04a4dd..6b67d7e9a75d 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -28,222 +28,57 @@ static const struct xt_table nf_nat_ipv4_table = {
.af = NFPROTO_IPV4,
};
-static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
-{
- /* Force range to this IP; let proto decide mapping for
- * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
- */
- struct nf_nat_range range;
-
- range.flags = 0;
- pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
- HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-
- return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
-}
-
-static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
- const struct net_device *in,
- const struct net_device *out,
- struct nf_conn *ct)
+static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
- unsigned int ret;
- ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
- if (ret == NF_ACCEPT) {
- if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
- ret = alloc_null_binding(ct, hooknum);
- }
- return ret;
+ return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table);
}
-static unsigned int
-nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- struct nf_conn_nat *nat;
- /* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
-
- /* We never see fragments: conntrack defrags on pre-routing
- * and local-out, and nf_nat_out protects post-routing.
- */
- NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
-
- ct = nf_ct_get(skb, &ctinfo);
- /* Can't track? It's not due to stress, or conntrack would
- * have dropped it. Hence it's the user's responsibilty to
- * packet filter it out, or implement conntrack/NAT for that
- * protocol. 8) --RR
- */
- if (!ct)
- return NF_ACCEPT;
-
- /* Don't try to NAT if this packet is not conntracked */
- if (nf_ct_is_untracked(ct))
- return NF_ACCEPT;
-
- nat = nf_ct_nat_ext_add(ct);
- if (nat == NULL)
- return NF_ACCEPT;
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED_REPLY:
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
- if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- ops->hooknum))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
- case IP_CT_NEW:
- /* Seen it before? This can happen for loopback, retrans,
- * or local packets.
- */
- if (!nf_nat_initialized(ct, maniptype)) {
- unsigned int ret;
-
- ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
- if (ret != NF_ACCEPT)
- return ret;
- } else {
- pr_debug("Already setup manip %s for ct %p\n",
- maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
- ct);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
- goto oif_changed;
- }
- break;
-
- default:
- /* ESTABLISHED */
- NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
- ctinfo == IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
- goto oif_changed;
- }
-
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
-
-oif_changed:
- nf_ct_kill_acct(ct, ctinfo, skb);
- return NF_DROP;
+ return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain);
}
-static unsigned int
-nf_nat_ipv4_in(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- unsigned int ret;
- __be32 daddr = ip_hdr(skb)->daddr;
-
- ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- daddr != ip_hdr(skb)->daddr)
- skb_dst_drop(skb);
-
- return ret;
+ return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain);
}
-static unsigned int
-nf_nat_ipv4_out(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
-#ifdef CONFIG_XFRM
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- int err;
-#endif
- unsigned int ret;
-
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN &&
- !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if ((ct->tuplehash[dir].tuple.src.u3.ip !=
- ct->tuplehash[!dir].tuple.dst.u3.ip) ||
- (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
- ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(skb, AF_INET);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
- }
-#endif
- return ret;
+ return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain);
}
-static unsigned int
-nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned int ret;
- int err;
-
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.dst.u3.ip !=
- ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#ifdef CONFIG_XFRM
- else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
- ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(skb, AF_INET);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#endif
- }
- return ret;
+ return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain);
}
static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
- .hook = nf_nat_ipv4_in,
+ .hook = iptable_nat_ipv4_in,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
@@ -251,7 +86,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
},
/* After packet filtering, change source */
{
- .hook = nf_nat_ipv4_out,
+ .hook = iptable_nat_ipv4_out,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
@@ -259,7 +94,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
},
/* Before packet filtering, change destination */
{
- .hook = nf_nat_ipv4_local_fn,
+ .hook = iptable_nat_ipv4_local_fn,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
@@ -267,7 +102,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
},
/* After packet filtering, change source */
{
- .hook = nf_nat_ipv4_fn,
+ .hook = iptable_nat_ipv4_fn,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8127dc802865..a054fe083431 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -ENOENT;
}
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -358,7 +358,7 @@ static struct nf_sockopt_ops so_getorigdst = {
.pf = PF_INET,
.get_optmin = SO_ORIGINAL_DST,
.get_optmax = SO_ORIGINAL_DST+1,
- .get = &getorigdst,
+ .get = getorigdst,
.owner = THIS_MODULE,
};
@@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.invert_tuple = ipv4_invert_tuple,
.print_tuple = ipv4_print_tuple,
.get_l4proto = ipv4_get_l4proto,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = ipv4_tuple_to_nlattr,
.nlattr_tuple_size = ipv4_nlattr_tuple_size,
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a338dad41b7d..b91b2641adda 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
}
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.error = icmp_error,
.destroy = NULL,
.me = NULL,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmp_tuple_to_nlattr,
.nlattr_tuple_size = icmp_nlattr_tuple_size,
.nlattr_to_tuple = icmp_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index b8f6381c7d0b..7e5ca6f2d0cd 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -17,7 +17,7 @@
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif
#include <net/netfilter/nf_conntrack_zones.h>
@@ -45,12 +45,12 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
{
u16 zone = NF_CT_DEFAULT_ZONE;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (skb->nfct)
zone = nf_ct_zone((struct nf_conn *)skb->nfct);
#endif
-#ifdef CONFIG_BRIDGE_NETFILTER
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (skb->nf_bridge &&
skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
@@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
inet->nodefrag)
return NF_ACCEPT;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#if !IS_ENABLED(CONFIG_NF_NAT)
/* Previously seen (loopback)? Ignore. Do this before
fragment check. */
if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
new file mode 100644
index 000000000000..ccfc78db12ee
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -0,0 +1,149 @@
+/*
+ * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Based on code from ebt_log from:
+ *
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 5,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+struct arppayload {
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+static void dump_arp_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int nhoff)
+{
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+ nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+
+ /* If it's for Ethernet and the lengths are OK, then log the ARP
+ * payload.
+ */
+ if (ah->ar_hrd != htons(1) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != sizeof(__be32))
+ return;
+
+ ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
+ if (ap == NULL) {
+ nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+ skb->len - sizeof(_arph));
+ return;
+ }
+ nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix);
+ dump_arp_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+ .name = "nf_log_arp",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_arp_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_arp_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_arp_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_arp_logger);
+}
+
+static struct pernet_operations nf_log_arp_net_ops = {
+ .init = nf_log_arp_net_init,
+ .exit = nf_log_arp_net_exit,
+};
+
+static int __init nf_log_arp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_arp_net_ops);
+ if (ret < 0)
+ return ret;
+
+ nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ return 0;
+}
+
+static void __exit nf_log_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_arp_net_ops);
+ nf_log_unregister(&nf_arp_logger);
+}
+
+module_init(nf_log_arp_init);
+module_exit(nf_log_arp_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter ARP packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(3, 0);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
new file mode 100644
index 000000000000..078bdca1b607
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -0,0 +1,385 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 5,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* One level of recursion won't kill us */
+static void dump_ipv4_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int iphoff)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ nf_log_buf_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ nf_log_buf_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ nf_log_buf_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & XT_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ const unsigned char *op;
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+ optsize, _opt);
+ if (op == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+ nf_log_buf_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ struct icmphdr _icmph;
+ const struct icmphdr *ich;
+ static const size_t required_len[NR_ICMP_TYPES+1]
+ = { [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+
+ /* Max length: 11 "PROTO=ICMP " */
+ nf_log_buf_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (ich == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ nf_log_buf_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ /* Fall through */
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ nf_log_buf_add(m, "[");
+ dump_ipv4_packet(m, info, skb,
+ iphoff + ih->ihl*4+sizeof(_icmph));
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohs(ich->un.frag.mtu));
+ }
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ nf_log_buf_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 10 "PROTO=ESP " */
+ nf_log_buf_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_esph), &_esph);
+ if (eh == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & XT_LOG_UID) && !iphoff)
+ nf_log_dump_sk_uid_gid(m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 252 = 803 */
+}
+
+static void dump_ipv4_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & XT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++, p++)
+ nf_log_buf_add(m, ":%02x", *p);
+ }
+ nf_log_buf_add(m, " ");
+}
+
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+ out, loginfo, prefix);
+
+ if (in != NULL)
+ dump_ipv4_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+ .name = "nf_log_ipv4",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_ipv4_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_ipv4_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip_logger);
+}
+
+static struct pernet_operations nf_log_ipv4_net_ops = {
+ .init = nf_log_ipv4_net_init,
+ .exit = nf_log_ipv4_net_exit,
+};
+
+static int __init nf_log_ipv4_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
+ if (ret < 0)
+ return ret;
+
+ nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ return 0;
+}
+
+static void __exit nf_log_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+ nf_log_unregister(&nf_ip_logger);
+}
+
+module_init(nf_log_ipv4_init);
+module_exit(nf_log_ipv4_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index d8b2e14efddc..fc37711e11f3 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
htons(oldlen), htons(datalen), 1);
}
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
struct nf_nat_range *range)
{
@@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
return 0;
}
+#endif
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
.l3proto = NFPROTO_IPV4,
@@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
.manip_pkt = nf_nat_ipv4_manip_pkt,
.csum_update = nf_nat_ipv4_csum_update,
.csum_recalc = nf_nat_ipv4_csum_recalc,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
+#endif
#ifdef CONFIG_XFRM
.decode_session = nf_nat_ipv4_decode_session,
#endif
@@ -250,6 +254,205 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+unsigned int
+nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ /* maniptype == SRC for postrouting. */
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+
+ /* We never see fragments: conntrack defrags on pre-routing
+ * and local-out, and nf_nat_out protects post-routing.
+ */
+ NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
+
+ /* Don't try to NAT if this packet is not conntracked */
+ if (nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ ops->hooknum))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ unsigned int ret;
+
+ ret = do_chain(ops, skb, in, out, ct);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ break;
+
+ ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else {
+ pr_debug("Already setup manip %s for ct %p\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ goto oif_changed;
+ }
+ break;
+
+ default:
+ /* ESTABLISHED */
+ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+ ctinfo == IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ goto oif_changed;
+ }
+
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
+
+unsigned int
+nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct))
+{
+ unsigned int ret;
+ __be32 daddr = ip_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ daddr != ip_hdr(skb)->daddr)
+ skb_dst_drop(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
+
+unsigned int
+nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct))
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int err;
+#endif
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+ (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+#endif
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
+
+unsigned int
+nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct))
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+ int err;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+ ct->tuplehash[!dir].tuple.src.u3.ip) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#endif
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
+
static int __init nf_nat_l3proto_ipv4_init(void)
{
int err;
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
new file mode 100644
index 000000000000..c6eb42100e9a
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -0,0 +1,153 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+unsigned int
+nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
+ const struct nf_nat_range *range,
+ const struct net_device *out)
+{
+ struct nf_conn *ct;
+ struct nf_conn_nat *nat;
+ enum ip_conntrack_info ctinfo;
+ struct nf_nat_range newrange;
+ const struct rtable *rt;
+ __be32 newsrc, nh;
+
+ NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ nat = nfct_nat(ct);
+
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ /* Source address is 0.0.0.0 - locally generated packet that is
+ * probably not supposed to be masqueraded.
+ */
+ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
+ return NF_ACCEPT;
+
+ rt = skb_rtable(skb);
+ nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+ newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
+ if (!newsrc) {
+ pr_info("%s ate my IP address\n", out->name);
+ return NF_DROP;
+ }
+
+ nat->masq_index = out->ifindex;
+
+ /* Transfer from original range. */
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = newsrc;
+ newrange.max_addr.ip = newsrc;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
+
+static int device_cmp(struct nf_conn *i, void *ifindex)
+{
+ const struct nf_conn_nat *nat = nfct_nat(i);
+
+ if (!nat)
+ return 0;
+ if (nf_ct_l3num(i) != NFPROTO_IPV4)
+ return 0;
+ return nat->masq_index == (int)(long)ifindex;
+}
+
+static int masq_device_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_DOWN) {
+ /* Device was downed. Search entire table for
+ * conntracks which were associated with that device,
+ * and forget them.
+ */
+ NF_CT_ASSERT(dev->ifindex != 0);
+
+ nf_ct_iterate_cleanup(net, device_cmp,
+ (void *)(long)dev->ifindex, 0, 0);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int masq_inet_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, dev);
+ return masq_device_event(this, event, &info);
+}
+
+static struct notifier_block masq_dev_notifier = {
+ .notifier_call = masq_device_event,
+};
+
+static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+};
+
+static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
+
+void nf_nat_masquerade_ipv4_register_notifier(void)
+{
+ /* check if the notifier was already set */
+ if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
+ return;
+
+ /* Register for device down reports */
+ register_netdevice_notifier(&masq_dev_notifier);
+ /* Register IP address change reports */
+ register_inetaddr_notifier(&masq_inet_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
+
+void nf_nat_masquerade_ipv4_unregister_notifier(void)
+{
+ /* check if the notifier still has clients */
+ if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
+ return;
+
+ unregister_netdevice_notifier(&masq_dev_notifier);
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 690d890111bb..9414923f1e15 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = {
.manip_pkt = gre_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
.unique_tuple = gre_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index eb303471bcf6..4557b4ab8342 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
.manip_pkt = icmp_manip_pkt,
.in_range = icmp_in_range,
.unique_tuple = icmp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
new file mode 100644
index 000000000000..b023b4eb1a96
--- /dev/null
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -0,0 +1,127 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <linux/netfilter_ipv4.h>
+
+/* Send RST reply */
+void nf_send_reset(struct sk_buff *oldskb, int hook)
+{
+ struct sk_buff *nskb;
+ const struct iphdr *oiph;
+ struct iphdr *niph;
+ const struct tcphdr *oth;
+ struct tcphdr _otcph, *tcph;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return;
+
+ oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
+ sizeof(_otcph), &_otcph);
+ if (oth == NULL)
+ return;
+
+ /* No RST for RST. */
+ if (oth->rst)
+ return;
+
+ if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ return;
+
+ /* Check checksum */
+ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
+ return;
+ oiph = ip_hdr(oldskb);
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+
+ skb_reset_network_header(nskb);
+ niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
+ niph->version = 4;
+ niph->ihl = sizeof(struct iphdr) / 4;
+ niph->tos = 0;
+ niph->id = 0;
+ niph->frag_off = htons(IP_DF);
+ niph->protocol = IPPROTO_TCP;
+ niph->check = 0;
+ niph->saddr = oiph->daddr;
+ niph->daddr = oiph->saddr;
+
+ skb_reset_transport_header(nskb);
+ tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+ memset(tcph, 0, sizeof(*tcph));
+ tcph->source = oth->dest;
+ tcph->dest = oth->source;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+
+ if (oth->ack)
+ tcph->seq = oth->ack_seq;
+ else {
+ tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+ oldskb->len - ip_hdrlen(oldskb) -
+ (oth->doff << 2));
+ tcph->ack = 1;
+ }
+
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+ niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)tcph - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ /* ip_route_me_harder expects skb->dst to be set */
+ skb_dst_set_noref(nskb, skb_dst(oldskb));
+
+ nskb->protocol = htons(ETH_P_IP);
+ if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ niph->ttl = ip4_dst_hoplimit(skb_dst(nskb));
+
+ /* "Never happens" */
+ if (nskb->len > dst_mtu(skb_dst(nskb)))
+ goto free_nskb;
+
+ nf_ct_attach(nskb, oldskb);
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip_local_out for bridged traffic, the MAC source on
+ * the RST will be ours, instead of the destination's. This confuses
+ * some routers/firewalls, and they drop the packet. So we need to
+ * build the eth header using the original destination's MAC as the
+ * source, and send the RST packet directly.
+ */
+ if (oldskb->nf_bridge) {
+ struct ethhdr *oeth = eth_hdr(oldskb);
+ nskb->dev = oldskb->nf_bridge->physindev;
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+ if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+ oeth->h_source, oeth->h_dest, nskb->len) < 0)
+ goto free_nskb;
+ dev_queue_xmit(nskb);
+ } else
+#endif
+ ip_local_out(nskb);
+
+ return;
+
+ free_nskb:
+ kfree_skb(nskb);
+}
+EXPORT_SYMBOL_GPL(nf_send_reset);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index 3964157d826c..df547bf50078 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -26,136 +26,53 @@
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/ip.h>
-/*
- * NAT chains
- */
-
-static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct)
{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct nf_conn_nat *nat;
- enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
struct nft_pktinfo pkt;
- unsigned int ret;
-
- if (ct == NULL || nf_ct_is_untracked(ct))
- return NF_ACCEPT;
-
- NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
-
- nat = nf_ct_nat_ext_add(ct);
- if (nat == NULL)
- return NF_ACCEPT;
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED + IP_CT_IS_REPLY:
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
- if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- ops->hooknum))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- /* Fall through */
- case IP_CT_NEW:
- if (nf_nat_initialized(ct, maniptype))
- break;
- nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
- ret = nft_do_chain(&pkt, ops);
- if (ret != NF_ACCEPT)
- return ret;
- if (!nf_nat_initialized(ct, maniptype)) {
- ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
- if (ret != NF_ACCEPT)
- return ret;
- }
- default:
- break;
- }
-
- return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+ return nft_do_chain(&pkt, ops);
}
-static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- __be32 daddr = ip_hdr(skb)->daddr;
- unsigned int ret;
-
- ret = nf_nat_fn(ops, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- ip_hdr(skb)->daddr != daddr) {
- skb_dst_drop(skb);
- }
- return ret;
+ return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain);
}
-static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- enum ip_conntrack_info ctinfo __maybe_unused;
- const struct nf_conn *ct __maybe_unused;
- unsigned int ret;
-
- ret = nf_nat_fn(ops, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.src.u3.ip !=
- ct->tuplehash[!dir].tuple.dst.u3.ip ||
- ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all)
- return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
- ret : NF_DROP;
- }
-#endif
- return ret;
+ return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain);
}
-static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- enum ip_conntrack_info ctinfo;
- const struct nf_conn *ct;
- unsigned int ret;
-
- ret = nf_nat_fn(ops, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain);
+}
- if (ct->tuplehash[dir].tuple.dst.u3.ip !=
- ct->tuplehash[!dir].tuple.src.u3.ip) {
- if (ip_route_me_harder(skb, RTN_UNSPEC))
- ret = NF_DROP;
- }
-#ifdef CONFIG_XFRM
- else if (ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all)
- if (nf_xfrm_me_harder(skb, AF_INET))
- ret = NF_DROP;
-#endif
- }
- return ret;
+static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain);
}
static const struct nf_chain_type nft_chain_nat_ipv4 = {
@@ -168,10 +85,10 @@ static const struct nf_chain_type nft_chain_nat_ipv4 = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.hooks = {
- [NF_INET_PRE_ROUTING] = nf_nat_prerouting,
- [NF_INET_POST_ROUTING] = nf_nat_postrouting,
- [NF_INET_LOCAL_OUT] = nf_nat_output,
- [NF_INET_LOCAL_IN] = nf_nat_fn,
+ [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in,
+ [NF_INET_POST_ROUTING] = nft_nat_ipv4_out,
+ [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn,
+ [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn,
},
};
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
new file mode 100644
index 000000000000..1c636d6b5b50
--- /dev/null
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_masq.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+static void nft_masq_ipv4_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_masq *priv = nft_expr_priv(expr);
+ struct nf_nat_range range;
+ unsigned int verdict;
+
+ range.flags = priv->flags;
+
+ verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
+ &range, pkt->out);
+
+ data[NFT_REG_VERDICT].verdict = verdict;
+}
+
+static struct nft_expr_type nft_masq_ipv4_type;
+static const struct nft_expr_ops nft_masq_ipv4_ops = {
+ .type = &nft_masq_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_ipv4_eval,
+ .init = nft_masq_init,
+ .dump = nft_masq_dump,
+};
+
+static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "masq",
+ .ops = &nft_masq_ipv4_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_masq_ipv4_module_init(void)
+{
+ int ret;
+
+ ret = nft_register_expr(&nft_masq_ipv4_type);
+ if (ret < 0)
+ return ret;
+
+ nf_nat_masquerade_ipv4_register_notifier();
+
+ return ret;
+}
+
+static void __exit nft_masq_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_masq_ipv4_type);
+ nf_nat_masquerade_ipv4_unregister_notifier();
+}
+
+module_init(nft_masq_ipv4_module_init);
+module_exit(nft_masq_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index e79718a382f2..ed33299c56d1 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -16,7 +16,6 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
-#include <net/icmp.h>
#include <net/netfilter/ipv4/nf_reject.h>
#include <net/netfilter/nft_reject.h>
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 044a0ddf6a79..57f7c9804139 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -311,7 +311,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
chk_addr_ret = RTN_LOCAL;
- if ((sysctl_ip_nonlocal_bind == 0 &&
+ if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
isk->freebind == 0 && isk->transparent == 0 &&
chk_addr_ret != RTN_LOCAL) ||
chk_addr_ret == RTN_MULTICAST ||
@@ -911,7 +911,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sin6->sin6_flowinfo = ip6_flowinfo(ip6);
sin6->sin6_scope_id =
ipv6_iface_scope_id(&sin6->sin6_addr,
- IP6CB(skb)->iif);
+ inet6_iif(skb));
*addr_len = sizeof(*sin6);
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ae0af9386f7c..8e3eb39f84e7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,6 +52,7 @@
static int sockstat_seq_show(struct seq_file *seq, void *v)
{
struct net *net = seq->private;
+ unsigned int frag_mem;
int orphans, sockets;
local_bh_disable();
@@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, &udplite_prot));
seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot));
- seq_printf(seq, "FRAG: inuse %d memory %d\n",
- ip_frag_nqueues(net), ip_frag_mem(net));
+ frag_mem = ip_frag_mem(net);
+ seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
return 0;
}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 46d6a1c923a8..4b7c0ec65251 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -30,6 +30,7 @@
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_offloads);
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c65160565e1..739db3100c23 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -58,6 +58,7 @@
#include <linux/in_route.h>
#include <linux/route.h>
#include <linux/skbuff.h>
+#include <linux/igmp.h>
#include <net/net_namespace.h>
#include <net/dst.h>
#include <net/sock.h>
@@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
while (sk) {
delivered = 1;
- if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+ if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
+ ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
+ skb->dev->ifindex)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
@@ -365,6 +368,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
skb->transport_header = skb->network_header;
err = -EFAULT;
if (memcpy_fromiovecend((void *)iph, from, 0, length))
@@ -606,6 +611,8 @@ back_from_confirm:
&rt, msg->msg_flags);
else {
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
if (!ipc.addr)
ipc.addr = fl4.daddr;
lock_sock(sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 190199851c9a..793c0bb8c4fd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -596,12 +596,12 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
static inline u32 fnhe_hashfun(__be32 daddr)
{
+ static u32 fnhe_hashrnd __read_mostly;
u32 hval;
- hval = (__force u32) daddr;
- hval ^= (hval >> 11) ^ (hval >> 22);
-
- return hval & (FNHE_HASH_SIZE - 1);
+ net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
+ hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
+ return hash_32(hval, FNHE_HASH_SHIFT);
}
static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
@@ -628,12 +628,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
spin_lock_bh(&fnhe_lock);
- hash = nh->nh_exceptions;
+ hash = rcu_dereference(nh->nh_exceptions);
if (!hash) {
hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
if (!hash)
goto out_unlock;
- nh->nh_exceptions = hash;
+ rcu_assign_pointer(nh->nh_exceptions, hash);
}
hash += hval;
@@ -746,7 +746,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
}
n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
- if (n) {
+ if (!IS_ERR(n)) {
if (!(n->nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
} else {
@@ -1242,7 +1242,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
{
- struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+ struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
struct fib_nh_exception *fnhe;
u32 hval;
@@ -1798,8 +1798,6 @@ local_input:
no_route:
RT_CACHE_STAT_INC(in_no_route);
res.type = RTN_UNREACHABLE;
- if (err == -ESRCH)
- err = -ENETUNREACH;
goto local_input;
/*
@@ -2267,9 +2265,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
return rt;
if (flp4->flowi4_proto)
- rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
- flowi4_to_flowi(flp4),
- sk, 0);
+ rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0);
return rt;
}
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c86624b36a62..0431a8f3c8f4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -25,7 +25,7 @@
extern int sysctl_tcp_syncookies;
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
+static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+ __u16 *mssp)
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 79a007c52558..b3c53c8b331e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &ip_ttl_max,
},
{
- .procname = "ip_nonlocal_bind",
- .data = &sysctl_ip_nonlocal_bind,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_syn_retries",
.data = &sysctl_tcp_syn_retries,
.maxlen = sizeof(int),
@@ -450,6 +443,16 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+#ifdef CONFIG_IP_MULTICAST
+ {
+ .procname = "igmp_qrv",
+ .data = &sysctl_igmp_qrv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+#endif
{
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
@@ -628,15 +631,6 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
-#ifdef CONFIG_NET_DMA
- {
- .procname = "tcp_dma_copybreak",
- .data = &sysctl_tcp_dma_copybreak,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
-#endif
{
.procname = "tcp_slow_start_after_idle",
.data = &sysctl_tcp_slow_start_after_idle,
@@ -728,6 +722,22 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &one,
},
{
+ .procname = "icmp_msgs_per_sec",
+ .data = &sysctl_icmp_msgs_per_sec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "icmp_msgs_burst",
+ .data = &sysctl_icmp_msgs_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
@@ -839,6 +849,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "fwmark_reflect",
.data = &init_net.ipv4.sysctl_fwmark_reflect,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d2118e5fbc7..86023b9be47f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -274,7 +274,6 @@
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/netdma.h>
#include <net/sock.h>
#include <asm/uaccess.h>
@@ -405,7 +404,7 @@ void tcp_init_sock(struct sock *sk)
tp->reordering = sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
@@ -426,6 +425,17 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);
+static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+ if (sk->sk_tsflags) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ sock_tx_timestamp(sk, &shinfo->tx_flags);
+ if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
+ shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+ }
+}
+
/*
* Wait for a TCP event.
*
@@ -523,7 +533,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
- if (sk->sk_err)
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR;
return mask;
@@ -598,7 +608,7 @@ static inline bool forced_push(const struct tcp_sock *tp)
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
-static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
+static void skb_entail(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -607,7 +617,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
tcb->seq = tcb->end_seq = tp->write_seq;
tcb->tcp_flags = TCPHDR_ACK;
tcb->sacked = 0;
- skb_header_release(skb);
+ __skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
@@ -952,15 +962,17 @@ new_segment:
skb->ip_summed = CHECKSUM_PARTIAL;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->gso_segs = 0;
+ tcp_skb_pcount_set(skb, 0);
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
copied += copy;
offset += copy;
- if (!(size -= copy))
+ if (!(size -= copy)) {
+ tcp_tx_timestamp(sk, skb);
goto out;
+ }
if (skb->len < size_goal || (flags & MSG_OOB))
continue;
@@ -1175,13 +1187,6 @@ new_segment:
goto wait_for_memory;
/*
- * All packets are restored as if they have
- * already been sent.
- */
- if (tp->repair)
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
-
- /*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
@@ -1190,6 +1195,13 @@ new_segment:
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
+
+ /* All packets are restored as if they have
+ * already been sent. skb_mstamp isn't set to
+ * avoid wrong rtt estimation.
+ */
+ if (tp->repair)
+ TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
}
/* Try to append data to the end of skb. */
@@ -1248,12 +1260,14 @@ new_segment:
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->gso_segs = 0;
+ tcp_skb_pcount_set(skb, 0);
from += copy;
copied += copy;
- if ((seglen -= copy) == 0 && iovlen == 0)
+ if ((seglen -= copy) == 0 && iovlen == 0) {
+ tcp_tx_timestamp(sk, skb);
goto out;
+ }
if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
@@ -1379,7 +1393,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-void tcp_cleanup_rbuf(struct sock *sk, int copied)
+static void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
@@ -1455,39 +1469,6 @@ static void tcp_prequeue_process(struct sock *sk)
tp->ucopy.memory = 0;
}
-#ifdef CONFIG_NET_DMA
-static void tcp_service_net_dma(struct sock *sk, bool wait)
-{
- dma_cookie_t done, used;
- dma_cookie_t last_issued;
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!tp->ucopy.dma_chan)
- return;
-
- last_issued = tp->ucopy.dma_cookie;
- dma_async_issue_pending(tp->ucopy.dma_chan);
-
- do {
- if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
- last_issued, &done,
- &used) == DMA_COMPLETE) {
- /* Safe to free early-copied skbs now */
- __skb_queue_purge(&sk->sk_async_wait_queue);
- break;
- } else {
- struct sk_buff *skb;
- while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
- (dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_COMPLETE)) {
- __skb_dequeue(&sk->sk_async_wait_queue);
- kfree_skb(skb);
- }
- }
- } while (wait);
-}
-#endif
-
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1495,9 +1476,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq;
- if (tcp_hdr(skb)->syn)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
offset--;
- if (offset < skb->len || tcp_hdr(skb)->fin) {
+ if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
*off = offset;
return skb;
}
@@ -1505,7 +1486,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb, false);
+ sk_eat_skb(sk, skb);
}
return NULL;
}
@@ -1570,12 +1551,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
if (offset + 1 != skb->len)
continue;
}
- if (tcp_hdr(skb)->fin) {
- sk_eat_skb(sk, skb, false);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ sk_eat_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb, false);
+ sk_eat_skb(sk, skb);
if (!desc->count)
break;
tp->copied_seq = seq;
@@ -1613,10 +1594,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- bool copied_early = false;
struct sk_buff *skb;
u32 urg_hole = 0;
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return ip_recv_error(sk, msg, len, addr_len);
+
if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
(sk->sk_state == TCP_ESTABLISHED))
sk_busy_loop(sk, nonblock);
@@ -1656,28 +1639,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
-#ifdef CONFIG_NET_DMA
- tp->ucopy.dma_chan = NULL;
- preempt_disable();
- skb = skb_peek_tail(&sk->sk_receive_queue);
- {
- int available = 0;
-
- if (skb)
- available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
- if ((available < target) &&
- (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
- !sysctl_tcp_low_latency &&
- net_dma_find_channel()) {
- preempt_enable();
- tp->ucopy.pinned_list =
- dma_pin_iovec_pages(msg->msg_iov, len);
- } else {
- preempt_enable();
- }
- }
-#endif
-
do {
u32 offset;
@@ -1704,11 +1665,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
break;
offset = *seq - TCP_SKB_CB(skb)->seq;
- if (tcp_hdr(skb)->syn)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
offset--;
if (offset < skb->len)
goto found_ok_skb;
- if (tcp_hdr(skb)->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
"recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
@@ -1808,16 +1769,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* __ Set realtime policy in scheduler __ */
}
-#ifdef CONFIG_NET_DMA
- if (tp->ucopy.dma_chan) {
- if (tp->rcv_wnd == 0 &&
- !skb_queue_empty(&sk->sk_async_wait_queue)) {
- tcp_service_net_dma(sk, true);
- tcp_cleanup_rbuf(sk, copied);
- } else
- dma_async_issue_pending(tp->ucopy.dma_chan);
- }
-#endif
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
@@ -1825,11 +1776,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
} else
sk_wait_data(sk, &timeo);
-#ifdef CONFIG_NET_DMA
- tcp_service_net_dma(sk, false); /* Don't block */
- tp->ucopy.wakeup = 0;
-#endif
-
if (user_recv) {
int chunk;
@@ -1887,43 +1833,13 @@ do_prequeue:
}
if (!(flags & MSG_TRUNC)) {
-#ifdef CONFIG_NET_DMA
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = net_dma_find_channel();
-
- if (tp->ucopy.dma_chan) {
- tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
- tp->ucopy.dma_chan, skb, offset,
- msg->msg_iov, used,
- tp->ucopy.pinned_list);
-
- if (tp->ucopy.dma_cookie < 0) {
-
- pr_alert("%s: dma_cookie < 0\n",
- __func__);
-
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
- break;
- }
-
- dma_async_issue_pending(tp->ucopy.dma_chan);
-
- if ((offset + used) == skb->len)
- copied_early = true;
-
- } else
-#endif
- {
- err = skb_copy_datagram_iovec(skb, offset,
- msg->msg_iov, used);
- if (err) {
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
- break;
- }
+ err = skb_copy_datagram_iovec(skb, offset,
+ msg->msg_iov, used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
}
}
@@ -1941,21 +1857,17 @@ skip_copy:
if (used + offset < skb->len)
continue;
- if (tcp_hdr(skb)->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
- if (!(flags & MSG_PEEK)) {
- sk_eat_skb(sk, skb, copied_early);
- copied_early = false;
- }
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
++*seq;
- if (!(flags & MSG_PEEK)) {
- sk_eat_skb(sk, skb, copied_early);
- copied_early = false;
- }
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
break;
} while (len > 0);
@@ -1978,16 +1890,6 @@ skip_copy:
tp->ucopy.len = 0;
}
-#ifdef CONFIG_NET_DMA
- tcp_service_net_dma(sk, true); /* Wait for queue to drain */
- tp->ucopy.dma_chan = NULL;
-
- if (tp->ucopy.pinned_list) {
- dma_unpin_iovec_pages(tp->ucopy.pinned_list);
- tp->ucopy.pinned_list = NULL;
- }
-#endif
-
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
@@ -2142,8 +2044,10 @@ void tcp_close(struct sock *sk, long timeout)
* reader process may not have drained the data yet!
*/
while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
- tcp_hdr(skb)->fin;
+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ len--;
data_was_unread += len;
__kfree_skb(skb);
}
@@ -2331,9 +2235,6 @@ int tcp_disconnect(struct sock *sk, int flags)
__skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk);
__skb_queue_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_NET_DMA
- __skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
inet->inet_dport = 0;
@@ -2673,7 +2574,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
break;
#endif
case TCP_USER_TIMEOUT:
- /* Cap the max timeout in ms TCP will retry/retrans
+ /* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
@@ -3152,7 +3053,7 @@ static int __init set_thash_entries(char *str)
}
__setup("thash_entries=", set_thash_entries);
-static void tcp_init_mem(void)
+static void __init tcp_init_mem(void)
{
unsigned long limit = nr_free_buffer_pages() / 8;
limit = max(limit, 128UL);
@@ -3170,8 +3071,8 @@ void __init tcp_init(void)
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
- percpu_counter_init(&tcp_sockets_allocated, 0);
- percpu_counter_init(&tcp_orphan_count, 0);
+ percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
+ percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
@@ -3238,8 +3139,6 @@ void __init tcp_init(void)
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_metrics_init();
-
- tcp_register_congestion_control(&tcp_reno);
-
+ BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index d5de69bc04f5..bb395d46a389 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <net/tcp.h>
-
#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
* max_cwnd = snd_cwnd * beta
*/
@@ -46,11 +45,10 @@ MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
module_param(smooth_part, int, 0644);
MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
-
/* BIC TCP Parameters */
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
- u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
@@ -103,7 +101,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
/* binary increase */
if (cwnd < ca->last_max_cwnd) {
- __u32 dist = (ca->last_max_cwnd - cwnd)
+ __u32 dist = (ca->last_max_cwnd - cwnd)
/ BICTCP_B;
if (dist > max_increment)
@@ -154,7 +152,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
}
-
}
/*
@@ -177,7 +174,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->loss_cwnd = tp->snd_cwnd;
-
if (tp->snd_cwnd <= low_window)
return max(tp->snd_cwnd >> 1U, 2U);
else
@@ -188,6 +184,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct bictcp *ca = inet_csk_ca(sk);
+
return max(tp->snd_cwnd, ca->loss_cwnd);
}
@@ -206,12 +203,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
if (icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
+
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ca->delayed_ack += cnt;
}
}
-
static struct tcp_congestion_ops bictcp __read_mostly = {
.init = bictcp_init,
.ssthresh = bictcp_recalc_ssthresh,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 7b09d8b49fa5..b1c5970d47a1 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
/* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct sock *sk)
+void tcp_assign_congestion_control(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
- /* if no choice made yet assign the current value set as default */
- if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
- rcu_read_lock();
- list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
- if (try_module_get(ca->owner)) {
- icsk->icsk_ca_ops = ca;
- break;
- }
-
- /* fallback to next available */
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (likely(try_module_get(ca->owner))) {
+ icsk->icsk_ca_ops = ca;
+ goto out;
}
- rcu_read_unlock();
+ /* Fallback to next available. The last really
+ * guaranteed fallback is Reno from this list.
+ */
}
+out:
+ rcu_read_unlock();
+
+ /* Clear out private data before diag gets it and
+ * the ca has not been initialized.
+ */
+ if (ca->get_info)
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
@@ -142,7 +152,6 @@ static int __init tcp_congestion_default(void)
}
late_initcall(tcp_congestion_default);
-
/* Build string with list of available congestion control values */
void tcp_get_available_congestion_control(char *buf, size_t maxlen)
{
@@ -154,7 +163,6 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
-
}
rcu_read_unlock();
}
@@ -186,7 +194,6 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
-
}
rcu_read_unlock();
}
@@ -230,7 +237,6 @@ out:
return ret;
}
-
/* Change congestion control for socket */
int tcp_set_congestion_control(struct sock *sk, const char *name)
{
@@ -285,15 +291,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
* returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-int tcp_slow_start(struct tcp_sock *tp, u32 acked)
+void tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
u32 cwnd = tp->snd_cwnd + acked;
if (cwnd > tp->snd_ssthresh)
cwnd = tp->snd_ssthresh + 1;
- acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
- return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
@@ -337,6 +341,7 @@ EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
u32 tcp_reno_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+
return max(tp->snd_cwnd >> 1U, 2U);
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
@@ -348,15 +353,3 @@ struct tcp_congestion_ops tcp_reno = {
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
};
-
-/* Initial congestion control used (until SYN)
- * really reno under another name so we can tell difference
- * during tcp_set_default_congestion_control
- */
-struct tcp_congestion_ops tcp_init_congestion_ops = {
- .name = "",
- .owner = THIS_MODULE,
- .ssthresh = tcp_reno_ssthresh,
- .cong_avoid = tcp_reno_cong_avoid,
-};
-EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index a9bd8a4828a9..20de0118c98e 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -82,12 +82,13 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse
/* BIC TCP Parameters */
struct bictcp {
u32 cnt; /* increase cwnd by 1 after ACKs */
- u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
u32 loss_cwnd; /* congestion window at last loss */
u32 last_cwnd; /* the last snd_cwnd */
u32 last_time; /* time when updated last_cwnd */
u32 bic_origin_point;/* origin point of bic function */
- u32 bic_K; /* time to origin point from the beginning of the current epoch */
+ u32 bic_K; /* time to origin point
+ from the beginning of the current epoch */
u32 delay_min; /* min delay (msec << 3) */
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
@@ -219,7 +220,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->last_time = tcp_time_stamp;
if (ca->epoch_start == 0) {
- ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
+ ca->epoch_start = tcp_time_stamp; /* record beginning */
ca->ack_cnt = 1; /* start counting */
ca->tcp_cwnd = cwnd; /* syn with cubic */
@@ -263,9 +264,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
/* c/rtt * (t-K)^3 */
delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
- if (t < ca->bic_K) /* below origin*/
+ if (t < ca->bic_K) /* below origin*/
bic_target = ca->bic_origin_point - delta;
- else /* above origin*/
+ else /* above origin*/
bic_target = ca->bic_origin_point + delta;
/* cubic function - calc bictcp_cnt*/
@@ -285,13 +286,14 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
/* TCP Friendly */
if (tcp_friendliness) {
u32 scale = beta_scale;
+
delta = (cwnd * scale) >> 3;
while (ca->ack_cnt > delta) { /* update tcp cwnd */
ca->ack_cnt -= delta;
ca->tcp_cwnd++;
}
- if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
+ if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
delta = ca->tcp_cwnd - cwnd;
max_cnt = cwnd / delta;
if (ca->cnt > max_cnt)
@@ -320,7 +322,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
}
-
}
static u32 bictcp_recalc_ssthresh(struct sock *sk)
@@ -452,7 +453,8 @@ static int __init cubictcp_register(void)
* based on SRTT of 100ms
*/
- beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
+ beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
+ / (BICTCP_BETA_SCALE - beta);
cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000000..b504371af742
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,344 @@
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ * - High burst tolerance (incast due to partition/aggregate)
+ * - Low latency (short flows, queries)
+ * - High throughput (continuous data updates, large file transfers)
+ * with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ * "Data Center TCP (DCTCP)", Data Center Networks session
+ * Proc. ACM SIGCOMM, New Delhi, 2010.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ * "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ * Proc. ACM SIGMETRICS, San Jose, 2011.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <dborkman@redhat.com>
+ * Florian Westphal <fw@strlen.de>
+ * Glenn Judd <glenn.judd@morganstanley.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+#define DCTCP_MAX_ALPHA 1024U
+
+struct dctcp {
+ u32 acked_bytes_ecn;
+ u32 acked_bytes_total;
+ u32 prior_snd_una;
+ u32 prior_rcv_nxt;
+ u32 dctcp_alpha;
+ u32 next_seq;
+ u32 ce_state;
+ u32 delayed_ack_reserved;
+};
+
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+module_param(dctcp_shift_g, uint, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+ "parameter for clamping alpha on loss");
+
+static struct tcp_congestion_ops dctcp_reno;
+
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+ ca->next_seq = tp->snd_nxt;
+
+ ca->acked_bytes_ecn = 0;
+ ca->acked_bytes_total = 0;
+}
+
+static void dctcp_init(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((tp->ecn_flags & TCP_ECN_OK) ||
+ (sk->sk_state == TCP_LISTEN ||
+ sk->sk_state == TCP_CLOSE)) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ ca->prior_snd_una = tp->snd_una;
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+
+ ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+
+ ca->delayed_ack_reserved = 0;
+ ca->ce_state = 0;
+
+ dctcp_reset(tp, ca);
+ return;
+ }
+
+ /* No ECN support? Fall back to Reno. Also need to clear
+ * ECT from sk since it is set during 3WHS for DCTCP.
+ */
+ inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+ INET_ECN_dontxmit(sk);
+}
+
+static u32 dctcp_ssthresh(struct sock *sk)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+
+static void dctcp_ce_state_0_to_1(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=0 to CE=1 and delayed
+ * ACK has not sent yet.
+ */
+ if (!ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=0. */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 1;
+
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_ce_state_1_to_0(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=1 to CE=0 and delayed
+ * ACK has not sent yet.
+ */
+ if (ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=1. */
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 0;
+
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+ u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
+
+ /* If ack did not advance snd_una, count dupack as MSS size.
+ * If ack did update window, do not count it at all.
+ */
+ if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
+ acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+ if (acked_bytes) {
+ ca->acked_bytes_total += acked_bytes;
+ ca->prior_snd_una = tp->snd_una;
+
+ if (flags & CA_ACK_ECE)
+ ca->acked_bytes_ecn += acked_bytes;
+ }
+
+ /* Expired RTT */
+ if (!before(tp->snd_una, ca->next_seq)) {
+ /* For avoiding denominator == 1. */
+ if (ca->acked_bytes_total == 0)
+ ca->acked_bytes_total = 1;
+
+ /* alpha = (1 - g) * alpha + g * F */
+ ca->dctcp_alpha = ca->dctcp_alpha -
+ (ca->dctcp_alpha >> dctcp_shift_g) +
+ (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
+ ca->acked_bytes_total;
+
+ if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
+ /* Clamp dctcp_alpha to max. */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+
+ dctcp_reset(tp, ca);
+ }
+}
+
+static void dctcp_state(struct sock *sk, u8 new_state)
+{
+ if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ /* If this extension is enabled, we clamp dctcp_alpha to
+ * max on packet loss; the motivation is that dctcp_alpha
+ * is an indicator to the extend of congestion and packet
+ * loss is an indicator of extreme congestion; setting
+ * this in practice turned out to be beneficial, and
+ * effectively assumes total congestion which reduces the
+ * window by half.
+ */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+ }
+}
+
+static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ switch (ev) {
+ case CA_EVENT_DELAYED_ACK:
+ if (!ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 1;
+ break;
+ case CA_EVENT_NON_DELAYED_ACK:
+ if (ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 0;
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+ switch (ev) {
+ case CA_EVENT_ECN_IS_CE:
+ dctcp_ce_state_0_to_1(sk);
+ break;
+ case CA_EVENT_ECN_NO_CE:
+ dctcp_ce_state_1_to_0(sk);
+ break;
+ case CA_EVENT_DELAYED_ACK:
+ case CA_EVENT_NON_DELAYED_ACK:
+ dctcp_update_ack_reserved(sk, ev);
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+
+ /* Fill it also in case of VEGASINFO due to req struct limits.
+ * We can still correctly retrieve it later.
+ */
+ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_dctcp_info info;
+
+ memset(&info, 0, sizeof(info));
+ if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+ info.dctcp_enabled = 1;
+ info.dctcp_ce_state = (u16) ca->ce_state;
+ info.dctcp_alpha = ca->dctcp_alpha;
+ info.dctcp_ab_ecn = ca->acked_bytes_ecn;
+ info.dctcp_ab_tot = ca->acked_bytes_total;
+ }
+
+ nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+ }
+}
+
+static struct tcp_congestion_ops dctcp __read_mostly = {
+ .init = dctcp_init,
+ .in_ack_event = dctcp_update_alpha,
+ .cwnd_event = dctcp_cwnd_event,
+ .ssthresh = dctcp_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .set_state = dctcp_state,
+ .get_info = dctcp_get_info,
+ .flags = TCP_CONG_NEEDS_ECN,
+ .owner = THIS_MODULE,
+ .name = "dctcp",
+};
+
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .get_info = dctcp_get_info,
+ .owner = THIS_MODULE,
+ .name = "dctcp-reno",
+};
+
+static int __init dctcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&dctcp);
+}
+
+static void __exit dctcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&dctcp);
+}
+
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index ed3f2ad42e0f..0d73f9ddb55b 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -9,7 +9,6 @@
* 2 of the License, or (at your option) any later version.
*/
-
#include <linux/module.h>
#include <linux/inet_diag.h>
@@ -35,13 +34,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
{
inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
}
static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ struct inet_diag_req_v2 *req)
{
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 9771563ab564..815c85e3b1e0 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -115,7 +115,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
struct in6_addr *buf = (struct in6_addr *) tmp.val;
- int i = 4;
+ int i;
for (i = 0; i < 4; i++)
buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 1c4908280d92..882c08aae2f5 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -9,7 +9,6 @@
#include <linux/module.h>
#include <net/tcp.h>
-
/* From AIMD tables from RFC 3649 appendix B,
* with fixed-point MD scaled <<8.
*/
@@ -17,78 +16,78 @@ static const struct hstcp_aimd_val {
unsigned int cwnd;
unsigned int md;
} hstcp_aimd_vals[] = {
- { 38, 128, /* 0.50 */ },
- { 118, 112, /* 0.44 */ },
- { 221, 104, /* 0.41 */ },
- { 347, 98, /* 0.38 */ },
- { 495, 93, /* 0.37 */ },
- { 663, 89, /* 0.35 */ },
- { 851, 86, /* 0.34 */ },
- { 1058, 83, /* 0.33 */ },
- { 1284, 81, /* 0.32 */ },
- { 1529, 78, /* 0.31 */ },
- { 1793, 76, /* 0.30 */ },
- { 2076, 74, /* 0.29 */ },
- { 2378, 72, /* 0.28 */ },
- { 2699, 71, /* 0.28 */ },
- { 3039, 69, /* 0.27 */ },
- { 3399, 68, /* 0.27 */ },
- { 3778, 66, /* 0.26 */ },
- { 4177, 65, /* 0.26 */ },
- { 4596, 64, /* 0.25 */ },
- { 5036, 62, /* 0.25 */ },
- { 5497, 61, /* 0.24 */ },
- { 5979, 60, /* 0.24 */ },
- { 6483, 59, /* 0.23 */ },
- { 7009, 58, /* 0.23 */ },
- { 7558, 57, /* 0.22 */ },
- { 8130, 56, /* 0.22 */ },
- { 8726, 55, /* 0.22 */ },
- { 9346, 54, /* 0.21 */ },
- { 9991, 53, /* 0.21 */ },
- { 10661, 52, /* 0.21 */ },
- { 11358, 52, /* 0.20 */ },
- { 12082, 51, /* 0.20 */ },
- { 12834, 50, /* 0.20 */ },
- { 13614, 49, /* 0.19 */ },
- { 14424, 48, /* 0.19 */ },
- { 15265, 48, /* 0.19 */ },
- { 16137, 47, /* 0.19 */ },
- { 17042, 46, /* 0.18 */ },
- { 17981, 45, /* 0.18 */ },
- { 18955, 45, /* 0.18 */ },
- { 19965, 44, /* 0.17 */ },
- { 21013, 43, /* 0.17 */ },
- { 22101, 43, /* 0.17 */ },
- { 23230, 42, /* 0.17 */ },
- { 24402, 41, /* 0.16 */ },
- { 25618, 41, /* 0.16 */ },
- { 26881, 40, /* 0.16 */ },
- { 28193, 39, /* 0.16 */ },
- { 29557, 39, /* 0.15 */ },
- { 30975, 38, /* 0.15 */ },
- { 32450, 38, /* 0.15 */ },
- { 33986, 37, /* 0.15 */ },
- { 35586, 36, /* 0.14 */ },
- { 37253, 36, /* 0.14 */ },
- { 38992, 35, /* 0.14 */ },
- { 40808, 35, /* 0.14 */ },
- { 42707, 34, /* 0.13 */ },
- { 44694, 33, /* 0.13 */ },
- { 46776, 33, /* 0.13 */ },
- { 48961, 32, /* 0.13 */ },
- { 51258, 32, /* 0.13 */ },
- { 53677, 31, /* 0.12 */ },
- { 56230, 30, /* 0.12 */ },
- { 58932, 30, /* 0.12 */ },
- { 61799, 29, /* 0.12 */ },
- { 64851, 28, /* 0.11 */ },
- { 68113, 28, /* 0.11 */ },
- { 71617, 27, /* 0.11 */ },
- { 75401, 26, /* 0.10 */ },
- { 79517, 26, /* 0.10 */ },
- { 84035, 25, /* 0.10 */ },
- { 89053, 24, /* 0.10 */ },
+ { 38, 128, /* 0.50 */ },
+ { 118, 112, /* 0.44 */ },
+ { 221, 104, /* 0.41 */ },
+ { 347, 98, /* 0.38 */ },
+ { 495, 93, /* 0.37 */ },
+ { 663, 89, /* 0.35 */ },
+ { 851, 86, /* 0.34 */ },
+ { 1058, 83, /* 0.33 */ },
+ { 1284, 81, /* 0.32 */ },
+ { 1529, 78, /* 0.31 */ },
+ { 1793, 76, /* 0.30 */ },
+ { 2076, 74, /* 0.29 */ },
+ { 2378, 72, /* 0.28 */ },
+ { 2699, 71, /* 0.28 */ },
+ { 3039, 69, /* 0.27 */ },
+ { 3399, 68, /* 0.27 */ },
+ { 3778, 66, /* 0.26 */ },
+ { 4177, 65, /* 0.26 */ },
+ { 4596, 64, /* 0.25 */ },
+ { 5036, 62, /* 0.25 */ },
+ { 5497, 61, /* 0.24 */ },
+ { 5979, 60, /* 0.24 */ },
+ { 6483, 59, /* 0.23 */ },
+ { 7009, 58, /* 0.23 */ },
+ { 7558, 57, /* 0.22 */ },
+ { 8130, 56, /* 0.22 */ },
+ { 8726, 55, /* 0.22 */ },
+ { 9346, 54, /* 0.21 */ },
+ { 9991, 53, /* 0.21 */ },
+ { 10661, 52, /* 0.21 */ },
+ { 11358, 52, /* 0.20 */ },
+ { 12082, 51, /* 0.20 */ },
+ { 12834, 50, /* 0.20 */ },
+ { 13614, 49, /* 0.19 */ },
+ { 14424, 48, /* 0.19 */ },
+ { 15265, 48, /* 0.19 */ },
+ { 16137, 47, /* 0.19 */ },
+ { 17042, 46, /* 0.18 */ },
+ { 17981, 45, /* 0.18 */ },
+ { 18955, 45, /* 0.18 */ },
+ { 19965, 44, /* 0.17 */ },
+ { 21013, 43, /* 0.17 */ },
+ { 22101, 43, /* 0.17 */ },
+ { 23230, 42, /* 0.17 */ },
+ { 24402, 41, /* 0.16 */ },
+ { 25618, 41, /* 0.16 */ },
+ { 26881, 40, /* 0.16 */ },
+ { 28193, 39, /* 0.16 */ },
+ { 29557, 39, /* 0.15 */ },
+ { 30975, 38, /* 0.15 */ },
+ { 32450, 38, /* 0.15 */ },
+ { 33986, 37, /* 0.15 */ },
+ { 35586, 36, /* 0.14 */ },
+ { 37253, 36, /* 0.14 */ },
+ { 38992, 35, /* 0.14 */ },
+ { 40808, 35, /* 0.14 */ },
+ { 42707, 34, /* 0.13 */ },
+ { 44694, 33, /* 0.13 */ },
+ { 46776, 33, /* 0.13 */ },
+ { 48961, 32, /* 0.13 */ },
+ { 51258, 32, /* 0.13 */ },
+ { 53677, 31, /* 0.12 */ },
+ { 56230, 30, /* 0.12 */ },
+ { 58932, 30, /* 0.12 */ },
+ { 61799, 29, /* 0.12 */ },
+ { 64851, 28, /* 0.11 */ },
+ { 68113, 28, /* 0.11 */ },
+ { 71617, 27, /* 0.11 */ },
+ { 75401, 26, /* 0.10 */ },
+ { 79517, 26, /* 0.10 */ },
+ { 84035, 25, /* 0.10 */ },
+ { 89053, 24, /* 0.10 */ },
};
#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 031361311a8b..58469fff6c18 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -98,7 +98,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
}
}
-static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt)
+static void measure_achieved_throughput(struct sock *sk,
+ u32 pkts_acked, s32 rtt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -148,8 +149,8 @@ static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
if (use_bandwidth_switch) {
u32 maxB = ca->maxB;
u32 old_maxB = ca->old_maxB;
- ca->old_maxB = ca->maxB;
+ ca->old_maxB = ca->maxB;
if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
ca->beta = BETA_MIN;
ca->modeswitch = 0;
@@ -270,6 +271,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
case TCP_CA_Open:
{
struct htcp *ca = inet_csk_ca(sk);
+
if (ca->undo_last_cong) {
ca->last_cong = jiffies;
ca->undo_last_cong = 0;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index d8f8f05a4951..f963b274f2b0 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -29,7 +29,6 @@ static int rtt0 = 25;
module_param(rtt0, int, 0644);
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
-
/* This is called to refresh values for hybla parameters */
static inline void hybla_recalc_param (struct sock *sk)
{
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 5999b3972e64..1d5a30a90adf 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -284,7 +284,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
if (delta >= tp->snd_cwnd) {
tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
- (u32) tp->snd_cwnd_clamp);
+ (u32)tp->snd_cwnd_clamp);
tp->snd_cwnd_cnt = 0;
}
}
@@ -299,7 +299,6 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
}
-
/* Extract info for Tcp socket info provided via netlink. */
static void tcp_illinois_info(struct sock *sk, u32 ext,
struct sk_buff *skb)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 40639c288dc2..00a41499d52c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -73,7 +73,7 @@
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
-#include <net/netdma.h>
+#include <linux/errqueue.h>
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -200,28 +200,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk)
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
-static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
+static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
if (tp->ecn_flags & TCP_ECN_OK)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
-static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
+static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
{
- if (!(tp->ecn_flags & TCP_ECN_OK))
- return;
-
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
@@ -232,30 +229,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
tcp_enter_quickack_mode((struct sock *)tp);
break;
case INET_ECN_CE:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode((struct sock *)tp);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
- /* fallinto */
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
default:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
}
}
-static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ if (tp->ecn_flags & TCP_ECN_OK)
+ __tcp_ecn_check_ce(tp, skb);
+}
+
+static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
return true;
@@ -652,7 +662,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
}
icsk->icsk_ack.lrcvtime = now;
- TCP_ECN_check_ce(tp, skb);
+ tcp_ecn_check_ce(tp, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb);
@@ -1294,9 +1304,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
TCP_SKB_CB(prev)->end_seq += shifted;
TCP_SKB_CB(skb)->seq += shifted;
- skb_shinfo(prev)->gso_segs += pcount;
- BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
- skb_shinfo(skb)->gso_segs -= pcount;
+ tcp_skb_pcount_add(prev, pcount);
+ BUG_ON(tcp_skb_pcount(skb) < pcount);
+ tcp_skb_pcount_add(skb, -pcount);
/* When we're adding to gso_segs == 1, gso_size will be zero,
* in theory this shouldn't be necessary but as long as DSACK
@@ -1309,7 +1319,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
}
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
- if (skb_shinfo(skb)->gso_segs <= 1) {
+ if (tcp_skb_pcount(skb) <= 1) {
skb_shinfo(skb)->gso_size = 0;
skb_shinfo(skb)->gso_type = 0;
}
@@ -1887,33 +1897,34 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
tp->sacked_out = 0;
}
-static void tcp_clear_retrans_partial(struct tcp_sock *tp)
+void tcp_clear_retrans(struct tcp_sock *tp)
{
tp->retrans_out = 0;
tp->lost_out = 0;
-
tp->undo_marker = 0;
tp->undo_retrans = -1;
+ tp->fackets_out = 0;
+ tp->sacked_out = 0;
}
-void tcp_clear_retrans(struct tcp_sock *tp)
+static inline void tcp_init_undo(struct tcp_sock *tp)
{
- tcp_clear_retrans_partial(tp);
-
- tp->fackets_out = 0;
- tp->sacked_out = 0;
+ tp->undo_marker = tp->snd_una;
+ /* Retransmission still in flight may cause DSACKs later. */
+ tp->undo_retrans = tp->retrans_out ? : -1;
}
-/* Enter Loss state. If "how" is not zero, forget all SACK information
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
-void tcp_enter_loss(struct sock *sk, int how)
+void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
bool new_recovery = false;
+ bool is_reneg; /* is receiver reneging on SACKs? */
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,18 +1934,22 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
+ tcp_init_undo(tp);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tcp_clear_retrans_partial(tp);
+ tp->retrans_out = 0;
+ tp->lost_out = 0;
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- tp->undo_marker = tp->snd_una;
- if (how) {
+ skb = tcp_write_queue_head(sk);
+ is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -1944,11 +1959,8 @@ void tcp_enter_loss(struct sock *sk, int how)
if (skb == tcp_send_head(sk))
break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->undo_marker = 0;
-
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -1966,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk, int how)
sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
- TCP_ECN_queue_cwr(tp);
+ tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
* loss recovery is underway except recurring timeout(s) on
@@ -1981,19 +1993,21 @@ void tcp_enter_loss(struct sock *sk, int how)
* remembered SACKs do not reflect real state of receiver i.e.
* receiver _host_ is heavily congested (or buggy).
*
- * Do processing similar to RTO timeout.
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
*/
static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
if (flag & FLAG_SACK_RENEGING) {
- struct inet_connection_sock *icsk = inet_csk(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+ msecs_to_jiffies(10));
- tcp_enter_loss(sk, 1);
- icsk->icsk_retransmits++;
- tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- icsk->icsk_rto, TCP_RTO_MAX);
+ delay, TCP_RTO_MAX);
return true;
}
return false;
@@ -2356,7 +2370,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
- TCP_ECN_withdraw_cwr(tp);
+ tcp_ecn_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2475,7 +2489,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
* losses and/or application stalls), do not perform any further cwnd
* reductions, but instead slow start up to ssthresh.
*/
-static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
+static void tcp_init_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2485,9 +2499,8 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
tp->prr_out = 0;
- if (set_ssthresh)
- tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
- TCP_ECN_queue_cwr(tp);
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ tcp_ecn_queue_cwr(tp);
}
static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
@@ -2528,14 +2541,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
}
/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+void tcp_enter_cwr(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = 0;
- tcp_init_cwnd_reduction(sk, set_ssthresh);
+ tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
@@ -2564,7 +2577,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
tp->retrans_stamp = 0;
if (flag & FLAG_ECE)
- tcp_enter_cwr(sk, 1);
+ tcp_enter_cwr(sk);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
@@ -2664,13 +2677,12 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
NET_INC_STATS_BH(sock_net(sk), mib_idx);
tp->prior_ssthresh = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out ? : -1;
+ tcp_init_undo(tp);
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tcp_init_cwnd_reduction(sk, true);
+ tcp_init_cwnd_reduction(sk);
}
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
@@ -2680,7 +2692,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
*/
static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
@@ -2706,12 +2717,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
if (recovered) {
/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
- icsk->icsk_retransmits = 0;
tcp_try_undo_recovery(sk);
return;
}
- if (flag & FLAG_DATA_ACKED)
- icsk->icsk_retransmits = 0;
if (tcp_is_reno(tp)) {
/* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions.
@@ -2968,7 +2976,8 @@ void tcp_rearm_rto(struct sock *sk)
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
- const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
+ const u32 rto_time_stamp =
+ tcp_skb_timestamp(skb) + rto;
s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
/* delta may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
@@ -3043,10 +3052,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
first_ackt.v64 = 0;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u8 sacked = scb->sacked;
u32 acked_pcount;
+ if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+ between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
@@ -3203,9 +3217,10 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using!
*/
} else {
+ unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
- TCP_RTO_MAX);
+ when, TCP_RTO_MAX);
}
}
@@ -3346,7 +3361,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tp->tlp_high_seq = 0;
/* Don't reduce cwnd if DSACK arrives for TLP retrans. */
if (!(flag & FLAG_DSACKING_ACK)) {
- tcp_init_cwnd_reduction(sk, true);
+ tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_try_keep_open(sk);
@@ -3356,6 +3371,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
}
}
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->in_ack_event)
+ icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3393,8 +3416,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
- if (after(ack, prior_snd_una))
+ if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
+ icsk->icsk_retransmits = 0;
+ }
prior_fackets = tp->fackets_out;
@@ -3413,10 +3438,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
- tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+ tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
+ u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
@@ -3428,10 +3455,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt_us);
- if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
+ ack_ev_flags |= CA_ACK_ECE;
+ }
- tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
+
+ tcp_in_ack_event(sk, ack_ev_flags);
}
/* We passed data and got it acked, remove any soft error
@@ -4053,6 +4085,44 @@ static void tcp_sack_remove(struct tcp_sock *tp)
tp->rx_opt.num_sacks = num_sacks;
}
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+ return true;
+}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -4060,7 +4130,8 @@ static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
- struct sk_buff *skb;
+ struct sk_buff *skb, *tail;
+ bool fragstolen, eaten;
while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
@@ -4073,9 +4144,9 @@ static void tcp_ofo_queue(struct sock *sk)
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
+ __skb_unlink(skb, &tp->out_of_order_queue);
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
- __skb_unlink(skb, &tp->out_of_order_queue);
__kfree_skb(skb);
continue;
}
@@ -4083,11 +4154,15 @@ static void tcp_ofo_queue(struct sock *sk)
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
- __skb_unlink(skb, &tp->out_of_order_queue);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- if (tcp_hdr(skb)->fin)
+ if (!eaten)
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
+ if (eaten)
+ kfree_skb_partial(skb, fragstolen);
}
}
@@ -4114,53 +4189,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
return 0;
}
-/**
- * tcp_try_coalesce - try to merge skb to prior one
- * @sk: socket
- * @to: prior buffer
- * @from: buffer to add in queue
- * @fragstolen: pointer to boolean
- *
- * Before queueing skb @from after @to, try to merge them
- * to reduce overall memory use and queue lengths, if cost is small.
- * Packets in ofo or receive queues can stay a long time.
- * Better try to coalesce them right now to avoid future collapses.
- * Returns true if caller should free @from instead of queueing it
- */
-static bool tcp_try_coalesce(struct sock *sk,
- struct sk_buff *to,
- struct sk_buff *from,
- bool *fragstolen)
-{
- int delta;
-
- *fragstolen = false;
-
- if (tcp_hdr(from)->fin)
- return false;
-
- /* Its possible this segment overlaps with prior segment in queue */
- if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
- return false;
-
- if (!skb_try_coalesce(to, from, fragstolen, &delta))
- return false;
-
- atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
- TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
- TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
- return true;
-}
-
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb1;
u32 seq, end_seq;
- TCP_ECN_check_ce(tp, skb);
+ tcp_ecn_check_ce(tp, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4299,24 +4334,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
- struct sk_buff *skb = NULL;
- struct tcphdr *th;
+ struct sk_buff *skb;
bool fragstolen;
if (size == 0)
return 0;
- skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
+ skb = alloc_skb(size, sk->sk_allocation);
if (!skb)
goto err;
- if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto err_free;
- th = (struct tcphdr *)skb_put(skb, sizeof(*th));
- skb_reset_transport_header(skb);
- memset(th, 0, sizeof(*th));
-
if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
goto err_free;
@@ -4324,7 +4354,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
- if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
+ if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
@@ -4338,7 +4368,6 @@ err:
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
- const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
bool fragstolen = false;
@@ -4347,9 +4376,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
goto drop;
skb_dst_drop(skb);
- __skb_pull(skb, th->doff * 4);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
- TCP_ECN_accept_cwr(tp, skb);
+ tcp_ecn_accept_cwr(tp, skb);
tp->rx_opt.dsack = 0;
@@ -4391,7 +4420,7 @@ queue_and_out:
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (skb->len)
tcp_event_data_recv(sk, skb);
- if (th->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
if (!skb_queue_empty(&tp->out_of_order_queue)) {
@@ -4506,7 +4535,7 @@ restart:
* - bloated or contains data before "start" or
* overlaps to the next one.
*/
- if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
+ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(skb->truesize) > skb->len ||
before(TCP_SKB_CB(skb)->seq, start))) {
end_of_skbs = false;
@@ -4525,30 +4554,18 @@ restart:
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
}
- if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
+ if (end_of_skbs ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
while (before(start, end)) {
+ int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
- unsigned int header = skb_headroom(skb);
- int copy = SKB_MAX_ORDER(header, 0);
- /* Too big header? This can happen with IPv6. */
- if (copy < 0)
- return;
- if (end - start < copy)
- copy = end - start;
- nskb = alloc_skb(copy + header, GFP_ATOMIC);
+ nskb = alloc_skb(copy, GFP_ATOMIC);
if (!nskb)
return;
- skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
- skb_set_network_header(nskb, (skb_network_header(skb) -
- skb->head));
- skb_set_transport_header(nskb, (skb_transport_header(skb) -
- skb->head));
- skb_reserve(nskb, header);
- memcpy(nskb->head, skb->head, header);
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
__skb_queue_before(list, skb, nskb);
@@ -4572,8 +4589,7 @@ restart:
skb = tcp_collapse_one(sk, skb, list);
if (!skb ||
skb == tail ||
- tcp_hdr(skb)->syn ||
- tcp_hdr(skb)->fin)
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
}
}
@@ -4941,53 +4957,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
__tcp_checksum_complete_user(sk, skb);
}
-#ifdef CONFIG_NET_DMA
-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
- int hlen)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int chunk = skb->len - hlen;
- int dma_cookie;
- bool copied_early = false;
-
- if (tp->ucopy.wakeup)
- return false;
-
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = net_dma_find_channel();
-
- if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
-
- dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
- skb, hlen,
- tp->ucopy.iov, chunk,
- tp->ucopy.pinned_list);
-
- if (dma_cookie < 0)
- goto out;
-
- tp->ucopy.dma_cookie = dma_cookie;
- copied_early = true;
-
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- tcp_rcv_space_adjust(sk);
-
- if ((tp->ucopy.len == 0) ||
- (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
- (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
- tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk);
- }
- } else if (chunk > 0) {
- tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk);
- }
-out:
- return copied_early;
-}
-#endif /* CONFIG_NET_DMA */
-
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5167,27 +5136,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
} else {
int eaten = 0;
- int copied_early = 0;
bool fragstolen = false;
- if (tp->copied_seq == tp->rcv_nxt &&
- len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
- if (tp->ucopy.task == current &&
- sock_owned_by_user(sk) &&
- tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
- copied_early = 1;
- eaten = 1;
- }
-#endif
- if (tp->ucopy.task == current &&
- sock_owned_by_user(sk) && !copied_early) {
- __set_current_state(TASK_RUNNING);
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt &&
+ len - tcp_header_len <= tp->ucopy.len &&
+ sock_owned_by_user(sk)) {
+ __set_current_state(TASK_RUNNING);
- if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
- eaten = 1;
- }
- if (eaten) {
+ if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
@@ -5203,9 +5160,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
__skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ eaten = 1;
}
- if (copied_early)
- tcp_cleanup_rbuf(sk, skb->len);
}
if (!eaten) {
if (tcp_checksum_complete_user(sk, skb))
@@ -5242,14 +5198,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
goto no_ack;
}
- if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
- __tcp_ack_snd_check(sk, 0);
+ __tcp_ack_snd_check(sk, 0);
no_ack:
-#ifdef CONFIG_NET_DMA
- if (copied_early)
- __skb_queue_tail(&sk->sk_async_wait_queue, skb);
- else
-#endif
if (eaten)
kfree_skb_partial(skb, fragstolen);
sk->sk_data_ready(sk);
@@ -5443,7 +5393,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* state to ESTABLISHED..."
*/
- TCP_ECN_rcv_synack(tp, th);
+ tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -5562,7 +5512,7 @@ discard:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- TCP_ECN_rcv_syn(tp, th);
+ tcp_ecn_rcv_syn(tp, th);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -5877,3 +5827,190 @@ discard:
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (family == AF_INET)
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
+ &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
+ &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negociation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control; it requires setting ECT on all packets,
+ * including SYN. We inverse the test in this case: If our
+ * local socket wants ECN, but peer only set ece/cwr (but not
+ * ECT in IP header) its probably a non-DCTCP aware sender.
+ */
+static void tcp_ecn_create_request(struct request_sock *req,
+ const struct sk_buff *skb,
+ const struct sock *listen_sk)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct net *net = sock_net(listen_sk);
+ bool th_ecn = th->ece && th->cwr;
+ bool ect, need_ecn;
+
+ if (!th_ecn)
+ return;
+
+ ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+ need_ecn = tcp_ca_needs_ecn(listen_sk);
+
+ if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+ else if (ect && need_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+}
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_options_received tmp_opt;
+ struct request_sock *req;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = NULL;
+ __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
+ bool want_cookie = false, fastopen;
+ struct flowi fl;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int err;
+
+
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if ((sysctl_tcp_syncookies == 2 ||
+ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
+
+
+ /* Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ req = inet_reqsk_alloc(rsk_ops);
+ if (!req)
+ goto drop;
+
+ tcp_rsk(req)->af_specific = af_ops;
+
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.mss_clamp = af_ops->mss_clamp;
+ tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+
+ if (want_cookie && !tmp_opt.saw_tstamp)
+ tcp_clear_options(&tmp_opt);
+
+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+ tcp_openreq_init(req, &tmp_opt, skb, sk);
+
+ af_ops->init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ if (!want_cookie || tmp_opt.tstamp_ok)
+ tcp_ecn_create_request(req, skb, sk);
+
+ if (want_cookie) {
+ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
+ } else if (!isn) {
+ /* VJ's idea. We save last timestamp seen
+ * from the destination in peer table, when entering
+ * state TIME-WAIT, and check against it before
+ * accepting new connection request.
+ *
+ * If "isn" is not zero, this request hit alive
+ * timewait bucket, so that all the necessary checks
+ * are made in the function processing timewait state.
+ */
+ if (tcp_death_row.sysctl_tw_recycle) {
+ bool strict;
+
+ dst = af_ops->route_req(sk, &fl, req, &strict);
+
+ if (dst && strict &&
+ !tcp_peer_is_proven(req, dst, true,
+ tmp_opt.saw_tstamp)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ goto drop_and_release;
+ }
+ }
+ /* Kill the following clause, if you dislike this way. */
+ else if (!sysctl_tcp_syncookies &&
+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (sysctl_max_syn_backlog >> 2)) &&
+ !tcp_peer_is_proven(req, dst, false,
+ tmp_opt.saw_tstamp)) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations,
+ * proven to be alive.
+ * It means that we continue to communicate
+ * to destinations, already remembered
+ * to the moment of synflood.
+ */
+ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+ rsk_ops->family);
+ goto drop_and_release;
+ }
+
+ isn = af_ops->init_seq(skb);
+ }
+ if (!dst) {
+ dst = af_ops->route_req(sk, &fl, req, NULL);
+ if (!dst)
+ goto drop_and_free;
+ }
+
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_openreq_init_rwin(req, sk, dst);
+ fastopen = !want_cookie &&
+ tcp_try_fastopen(sk, skb, req, &foc, dst);
+ err = af_ops->send_synack(sk, dst, &fl, req,
+ skb_get_queue_mapping(skb), &foc);
+ if (!fastopen) {
+ if (err || want_cookie)
+ goto drop_and_free;
+
+ tcp_rsk(req)->listener = NULL;
+ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ }
+
+ return 0;
+
+drop_and_release:
+ dst_release(dst);
+drop_and_free:
+ reqsk_free(req);
+drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77cccda1ad0c..552e87e3c269 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -72,7 +72,6 @@
#include <net/inet_common.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
-#include <net/netdma.h>
#include <net/secure_seq.h>
#include <net/tcp_memcontrol.h>
#include <net/busy_poll.h>
@@ -90,7 +89,6 @@ int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_low_latency);
-
#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th);
@@ -99,7 +97,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
{
return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
@@ -208,6 +206,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
+ inet_set_txhash(sk);
+
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -269,7 +269,7 @@ EXPORT_SYMBOL(tcp_v4_connect);
* It can be called through tcp_release_cb() if socket was owned by user
* at the time tcp_v4_err() was called to handle ICMP message.
*/
-static void tcp_v4_mtu_reduced(struct sock *sk)
+void tcp_v4_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
@@ -300,6 +300,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
}
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
@@ -342,11 +343,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
int err;
struct net *net = dev_net(icmp_skb->dev);
- if (icmp_skb->len < (iph->ihl << 2) + 8) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
- return;
- }
-
sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
iph->saddr, th->source, inet_iif(icmp_skb));
if (!sk) {
@@ -433,15 +429,16 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
break;
icsk->icsk_backoff--;
- inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
- TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
- tcp_bound_rto(sk);
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
+ TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
skb = tcp_write_queue_head(sk);
BUG_ON(!skb);
- remaining = icsk->icsk_rto - min(icsk->icsk_rto,
- tcp_time_stamp - TCP_SKB_CB(skb)->when);
+ remaining = icsk->icsk_rto -
+ min(icsk->icsk_rto,
+ tcp_time_stamp - tcp_skb_timestamp(skb));
if (remaining) {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -683,8 +680,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
net = dev_net(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos;
- ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
+ ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -766,8 +764,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
- ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
+ ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
}
@@ -814,6 +813,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
* socket.
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
struct request_sock *req,
u16 queue_mapping,
struct tcp_fastopen_cookie *foc)
@@ -837,24 +837,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
ireq->ir_rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
- if (!tcp_rsk(req)->snt_synack && !err)
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
}
return err;
}
-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
-{
- int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
-
- if (!res) {
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- }
- return res;
-}
-
/*
* IPv4 request_sock destructor.
*/
@@ -898,18 +885,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
*/
static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
{
- const struct ip_options *opt = &(IPCB(skb)->opt);
+ const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
struct ip_options_rcu *dopt = NULL;
if (opt && opt->optlen) {
int opt_size = sizeof(*dopt) + opt->optlen;
dopt = kmalloc(opt_size, GFP_ATOMIC);
- if (dopt) {
- if (ip_options_echo(&dopt->opt, skb)) {
- kfree(dopt);
- dopt = NULL;
- }
+ if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
+ kfree(dopt);
+ dopt = NULL;
}
}
return dopt;
@@ -1064,7 +1049,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
if (sin->sin_family != AF_INET)
return -EINVAL;
- if (!cmd.tcpm_key || !cmd.tcpm_keylen)
+ if (!cmd.tcpm_keylen)
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
AF_INET);
@@ -1182,7 +1167,8 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
-static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
+ const struct sk_buff *skb)
{
/*
* This gets called for each TCP segment that arrives
@@ -1235,163 +1221,81 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
return false;
}
+static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = __tcp_v4_inbound_md5_hash(sk, skb);
+ rcu_read_unlock();
+
+ return ret;
+}
+
#endif
+static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ ireq->no_srccheck = inet_sk(sk)->transparent;
+ ireq->opt = tcp_v4_save_options(skb);
+}
+
+static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+ const struct request_sock *req,
+ bool *strict)
+{
+ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
+
+ if (strict) {
+ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
+ *strict = true;
+ else
+ *strict = false;
+ }
+
+ return dst;
+}
+
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
- .rtx_syn_ack = tcp_v4_rtx_synack,
+ .rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
- .syn_ack_timeout = tcp_syn_ack_timeout,
+ .syn_ack_timeout = tcp_syn_ack_timeout,
};
-#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+ .mss_clamp = TCP_MSS_DEFAULT,
+#ifdef CONFIG_TCP_MD5SIG
.md5_lookup = tcp_v4_reqsk_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
-};
#endif
+ .init_req = tcp_v4_init_req,
+#ifdef CONFIG_SYN_COOKIES
+ .cookie_init_seq = cookie_v4_init_sequence,
+#endif
+ .route_req = tcp_v4_route_req,
+ .init_seq = tcp_v4_init_sequence,
+ .send_synack = tcp_v4_send_synack,
+ .queue_hash_add = inet_csk_reqsk_queue_hash_add,
+};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_options_received tmp_opt;
- struct request_sock *req;
- struct inet_request_sock *ireq;
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = NULL;
- __be32 saddr = ip_hdr(skb)->saddr;
- __be32 daddr = ip_hdr(skb)->daddr;
- __u32 isn = TCP_SKB_CB(skb)->when;
- bool want_cookie = false, fastopen;
- struct flowi4 fl4;
- struct tcp_fastopen_cookie foc = { .len = -1 };
- int err;
-
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if ((sysctl_tcp_syncookies == 2 ||
- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
- if (!want_cookie)
- goto drop;
- }
+ return tcp_conn_request(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, skb);
- /* Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
- goto drop;
- }
-
- req = inet_reqsk_alloc(&tcp_request_sock_ops);
- if (!req)
- goto drop;
-
-#ifdef CONFIG_TCP_MD5SIG
- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
-#endif
-
- tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
-
- if (want_cookie && !tmp_opt.saw_tstamp)
- tcp_clear_options(&tmp_opt);
-
- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
- tcp_openreq_init(req, &tmp_opt, skb);
-
- ireq = inet_rsk(req);
- ireq->ir_loc_addr = daddr;
- ireq->ir_rmt_addr = saddr;
- ireq->no_srccheck = inet_sk(sk)->transparent;
- ireq->opt = tcp_v4_save_options(skb);
- ireq->ir_mark = inet_request_mark(sk, skb);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, skb, sock_net(sk));
-
- if (want_cookie) {
- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
- req->cookie_ts = tmp_opt.tstamp_ok;
- } else if (!isn) {
- /* VJ's idea. We save last timestamp seen
- * from the destination in peer table, when entering
- * state TIME-WAIT, and check against it before
- * accepting new connection request.
- *
- * If "isn" is not zero, this request hit alive
- * timewait bucket, so that all the necessary checks
- * are made in the function processing timewait state.
- */
- if (tmp_opt.saw_tstamp &&
- tcp_death_row.sysctl_tw_recycle &&
- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
- fl4.daddr == saddr) {
- if (!tcp_peer_is_proven(req, dst, true)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
- goto drop_and_release;
- }
- }
- /* Kill the following clause, if you dislike this way. */
- else if (!sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (sysctl_max_syn_backlog >> 2)) &&
- !tcp_peer_is_proven(req, dst, false)) {
- /* Without syncookies last quarter of
- * backlog is filled with destinations,
- * proven to be alive.
- * It means that we continue to communicate
- * to destinations, already remembered
- * to the moment of synflood.
- */
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
- &saddr, ntohs(tcp_hdr(skb)->source));
- goto drop_and_release;
- }
-
- isn = tcp_v4_init_sequence(skb);
- }
- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
- goto drop_and_free;
-
- tcp_rsk(req)->snt_isn = isn;
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
- tcp_openreq_init_rwin(req, sk, dst);
- fastopen = !want_cookie &&
- tcp_try_fastopen(sk, skb, req, &foc, dst);
- err = tcp_v4_send_synack(sk, dst, req,
- skb_get_queue_mapping(skb), &foc);
- if (!fastopen) {
- if (err || want_cookie)
- goto drop_and_free;
-
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
- tcp_rsk(req)->listener = NULL;
- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- }
-
- return 0;
-
-drop_and_release:
- dst_release(dst);
-drop_and_free:
- reqsk_free(req);
drop:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return 0;
@@ -1439,6 +1343,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ inet_set_txhash(newsk);
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
@@ -1523,7 +1428,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
#ifdef CONFIG_SYN_COOKIES
if (!th->syn)
- sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+ sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt);
#endif
return sk;
}
@@ -1539,16 +1444,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * We really want to reject the packet as early as possible
- * if:
- * o We're expecting an MD5'd packet and this is no MD5 tcp option
- * o There is an MD5 option and we're not expecting one
- */
- if (tcp_v4_inbound_md5_hash(sk, skb))
- goto discard;
-#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
@@ -1663,7 +1558,17 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
skb_queue_len(&tp->ucopy.prequeue) == 0)
return false;
- skb_dst_force(skb);
+ /* Before escaping RCU protected region, we need to take care of skb
+ * dst. Prequeue is only enabled for established sockets.
+ * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
+ * Instead of doing full sk_rx_dst validity here, let's perform
+ * an optimistic check.
+ */
+ if (likely(sk->sk_rx_dst))
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
if (tp->ucopy.memory > sk->sk_rcvbuf) {
@@ -1728,11 +1633,19 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = tcp_hdr(skb);
iph = ip_hdr(skb);
+ /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+ * barrier() makes sure compiler wont play fool^Waliasing games.
+ */
+ memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
+ sizeof(struct inet_skb_parm));
+ barrier();
+
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->when = 0;
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+ TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
@@ -1751,6 +1664,18 @@ process:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
+
+#ifdef CONFIG_TCP_MD5SIG
+ /*
+ * We really want to reject the packet as early as possible
+ * if:
+ * o We're expecting an MD5'd packet and this is no MD5 tcp option
+ * o There is an MD5 option and we're not expecting one
+ */
+ if (tcp_v4_inbound_md5_hash(sk, skb))
+ goto discard_and_relse;
+#endif
+
nf_reset(skb);
if (sk_filter(sk, skb))
@@ -1762,18 +1687,8 @@ process:
bh_lock_sock_nested(sk);
ret = 0;
if (!sock_owned_by_user(sk)) {
-#ifdef CONFIG_NET_DMA
- struct tcp_sock *tp = tcp_sk(sk);
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = net_dma_find_channel();
- if (tp->ucopy.dma_chan)
+ if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
- else
-#endif
- {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v4_do_rcv(sk, skb);
- }
} else if (unlikely(sk_add_backlog(sk, skb,
sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
@@ -1857,9 +1772,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- dst_hold(dst);
- sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ if (dst) {
+ dst_hold(dst);
+ sk->sk_rx_dst = dst;
+ inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ }
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);
@@ -1880,6 +1797,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
};
EXPORT_SYMBOL(ipv4_specific);
@@ -1932,11 +1850,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
}
#endif
-#ifdef CONFIG_NET_DMA
- /* Cleans up our sk_async_wait_queue */
- __skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
-
/* Clean prequeue, it must be empty really */
__skb_queue_purge(&tp->ucopy.prequeue);
@@ -2274,7 +2187,7 @@ int tcp_seq_open(struct inode *inode, struct file *file)
s = ((struct seq_file *)file->private_data)->private;
s->family = afinfo->family;
- s->last_pos = 0;
+ s->last_pos = 0;
return 0;
}
EXPORT_SYMBOL(tcp_seq_open);
@@ -2499,7 +2412,6 @@ struct proto tcp_prot = {
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
- .mtu_reduced = tcp_v4_mtu_reduced,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7a2ec3ac584..1d191357bf88 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
res_parent = &parent_cg->memory_allocated;
res_counter_init(&cg_proto->memory_allocated, res_parent);
- percpu_counter_init(&cg_proto->sockets_allocated, 0);
+ percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
return 0;
}
@@ -222,7 +222,7 @@ static struct cftype tcp_files[] = {
static int __init tcp_memcontrol_init(void)
{
- WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
return 0;
}
__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4fe041805989..ed9c9a91851c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -576,7 +576,8 @@ reset:
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
+ bool paws_check, bool timestamps)
{
struct tcp_metrics_block *tm;
bool ret;
@@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa
if (paws_check) {
if (tm &&
(u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
- (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+ ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
+ !timestamps))
ret = false;
else
ret = true;
@@ -1093,7 +1095,6 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
.doit = tcp_metrics_nl_cmd_get,
.dumpit = tcp_metrics_nl_dump,
.policy = tcp_metrics_nl_policy,
- .flags = GENL_ADMIN_PERM,
},
{
.cmd = TCP_METRICS_CMD_DEL,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e68e0d4af6c9..63d2680b65db 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -232,7 +232,7 @@ kill:
u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
if (isn == 0)
isn++;
- TCP_SKB_CB(skb)->when = isn;
+ TCP_SKB_CB(skb)->tcp_tw_isn = isn;
return TCP_TW_SYN;
}
@@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
tw->tw_flowlabel = np->flow_label >> 12;
- tw->tw_ipv6only = np->ipv6only;
+ tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
@@ -393,8 +393,8 @@ void tcp_openreq_init_rwin(struct request_sock *req,
}
EXPORT_SYMBOL(tcp_openreq_init_rwin);
-static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
- struct request_sock *req)
+static void tcp_ecn_openreq_child(struct tcp_sock *tp,
+ const struct request_sock *req)
{
tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
}
@@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
- !try_module_get(newicsk->icsk_ca_ops->owner))
- newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ if (!try_module_get(newicsk->icsk_ca_ops->owner))
+ tcp_assign_congestion_control(newsk);
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
@@ -508,7 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
- TCP_ECN_openreq_child(newtp, req);
+ tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 55046ecd083e..5b90f2f447a5 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,6 +14,43 @@
#include <net/tcp.h>
#include <net/protocol.h>
+static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
+ unsigned int seq, unsigned int mss)
+{
+ while (skb) {
+ if (before(ts_seq, seq + mss)) {
+ skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
+ skb_shinfo(skb)->tskey = ts_seq;
+ return;
+ }
+
+ skb = skb->next;
+ seq += mss;
+ }
+}
+
+struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ /* Set up checksum pseudo header, usually expect stack to
+ * have done this already.
+ */
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+ }
+
+ return tcp_gso_segment(skb, features);
+}
+
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -29,9 +66,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
__sum16 newcheck;
bool ooo_okay, copy_destructor;
- if (!pskb_may_pull(skb, sizeof(*th)))
- goto out;
-
th = tcp_hdr(skb);
thlen = th->doff * 4;
if (thlen < sizeof(*th))
@@ -91,6 +125,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th = tcp_hdr(skb);
seq = ntohl(th->seq);
+ if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
+ tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+
newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
@@ -251,54 +288,16 @@ int tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);
-static int tcp_v4_gso_send_check(struct sk_buff *skb)
-{
- const struct iphdr *iph;
- struct tcphdr *th;
-
- if (!pskb_may_pull(skb, sizeof(*th)))
- return -EINVAL;
-
- iph = ip_hdr(skb);
- th = tcp_hdr(skb);
-
- th->check = 0;
- skb->ip_summed = CHECKSUM_PARTIAL;
- __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
- return 0;
-}
-
static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
- /* Use the IP hdr immediately proceeding for this transport */
- const struct iphdr *iph = skb_gro_network_header(skb);
- __wsum wsum;
-
/* Don't bother verifying checksum if we're going to flush anyway. */
- if (NAPI_GRO_CB(skb)->flush)
- goto skip_csum;
-
- wsum = NAPI_GRO_CB(skb)->csum;
-
- switch (skb->ip_summed) {
- case CHECKSUM_NONE:
- wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
- 0);
-
- /* fall through */
-
- case CHECKSUM_COMPLETE:
- if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
- wsum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
-
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+ inet_gro_compute_pseudo)) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
-skip_csum:
return tcp_gro_receive(head, skb);
}
@@ -316,8 +315,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
static const struct net_offload tcpv4_offload = {
.callbacks = {
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_gso_segment,
+ .gso_segment = tcp4_gso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
},
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 179b51e6bda3..8d4eac793700 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -318,36 +318,47 @@ static u16 tcp_select_window(struct sock *sk)
}
/* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+ else if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
/* Packet ECN state for a SYN. */
-static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
+static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk)) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
}
-static __inline__ void
-TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
+static void
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
+ struct sock *sk)
{
- if (inet_rsk(req)->ecn_ok)
+ if (inet_rsk(req)->ecn_ok) {
th->ece = 1;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
-static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
+static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
int tcp_header_len)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
tcp_hdr(skb)->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
- } else {
+ } else if (!tcp_ca_needs_ecn(sk)) {
/* ACK or retransmitted segment: clear ECT|CE */
INET_ECN_dontxmit(sk);
}
@@ -384,7 +395,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->tcp_flags = flags;
TCP_SKB_CB(skb)->sacked = 0;
- shinfo->gso_segs = 1;
+ tcp_skb_pcount_set(skb, 1);
shinfo->gso_size = 0;
shinfo->gso_type = 0;
@@ -550,7 +561,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
opts->options |= OPTION_TS;
- opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;
+ opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -618,7 +629,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = TCP_SKB_CB(skb)->when;
+ opts->tsval = tcp_skb_timestamp(skb);
opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -647,7 +658,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
- struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int size = 0;
unsigned int eff_sacks;
@@ -666,7 +676,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (likely(tp->rx_opt.tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;
+ opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
@@ -800,7 +810,7 @@ void tcp_release_cb(struct sock *sk)
__sock_put(sk);
}
if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
- sk->sk_prot->mtu_reduced(sk);
+ inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
}
@@ -886,8 +896,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
- /* Our usage of tstamp should remain private */
- skb->tstamp.tv64 = 0;
}
inet = inet_sk(sk);
@@ -916,6 +924,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_orphan(skb);
skb->sk = sk;
skb->destructor = tcp_wfree;
+ skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
@@ -951,7 +960,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
tcp_options_write((__be32 *)(th + 1), tp, &opts);
if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
- TCP_ECN_send(sk, skb, tcp_header_size);
+ tcp_ecn_send(sk, skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
@@ -974,11 +983,22 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
+ /* OK, its time to fill skb_shinfo(skb)->gso_segs */
+ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+
+ /* Our usage of tstamp should remain private */
+ skb->tstamp.tv64 = 0;
+
+ /* Cleanup our debris for IP stacks */
+ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
+
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+
if (likely(err <= 0))
return err;
- tcp_enter_cwr(sk, 1);
+ tcp_enter_cwr(sk);
return net_xmit_eval(err);
}
@@ -994,7 +1014,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
/* Advance write_seq and place onto the write_queue. */
tp->write_seq = TCP_SKB_CB(skb)->end_seq;
- skb_header_release(skb);
+ __skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
@@ -1013,11 +1033,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- shinfo->gso_segs = 1;
+ tcp_skb_pcount_set(skb, 1);
shinfo->gso_size = 0;
shinfo->gso_type = 0;
} else {
- shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
shinfo->gso_size = mss_now;
shinfo->gso_type = sk->sk_gso_type;
}
@@ -1068,6 +1088,21 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
tcp_verify_left_out(tp);
}
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+ !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+ struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+ u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+ shinfo->tx_flags &= ~tsflags;
+ shinfo2->tx_flags |= tsflags;
+ swap(shinfo->tskey, shinfo2->tskey);
+ }
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -1130,11 +1165,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
buff->ip_summed = skb->ip_summed;
- /* Looks stupid, but our code really uses when of
- * skbs, which it never sent before. --ANK
- */
- TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
buff->tstamp = skb->tstamp;
+ tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
@@ -1154,7 +1186,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
}
/* Link BUFF into the send queue. */
- skb_header_release(buff);
+ __skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk);
return 0;
@@ -1651,13 +1683,14 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
+ tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
tcp_set_skb_tso_segs(sk, skb, mss_now);
tcp_set_skb_tso_segs(sk, buff, mss_now);
/* Link BUFF into the send queue. */
- skb_header_release(buff);
+ __skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk);
return 0;
@@ -1856,8 +1889,8 @@ static int tcp_mtu_probe(struct sock *sk)
tcp_init_tso_segs(sk, nskb, nskb->len);
/* We're ready to send. If this fails, the probe will
- * be resegmented into mss-sized pieces by tcp_write_xmit(). */
- TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+ * be resegmented into mss-sized pieces by tcp_write_xmit().
+ */
if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
/* Decrement cwnd here because we are sending
* effectively two packets. */
@@ -1916,8 +1949,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp" is used as a start point for the retransmit timer */
+ skb_mstamp_get(&skb->skb_mstamp);
goto repair; /* Skip network transmission */
+ }
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
@@ -1979,8 +2015,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
-
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2076,10 +2110,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
static bool skb_still_in_host_queue(const struct sock *sk,
const struct sk_buff *skb)
{
- const struct sk_buff *fclone = skb + 1;
-
- if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
- fclone->fclone == SKB_FCLONE_CLONE)) {
+ if (unlikely(skb_fclone_busy(skb))) {
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
return true;
@@ -2478,7 +2509,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
/* Make a copy, if the first transmission SKB clone we made
* is still in somebody's hands, else make a clone.
*/
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
@@ -2523,7 +2553,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
/* Save stamp of the first retransmit. */
if (!tp->retrans_stamp)
- tp->retrans_stamp = TCP_SKB_CB(skb)->when;
+ tp->retrans_stamp = tcp_skb_timestamp(skb);
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
@@ -2731,7 +2761,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
TCPHDR_ACK | TCPHDR_RST);
/* Send it off. */
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
@@ -2759,7 +2788,7 @@ int tcp_send_synack(struct sock *sk)
if (nskb == NULL)
return -ENOMEM;
tcp_unlink_write_queue(skb, sk);
- skb_header_release(nskb);
+ __skb_header_release(nskb);
__tcp_add_write_queue_head(sk, nskb);
sk_wmem_free_skb(sk, skb);
sk->sk_wmem_queued += nskb->truesize;
@@ -2768,9 +2797,8 @@ int tcp_send_synack(struct sock *sk)
}
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
- TCP_ECN_send_synack(tcp_sk(sk), skb);
+ tcp_ecn_send_synack(sk, skb);
}
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
@@ -2814,10 +2842,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
- TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
+ skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req);
else
#endif
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ skb_mstamp_get(&skb->skb_mstamp);
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
foc) + sizeof(*th);
@@ -2828,7 +2856,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- TCP_ECN_make_synack(req, th);
+ tcp_ecn_make_synack(req, th, sk);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
@@ -2935,7 +2963,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
tcb->end_seq += skb->len;
- skb_header_release(skb);
+ __skb_header_release(skb);
__tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
@@ -3065,9 +3093,9 @@ int tcp_connect(struct sock *sk)
skb_reserve(buff, MAX_TCP_HEADER);
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
- tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tp->retrans_stamp = tcp_time_stamp;
tcp_connect_queue_skb(sk, buff);
- TCP_ECN_send_syn(sk, buff);
+ tcp_ecn_send_syn(sk, buff);
/* Send off SYN; include data in Fast Open. */
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3099,6 +3127,8 @@ void tcp_send_delayed_ack(struct sock *sk)
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
+ tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
+
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
@@ -3155,6 +3185,8 @@ void tcp_send_ack(struct sock *sk)
if (sk->sk_state == TCP_CLOSE)
return;
+ tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
+
/* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
@@ -3173,9 +3205,10 @@ void tcp_send_ack(struct sock *sk)
tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
/* Send it off, this clears delayed acks for us. */
- TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ skb_mstamp_get(&buff->skb_mstamp);
tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
}
+EXPORT_SYMBOL_GPL(tcp_send_ack);
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
@@ -3205,7 +3238,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
* send it.
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ skb_mstamp_get(&skb->skb_mstamp);
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
@@ -3249,7 +3282,6 @@ int tcp_write_wakeup(struct sock *sk)
tcp_set_skb_tso_segs(sk, skb, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err)
tcp_event_new_data_sent(sk, skb);
@@ -3268,6 +3300,7 @@ void tcp_send_probe0(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long probe_max;
int err;
err = tcp_write_wakeup(sk);
@@ -3283,9 +3316,7 @@ void tcp_send_probe0(struct sock *sk)
if (icsk->icsk_backoff < sysctl_tcp_retries2)
icsk->icsk_backoff++;
icsk->icsk_probes_out++;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
- TCP_RTO_MAX);
+ probe_max = TCP_RTO_MAX;
} else {
/* If packet was not sent due to local congestion,
* do not backoff and do not remember icsk_probes_out.
@@ -3295,9 +3326,24 @@ void tcp_send_probe0(struct sock *sk)
*/
if (!icsk->icsk_probes_out)
icsk->icsk_probes_out = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff,
- TCP_RESOURCE_PROBE_INTERVAL),
- TCP_RTO_MAX);
+ probe_max = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ inet_csk_rto_backoff(icsk, probe_max),
+ TCP_RTO_MAX);
+}
+
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+{
+ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+ struct flowi fl;
+ int res;
+
+ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ if (!res) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
}
+ return res;
}
+EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 3b66610d4156..ebf5ff57526e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -83,7 +83,6 @@ static struct {
struct tcp_log *log;
} tcp_probe;
-
static inline int tcp_probe_used(void)
{
return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
@@ -101,7 +100,6 @@ static inline int tcp_probe_avail(void)
si4.sin_addr.s_addr = inet->inet_##mem##addr; \
} while (0) \
-
/*
* Hook inserted to be called before each receive packet.
* Note: arguments must match tcp_rcv_established()!
@@ -194,8 +192,8 @@ static int tcpprobe_sprint(char *tbuf, int n)
return scnprintf(tbuf, n,
"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
- (unsigned long) tv.tv_sec,
- (unsigned long) tv.tv_nsec,
+ (unsigned long)tv.tv_sec,
+ (unsigned long)tv.tv_nsec,
&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 8250949b8853..6824afb65d93 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -31,10 +31,10 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
-
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
.cong_avoid = tcp_scalable_cong_avoid,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 286227abed10..9b21ae8b2e31 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk)
* limit.
* 2. If we have strong memory pressure.
*/
-static int tcp_out_of_resources(struct sock *sk, int do_reset)
+static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
struct tcp_sock *tp = tcp_sk(sk);
int shift = 0;
@@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
/* 2. Window is closed. */
(!tp->snd_wnd && !tp->packets_out))
- do_reset = 1;
+ do_reset = true;
if (do_reset)
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_done(sk);
@@ -135,10 +135,9 @@ static bool retransmits_timed_out(struct sock *sk,
if (!inet_csk(sk)->icsk_retransmits)
return false;
- if (unlikely(!tcp_sk(sk)->retrans_stamp))
- start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when;
- else
- start_ts = tcp_sk(sk)->retrans_stamp;
+ start_ts = tcp_sk(sk)->retrans_stamp;
+ if (unlikely(!start_ts))
+ start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -181,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
+ const int alive = icsk->icsk_rto < TCP_RTO_MAX;
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -271,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
+ u32 start_ts;
if (tp->packets_out || !tcp_send_head(sk)) {
icsk->icsk_probes_out = 0;
return;
}
- /* *WARNING* RFC 1122 forbids this
- *
- * It doesn't AFAIK, because we kill the retransmit timer -AK
- *
- * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
- * this behaviour in Solaris down as a bug fix. [AC]
- *
- * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
- * even if they advertise zero window. Hence, connection is killed only
- * if we received no ACKs for normal connection timeout. It is not killed
- * only because window stays zero for some time, window may be zero
- * until armageddon and even later. We are in full accordance
- * with RFCs, only probe timer combines both retransmission timeout
- * and probe timeout in one bottle. --ANK
+ /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
+ * long as the receiver continues to respond probes. We support this by
+ * default and reset icsk_probes_out with incoming ACKs. But if the
+ * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
+ * kill the socket when the retry count and the time exceeds the
+ * corresponding system limit. We also implement similar policy when
+ * we use RTO to probe window in tcp_retransmit_timer().
*/
- max_probes = sysctl_tcp_retries2;
+ start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+ if (!start_ts)
+ skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
+ else if (icsk->icsk_user_timeout &&
+ (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
+ goto abort;
+ max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
+ const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
max_probes = tcp_orphan_retries(sk, alive);
-
- if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
+ if (!alive && icsk->icsk_backoff >= max_probes)
+ goto abort;
+ if (tcp_out_of_resources(sk, true))
return;
}
if (icsk->icsk_probes_out > max_probes) {
- tcp_write_err(sk);
+abort: tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
@@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_write_err(sk);
goto out;
}
- tcp_enter_loss(sk, 0);
+ tcp_enter_loss(sk);
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
__sk_dst_reset(sk);
goto out_reset_timer;
@@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)
NET_INC_STATS_BH(sock_net(sk), mib_idx);
}
- tcp_enter_loss(sk, 0);
+ tcp_enter_loss(sk);
if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
/* Retransmission failed because of local congestion,
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9a5e05f27f4f..a6afde666ab1 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -51,7 +51,6 @@ MODULE_PARM_DESC(beta, "upper bound of packets in network");
module_param(gamma, int, 0644);
MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
-
/* There are several situations when we must "re-start" Vegas:
*
* o when a connection is established
@@ -133,7 +132,6 @@ EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
void tcp_vegas_state(struct sock *sk, u8 ca_state)
{
-
if (ca_state == TCP_CA_Open)
vegas_enable(sk);
else
@@ -218,7 +216,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* This is:
* (actual rate in segments) * baseRTT
*/
- target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt;
+ target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+ do_div(target_cwnd, rtt);
/* Calculate the difference between the window we had,
* and the window we would like to have. This quantity
@@ -284,7 +283,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* Use normal slow start */
else if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp, acked);
-
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 27b9825753d1..a4d2d2d88dca 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -144,7 +144,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rtt = veno->minrtt;
- target_cwnd = (tp->snd_cwnd * veno->basertt);
+ target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
target_cwnd <<= V_PARAM_SHIFT;
do_div(target_cwnd, rtt);
@@ -175,7 +175,6 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
} else
tp->snd_cwnd_cnt++;
}
-
}
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index b94a04ae2ed5..bb63fba47d47 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -42,7 +42,6 @@ struct westwood {
u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/
};
-
/* TCP Westwood functions and constants */
#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
@@ -153,7 +152,6 @@ static inline void update_rtt_min(struct westwood *w)
w->rtt_min = min(w->rtt, w->rtt_min);
}
-
/*
* @westwood_fast_bw
* It is called when we are in fast path. In particular it is called when
@@ -208,7 +206,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
return w->cumul_ack;
}
-
/*
* TCP Westwood
* Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
@@ -219,47 +216,51 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct westwood *w = inet_csk_ca(sk);
+
return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
}
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+ if (ack_flags & CA_ACK_SLOWPATH) {
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
+
+ update_rtt_min(w);
+ return;
+ }
+
+ westwood_fast_bw(sk);
+}
+
static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
{
struct tcp_sock *tp = tcp_sk(sk);
struct westwood *w = inet_csk_ca(sk);
switch (event) {
- case CA_EVENT_FAST_ACK:
- westwood_fast_bw(sk);
- break;
-
case CA_EVENT_COMPLETE_CWR:
tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
break;
-
case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
/* Update RTT_min when next ack arrives */
w->reset_rtt_min = 1;
break;
-
- case CA_EVENT_SLOW_ACK:
- westwood_update_window(sk);
- w->bk += westwood_acked_count(sk);
- update_rtt_min(w);
- break;
-
default:
/* don't care */
break;
}
}
-
/* Extract info for Tcp socket info provided via netlink. */
static void tcp_westwood_info(struct sock *sk, u32 ext,
struct sk_buff *skb)
{
const struct westwood *ca = inet_csk_ca(sk);
+
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
struct tcpvegas_info info = {
.tcpv_enabled = 1,
@@ -271,12 +272,12 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
}
}
-
static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
.cwnd_event = tcp_westwood_event,
+ .in_ack_event = tcp_westwood_ack,
.get_info = tcp_westwood_info,
.pkts_acked = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 599b79b8eac0..cd7273218598 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -54,10 +54,8 @@ static void tcp_yeah_init(struct sock *sk)
/* Ensure the MD arithmetic works. This is somewhat pedantic,
* since I don't think we will see a cwnd this large. :) */
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
-
}
-
static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -84,7 +82,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* Scalable */
tp->snd_cwnd_cnt += yeah->pkts_acked;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
+ if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0;
@@ -120,7 +118,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
*/
if (after(ack, yeah->vegas.beg_snd_nxt)) {
-
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
* at least one RTT sample that wasn't from a delayed ACK.
@@ -189,7 +186,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
yeah->lastQ = queue;
-
}
/* Save the extent of the current window so we can use this
@@ -205,7 +201,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
}
-static u32 tcp_yeah_ssthresh(struct sock *sk) {
+static u32 tcp_yeah_ssthresh(struct sock *sk)
+{
const struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
u32 reduction;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7d5a8661df76..cd0db5471bb5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -99,6 +99,7 @@
#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
+#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/net_namespace.h>
@@ -224,7 +225,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
remaining = (high - low) + 1;
rand = prandom_u32();
- first = (((u64)rand * remaining) >> 32) + low;
+ first = reciprocal_scale(rand, remaining) + low;
/*
* force rand to be an odd multiple of UDP_HTABLE_SIZE
*/
@@ -448,7 +449,7 @@ begin:
}
} else if (score == badness && reuseport) {
matches++;
- if (((u64)hash * matches) >> 32 == 0)
+ if (reciprocal_scale(hash, matches) == 0)
result = sk;
hash = next_pseudo_random32(hash);
}
@@ -529,7 +530,7 @@ begin:
}
} else if (score == badness && reuseport) {
matches++;
- if (((u64)hash * matches) >> 32 == 0)
+ if (reciprocal_scale(hash, matches) == 0)
result = sk;
hash = next_pseudo_random32(hash);
}
@@ -594,27 +595,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
-static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
- __be16 loc_port, __be32 loc_addr,
- __be16 rmt_port, __be32 rmt_addr,
- int dif)
-{
- struct hlist_nulls_node *node;
- struct sock *s = sk;
- unsigned short hnum = ntohs(loc_port);
-
- sk_nulls_for_each_from(s, node) {
- if (__udp_is_mcast_sock(net, s,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum))
- goto found;
- }
- s = NULL;
-found:
- return s;
-}
-
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -1588,7 +1568,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto csum_error;
- if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
goto drop;
@@ -1640,6 +1620,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
skb1 = NULL;
+
+ sock_put(sk);
}
if (unlikely(skb1))
kfree_skb(skb1);
@@ -1668,41 +1650,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udp_table *udptable)
{
struct sock *sk, *stack[256 / sizeof(struct sock *)];
- struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
- int dif;
- unsigned int i, count = 0;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(uh->dest);
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+ int dif = skb->dev->ifindex;
+ unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+ unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+ if (use_hash2) {
+ hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ udp_table.mask;
+ hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+ hslot = &udp_table.hash2[hash2];
+ offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+ }
spin_lock(&hslot->lock);
- sk = sk_nulls_head(&hslot->head);
- dif = skb->dev->ifindex;
- sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
- while (sk) {
- stack[count++] = sk;
- sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
- daddr, uh->source, saddr, dif);
- if (unlikely(count == ARRAY_SIZE(stack))) {
- if (!sk)
- break;
- flush_stack(stack, count, skb, ~0);
- count = 0;
+ sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
+ if (__udp_is_mcast_sock(net, sk,
+ uh->dest, daddr,
+ uh->source, saddr,
+ dif, hnum)) {
+ if (unlikely(count == ARRAY_SIZE(stack))) {
+ flush_stack(stack, count, skb, ~0);
+ count = 0;
+ }
+ stack[count++] = sk;
+ sock_hold(sk);
}
}
- /*
- * before releasing chain lock, we must take a reference on sockets
- */
- for (i = 0; i < count; i++)
- sock_hold(stack[i]);
spin_unlock(&hslot->lock);
+ /* Also lookup *:port if we are using hash2 and haven't done so yet. */
+ if (use_hash2 && hash2 != hash2_any) {
+ hash2 = hash2_any;
+ goto start_lookup;
+ }
+
/*
* do the slow work with no lock held
*/
if (count) {
flush_stack(stack, count, skb, count - 1);
-
- for (i = 0; i < count; i++)
- sock_put(stack[i]);
} else {
kfree_skb(skb);
}
@@ -1797,6 +1788,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (sk != NULL) {
int ret;
+ if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ inet_compute_pseudo);
+
ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
@@ -1977,7 +1972,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
return;
skb->sk = sk;
- skb->destructor = sock_edemux;
+ skb->destructor = sock_efree;
dst = sk->sk_rx_dst;
if (dst)
@@ -2526,79 +2521,3 @@ void __init udp_init(void)
sysctl_udp_rmem_min = SK_MEM_QUANTUM;
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
}
-
-struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- u16 mac_offset = skb->mac_header;
- int mac_len = skb->mac_len;
- int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- __be16 protocol = skb->protocol;
- netdev_features_t enc_features;
- int udp_offset, outer_hlen;
- unsigned int oldlen;
- bool need_csum;
-
- oldlen = (u16)~skb->len;
-
- if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
- goto out;
-
- skb->encapsulation = 0;
- __skb_pull(skb, tnl_hlen);
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, skb_inner_network_offset(skb));
- skb->mac_len = skb_inner_network_offset(skb);
- skb->protocol = htons(ETH_P_TEB);
-
- need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
- if (need_csum)
- skb->encap_hdr_csum = 1;
-
- /* segment inner packet. */
- enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
- segs = skb_mac_gso_segment(skb, enc_features);
- if (!segs || IS_ERR(segs)) {
- skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
- mac_len);
- goto out;
- }
-
- outer_hlen = skb_tnl_header_len(skb);
- udp_offset = outer_hlen - tnl_hlen;
- skb = segs;
- do {
- struct udphdr *uh;
- int len;
-
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
-
- skb->mac_len = mac_len;
-
- skb_push(skb, outer_hlen);
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, mac_len);
- skb_set_transport_header(skb, udp_offset);
- len = skb->len - udp_offset;
- uh = udp_hdr(skb);
- uh->len = htons(len);
-
- if (need_csum) {
- __be32 delta = htonl(oldlen + len);
-
- uh->check = ~csum_fold((__force __wsum)
- ((__force u32)uh->check +
- (__force u32)delta));
- uh->check = gso_make_checksum(skb, ~uh->check);
-
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
- }
-
- skb->protocol = protocol;
- } while ((skb = skb->next));
-out:
- return segs;
-}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 546d2d439dda..507310ef4b56 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -25,26 +25,121 @@ struct udp_offload_priv {
struct udp_offload_priv __rcu *next;
};
-static int udp4_ufo_send_check(struct sk_buff *skb)
+static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features),
+ __be16 new_protocol)
{
- if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- return -EINVAL;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ __be16 protocol = skb->protocol;
+ netdev_features_t enc_features;
+ int udp_offset, outer_hlen;
+ unsigned int oldlen;
+ bool need_csum;
+
+ oldlen = (u16)~skb->len;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ skb->encapsulation = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = new_protocol;
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+ if (need_csum)
+ skb->encap_hdr_csum = 1;
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = gso_inner_segment(skb, enc_features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
- if (likely(!skb->encapsulation)) {
- const struct iphdr *iph;
+ outer_hlen = skb_tnl_header_len(skb);
+ udp_offset = outer_hlen - tnl_hlen;
+ skb = segs;
+ do {
struct udphdr *uh;
+ int len;
- iph = ip_hdr(skb);
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ skb->mac_len = mac_len;
+
+ skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ len = skb->len - udp_offset;
uh = udp_hdr(skb);
+ uh->len = htons(len);
+
+ if (need_csum) {
+ __be32 delta = htonl(oldlen + len);
+
+ uh->check = ~csum_fold((__force __wsum)
+ ((__force u32)uh->check +
+ (__force u32)delta));
+ uh->check = gso_make_checksum(skb, ~uh->check);
+
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
- uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- IPPROTO_UDP, 0);
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ bool is_ipv6)
+{
+ __be16 protocol = skb->protocol;
+ const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features);
+
+ rcu_read_lock();
+
+ switch (skb->inner_protocol_type) {
+ case ENCAP_TYPE_ETHER:
+ protocol = skb->inner_protocol;
+ gso_inner_segment = skb_mac_gso_segment;
+ break;
+ case ENCAP_TYPE_IPPROTO:
+ offloads = is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[skb->inner_ipproto]);
+ if (!ops || !ops->callbacks.gso_segment)
+ goto out_unlock;
+ gso_inner_segment = ops->callbacks.gso_segment;
+ break;
+ default:
+ goto out_unlock;
}
- return 0;
+ segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment,
+ protocol);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return segs;
}
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
@@ -52,16 +147,20 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
- int offset;
__wsum csum;
+ struct udphdr *uh;
+ struct iphdr *iph;
if (skb->encapsulation &&
(skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
- segs = skb_udp_tunnel_segment(skb, features);
+ segs = skb_udp_tunnel_segment(skb, features, false);
goto out;
}
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto out;
+
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
@@ -89,10 +188,16 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
* HW cannot do checksum of UDP packets sent as multiple
* IP fragments.
*/
- offset = skb_checksum_start_offset(skb);
- csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset += skb->csum_offset;
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+
+ uh = udp_hdr(skb);
+ iph = ip_hdr(skb);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
skb->ip_summed = CHECKSUM_NONE;
/* Fragment the skb. IP headers of the fragments are updated in
@@ -152,30 +257,24 @@ unlock:
}
EXPORT_SYMBOL(udp_del_offload);
-static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
+ struct udphdr *uh)
{
struct udp_offload_priv *uo_priv;
struct sk_buff *p, **pp = NULL;
- struct udphdr *uh, *uh2;
- unsigned int hlen, off;
+ struct udphdr *uh2;
+ unsigned int off = skb_gro_offset(skb);
int flush = 1;
if (NAPI_GRO_CB(skb)->udp_mark ||
- (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE))
+ (skb->ip_summed != CHECKSUM_PARTIAL &&
+ NAPI_GRO_CB(skb)->csum_cnt == 0 &&
+ !NAPI_GRO_CB(skb)->csum_valid))
goto out;
/* mark that this skb passed once through the udp gro layer */
NAPI_GRO_CB(skb)->udp_mark = 1;
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*uh);
- uh = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- uh = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!uh))
- goto out;
- }
-
rcu_read_lock();
uo_priv = rcu_dereference(udp_offload_base);
for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
@@ -193,7 +292,12 @@ unflush:
continue;
uh2 = (struct udphdr *)(p->data + off);
- if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+
+ /* Match ports and either checksums are either both zero
+ * or nonzero.
+ */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
+ (!uh->check ^ !uh2->check)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -201,6 +305,7 @@ unflush:
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+ NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
pp = uo_priv->offload->callbacks.gro_receive(head, skb);
out_unlock:
@@ -210,7 +315,34 @@ out:
return pp;
}
-static int udp_gro_complete(struct sk_buff *skb, int nhoff)
+static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+
+ if (unlikely(!uh))
+ goto flush;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip;
+
+ if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
+ inet_gro_compute_pseudo))
+ goto flush;
+ else if (uh->check)
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ inet_gro_compute_pseudo);
+skip:
+ NAPI_GRO_CB(skb)->is_ipv6 = 0;
+ return udp_gro_receive(head, skb, uh);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+int udp_gro_complete(struct sk_buff *skb, int nhoff)
{
struct udp_offload_priv *uo_priv;
__be16 newlen = htons(skb->len - nhoff);
@@ -228,19 +360,32 @@ static int udp_gro_complete(struct sk_buff *skb, int nhoff)
break;
}
- if (uo_priv != NULL)
+ if (uo_priv != NULL) {
+ NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
+ }
rcu_read_unlock();
return err;
}
+static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+
+ if (uh->check)
+ uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
+ iph->daddr, 0);
+
+ return udp_gro_complete(skb, nhoff);
+}
+
static const struct net_offload udpv4_offload = {
.callbacks = {
- .gso_send_check = udp4_ufo_send_check,
.gso_segment = udp4_ufo_fragment,
- .gro_receive = udp_gro_receive,
- .gro_complete = udp_gro_complete,
+ .gro_receive = udp4_gro_receive,
+ .gro_complete = udp4_gro_complete,
},
};
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
new file mode 100644
index 000000000000..1671263e5fa0
--- /dev/null
+++ b/net/ipv4/udp_tunnel.c
@@ -0,0 +1,108 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/net_namespace.h>
+
+int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
+ struct socket **sockp)
+{
+ int err;
+ struct socket *sock = NULL;
+ struct sockaddr_in udp_addr;
+
+ err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ sk_change_net(sock->sk, net);
+
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->local_ip;
+ udp_addr.sin_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->peer_ip;
+ udp_addr.sin_port = cfg->peer_udp_port;
+ err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr), 0);
+ if (err < 0)
+ goto error;
+ }
+
+ sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
+
+ *sockp = sock;
+ return 0;
+
+error:
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+ }
+ *sockp = NULL;
+ return err;
+}
+EXPORT_SYMBOL(udp_sock_create4);
+
+void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
+ struct udp_tunnel_sock_cfg *cfg)
+{
+ struct sock *sk = sock->sk;
+
+ /* Disable multicast loopback */
+ inet_sk(sk)->mc_loop = 0;
+
+ /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
+ udp_set_convert_csum(sk, true);
+
+ rcu_assign_sk_user_data(sk, cfg->sk_user_data);
+
+ udp_sk(sk)->encap_type = cfg->encap_type;
+ udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+
+ udp_tunnel_encap_enable(sock);
+}
+EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+
+int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
+ struct sk_buff *skb, __be32 src, __be32 dst,
+ __u8 tos, __u8 ttl, __be16 df, __be16 src_port,
+ __be16 dst_port, bool xnet)
+{
+ struct udphdr *uh;
+
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ uh->dest = dst_port;
+ uh->source = src_port;
+ uh->len = htons(skb->len);
+
+ udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len);
+
+ return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP,
+ tos, ttl, df, xnet);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
+
+void udp_tunnel_sock_release(struct socket *sock)
+{
+ rcu_assign_sk_user_data(sock->sk, NULL);
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index a2ce0101eaac..dccefa9d84cf 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -124,7 +124,7 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
for_each_protocol_rcu(ah4_handlers, handler)
if ((ret = handler->handler(skb)) != -EINVAL)
- return ret;;
+ return ret;
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);