diff options
Diffstat (limited to 'net/ipv4')
40 files changed, 541 insertions, 1239 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 80dad301361d..32cae39cdff6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -430,7 +430,7 @@ config INET_DIAG Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: - + http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2 If unsure, say Y. @@ -600,7 +600,7 @@ config TCP_CONG_VENO distinguishing to circumvent the difficult judgment of the packet loss type. TCP Veno cuts down less congestion window in response to random loss packets. - See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index eec9569ffa5c..7446b98661d8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ -obj-$(CONFIG_INET_DIAG) += inet_diag.o +obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b403499fdabe..20fda8fb8ffd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk, backlog); if (err) goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } sk->sk_max_ack_backlog = backlog; err = 0; @@ -485,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && @@ -1384,12 +1384,12 @@ out: } EXPORT_SYMBOL(inet_gso_segment); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; + struct sk_buff *pp = NULL; const struct iphdr *iph; + struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; @@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; @@ -1505,8 +1505,8 @@ out: } EXPORT_SYMBOL(inet_gro_receive); -static struct sk_buff **ipip_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipip_gro_receive(struct list_head *head, + struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; @@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net) * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; @@ -1882,6 +1883,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile index ce262d76cc48..e9e42f99725e 100644 --- a/net/ipv4/bpfilter/Makefile +++ b/net/ipv4/bpfilter/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_BPFILTER) += sockopt.o - diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7585ab1a77a..ea4bd8a52422 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); + if (all || type == NETCONFA_BC_FORWARDING) + size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) @@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; + if ((all || type == NETCONFA_BC_FORWARDING) && + nla_put_s32(skb, NETCONFA_BC_FORWARDING, + IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0) + goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) @@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); + if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && + new_value != old_value) + rt_cache_flush(net); + if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); @@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), + DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 7cf755ef9efb..58834a10c0be 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -28,8 +28,8 @@ #include <linux/spinlock.h> #include <net/udp.h> -static struct sk_buff **esp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; @@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~NETIF_F_CSUM_MASK; @@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ if (!xo) return -EINVAL; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) { + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index c9ec1603666b..500a59906b87 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -224,14 +224,14 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *fou_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - const struct net_offload *ops; - struct sk_buff **pp = NULL; u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *pp = NULL; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gue_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; @@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, skb_gro_pull(skb, hdrlen); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 6a7d980105f6..6c63524f598a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -108,10 +108,10 @@ out: return segs; } -static struct sk_buff **gre_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gre_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; @@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..695979b7ef6d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -429,14 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; + ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -710,11 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; + ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 75151be21413..cf75f8944b05 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1288,7 +1288,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #endif } -static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) +static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST @@ -1320,7 +1320,7 @@ static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) * not send filter-mode change record as the mode should be from * IN() to IN(A). */ - if (mode == MCAST_EXCLUDE) + if (im->sfmode == MCAST_EXCLUDE) im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_event(in_dev); @@ -1432,7 +1432,7 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif - igmp_group_added(im, mode); + igmp_group_added(im); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: @@ -1699,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc, pmc->sfmode); + igmp_group_added(pmc); } } @@ -1762,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc, pmc->sfmode); + igmp_group_added(pmc); } } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 0d70608cc2e1..ccd140e4082d 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -20,6 +20,7 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index b54b948b0596..32662e9e5d21 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb) !skb_sec_path(skb)) ip_rt_send_redirect(skb); - skb->priority = rt_tos2priority(iph->tos); + if (net->ipv4.sysctl_ip_fwd_update_priority) + skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, rt->dst.dev, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2d8efeecf619..51a5d06085ac 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto err_free_rt; md = ip_tunnel_info_opts(tun_info); if (!md) goto err_free_rt; @@ -983,7 +985,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; - int t_hlen; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); @@ -991,8 +992,6 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -1302,13 +1301,11 @@ static const struct net_device_ops gre_tap_netdev_ops = { static int erspan_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - int t_hlen; tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->erspan_ver); - t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..3196cf58f418 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,7 +307,8 @@ drop: return true; } -static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); @@ -315,13 +316,6 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) struct rtable *rt; int err; - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -393,7 +387,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - return dst_input(skb); + return NET_RX_SUCCESS; drop: kfree_skb(skb); @@ -405,13 +399,29 @@ drop_error: goto drop; } +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + + ret = ip_rcv_finish_core(net, sk, skb); + if (ret != NET_RX_DROP) + ret = dst_input(skb); + return ret; +} + /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +431,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +498,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +507,113 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + list_del(&skb->list); + /* Handle ip{6}_forward case, as sch_direct_xmit have + * another kind of SKB-list usage (see validate_xmit_skb_list) + */ + skb->next = NULL; + dst_input(skb); + } +} + +static void ip_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + continue; + if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + continue; + + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv_finish(&sublist); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + ip_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0e3edd25f881..9c4e72e9c60a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) } /* Note: skb->sk can be different from sk, in case of tunnels */ -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), + RT_CONN_FLAGS_TOS(sk, tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -478,7 +479,7 @@ packet_routed: skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); - *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else @@ -511,7 +512,7 @@ no_route: kfree_skb(skb); return -EHOSTUNREACH; } -EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(__ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { @@ -1147,14 +1148,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->fragsize = ip_sk_use_pmtu(sk) ? dst_mtu(&rt->dst) : rt->dst.dev->mtu; - cork->gso_size = sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0; + cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->priority = ipc->priority; - cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; + cork->tx_flags = 0; + sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); return 0; } @@ -1415,6 +1417,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1547,11 +1550,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; + ipcm_init(&ipc); ipc.addr = daddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9f79b9803a16..5660adcf7a04 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -60,6 +60,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> +#include <linux/rhashtable.h> #include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> @@ -1051,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *skb; int ret; - if (assert == IGMPMSG_WHOLEPKT) + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); @@ -1059,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt, if (!skb) return -ENOBUFS; - if (assert == IGMPMSG_WHOLEPKT) { + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and @@ -1070,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt, skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); - msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_msgtype = assert; msg->im_mbz = 0; - msg->im_vif = mrt->mroute_reg_vif_num; + if (assert == IGMPMSG_WRVIFWHOLE) + msg->im_vif = vifi; + else + msg->im_vif = mrt->mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1371,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; + bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ @@ -1501,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, break; } + do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; + mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: @@ -1982,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + if (mrt->mroute_do_wrvifwhole) + ipmr_cache_report(mrt, skb, true_vifi, + IGMPMSG_WRVIFWHOLE); } goto dont_forward; } @@ -2658,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, + mrt->mroute_do_wrvifwhole)) return false; return true; diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index cafb0506c8c9..1ad9aa62a97b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -2,6 +2,7 @@ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ +#include <linux/rhashtable.h> #include <linux/mroute_base.h> /* Sets everything common except 'dev', since that is done under locking */ diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e6774ccb7731..8d2e5dc9a827 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_ip_reroute); -__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if ((protocol == 0 && !csum_fold(skb->csum)) || - !csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - dataoff, protocol, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - if (protocol == 0) - skb->csum = 0; - else - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len - dataoff, - protocol, 0); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip_checksum); - -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, - skb->len - dataoff, 0); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -} -EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index bbfc356cb1b5..d9504adc47b3 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4 tristate default n -config NF_CONNTRACK_IPV4 - tristate "IPv4 connection tracking support (required for NAT)" - depends on NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV4 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv4 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help @@ -112,7 +96,7 @@ config NF_REJECT_IPV4 config NF_NAT_IPV4 tristate "IPv4 NAT" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT help @@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY # NAT + specific targets: nf_conntrack config IP_NF_NAT tristate "iptables NAT support" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT select NF_NAT_IPV4 @@ -340,7 +324,7 @@ config IP_NF_MANGLE config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support" depends on IP_NF_MANGLE - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK select NETFILTER_FAMILY_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 8394c17c269f..367993adf4d3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,12 +3,6 @@ # Makefile for the netfilter modules on top of IPv4. # -# objects for l3 independent conntrack -nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o - -# connection tracking -obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o - nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c deleted file mode 100644 index 9db988f9a4d7..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ /dev/null @@ -1,472 +0,0 @@ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2006-2012 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/icmp.h> -#include <linux/sysctl.h> -#include <net/route.h> -#include <net/ip.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#include <net/netfilter/nf_log.h> - -static int conntrack4_net_id __read_mostly; -static DEFINE_MUTEX(register_ipv4_hooks); - -struct conntrack4_net { - unsigned int users; -}; - -static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const __be32 *ap; - __be32 _addrs[2]; - ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), - sizeof(u_int32_t) * 2, _addrs); - if (ap == NULL) - return false; - - tuple->src.u3.ip = ap[0]; - tuple->dst.u3.ip = ap[1]; - - return true; -} - -static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u3.ip = orig->dst.u3.ip; - tuple->dst.u3.ip = orig->src.u3.ip; - - return true; -} - -static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - const struct iphdr *iph; - struct iphdr _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (iph == NULL) - return -NF_ACCEPT; - - /* Conntrack defragments packets, we might still see fragments - * inside ICMP packets though. */ - if (iph->frag_off & htons(IP_OFFSET)) - return -NF_ACCEPT; - - *dataoff = nhoff + (iph->ihl << 2); - *protonum = iph->protocol; - - /* Check bogus IP headers */ - if (*dataoff > skb->len) { - pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " - "nhoff %u, ihl %u, skblen %u\n", - nhoff, iph->ihl << 2, skb->len); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -static unsigned int ipv4_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv4_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -static unsigned int ipv4_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ - enum ip_conntrack_info ctinfo; - struct nf_conn *tmpl; - - tmpl = nf_ct_get(skb, &ctinfo); - if (tmpl && nf_ct_is_template(tmpl)) { - /* when skipping ct, clear templates to avoid fooling - * later targets/matches - */ - skb->_nfct = 0; - nf_ct_put(tmpl); - } - return NF_ACCEPT; - } - - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -/* Connection tracking may drop packets, but never alters them, so - make it the first hook. */ -static const struct nf_hook_ops ipv4_conntrack_ops[] = { - { - .hook = ipv4_conntrack_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_conntrack_local, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, -}; - -/* Fast function for those who don't want to parse /proc (and I don't - blame them). */ -/* Reversing the socket's dst/src point of view gives us the reply - mapping. */ -static int -getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - - memset(&tuple, 0, sizeof(tuple)); - - lock_sock(sk); - tuple.src.u3.ip = inet->inet_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.ip = inet->inet_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.src.l3num = PF_INET; - tuple.dst.protonum = sk->sk_protocol; - release_sock(sk); - - /* We only do TCP and SCTP at the moment: is there a better way? */ - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); - return -ENOPROTOOPT; - } - - if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); - return -EINVAL; - } - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (h) { - struct sockaddr_in sin; - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - sin.sin_family = AF_INET; - sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u3.ip; - memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); - nf_ct_put(ct); - if (copy_to_user(user, &sin, sizeof(sin)) != 0) - return -EFAULT; - else - return 0; - } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static int ipv4_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || - nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V4_SRC] = { .type = NLA_U32 }, - [CTA_IP_V4_DST] = { .type = NLA_U32 }, -}; - -static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; - - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); - - return 0; -} -#endif - -static struct nf_sockopt_ops so_getorigdst = { - .pf = PF_INET, - .get_optmin = SO_ORIGINAL_DST, - .get_optmax = SO_ORIGINAL_DST+1, - .get = getorigdst, - .owner = THIS_MODULE, -}; - -static int ipv4_hooks_register(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - int err = 0; - - mutex_lock(®ister_ipv4_hooks); - - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv4_enable(net); - if (err) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv4_hooks); - return err; -} - -static void ipv4_hooks_unregister(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - - mutex_lock(®ister_ipv4_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - mutex_unlock(®ister_ipv4_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { - .l3proto = PF_INET, - .pkt_to_tuple = ipv4_pkt_to_tuple, - .invert_tuple = ipv4_invert_tuple, - .get_l4proto = ipv4_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv4_tuple_to_nlattr, - .nlattr_to_tuple = ipv4_nlattr_to_tuple, - .nla_policy = ipv4_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ - NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */ -#endif - .net_ns_get = ipv4_hooks_register, - .net_ns_put = ipv4_hooks_unregister, - .me = THIS_MODULE, -}; - -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); -MODULE_ALIAS("ip_conntrack"); -MODULE_LICENSE("GPL"); - -static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { - &nf_conntrack_l4proto_tcp4, - &nf_conntrack_l4proto_udp4, - &nf_conntrack_l4proto_icmp, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite4, -#endif -}; - -static int ipv4_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static void ipv4_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static struct pernet_operations ipv4_net_ops = { - .init = ipv4_net_init, - .exit = ipv4_net_exit, - .id = &conntrack4_net_id, - .size = sizeof(struct conntrack4_net), -}; - -static int __init nf_conntrack_l3proto_ipv4_init(void) -{ - int ret = 0; - - need_conntrack(); - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv4.nla_size)) - return -EINVAL; -#endif - ret = nf_register_sockopt(&so_getorigdst); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv4_net_ops); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); - goto cleanup_sockopt; - } - - ret = nf_ct_l4proto_register(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); - goto cleanup_l4proto; - } - - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - cleanup_pernet: - unregister_pernet_subsys(&ipv4_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv4_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - unregister_pernet_subsys(&ipv4_net_ops); - nf_unregister_sockopt(&so_getorigdst); -} - -module_init(nf_conntrack_l3proto_ipv4_init); -module_exit(nf_conntrack_l3proto_ipv4_fini); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c deleted file mode 100644 index 5c15beafa711..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ /dev/null @@ -1,383 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2006-2010 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/in.h> -#include <linux/icmp.h> -#include <linux/seq_file.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack_tuple.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/nf_log.h> - -static const unsigned int nf_ct_icmp_timeout = 30*HZ; - -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - -static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct icmphdr *hp; - struct icmphdr _hdr; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return false; - - tuple->dst.u.icmp.type = hp->type; - tuple->src.u.icmp.id = hp->un.echo.id; - tuple->dst.u.icmp.code = hp->code; - - return true; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 -}; - -static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - if (orig->dst.u.icmp.type >= sizeof(invmap) || - !invmap[orig->dst.u.icmp.type]) - return false; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return true; -} - -static unsigned int *icmp_get_timeouts(struct net *net) -{ - return &icmp_pernet(net)->timeout; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmp_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) -{ - /* Do not immediately delete the connection after the first - successful reply to avoid excessive conntrackd traffic - and also to handle correctly ICMP echo reply duplicates. */ - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) -{ - static const u_int8_t valid_new[] = { - [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 - }; - - if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || - !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { - /* Can't create a new ICMP `conn' with this. */ - pr_debug("icmp: can't create new conn with type %u\n", - ct->tuplehash[0].tuple.dst.u.icmp.type); - nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -static int -icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int hooknum) -{ - struct nf_conntrack_tuple innertuple, origtuple; - const struct nf_conntrack_l4proto *innerproto; - const struct nf_conntrack_tuple_hash *h; - const struct nf_conntrack_zone *zone; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) + ip_hdrlen(skb) - + sizeof(struct icmphdr), - PF_INET, net, &origtuple)) { - pr_debug("icmp_error_message: failed to get tuple\n"); - return -NF_ACCEPT; - } - - /* rcu_read_lock()ed by nf_hook_thresh */ - innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, - &nf_conntrack_l3proto_ipv4, innerproto)) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, zone, &innertuple); - if (!h) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} - -static void icmp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); -} - -/* Small and modified version of icmp_rcv */ -static int -icmp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct icmphdr *icmph; - struct icmphdr _ih; - - /* Not enough header? */ - icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); - if (icmph == NULL) { - icmp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } - - /* See ip_conntrack_proto_tcp.c */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, dataoff, 0)) { - icmp_error_log(skb, net, pf, "bad hw icmp checksum"); - return -NF_ACCEPT; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - icmp_error_log(skb, net, pf, "invalid icmp type"); - return -NF_ACCEPT; - } - - /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) - return NF_ACCEPT; - - return icmp_error_message(net, tmpl, skb, hooknum); -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static int icmp_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *t) -{ - if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || - nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || - nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { - [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, -}; - -static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; - - return 0; -} - -static unsigned int icmp_nlattr_tuple_size(void) -{ - static unsigned int size __read_mostly; - - if (!size) - size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); - - return size; -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_cttimeout.h> - -static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); - - if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { - *timeout = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; - } else { - /* Set default ICMP timeout. */ - *timeout = in->timeout; - } - return 0; -} - -static int -icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeout = data; - - if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { - [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table icmp_sysctl_table[] = { - { - .procname = "nf_conntrack_icmp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmp_sysctl_table, - sizeof(icmp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmp_init_net(struct net *net, u_int16_t proto) -{ - struct nf_icmp_net *in = icmp_pernet(net); - struct nf_proto_net *pn = &in->pn; - - in->timeout = nf_ct_icmp_timeout; - - return icmp_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp.pn; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = -{ - .l3proto = PF_INET, - .l4proto = IPPROTO_ICMP, - .pkt_to_tuple = icmp_pkt_to_tuple, - .invert_tuple = icmp_invert_tuple, - .packet = icmp_packet, - .get_timeouts = icmp_get_timeouts, - .new = icmp_new, - .error = icmp_error, - .destroy = NULL, - .me = NULL, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = icmp_tuple_to_nlattr, - .nlattr_tuple_size = icmp_nlattr_tuple_size, - .nlattr_to_tuple = icmp_nlattr_to_tuple, - .nla_policy = icmp_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = icmp_timeout_nlattr_to_obj, - .obj_to_nlattr = icmp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_ICMP_MAX, - .obj_size = sizeof(unsigned int), - .nla_policy = icmp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = icmp_init_net, - .get_net_proto = icmp_get_net_proto, -}; diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 4388de0e5380..1e6f28c97d3a 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct nf_log_buf *m, +static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); - dump_ipv4_packet(m, info, skb, + dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); nf_log_buf_add(m, "] "); } @@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv4_mac_header(m, loginfo, skb); - dump_ipv4_packet(m, loginfo, skb, 0); + dump_ipv4_packet(net, m, loginfo, skb, 0); nf_log_buf_close(m); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2ed64bca54e3..8d7aaf118a30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && + if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) @@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - isk->freebind || isk->transparent || has_addr || + if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; @@ -739,13 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.oif = sk->sk_bound_dev_if; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -769,8 +761,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 77350c1256ce..b46e4cf9a55a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -287,6 +287,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), + SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), + SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index abb3c9490c55..33df4d76db2d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -561,13 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -670,8 +665,6 @@ back_from_confirm: &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1df6e97106d7..b678466da451 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto no_route; } - if (res->type == RTN_BROADCAST) + if (res->type == RTN_BROADCAST) { + if (IN_DEV_BFORWARD(in_dev)) + goto make_route; goto brd_input; + } if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, @@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type != RTN_UNICAST) goto martian_destination; +make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5fa335fd3852..b92f422f2fa8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -201,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, return ret; } +static int ipv4_fwd_update_priority(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_ip_fwd_update_priority); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, + net); + + return ret; +} + static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -664,6 +681,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "ip_forward_update_priority", + .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipv4_fwd_update_priority, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4491faf83f4f..31fa1c080f28 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) @@ -817,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, * This occurs when user tries to read * from never connected socket. */ - if (!sock_flag(sk, SOCK_DONE)) - ret = -ENOTCONN; + ret = -ENOTCONN; break; } if (!timeo) { @@ -1241,7 +1240,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -1275,9 +1274,6 @@ restart: int linear; new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -2042,13 +2038,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; if (sk->sk_state == TCP_CLOSE) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; break; } @@ -2576,6 +2569,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; + tp->rcv_rtt_last_tsecr = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; @@ -2600,6 +2594,10 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); tp->compressed_ack = 0; + tp->bytes_sent = 0; + tp->bytes_retrans = 0; + tp->dsack_dups = 0; + tp->reord_seen = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -2995,7 +2993,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (val < 0) err = -EINVAL; else - icsk->icsk_user_timeout = msecs_to_jiffies(val); + icsk->icsk_user_timeout = val; break; case TCP_FASTOPEN: @@ -3207,10 +3205,41 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; + info->tcpi_bytes_sent = tp->bytes_sent; + info->tcpi_bytes_retrans = tp->bytes_retrans; + info->tcpi_dsack_dups = tp->dsack_dups; + info->tcpi_reord_seen = tp->reord_seen; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); +static size_t tcp_opt_stats_get_size(void) +{ + return + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ + 0; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -3219,9 +3248,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u64 rate64; u32 rate; - stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 7 * nla_total_size(sizeof(u32)) + - 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; @@ -3257,6 +3284,13 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, + TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, + TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); + nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); + return stats; } @@ -3451,7 +3485,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_USER_TIMEOUT: - val = jiffies_to_msecs(icsk->icsk_user_timeout); + val = icsk->icsk_user_timeout; break; case TCP_FASTOPEN: diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 4bfff3c87e8e..13d34427ca3d 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -205,7 +205,11 @@ static u32 bbr_bw(const struct sock *sk) */ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { - rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + unsigned int mss = tcp_sk(sk)->mss_cache; + + if (!tcp_needs_internal_pacing(sk)) + mss = tcp_mss_to_mtu(sk, mss); + rate *= mss; rate *= gain; rate >>= BBR_SCALE; rate *= USEC_PER_SEC; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f9dcb29be12d..715d541b52dd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/static_key.h> +#include <net/busy_poll.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -590,9 +591,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->rx_opt.rcv_tsecr && - (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) + return; + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; + + if (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; @@ -877,6 +881,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; + tp->dsack_dups++; } /* It's reordering when higher sequence was delivered (i.e. sacked) before @@ -908,8 +913,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } - tp->rack.reord = 1; /* This exciting event is worth to be remembered. 8) */ + tp->reord_seen++; NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } @@ -1873,6 +1878,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) tp->reordering = min_t(u32, tp->packets_out + addend, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } @@ -3466,7 +3472,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -4347,6 +4353,11 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; +#ifdef CONFIG_TLS_DEVICE + if (from->decrypted != to->decrypted) + return false; +#endif + if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -4642,8 +4653,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) skb->data_len = data_len; skb->len = size; - if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto err_free; + } err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) @@ -4699,18 +4712,21 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (tcp_receive_window(tp) == 0) + if (tcp_receive_window(tp) == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } /* Ok. In sequence. In window. */ queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4766,8 +4782,10 @@ drop: /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ - if (!tcp_receive_window(tp)) + if (!tcp_receive_window(tp)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } goto queue_and_out; } @@ -4885,6 +4903,9 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); +#ifdef CONFIG_TLS_DEVICE + nskb->decrypted = skb->decrypted; +#endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -4912,6 +4933,10 @@ restart: skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; +#ifdef CONFIG_TLS_DEVICE + if (skb->decrypted != nskb->decrypted) + goto end; +#endif } } } @@ -5530,6 +5555,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); + /* When receiving pure ack in fast path, update + * last ts ecr directly instead of calling + * tcp_rcv_rtt_measure_ts() + */ + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); @@ -5631,6 +5661,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); @@ -6459,6 +6490,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); + sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3b2711e33e4c..9e041fa5c545 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { /* In case of repair and re-using TIME-WAIT sockets we still * want to be sure that it is safe as above but honor the * sequence numbers and time stamps set as part of the repair diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dda1341a223..75ef332a7caf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -189,7 +189,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); @@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk; + struct tcp_sock *oldtp, *newtp; - if (newsk) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp = tcp_sk(newsk); - struct tcp_sock *oldtp = tcp_sk(sk); - - smc_check_reset_syn_req(oldtp, req, newtp); - - /* Now setup tcp_sock */ - newtp->pred_flags = 0; - - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 1; - - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - - INIT_LIST_HEAD(&newtp->tsq_node); - INIT_LIST_HEAD(&newtp->tsorted_sent_queue); - - tcp_init_wl(newtp, treq->rcv_isn); - - newtp->srtt_us = 0; - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); - newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - newtp->tlp_high_seq = 0; - newtp->lsndtime = tcp_jiffies32; - newsk->sk_txhash = treq->txhash; - newtp->last_oow_ack_time = 0; - newtp->total_retrans = req->num_retrans; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; - - /* There's a bubble in the pipe until at least the first ACK. */ - newtp->app_limited = ~0U; - - tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; - - newtp->rx_opt.saw_tstamp = 0; - - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; - - newtp->urg_data = 0; - - if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); - - newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - newtp->rx_opt.sack_ok = ireq->sack_ok; - newtp->window_clamp = req->rsk_window_clamp; - newtp->rcv_ssthresh = req->rsk_rcv_wnd; - newtp->rcv_wnd = req->rsk_rcv_wnd; - newtp->rx_opt.wscale_ok = ireq->wscale_ok; - if (newtp->rx_opt.wscale_ok) { - newtp->rx_opt.snd_wscale = ireq->snd_wscale; - newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; - } else { - newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp, 65535U); - } - newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << - newtp->rx_opt.snd_wscale); - newtp->max_window = newtp->snd_wnd; - - if (newtp->rx_opt.tstamp_ok) { - newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else { - newtp->rx_opt.ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); - } - newtp->tsoffset = treq->ts_off; + if (!newsk) + return NULL; + + newicsk = inet_csk(newsk); + newtp = tcp_sk(newsk); + oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + + INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); + + tcp_init_wl(newtp, treq->rcv_isn); + + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + newtp->packets_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + newtp->tlp_high_seq = 0; + newtp->lsndtime = tcp_jiffies32; + newsk->sk_txhash = treq->txhash; + newtp->last_oow_ack_time = 0; + newtp->total_retrans = req->num_retrans; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = TCP_INIT_CWND; + newtp->snd_cwnd_cnt = 0; + + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + + tcp_init_xmit_timers(newsk); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->rx_opt.sack_ok = ireq->sack_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG - newtp->md5sig_info = NULL; /*XXX*/ - if (newtp->af_specific->md5_lookup(sk, newsk)) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; + newtp->md5sig_info = NULL; /*XXX*/ + if (newtp->af_specific->md5_lookup(sk, newsk)) + newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) - newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; - newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); - newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - newtp->rack.mstamp = 0; - newtp->rack.advanced = 0; - newtp->rack.reo_wnd_steps = 1; - newtp->rack.last_delivered = 0; - newtp->rack.reo_wnd_persist = 0; - newtp->rack.dsack_seen = 0; - - __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); - } + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; + newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); @@ -600,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8cc7c3487330..870b0a335061 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -180,9 +180,9 @@ out: return segs; } -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; @@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - for (; (p = *head); head = &p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto found; } - + p = NULL; goto out_check_final; found: @@ -262,8 +262,11 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); +#ifdef CONFIG_TLS_DEVICE + flush |= p->decrypted ^ skb->decrypted; +#endif - if (flush || skb_gro_receive(head, skb)) { + if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } @@ -277,7 +280,7 @@ out_check_final: TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; + pp = p; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); @@ -302,7 +305,7 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c4172c1fb198..597dbd749f05 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -977,17 +977,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -/* BBR congestion control needs pacing. - * Same remark for SO_MAX_PACING_RATE. - * sch_fq packet scheduler is efficiently handling pacing, - * but is not always installed/used. - * Return true if TCP stack should pace packets itself. - */ -static bool tcp_needs_internal_pacing(const struct sock *sk) -{ - return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; -} - static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) { u64 len_ns; @@ -999,9 +988,6 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) if (!rate || rate == ~0U) return; - /* Should account for header sizes as sch_fq does, - * but lets make things simple. - */ len_ns = (u64)skb->len * NSEC_PER_SEC; do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, @@ -1150,6 +1136,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); + tp->bytes_sent += skb->len - tcp_header_size; tcp_internal_pacing(sk, skb); } @@ -2711,9 +2698,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); - int skb_size, next_skb_size; + int next_skb_size; - skb_size = skb->len; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); @@ -2884,6 +2870,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; + tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c61240e43923..4dff40dad4dc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 71593e4400ab..c81aadff769b 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (!tp->rack.reord) { + if (!tp->reord_seen) { /* If reordering has not been observed, be aggressive during * the recovery or starting the recovery by DUPACK threshold. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3b3611729928..7fdf222a0bdf 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,35 @@ #include <linux/gfp.h> #include <net/tcp.h> +static u32 tcp_retransmit_stamp(const struct sock *sk) +{ + u32 start_ts = tcp_sk(sk)->retrans_stamp; + + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return 0; + start_ts = tcp_skb_timestamp(head); + } + return start_ts; +} + +static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + u32 elapsed, start_ts; + + start_ts = tcp_retransmit_stamp(sk); + if (!icsk->icsk_user_timeout || !start_ts) + return icsk->icsk_rto; + elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; + if (elapsed >= icsk->icsk_user_timeout) + return 1; /* user timeout has passed; fire ASAP */ + else + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); +} + /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return false; - start_ts = tcp_skb_timestamp(head); - } + start_ts = tcp_retransmit_stamp(sk); + if (!start_ts) + return false; if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk, else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; } /* A write timeout has occurred. Process the after effects. */ @@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > - jiffies_to_msecs(icsk->icsk_user_timeout)) + (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -535,7 +559,8 @@ out_reset_timer: /* Use normal (exponential) backoff */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); @@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t) * to determine when to timeout instead. */ if ((icsk->icsk_user_timeout != 0 && - elapsed >= icsk->icsk_user_timeout && + elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && icsk->icsk_probes_out > 0) || (icsk->icsk_user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24e116ddae79..060e841dde40 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -926,11 +926,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; @@ -977,9 +972,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) connected = 1; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); ipc.gso_size = up->gso_size; if (msg->msg_controllen) { @@ -1027,8 +1020,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 69c54540d5b4..0c0522b79b43 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,10 +343,11 @@ out: return segs; } -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; @@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unflush: flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -399,8 +400,8 @@ out: } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff **udp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); |