diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 16 | ||||
-rw-r--r-- | net/ipv4/Makefile | 1 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 6 | ||||
-rw-r--r-- | net/ipv4/fou.c | 81 | ||||
-rw-r--r-- | net/ipv4/gre_demux.c | 1 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 25 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_forward.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 51 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/arp_tables.c | 41 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_tables.c | 20 | ||||
-rw-r--r-- | net/ipv4/netfilter/iptable_mangle.c | 4 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_reject_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 67 | ||||
-rw-r--r-- | net/ipv4/tcp_dctcp.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 31 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 31 | ||||
-rw-r--r-- | net/ipv4/tcp_nv.c | 476 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 4 | ||||
-rw-r--r-- | net/ipv4/udp_tunnel.c | 61 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 8 |
23 files changed, 855 insertions, 84 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 238225b0c970..50d6a9b49f6c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -532,6 +532,22 @@ config TCP_CONG_VEGAS window. TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno. +config TCP_CONG_NV + tristate "TCP NV" + default n + ---help--- + TCP NV is a follow up to TCP Vegas. It has been modified to deal with + 10G networks, measurement noise introduced by LRO, GRO and interrupt + coalescence. In addition, it will decrease its cwnd multiplicatively + instead of linearly. + + Note that in general congestion avoidance (cwnd decreased when # packets + queued grows) cannot coexist with congestion control (cwnd decreased only + when there is packet loss) due to fairness issues. One scenario when they + can coexist safely is when the CA flows have RTTs << CC flows RTTs. + + For further details see http://www.brakmo.org/networking/tcp-nv/ + config TCP_CONG_SCALABLE tristate "Scalable TCP" default n diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bfa133691cde..24629b6f57cc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f2bda9e89c61..6e9ea69e5f75 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -76,6 +76,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, { int err = -EAGAIN; struct fib_table *tbl; + u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: @@ -94,7 +95,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, rcu_read_lock(); - tbl = fib_get_table(rule->fr_net, rule->table); + tb_id = fib_rule_get_table(rule, arg); + tbl = fib_get_table(rule->fr_net, tb_id); if (tbl) err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *)arg->result, @@ -180,7 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (err) goto errout; - if (rule->table == RT_TABLE_UNSPEC) { + if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 5f9207c039e7..321d57f825ce 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -129,6 +129,36 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + + int prot; + + switch (((struct iphdr *)guehdr)->version) { + case 4: + prot = IPPROTO_IPIP; + break; + case 6: + prot = IPPROTO_IPV6; + break; + default: + goto drop; + } + + if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) + goto drop; + + return -prot; + } + + default: /* Undefined version */ + goto drop; + } + optlen = guehdr->hlen << 2; len += optlen; @@ -289,6 +319,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, int flush = 1; struct fou *fou = fou_from_sock(sk); struct gro_remcsum grc; + u8 proto; skb_gro_remcsum_init(&grc); @@ -302,6 +333,25 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, goto out; } + switch (guehdr->version) { + case 0: + break; + case 1: + switch (((struct iphdr *)guehdr)->version) { + case 4: + proto = IPPROTO_IPIP; + break; + case 6: + proto = IPPROTO_IPV6; + break; + default: + goto out; + } + goto next_proto; + default: + goto out; + } + optlen = guehdr->hlen << 2; len += optlen; @@ -370,6 +420,10 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, } } + proto = guehdr->proto_ctype; + +next_proto: + /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are are simply @@ -383,7 +437,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[guehdr->proto_ctype]); + ops = rcu_dereference(offloads[proto]); if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; @@ -404,13 +458,30 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); const struct net_offload *ops; - unsigned int guehlen; + unsigned int guehlen = 0; u8 proto; int err = -ENOENT; - proto = guehdr->proto_ctype; - - guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + switch (guehdr->version) { + case 0: + proto = guehdr->proto_ctype; + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + break; + case 1: + switch (((struct iphdr *)guehdr)->version) { + case 4: + proto = IPPROTO_IPIP; + break; + case 6: + proto = IPPROTO_IPV6; + break; + default: + return err; + } + break; + default: + return err; + } rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index de1d119a4497..b798862b6be5 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -117,6 +117,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if ((*(u8 *)options & 0xF0) != 0x40) hdr_len += 4; } + tpi->hdr_len = hdr_len; return hdr_len; } EXPORT_SYMBOL(gre_parse_header); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 25af1243649b..38c2c47fe0e8 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,6 +44,7 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; + u32 ifindex; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -571,6 +572,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_DEV_COND: { + u32 ifindex; + + ifindex = *((const u32 *)(op + 1)); + if (ifindex != entry->ifindex) + yes = 0; + break; + } } if (yes) { @@ -613,6 +622,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); + entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); @@ -636,6 +646,17 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* data is u32 ifindex */ +static bool valid_devcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + /* Check ifindex space. */ + *min_len += sizeof(u32); + if (len < *min_len) + return false; + + return true; +} /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) @@ -700,6 +721,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_DEV_COND: + if (!valid_devcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 3a88b0c73797..b5e9317eaf9e 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -355,7 +355,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; - if (frag_mem_limit(nf) > nf->high_thresh) { + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { inet_frag_schedule_worker(f); return NULL; } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index cbfb1808fcc4..9f0a7b96646f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -54,7 +54,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1d000af7f561..5b1481be0282 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -138,6 +138,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + unsigned int data_len = 0; struct ip_tunnel *t; switch (type) { @@ -163,6 +164,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return; + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -181,6 +183,13 @@ static void ipgre_err(struct sk_buff *skb, u32 info, if (!t) return; +#if IS_ENABLED(CONFIG_IPV6) + if (tpi->proto == htons(ETH_P_IPV6) && + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return; +#endif + if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) return; @@ -837,17 +846,19 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct net_device *dev, +static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], struct ip_tunnel_parm *parms) { + struct ip_tunnel *t = netdev_priv(dev); + memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_GRE; if (!data) - return; + return 0; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); @@ -876,16 +887,26 @@ static void ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); - if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { + if (t->ignore_df) + return -EINVAL; parms->iph.frag_off = htons(IP_DF); + } if (data[IFLA_GRE_COLLECT_METADATA]) { - struct ip_tunnel *t = netdev_priv(dev); - t->collect_md = true; if (dev->type == ARPHRD_IPGRE) dev->type = ARPHRD_NONE; } + + if (data[IFLA_GRE_IGNORE_DF]) { + if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) + && (parms->iph.frag_off & htons(IP_DF))) + return -EINVAL; + t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); + } + + return 0; } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -956,16 +977,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_newlink(dev, tb, &p); } @@ -974,16 +998,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_changelink(dev, tb, &p); } @@ -1020,6 +1047,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_GRE_IGNORE_DF */ + nla_total_size(1) + 0; } @@ -1053,6 +1082,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df)) + goto nla_put_failure; + if (t->collect_md) { if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) goto nla_put_failure; @@ -1080,6 +1112,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4bd4921639c3..e23f141c9ba5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -225,7 +225,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= mtu) + skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d8f5e0a269f5..95649ebd2874 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2033f929aa66..c8dd9e26b185 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -89,22 +89,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr, __be32 src_ipaddr, tgt_ipaddr; long ret; -#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) - - if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, - ARPT_INV_ARPOP)) + if (NF_INVF(arpinfo, ARPT_INV_ARPOP, + (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop)) return 0; - if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, - ARPT_INV_ARPHRD)) + if (NF_INVF(arpinfo, ARPT_INV_ARPHRD, + (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd)) return 0; - if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, - ARPT_INV_ARPPRO)) + if (NF_INVF(arpinfo, ARPT_INV_ARPPRO, + (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro)) return 0; - if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, - ARPT_INV_ARPHLN)) + if (NF_INVF(arpinfo, ARPT_INV_ARPHLN, + (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln)) return 0; src_devaddr = arpptr; @@ -115,31 +113,32 @@ static inline int arp_packet_match(const struct arphdr *arphdr, arpptr += dev->addr_len; memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); - if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), - ARPT_INV_SRCDEVADDR) || - FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), - ARPT_INV_TGTDEVADDR)) + if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR, + arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, + dev->addr_len)) || + NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR, + arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, + dev->addr_len))) return 0; - if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, - ARPT_INV_SRCIP) || - FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), - ARPT_INV_TGTIP)) + if (NF_INVF(arpinfo, ARPT_INV_SRCIP, + (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) || + NF_INVF(arpinfo, ARPT_INV_TGTIP, + (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr)) return 0; /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_IN)) + if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0)) return 0; ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) + if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0)) return 0; return 1; -#undef FWINV } static inline int arp_checkentry(const struct arpt_arp *arp) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 54906e0e8e0c..f0df66f54ce6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -58,32 +58,31 @@ ip_packet_match(const struct iphdr *ip, { unsigned long ret; -#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) - - if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, - IPT_INV_SRCIP) || - FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) + if (NF_INVF(ipinfo, IPT_INV_SRCIP, + (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) || + NF_INVF(ipinfo, IPT_INV_DSTIP, + (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr)) return false; ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_IN)) + if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_OUT)) + if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0)) return false; /* Check specific protocol */ if (ipinfo->proto && - FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) + NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto)) return false; /* If we have a fragment rule but the packet is not a fragment * then we return zero */ - if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) + if (NF_INVF(ipinfo, IPT_INV_FRAG, + (ipinfo->flags & IPT_F_FRAG) && !isfrag)) return false; return true; @@ -122,7 +121,6 @@ static inline bool unconditional(const struct ipt_entry *e) return e->target_offset == sizeof(struct ipt_entry) && memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; -#undef FWINV } /* for const-correctness */ diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 57fc97cdac70..aebdb337fd7e 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -87,10 +87,6 @@ iptable_mangle_hook(void *priv, { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state); - if (state->hook == NF_INET_POST_ROUTING) - return ipt_do_table(skb, state, - state->net->ipv4.iptable_mangle); - /* PREROUTING/INPUT/FORWARD: */ return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index b6ea57ec5e14..fd8220213afc 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -24,6 +24,9 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; + if (ip_hdr(oldskb)->protocol != IPPROTO_TCP) + return NULL; + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), sizeof(struct tcphdr), _oth); if (oth == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5c7ed147449c..032a96d78c99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2277,6 +2277,38 @@ static inline bool tcp_can_repair_sock(const struct sock *sk) ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); } +static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) +{ + struct tcp_repair_window opt; + + if (!tp->repair) + return -EPERM; + + if (len != sizeof(opt)) + return -EINVAL; + + if (copy_from_user(&opt, optbuf, sizeof(opt))) + return -EFAULT; + + if (opt.max_window < opt.snd_wnd) + return -EINVAL; + + if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) + return -EINVAL; + + if (after(opt.rcv_wup, tp->rcv_nxt)) + return -EINVAL; + + tp->snd_wl1 = opt.snd_wl1; + tp->snd_wnd = opt.snd_wnd; + tp->max_window = opt.max_window; + + tp->rcv_wnd = opt.rcv_wnd; + tp->rcv_wup = opt.rcv_wup; + + return 0; +} + static int tcp_repair_options_est(struct tcp_sock *tp, struct tcp_repair_opt __user *optbuf, unsigned int len) { @@ -2604,6 +2636,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_REPAIR_WINDOW: + err = tcp_repair_set_window(tp, optval, optlen); + break; case TCP_NOTSENT_LOWAT: tp->notsent_lowat = val; sk->sk_write_space(sk); @@ -2860,6 +2895,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EINVAL; break; + case TCP_REPAIR_WINDOW: { + struct tcp_repair_window opt; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len != sizeof(opt)) + return -EINVAL; + + if (!tp->repair) + return -EPERM; + + opt.snd_wl1 = tp->snd_wl1; + opt.snd_wnd = tp->snd_wnd; + opt.max_window = tp->max_window; + opt.rcv_wnd = tp->rcv_wnd; + opt.rcv_wup = tp->rcv_wup; + + if (copy_to_user(optval, &opt, len)) + return -EFAULT; + return 0; + } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; @@ -2969,8 +3026,18 @@ static void __tcp_alloc_md5sig_pool(void) return; for_each_possible_cpu(cpu) { + void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch; struct ahash_request *req; + if (!scratch) { + scratch = kmalloc_node(sizeof(union tcp_md5sum_block) + + sizeof(struct tcphdr), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!scratch) + return; + per_cpu(tcp_md5sig_pool, cpu).scratch = scratch; + } if (per_cpu(tcp_md5sig_pool, cpu).md5_req) continue; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 7e538f71f5fb..10d728b6804c 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -293,7 +293,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - memset(info, 0, sizeof(struct tcp_dctcp_info)); + memset(&info->dctcp, 0, sizeof(info->dctcp)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; @@ -303,7 +303,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, } *attr = INET_DIAG_DCTCPINFO; - return sizeof(*info); + return sizeof(info->dctcp); } return 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6c8f4cd0800..94d4aff97523 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3115,6 +3115,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; + u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3154,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!first_ackt.v64) first_ackt = last_ackt; + last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; @@ -3250,7 +3252,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = ca_rtt_us }; + .rtt_us = ca_rtt_us, + .in_flight = last_in_flight }; icsk->icsk_ca_ops->pkts_acked(sk, &sample); } @@ -5159,6 +5162,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool rst_seq_match = false; /* RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && @@ -5195,13 +5199,32 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* Step 2: check RST bit */ if (th->rst) { - /* RFC 5961 3.2 : - * If sequence number exactly matches RCV.NXT, then + /* RFC 5961 3.2 (extend to match against SACK too if available): + * If seq num matches RCV.NXT or the right-most SACK block, + * then * RESET the connection * else * Send a challenge ACK */ - if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + rst_seq_match = true; + } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int max_sack = sp[0].end_seq; + int this_sack; + + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; + ++this_sack) { + max_sack = after(sp[this_sack].end_seq, + max_sack) ? + sp[this_sack].end_seq : max_sack; + } + + if (TCP_SKB_CB(skb)->seq == max_sack) + rst_seq_match = true; + } + + if (rst_seq_match) tcp_reset(sk); else tcp_send_challenge_ack(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3708de2a6683..32b048e524d6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1018,27 +1018,28 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, GFP_KERNEL); } -static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - __be32 daddr, __be32 saddr, int nbytes) +static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, + __be32 daddr, __be32 saddr, + const struct tcphdr *th, int nbytes) { struct tcp4_pseudohdr *bp; struct scatterlist sg; + struct tcphdr *_th; - bp = &hp->md5_blk.ip4; - - /* - * 1. the TCP pseudo-header (in the order: source IP address, - * destination IP address, zero-padded protocol number, and - * segment length) - */ + bp = hp->scratch; bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; bp->protocol = IPPROTO_TCP; bp->len = cpu_to_be16(nbytes); - sg_init_one(&sg, bp, sizeof(*bp)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + _th = (struct tcphdr *)(bp + 1); + memcpy(_th, th, sizeof(*th)); + _th->check = 0; + + sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, + sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->md5_req); } @@ -1055,9 +1056,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; @@ -1101,9 +1100,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c new file mode 100644 index 000000000000..5de82a8d4d87 --- /dev/null +++ b/net/ipv4/tcp_nv.c @@ -0,0 +1,476 @@ +/* + * TCP NV: TCP with Congestion Avoidance + * + * TCP-NV is a successor of TCP-Vegas that has been developed to + * deal with the issues that occur in modern networks. + * Like TCP-Vegas, TCP-NV supports true congestion avoidance, + * the ability to detect congestion before packet losses occur. + * When congestion (queue buildup) starts to occur, TCP-NV + * predicts what the cwnd size should be for the current + * throughput and it reduces the cwnd proportionally to + * the difference between the current cwnd and the predicted cwnd. + * + * NV is only recommeneded for traffic within a data center, and when + * all the flows are NV (at least those within the data center). This + * is due to the inherent unfairness between flows using losses to + * detect congestion (congestion control) and those that use queue + * buildup to detect congestion (congestion avoidance). + * + * Note: High NIC coalescence values may lower the performance of NV + * due to the increased noise in RTT values. In particular, we have + * seen issues with rx-frames values greater than 8. + * + * TODO: + * 1) Add mechanism to deal with reverse congestion. + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/math64.h> +#include <net/tcp.h> +#include <linux/inet_diag.h> + +/* TCP NV parameters + * + * nv_pad Max number of queued packets allowed in network + * nv_pad_buffer Do not grow cwnd if this closed to nv_pad + * nv_reset_period How often (in) seconds)to reset min_rtt + * nv_min_cwnd Don't decrease cwnd below this if there are no losses + * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected + * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 + * nv_rtt_factor RTT averaging factor + * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd + * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd + * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping + * slow-start due to congestion + * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion + * nv_rtt_min_cnt Wait these many RTTs before making congesion decision + * nv_cwnd_growth_rate_neg + * nv_cwnd_growth_rate_pos + * How quickly to double growth rate (not rate) of cwnd when not + * congested. One value (nv_cwnd_growth_rate_neg) for when + * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos) + * otherwise. + */ + +static int nv_pad __read_mostly = 10; +static int nv_pad_buffer __read_mostly = 2; +static int nv_reset_period __read_mostly = 5; /* in seconds */ +static int nv_min_cwnd __read_mostly = 2; +static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ +static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ +static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ +static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_cwnd_growth_rate_neg __read_mostly = 8; +static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ +static int nv_dec_eval_min_calls __read_mostly = 60; +static int nv_inc_eval_min_calls __read_mostly = 20; +static int nv_ssthresh_eval_min_calls __read_mostly = 30; +static int nv_stop_rtt_cnt __read_mostly = 10; +static int nv_rtt_min_cnt __read_mostly = 2; + +module_param(nv_pad, int, 0644); +MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network"); +module_param(nv_reset_period, int, 0644); +MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)"); +module_param(nv_min_cwnd, int, 0644); +MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value" + " without losses"); + +/* TCP NV Parameters */ +struct tcpnv { + unsigned long nv_min_rtt_reset_jiffies; /* when to switch to + * nv_min_rtt_new */ + s8 cwnd_growth_factor; /* Current cwnd growth factor, + * < 0 => less than 1 packet/RTT */ + u8 available8; + u16 available16; + u32 loss_cwnd; /* cwnd at last loss */ + u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ + nv_reset:1, /* whether to reset values */ + nv_catchup:1; /* whether we are growing because + * of temporary cwnd decrease */ + u8 nv_eval_call_cnt; /* call count since last eval */ + u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is + * smaller than this. It may grow to handle + * TSO, LRO and interrupt coalescence because + * with these a small cwnd cannot saturate + * the link. Note that this is different from + * the file local nv_min_cwnd */ + u8 nv_rtt_cnt; /* RTTs without making ca decision */; + u32 nv_last_rtt; /* last rtt */ + u32 nv_min_rtt; /* active min rtt. Used to determine slope */ + u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_rtt_max_rate; /* max rate seen during current RTT */ + u32 nv_rtt_start_seq; /* current RTT ends when packet arrives + * acking beyond nv_rtt_start_seq */ + u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is + * used to determine bytes acked since last + * call to bictcp_acked */ + u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */ +}; + +#define NV_INIT_RTT U32_MAX +#define NV_MIN_CWND 4 +#define NV_MIN_CWND_GROW 2 +#define NV_TSO_CWND_BOUND 80 + +static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + ca->nv_reset = 0; + ca->loss_cwnd = 0; + ca->nv_no_cong_cnt = 0; + ca->nv_rtt_cnt = 0; + ca->nv_last_rtt = 0; + ca->nv_rtt_max_rate = 0; + ca->nv_rtt_start_seq = tp->snd_una; + ca->nv_eval_call_cnt = 0; + ca->nv_last_snd_una = tp->snd_una; +} + +static void tcpnv_init(struct sock *sk) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + tcpnv_reset(ca, sk); + + ca->nv_allow_cwnd_growth = 1; + ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; + ca->nv_min_rtt = NV_INIT_RTT; + ca->nv_min_rtt_new = NV_INIT_RTT; + ca->nv_min_cwnd = NV_MIN_CWND; + ca->nv_catchup = 0; + ca->cwnd_growth_factor = 0; +} + +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + u32 cnt; + + if (!tcp_is_cwnd_limited(sk)) + return; + + /* Only grow cwnd if NV has not detected congestion */ + if (!ca->nv_allow_cwnd_growth) + return; + + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } + + if (ca->cwnd_growth_factor < 0) { + cnt = tp->snd_cwnd << -ca->cwnd_growth_factor; + tcp_cong_avoid_ai(tp, cnt, acked); + } else { + cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor); + tcp_cong_avoid_ai(tp, cnt, acked); + } +} + +static u32 tcpnv_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + + ca->loss_cwnd = tp->snd_cwnd; + return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); +} + +static u32 tcpnv_undo_cwnd(struct sock *sk) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + +static void tcpnv_state(struct sock *sk, u8 new_state) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + if (new_state == TCP_CA_Open && ca->nv_reset) { + tcpnv_reset(ca, sk); + } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR || + new_state == TCP_CA_Recovery) { + ca->nv_reset = 1; + ca->nv_allow_cwnd_growth = 0; + if (new_state == TCP_CA_Loss) { + /* Reset cwnd growth factor to Reno value */ + if (ca->cwnd_growth_factor > 0) + ca->cwnd_growth_factor = 0; + /* Decrease growth rate if allowed */ + if (nv_cwnd_growth_rate_neg > 0 && + ca->cwnd_growth_factor > -8) + ca->cwnd_growth_factor--; + } + } +} + +/* Do congestion avoidance calculations for TCP-NV + */ +static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + unsigned long now = jiffies; + s64 rate64 = 0; + u32 rate, max_win, cwnd_by_slope; + u32 avg_rtt; + u32 bytes_acked = 0; + + /* Some calls are for duplicates without timetamps */ + if (sample->rtt_us < 0) + return; + + /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */ + if (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Disorder) + return; + + /* Stop cwnd growth if we were in catch up mode */ + if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) { + ca->nv_catchup = 0; + ca->nv_allow_cwnd_growth = 0; + } + + bytes_acked = tp->snd_una - ca->nv_last_snd_una; + ca->nv_last_snd_una = tp->snd_una; + + if (sample->in_flight == 0) + return; + + /* Calculate moving average of RTT */ + if (nv_rtt_factor > 0) { + if (ca->nv_last_rtt > 0) { + avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor + + ((u64)ca->nv_last_rtt) + * (256 - nv_rtt_factor)) >> 8; + } else { + avg_rtt = sample->rtt_us; + ca->nv_min_rtt = avg_rtt << 1; + } + ca->nv_last_rtt = avg_rtt; + } else { + avg_rtt = sample->rtt_us; + } + + /* rate in 100's bits per second */ + rate64 = ((u64)sample->in_flight) * 8000000; + rate = (u32)div64_u64(rate64, (u64)(avg_rtt * 100)); + + /* Remember the maximum rate seen during this RTT + * Note: It may be more than one RTT. This function should be + * called at least nv_dec_eval_min_calls times. + */ + if (ca->nv_rtt_max_rate < rate) + ca->nv_rtt_max_rate = rate; + + /* We have valid information, increment counter */ + if (ca->nv_eval_call_cnt < 255) + ca->nv_eval_call_cnt++; + + /* update min rtt if necessary */ + if (avg_rtt < ca->nv_min_rtt) + ca->nv_min_rtt = avg_rtt; + + /* update future min_rtt if necessary */ + if (avg_rtt < ca->nv_min_rtt_new) + ca->nv_min_rtt_new = avg_rtt; + + /* nv_min_rtt is updated with the minimum (possibley averaged) rtt + * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a + * warm reset). This new nv_min_rtt will be continued to be updated + * and be used for another sysctl_tcp_nv_reset_period seconds, + * when it will be updated again. + * In practice we introduce some randomness, so the actual period used + * is chosen randomly from the range: + * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4) + */ + if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { + unsigned char rand; + + ca->nv_min_rtt = ca->nv_min_rtt_new; + ca->nv_min_rtt_new = NV_INIT_RTT; + get_random_bytes(&rand, 1); + ca->nv_min_rtt_reset_jiffies = + now + ((nv_reset_period * (384 + rand) * HZ) >> 9); + /* Every so often we decrease ca->nv_min_cwnd in case previous + * value is no longer accurate. + */ + ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND); + } + + /* Once per RTT check if we need to do congestion avoidance */ + if (before(ca->nv_rtt_start_seq, tp->snd_una)) { + ca->nv_rtt_start_seq = tp->snd_nxt; + if (ca->nv_rtt_cnt < 0xff) + /* Increase counter for RTTs without CA decision */ + ca->nv_rtt_cnt++; + + /* If this function is only called once within an RTT + * the cwnd is probably too small (in some cases due to + * tso, lro or interrupt coalescence), so we increase + * ca->nv_min_cwnd. + */ + if (ca->nv_eval_call_cnt == 1 && + bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache && + ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) { + ca->nv_min_cwnd = min(ca->nv_min_cwnd + + NV_MIN_CWND_GROW, + NV_TSO_CWND_BOUND + 1); + ca->nv_rtt_start_seq = tp->snd_nxt + + ca->nv_min_cwnd * tp->mss_cache; + ca->nv_eval_call_cnt = 0; + ca->nv_allow_cwnd_growth = 1; + return; + } + + /* Find the ideal cwnd for current rate from slope + * slope = 80000.0 * mss / nv_min_rtt + * cwnd_by_slope = nv_rtt_max_rate / slope + */ + cwnd_by_slope = (u32) + div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, + (u64)(80000 * tp->mss_cache)); + max_win = cwnd_by_slope + nv_pad; + + /* If cwnd > max_win, decrease cwnd + * if cwnd < max_win, grow cwnd + * else leave the same + */ + if (tp->snd_cwnd > max_win) { + /* there is congestion, check that it is ok + * to make a CA decision + * 1. We should have at least nv_dec_eval_min_calls + * data points before making a CA decision + * 2. We only make a congesion decision after + * nv_rtt_min_cnt RTTs + */ + if (ca->nv_rtt_cnt < nv_rtt_min_cnt) { + return; + } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { + if (ca->nv_eval_call_cnt < + nv_ssthresh_eval_min_calls) + return; + /* otherwise we will decrease cwnd */ + } else if (ca->nv_eval_call_cnt < + nv_dec_eval_min_calls) { + if (ca->nv_allow_cwnd_growth && + ca->nv_rtt_cnt > nv_stop_rtt_cnt) + ca->nv_allow_cwnd_growth = 0; + return; + } + + /* We have enough data to determine we are congested */ + ca->nv_allow_cwnd_growth = 0; + tp->snd_ssthresh = + (nv_ssthresh_factor * max_win) >> 3; + if (tp->snd_cwnd - max_win > 2) { + /* gap > 2, we do exponential cwnd decrease */ + int dec; + + dec = max(2U, ((tp->snd_cwnd - max_win) * + nv_cong_dec_mult) >> 7); + tp->snd_cwnd -= dec; + } else if (nv_cong_dec_mult > 0) { + tp->snd_cwnd = max_win; + } + if (ca->cwnd_growth_factor > 0) + ca->cwnd_growth_factor = 0; + ca->nv_no_cong_cnt = 0; + } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) { + /* There is no congestion, grow cwnd if allowed*/ + if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) + return; + + ca->nv_allow_cwnd_growth = 1; + ca->nv_no_cong_cnt++; + if (ca->cwnd_growth_factor < 0 && + nv_cwnd_growth_rate_neg > 0 && + ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) { + ca->cwnd_growth_factor++; + ca->nv_no_cong_cnt = 0; + } else if (ca->cwnd_growth_factor >= 0 && + nv_cwnd_growth_rate_pos > 0 && + ca->nv_no_cong_cnt > + nv_cwnd_growth_rate_pos) { + ca->cwnd_growth_factor++; + ca->nv_no_cong_cnt = 0; + } + } else { + /* cwnd is in-between, so do nothing */ + return; + } + + /* update state */ + ca->nv_eval_call_cnt = 0; + ca->nv_rtt_cnt = 0; + ca->nv_rtt_max_rate = 0; + + /* Don't want to make cwnd < nv_min_cwnd + * (it wasn't before, if it is now is because nv + * decreased it). + */ + if (tp->snd_cwnd < nv_min_cwnd) + tp->snd_cwnd = nv_min_cwnd; + } +} + +/* Extract info for Tcp socket info provided via netlink */ +size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + const struct tcpnv *ca = inet_csk_ca(sk); + + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt; + info->vegas.tcpv_rtt = ca->nv_last_rtt; + info->vegas.tcpv_minrtt = ca->nv_min_rtt; + + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); + } + return 0; +} +EXPORT_SYMBOL_GPL(tcpnv_get_info); + +static struct tcp_congestion_ops tcpnv __read_mostly = { + .init = tcpnv_init, + .ssthresh = tcpnv_recalc_ssthresh, + .cong_avoid = tcpnv_cong_avoid, + .set_state = tcpnv_state, + .undo_cwnd = tcpnv_undo_cwnd, + .pkts_acked = tcpnv_acked, + .get_info = tcpnv_get_info, + + .owner = THIS_MODULE, + .name = "nv", +}; + +static int __init tcpnv_register(void) +{ + BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); + + return tcp_register_congestion_control(&tcpnv); +} + +static void __exit tcpnv_unregister(void) +{ + tcp_unregister_congestion_control(&tcpnv); +} + +module_init(tcpnv_register); +module_exit(tcpnv_unregister); + +MODULE_AUTHOR("Lawrence Brakmo"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP NV"); +MODULE_VERSION("1.0"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e00e972c4e6a..b26aa870adc0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -911,9 +911,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); + tp = tcp_sk(sk); if (clone_it) { skb_mstamp_get(&skb->skb_mstamp); + TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq + - tp->snd_una; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -924,7 +927,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } inet = inet_sk(sk); - tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 47f12c73d959..58bd39fb14b4 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -76,6 +76,67 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); +void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + if (!dev->netdev_ops->ndo_udp_tunnel_add) + return; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); + +/* Notify netdevs that UDP port started listening */ +void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_add) + continue; + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); + +/* Notify netdevs that UDP port is no more listening */ +void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_del) + continue; + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); + void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7b0edb37a115..b644a23c3db0 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static int __net_init xfrm4_net_sysctl_init(struct net *net) +static __net_init int xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -323,7 +323,7 @@ err_alloc: return -ENOMEM; } -static void __net_exit xfrm4_net_sysctl_exit(struct net *net) +static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -336,12 +336,12 @@ static void __net_exit xfrm4_net_sysctl_exit(struct net *net) kfree(table); } #else /* CONFIG_SYSCTL */ -static int inline xfrm4_net_sysctl_init(struct net *net) +static inline int xfrm4_net_sysctl_init(struct net *net) { return 0; } -static void inline xfrm4_net_sysctl_exit(struct net *net) +static inline void xfrm4_net_sysctl_exit(struct net *net) { } #endif |