diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 12 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 2 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 64 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 6 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 4 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 9 | ||||
-rw-r--r-- | net/ipv4/raw.c | 15 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_bbr.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 61 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_rate.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp_recovery.c | 28 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 49 |
15 files changed, 206 insertions, 85 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ce4aa827be05..f00499a46927 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1223,9 +1223,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool fixedid = false, gso_partial, encap; + bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; + unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; @@ -1260,6 +1261,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ @@ -1279,7 +1281,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) { + if (udpfrag) { + iph->frag_off = htons(offset >> 3); + if (skb->next) + iph->frag_off |= htons(IP_MF); + offset += skb->len - nhoff - ihl; + tot_len = skb->len - nhoff; + } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a4573bccd6da..7a93359fbc72 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1428,7 +1428,7 @@ skip: static bool inetdev_valid_mtu(unsigned int mtu) { - return mtu >= 68; + return mtu >= IPV4_MIN_MTU; } static void inetdev_send_gratuitous_arp(struct net_device *dev, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index ab183af0b5b6..726f6b608274 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -89,6 +89,7 @@ #include <linux/rtnetlink.h> #include <linux/times.h> #include <linux/pkt_sched.h> +#include <linux/byteorder/generic.h> #include <net/net_namespace.h> #include <net/arp.h> @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } +/* source address selection per RFC 3376 section 4.2.13 */ +static __be32 igmpv3_get_srcaddr(struct net_device *dev, + const struct flowi4 *fl4) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return htonl(INADDR_ANY); + + for_ifa(in_dev) { + if (inet_ifa_match(fl4->saddr, ifa)) + return fl4->saddr; + } endfor_ifa(in_dev); + + return htonl(INADDR_ANY); +} + static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; @@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; - pip->saddr = fl4.saddr; + pip->saddr = igmpv3_get_srcaddr(dev, &fl4); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); @@ -404,16 +422,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, - int type, struct igmpv3_grec **ppgr) + int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; - if (!skb) - skb = igmpv3_newpack(dev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = igmpv3_newpack(dev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -436,12 +455,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV4_MIN_MTU) + return skb; + isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || @@ -462,7 +486,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); } } first = 1; @@ -498,12 +522,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -538,7 +562,7 @@ empty_source: igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) @@ -752,18 +776,18 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, return ip_local_out(net, skb->sk, skb); } -static void igmp_gq_timer_expire(unsigned long data) +static void igmp_gq_timer_expire(struct timer_list *t) { - struct in_device *in_dev = (struct in_device *)data; + struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer); in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); in_dev_put(in_dev); } -static void igmp_ifc_timer_expire(unsigned long data) +static void igmp_ifc_timer_expire(struct timer_list *t) { - struct in_device *in_dev = (struct in_device *)data; + struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); igmpv3_send_cr(in_dev); if (in_dev->mr_ifc_count) { @@ -784,9 +808,9 @@ static void igmp_ifc_event(struct in_device *in_dev) } -static void igmp_timer_expire(unsigned long data) +static void igmp_timer_expire(struct timer_list *t) { - struct ip_mc_list *im = (struct ip_mc_list *)data; + struct ip_mc_list *im = from_timer(im, t, timer); struct in_device *in_dev = im->interface; spin_lock(&im->lock); @@ -1385,7 +1409,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST - setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); + timer_setup(&im->timer, igmp_timer_expire, 0); im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; #endif @@ -1695,10 +1719,8 @@ void ip_mc_init_dev(struct in_device *in_dev) ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST - setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, - (unsigned long)in_dev); - setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, - (unsigned long)in_dev); + timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); + timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c690cd0d9b3f..b563e0c46bac 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -93,7 +93,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, } /* - * Enter the time wait state. + * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ @@ -111,7 +111,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; - spin_lock_bh(&bhead->lock); + spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); @@ -137,7 +137,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock_bh(lock); + spin_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index fe6fee728ce4..5ddb1cb52bd4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); - if (mtu < 68) - mtu = 68; + if (mtu < IPV4_MIN_MTU) + mtu = IPV4_MIN_MTU; return mtu; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 40a43ad294cb..fd5f19c988e4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -112,7 +112,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, bool all); -static void ipmr_expire_process(unsigned long arg); +static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES #define ipmr_for_each_table(mrt, net) \ @@ -375,8 +375,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); - setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, - (unsigned long)mrt); + timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); mrt->mroute_reg_vif_num = -1; #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -804,9 +803,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) } /* Timer process for the unresolved queue. */ -static void ipmr_expire_process(unsigned long arg) +static void ipmr_expire_process(struct timer_list *t) { - struct mr_table *mrt = (struct mr_table *)arg; + struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); unsigned long now; unsigned long expires; struct mfc_cache *c, *next; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33b70bfd1122..125c1eab3eaa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; + int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields + */ + hdrincl = inet->hdrincl; /* * Check the flags. */ @@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ - if (inet->hdrincl) + if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) @@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); - if (!inet->hdrincl) { + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf97317e6c97..f08eebe60446 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2412,6 +2412,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; tcp_clear_retrans(tp); inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 69ee877574d0..8322f26e770e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -110,7 +110,8 @@ struct bbr { u32 lt_last_lost; /* LT intvl start: tp->lost */ u32 pacing_gain:10, /* current gain for setting pacing rate */ cwnd_gain:10, /* current gain for setting cwnd */ - full_bw_cnt:3, /* number of rounds without large bw gains */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ has_seen_rtt:1, /* have we seen an RTT sample yet? */ unused_b:5; @@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk) { const struct bbr *bbr = inet_csk_ca(sk); - return bbr->full_bw_cnt >= bbr_full_bw_cnt; + return bbr->full_bw_reached; } /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ @@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk, return; } ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; } /* If pipe is probably full, drain the queue and then enter steady-state. */ @@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk) bbr->restore_cwnd = 0; bbr->round_start = 0; bbr->idle_restart = 0; + bbr->full_bw_reached = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; bbr->cycle_mstamp = 0; @@ -871,6 +874,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk) */ static u32 bbr_undo_cwnd(struct sock *sk) { + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; + bbr_reset_lt_bw_sampling(sk); return tcp_sk(sk)->snd_cwnd; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 734cfc8ff76e..9550cc42de2d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -579,6 +579,7 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; + tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; @@ -1941,6 +1942,8 @@ void tcp_enter_loss(struct sock *sk) if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; } tcp_clear_all_retrans_hints(tp); @@ -2326,6 +2329,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) } tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; + tp->rack.advanced = 1; /* Force RACK to re-exam losses */ } static inline bool tcp_may_undo(const struct tcp_sock *tp) @@ -2364,6 +2368,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) return true; } tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; return false; } @@ -2397,8 +2402,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - if (frto_undo || tcp_is_sack(tp)) + if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; + } return true; } return false; @@ -3495,6 +3502,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct tcp_sacktag_state sack_state; struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; + bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; @@ -3611,7 +3619,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, sack_state.rate); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c6bc0c4d19c6..94e28350f420 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -848,7 +848,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); @@ -1591,6 +1591,34 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_filter); +static void tcp_v4_restore_cb(struct sk_buff *skb) +{ + memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, + sizeof(struct inet_skb_parm)); +} + +static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, + const struct tcphdr *th) +{ + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; +} + /* * From tcp_input.c */ @@ -1631,24 +1659,6 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); - /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() - * barrier() makes sure compiler wont play fool^Waliasing games. - */ - memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), - sizeof(struct inet_skb_parm)); - barrier(); - - TCP_SKB_CB(skb)->seq = ntohl(th->seq); - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - skb->len - th->doff * 4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; - TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->has_rxtstamp = - skb->tstamp || skb_hwtstamps(skb)->hwtstamp; - lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); @@ -1679,14 +1689,19 @@ process: sock_hold(sk); refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false); + } if (!nsk) { reqsk_put(req); goto discard_and_relse; } if (nsk == sk) { reqsk_put(req); + tcp_v4_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; @@ -1712,6 +1727,7 @@ process: goto discard_and_relse; th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); + tcp_v4_fill_cb(skb, iph, th); skb->dev = NULL; @@ -1742,6 +1758,8 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { csum_error: __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); @@ -1768,6 +1786,8 @@ do_time_wait: goto discard_it; } + tcp_v4_fill_cb(skb, iph, th); + if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1784,6 +1804,7 @@ do_time_wait: if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + tcp_v4_restore_cb(skb); refcounted = false; goto process; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e36eff0403f4..b079b619b60c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -310,10 +310,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; + /* tw_timer is pinned, so we need to make sure BH are disabled + * in following section, otherwise timer handler could run before + * we complete the initialization. + */ + local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); + local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 3330a370d306..c61240e43923 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs) + bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp) { + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index d3ea89020c69..3a81720ac0c4 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -55,7 +55,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) && + min_rtt != ~0U) { reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); reo_wnd = min(reo_wnd, tp->srtt_us >> 3); } @@ -79,12 +80,12 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) */ remaining = tp->rack.rtt_us + reo_wnd - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); - if (remaining < 0) { + if (remaining <= 0) { tcp_rack_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { - /* Record maximum wait time (+1 to avoid 0) */ - *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + /* Record maximum wait time */ + *reo_timeout = max_t(u32, *reo_timeout, remaining); } } } @@ -116,13 +117,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, { u32 rtt_us; - if (tp->rack.mstamp && - !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) - return; - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (sacked & TCPCB_RETRANS) { + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -133,13 +129,15 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - if (rtt_us < tcp_min_rtt(tp)) - return; + return; } - tp->rack.rtt_us = rtt_us; - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } } /* We have waited long enough to accommodate reordering. Mark the expired diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e360d55be555..01801b77bd0d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -187,16 +187,57 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); -static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + __wsum csum; + struct udphdr *uh; + struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { segs = skb_udp_tunnel_segment(skb, features, false); + goto out; + } + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + uh = udp_hdr(skb); + iph = ip_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: return segs; } @@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_segment = udp4_tunnel_segment, + .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, |