diff options
Diffstat (limited to 'net')
90 files changed, 2102 insertions, 1066 deletions
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 42d0997e2fbb..8a8f77a247e6 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -733,7 +733,7 @@ void bt_procfs_cleanup(struct net *net, const char *name) EXPORT_SYMBOL(bt_procfs_init); EXPORT_SYMBOL(bt_procfs_cleanup); -static struct net_proto_family bt_sock_family_ops = { +static const struct net_proto_family bt_sock_family_ops = { .owner = THIS_MODULE, .family = PF_BLUETOOTH, .create = bt_sock_create, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 0db8102995a5..4efd5d54498a 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -150,7 +150,6 @@ static int br_stp_call_user(struct net_bridge *br, char *arg) static void br_stp_start(struct net_bridge *br) { - struct net_bridge_port *p; int err = -ENOENT; if (net_eq(dev_net(br->dev), &init_net)) @@ -169,11 +168,6 @@ static void br_stp_start(struct net_bridge *br) if (!err) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); - - /* Stop hello and hold timers */ - del_timer(&br->hello_timer); - list_for_each_entry(p, &br->port_list, list) - del_timer(&p->hold_timer); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); @@ -188,7 +182,6 @@ static void br_stp_start(struct net_bridge *br) static void br_stp_stop(struct net_bridge *br) { - struct net_bridge_port *p; int err; if (br->stp_enabled == BR_USER_STP) { @@ -197,10 +190,6 @@ static void br_stp_stop(struct net_bridge *br) br_err(br, "failed to stop userspace STP (%d)\n", err); /* To start timers on any ports left in blocking */ - mod_timer(&br->hello_timer, jiffies + br->hello_time); - list_for_each_entry(p, &br->port_list, list) - mod_timer(&p->hold_timer, - round_jiffies(jiffies + BR_HOLD_TIME)); spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 346ef6b00b8f..c16dd3a47fc6 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -111,7 +111,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net, __wsum csum; u8 proto; - if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb)) + if (!nft_bridge_iphdr_validate(oldskb)) return; /* IP header checks: fragment. */ @@ -226,9 +226,6 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) __be16 fo; u8 proto = ip6h->nexthdr; - if (skb->csum_bad) - return false; - if (skb_csum_unnecessary(skb)) return true; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index adcad344c843..4674d17e7c08 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1099,7 +1099,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, } -static struct net_proto_family caif_family_ops = { +static const struct net_proto_family caif_family_ops = { .family = PF_CAIF, .create = caif_create, .owner = THIS_MODULE, diff --git a/net/core/datagram.c b/net/core/datagram.c index db1866f2ffcf..bc46118486fe 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -161,6 +161,45 @@ done: return skb; } +struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb), + int *peeked, int *off, int *err, + struct sk_buff **last) +{ + struct sk_buff *skb; + int _off = *off; + + *last = queue->prev; + skb_queue_walk(queue, skb) { + if (flags & MSG_PEEK) { + if (_off >= skb->len && (skb->len || _off || + skb->peeked)) { + _off -= skb->len; + continue; + } + if (!skb->len) { + skb = skb_set_peeked(skb); + if (unlikely(IS_ERR(skb))) { + *err = PTR_ERR(skb); + return NULL; + } + } + *peeked = 1; + atomic_inc(&skb->users); + } else { + __skb_unlink(skb, queue); + if (destructor) + destructor(sk, skb); + } + *off = _off; + return skb; + } + return NULL; +} + /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket @@ -222,40 +261,14 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ - int _off = *off; - - *last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); - skb_queue_walk(queue, skb) { - *last = skb; - if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { - _off -= skb->len; - continue; - } - if (!skb->len) { - skb = skb_set_peeked(skb); - if (IS_ERR(skb)) { - error = PTR_ERR(skb); - spin_unlock_irqrestore(&queue->lock, - cpu_flags); - goto no_packet; - } - } - *peeked = 1; - atomic_inc(&skb->users); - } else { - __skb_unlink(skb, queue); - if (destructor) - destructor(sk, skb); - } - spin_unlock_irqrestore(&queue->lock, cpu_flags); - *off = _off; - return skb; - } - + skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, + peeked, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (error) + goto no_packet; + if (skb) + return skb; if (!sk_can_busy_loop(sk)) break; @@ -335,8 +348,8 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) } EXPORT_SYMBOL(__skb_free_datagram_locked); -int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, - unsigned int flags, +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, + struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)) { @@ -344,15 +357,15 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, if (flags & MSG_PEEK) { err = -ENOENT; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); + spin_lock_bh(&sk_queue->lock); + if (skb == skb_peek(sk_queue)) { + __skb_unlink(skb, sk_queue); atomic_dec(&skb->users); if (destructor) destructor(sk, skb); err = 0; } - spin_unlock_bh(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk_queue->lock); } atomic_inc(&sk->sk_drops); @@ -383,7 +396,8 @@ EXPORT_SYMBOL(__sk_queue_drop_skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { - int err = __sk_queue_drop_skb(sk, skb, flags, NULL); + int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, + NULL); kfree_skb(skb); sk_mem_reclaim_partial(sk); diff --git a/net/core/dev.c b/net/core/dev.c index fca407b4a6ea..3d98fbf4cbb0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -105,6 +105,7 @@ #include <net/dst.h> #include <net/dst_metadata.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/highmem.h> @@ -142,6 +143,7 @@ #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> #include <linux/crash_dump.h> +#include <linux/sctp.h> #include "net-sysfs.h" @@ -161,6 +163,7 @@ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, struct netdev_notifier_info *info); +static struct napi_struct *napi_by_id(unsigned int napi_id); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -865,6 +868,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** + * dev_get_by_napi_id - find a device by napi_id + * @napi_id: ID of the NAPI struct + * + * Search for an interface by NAPI ID. Returns %NULL if the device + * is not found or a pointer to the device. The device has not had + * its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ + struct napi_struct *napi; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (napi_id < MIN_NAPI_ID) + return NULL; + + napi = napi_by_id(napi_id); + + return napi ? napi->dev : NULL; +} +EXPORT_SYMBOL(dev_get_by_napi_id); + +/** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. @@ -2611,6 +2639,47 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +int skb_crc32c_csum_help(struct sk_buff *skb) +{ + __le32 crc32c_csum; + int ret = 0, offset, start; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + goto out; + + if (unlikely(skb_is_gso(skb))) + goto out; + + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (unlikely(skb_has_shared_frag(skb))) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + start = skb_checksum_start_offset(skb); + offset = start + offsetof(struct sctphdr, checksum); + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + ret = -EINVAL; + goto out; + } + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__le32))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, + skb->len - start, ~(__u32)0, + crc32c_csum_stub)); + *(__le32 *)(skb->data + offset) = crc32c_csum; + skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; +out: + return ret; +} + __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; @@ -2953,6 +3022,17 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, return skb; } +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features) +{ + if (unlikely(skb->csum_not_inet)) + return !!(features & NETIF_F_SCTP_CRC) ? 0 : + skb_crc32c_csum_help(skb); + + return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2991,8 +3071,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_CSUM_MASK) && - skb_checksum_help(skb)) + if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } @@ -3178,7 +3257,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3948,7 +4027,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -4636,9 +4715,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - if (skb->csum_bad) - goto normal; - gro_list_prepare(napi, skb); rcu_read_lock(); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index b94b1d293506..77f04e71100f 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -225,6 +225,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 65ea0ff4017c..58e6cc70500d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -323,7 +323,11 @@ NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - int res, orig_len = dev->tx_queue_len; + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; if (new_len != orig_len) { dev->tx_queue_len = new_len; @@ -349,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } -NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 49a279a7cc15..dab2834f58f8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2048,8 +2048,8 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_TXQLEN]) { - unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); - unsigned long orig_len = dev->tx_queue_len; + unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); + unsigned int orig_len = dev->tx_queue_len; if (dev->tx_queue_len ^ value) { dev->tx_queue_len = value; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 346d3e85dfbc..780b7c1563d0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2243,6 +2243,32 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, + int offset, int len) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static const struct skb_checksum_ops default_crc32c_ops = { + .update = warn_crc32c_csum_update, + .combine = warn_crc32c_csum_combine, +}; + +const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = + &default_crc32c_ops; +EXPORT_SYMBOL(crc32c_csum_stub); + /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer @@ -3875,6 +3901,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; diff --git a/net/core/sock.c b/net/core/sock.c index 727f924b7f91..bef844127e01 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1038,6 +1038,10 @@ set_rcvbuf: #endif case SO_MAX_PACING_RATE: + if (val != ~0U) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); sk->sk_max_pacing_rate = val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 93106120f987..733f523707ac 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -178,10 +178,6 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, }; -static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = { - [DCB_ATTR_IEEE_APP] = {.len = sizeof(struct dcb_app)}, -}; - /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, @@ -1463,8 +1459,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { struct dcb_app *app_data; + if (nla_type(attr) != DCB_ATTR_IEEE_APP) continue; + + if (nla_len(attr) < sizeof(struct dcb_app)) { + err = -ERANGE; + goto err; + } + app_data = nla_data(attr); if (ops->ieee_setapp) err = ops->ieee_setapp(netdev, app_data); diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 5e3a7302f774..e1295d5f2c56 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -233,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const u32 now = ccid2_time_stamp; + const u32 now = ccid2_jiffies32; struct ccid2_seq *next; /* slow-start after idle periods (RFC 2581, RFC 2861) */ @@ -466,7 +466,7 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, * The cleanest solution is to not use the ccid2s_sent field at all * and instead use DCCP timestamps: requires changes in other places. */ - ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); + ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -478,7 +478,7 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) return; } - hc->tx_last_cong = ccid2_time_stamp; + hc->tx_last_cong = ccid2_jiffies32; hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; hc->tx_ssthresh = max(hc->tx_cwnd, 2U); @@ -731,7 +731,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; - hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp; + hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; hc->tx_cwnd_used = 0; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 18c97543e522..6e50ef2898fb 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -27,7 +27,7 @@ * CCID-2 timestamping faces the same issues as TCP timestamping. * Hence we reuse/share as much of the code as possible. */ -#define ccid2_time_stamp tcp_time_stamp +#define ccid2_jiffies32 ((u32)jiffies) /* NUMDUPACK parameter from RFC 4341, p. 6 */ #define NUMDUPACK 3 diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 81a0868edb1d..297389b2ab35 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -25,16 +25,16 @@ config NET_DSA_TAG_DSA config NET_DSA_TAG_EDSA bool -config NET_DSA_TAG_TRAILER +config NET_DSA_TAG_LAN9303 bool -config NET_DSA_TAG_QCA +config NET_DSA_TAG_MTK bool -config NET_DSA_TAG_MTK +config NET_DSA_TAG_TRAILER bool -config NET_DSA_TAG_LAN9303 +config NET_DSA_TAG_QCA bool endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 0b747d75e65a..90e5aa6f7d0f 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,12 +1,12 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o slave.o dsa2.o switch.o legacy.o +dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o -dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o -dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o -dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o +dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o +dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 26130ae438da..3288a80d4d6c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -24,7 +24,7 @@ #include <linux/phy_fixed.h> #include <linux/gpio/consumer.h> #include <linux/etherdevice.h> -#include <net/dsa.h> + #include "dsa_priv.h" static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, @@ -40,26 +40,26 @@ static const struct dsa_device_ops none_ops = { }; const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { +#ifdef CONFIG_NET_DSA_TAG_BRCM + [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#endif #ifdef CONFIG_NET_DSA_TAG_DSA [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, #endif #ifdef CONFIG_NET_DSA_TAG_EDSA [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_LAN9303 + [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_MTK + [DSA_TAG_PROTO_MTK] = &mtk_netdev_ops, #endif #ifdef CONFIG_NET_DSA_TAG_QCA [DSA_TAG_PROTO_QCA] = &qca_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_MTK - [DSA_TAG_PROTO_MTK] = &mtk_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_LAN9303 - [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_TRAILER + [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, #endif [DSA_TAG_PROTO_NONE] = &none_ops, }; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 033b3bfb63dc..4301f52e4f5a 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -18,7 +18,7 @@ #include <linux/rtnetlink.h> #include <linux/of.h> #include <linux/of_net.h> -#include <net/dsa.h> + #include "dsa_priv.h" static LIST_HEAD(dsa_switch_trees); @@ -443,8 +443,8 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - if (dst->cpu_switch) { - err = dsa_cpu_port_ethtool_setup(dst->cpu_switch); + if (dst->cpu_dp) { + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp->ds); if (err) return err; } @@ -484,8 +484,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - if (dst->cpu_switch) - dsa_cpu_port_ethtool_restore(dst->cpu_switch); + if (dst->cpu_dp) + dsa_cpu_port_ethtool_restore(dst->cpu_dp->ds); pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; @@ -518,10 +518,8 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, if (!dst->master_netdev) dst->master_netdev = ethernet_dev; - if (!dst->cpu_switch) { - dst->cpu_switch = ds; - dst->cpu_port = index; - } + if (!dst->cpu_dp) + dst->cpu_dp = port; tag_protocol = ds->ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f4a88e485213..1d52f9051d0e 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -14,6 +14,57 @@ #include <linux/phy.h> #include <linux/netdevice.h> #include <linux/netpoll.h> +#include <net/dsa.h> + +enum { + DSA_NOTIFIER_AGEING_TIME, + DSA_NOTIFIER_BRIDGE_JOIN, + DSA_NOTIFIER_BRIDGE_LEAVE, + DSA_NOTIFIER_FDB_ADD, + DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_MDB_ADD, + DSA_NOTIFIER_MDB_DEL, + DSA_NOTIFIER_VLAN_ADD, + DSA_NOTIFIER_VLAN_DEL, +}; + +/* DSA_NOTIFIER_AGEING_TIME */ +struct dsa_notifier_ageing_time_info { + struct switchdev_trans *trans; + unsigned int ageing_time; + int sw_index; +}; + +/* DSA_NOTIFIER_BRIDGE_* */ +struct dsa_notifier_bridge_info { + struct net_device *br; + int sw_index; + int port; +}; + +/* DSA_NOTIFIER_FDB_* */ +struct dsa_notifier_fdb_info { + const struct switchdev_obj_port_fdb *fdb; + struct switchdev_trans *trans; + int sw_index; + int port; +}; + +/* DSA_NOTIFIER_MDB_* */ +struct dsa_notifier_mdb_info { + const struct switchdev_obj_port_mdb *mdb; + struct switchdev_trans *trans; + int sw_index; + int port; +}; + +/* DSA_NOTIFIER_VLAN_* */ +struct dsa_notifier_vlan_info { + const struct switchdev_obj_port_vlan *vlan; + struct switchdev_trans *trans; + int sw_index; + int port; +}; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); @@ -59,6 +110,39 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); int dsa_legacy_register(void); void dsa_legacy_unregister(void); +/* port.c */ +int dsa_port_set_state(struct dsa_port *dp, u8 state, + struct switchdev_trans *trans); +void dsa_port_set_state_now(struct dsa_port *dp, u8 state); +int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); +void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct switchdev_trans *trans); +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, + struct switchdev_trans *trans); +int dsa_port_fdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb, + struct switchdev_trans *trans); +int dsa_port_fdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb); +int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb, + switchdev_obj_dump_cb_t *cb); +int dsa_port_mdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans); +int dsa_port_mdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb, + switchdev_obj_dump_cb_t *cb); +int dsa_port_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct switchdev_trans *trans); +int dsa_port_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan); +int dsa_port_vlan_dump(struct dsa_port *dp, + struct switchdev_obj_port_vlan *vlan, + switchdev_obj_dump_cb_t *cb); + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); @@ -75,25 +159,25 @@ void dsa_slave_unregister_notifier(void); int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); +/* tag_brcm.c */ +extern const struct dsa_device_ops brcm_netdev_ops; + /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ extern const struct dsa_device_ops edsa_netdev_ops; -/* tag_trailer.c */ -extern const struct dsa_device_ops trailer_netdev_ops; +/* tag_lan9303.c */ +extern const struct dsa_device_ops lan9303_netdev_ops; -/* tag_brcm.c */ -extern const struct dsa_device_ops brcm_netdev_ops; +/* tag_mtk.c */ +extern const struct dsa_device_ops mtk_netdev_ops; /* tag_qca.c */ extern const struct dsa_device_ops qca_netdev_ops; -/* tag_mtk.c */ -extern const struct dsa_device_ops mtk_netdev_ops; - -/* tag_lan9303.c */ -extern const struct dsa_device_ops lan9303_netdev_ops; +/* tag_trailer.c */ +extern const struct dsa_device_ops trailer_netdev_ops; #endif diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ad345c8b0b06..ac4379b8d7ac 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -22,7 +22,7 @@ #include <linux/sysfs.h> #include <linux/phy_fixed.h> #include <linux/etherdevice.h> -#include <net/dsa.h> + #include "dsa_priv.h" /* switch driver registration ***********************************************/ @@ -115,13 +115,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) continue; if (!strcmp(name, "cpu")) { - if (dst->cpu_switch) { + if (dst->cpu_dp) { netdev_err(dst->master_netdev, "multiple cpu ports?!\n"); return -EINVAL; } - dst->cpu_switch = ds; - dst->cpu_port = i; + dst->cpu_dp = &ds->ports[i]; ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; @@ -144,7 +143,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * tagging protocol to the preferred tagging format of this * switch. */ - if (dst->cpu_switch == ds) { + if (dst->cpu_dp->ds == ds) { enum dsa_tag_protocol tag_protocol; tag_protocol = ops->get_tag_protocol(ds); @@ -624,7 +623,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, dst->pd = pd; dst->master_netdev = dev; - dst->cpu_port = -1; for (i = 0; i < pd->nr_chips; i++) { struct dsa_switch *ds; @@ -735,7 +733,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_switch); + dsa_cpu_port_ethtool_restore(dst->cpu_dp->ds); dev_put(dst->master_netdev); } diff --git a/net/dsa/port.c b/net/dsa/port.c new file mode 100644 index 000000000000..c88c0cec8454 --- /dev/null +++ b/net/dsa/port.c @@ -0,0 +1,260 @@ +/* + * Handling of a single switch port + * + * Copyright (c) 2017 Savoir-faire Linux Inc. + * Vivien Didelot <vivien.didelot@savoirfairelinux.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/if_bridge.h> +#include <linux/notifier.h> + +#include "dsa_priv.h" + +static int dsa_port_notify(struct dsa_port *dp, unsigned long e, void *v) +{ + struct raw_notifier_head *nh = &dp->ds->dst->nh; + int err; + + err = raw_notifier_call_chain(nh, e, v); + + return notifier_to_errno(err); +} + +int dsa_port_set_state(struct dsa_port *dp, u8 state, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (switchdev_trans_ph_prepare(trans)) + return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; + + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, port, state); + + if (ds->ops->port_fast_age) { + /* Fast age FDB entries or flush appropriate forwarding database + * for the given port, if we are moving it from Learning or + * Forwarding state, to Disabled or Blocking or Listening state. + */ + + if ((dp->stp_state == BR_STATE_LEARNING || + dp->stp_state == BR_STATE_FORWARDING) && + (state == BR_STATE_DISABLED || + state == BR_STATE_BLOCKING || + state == BR_STATE_LISTENING)) + ds->ops->port_fast_age(ds, port); + } + + dp->stp_state = state; + + return 0; +} + +void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +{ + int err; + + err = dsa_port_set_state(dp, state, NULL); + if (err) + pr_err("DSA: failed to set STP state %u (%d)\n", state, err); +} + +int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) +{ + struct dsa_notifier_bridge_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .br = br, + }; + int err; + + /* Here the port is already bridged. Reflect the current configuration + * so that drivers can program their chips accordingly. + */ + dp->bridge_dev = br; + + err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); + + /* The bridging is rolled back on error */ + if (err) + dp->bridge_dev = NULL; + + return err; +} + +void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) +{ + struct dsa_notifier_bridge_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .br = br, + }; + int err; + + /* Here the port is already unbridged. Reflect the current configuration + * so that drivers can program their chips accordingly. + */ + dp->bridge_dev = NULL; + + err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); + if (err) + pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); + + /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, + * so allow it to be in BR_STATE_FORWARDING to be kept functional + */ + dsa_port_set_state_now(dp, BR_STATE_FORWARDING); +} + +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct switchdev_trans *trans) +{ + struct dsa_switch *ds = dp->ds; + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + if (ds->ops->port_vlan_filtering) + return ds->ops->port_vlan_filtering(ds, dp->index, + vlan_filtering); + + return 0; +} + +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, + struct switchdev_trans *trans) +{ + unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock); + unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); + struct dsa_notifier_ageing_time_info info = { + .ageing_time = ageing_time, + .sw_index = dp->ds->index, + .trans = trans, + }; + + if (switchdev_trans_ph_prepare(trans)) + return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); + + dp->ageing_time = ageing_time; + + return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); +} + +int dsa_port_fdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb, + struct switchdev_trans *trans) +{ + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .trans = trans, + .fdb = fdb, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, &info); +} + +int dsa_port_fdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_fdb *fdb) +{ + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .fdb = fdb, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); +} + +int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_fdb_dump) + return ds->ops->port_fdb_dump(ds, dp->index, fdb, cb); + + return -EOPNOTSUPP; +} + +int dsa_port_mdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans) +{ + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .trans = trans, + .mdb = mdb, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info); +} + +int dsa_port_mdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mdb = mdb, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info); +} + +int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_mdb_dump) + return ds->ops->port_mdb_dump(ds, dp->index, mdb, cb); + + return -EOPNOTSUPP; +} + +int dsa_port_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct switchdev_trans *trans) +{ + struct dsa_notifier_vlan_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .trans = trans, + .vlan = vlan, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); +} + +int dsa_port_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan) +{ + struct dsa_notifier_vlan_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .vlan = vlan, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); +} + +int dsa_port_vlan_dump(struct dsa_port *dp, + struct switchdev_obj_port_vlan *vlan, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_vlan_dump) + return ds->ops->port_vlan_dump(ds, dp->index, vlan, cb); + + return -EOPNOTSUPP; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7693182df81e..887e26695519 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -17,28 +17,16 @@ #include <linux/of_mdio.h> #include <linux/mdio.h> #include <linux/list.h> -#include <net/dsa.h> #include <net/rtnetlink.h> -#include <net/switchdev.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/netpoll.h> + #include "dsa_priv.h" static bool dsa_slave_dev_check(struct net_device *dev); -static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct raw_notifier_head *nh = &p->dp->ds->dst->nh; - int err; - - err = raw_notifier_call_chain(nh, e, v); - - return notifier_to_errno(err); -} - /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -86,33 +74,6 @@ static inline bool dsa_port_is_bridged(struct dsa_port *dp) return !!dp->bridge_dev; } -static void dsa_slave_set_state(struct net_device *dev, u8 state) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; - int port = dp->index; - - if (ds->ops->port_stp_state_set) - ds->ops->port_stp_state_set(ds, port, state); - - if (ds->ops->port_fast_age) { - /* Fast age FDB entries or flush appropriate forwarding database - * for the given port, if we are moving it from Learning or - * Forwarding state, to Disabled or Blocking or Listening state. - */ - - if ((dp->stp_state == BR_STATE_LEARNING || - dp->stp_state == BR_STATE_FORWARDING) && - (state == BR_STATE_DISABLED || - state == BR_STATE_BLOCKING || - state == BR_STATE_LISTENING)) - ds->ops->port_fast_age(ds, port); - } - - dp->stp_state = state; -} - static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -148,7 +109,7 @@ static int dsa_slave_open(struct net_device *dev) goto clear_promisc; } - dsa_slave_set_state(dev, stp_state); + dsa_port_set_state_now(p->dp, stp_state); if (p->phy) phy_start(p->phy); @@ -190,7 +151,7 @@ static int dsa_slave_close(struct net_device *dev) if (ds->ops->port_disable) ds->ops->port_disable(ds, p->dp->index, p->phy); - dsa_slave_set_state(dev, BR_STATE_DISABLED); + dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); return 0; } @@ -243,140 +204,6 @@ out: return 0; } -static int dsa_slave_port_vlan_add(struct net_device *dev, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) - return -EOPNOTSUPP; - - return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans); - } - - ds->ops->port_vlan_add(ds, dp->index, vlan, trans); - - return 0; -} - -static int dsa_slave_port_vlan_del(struct net_device *dev, - const struct switchdev_obj_port_vlan *vlan) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (!ds->ops->port_vlan_del) - return -EOPNOTSUPP; - - return ds->ops->port_vlan_del(ds, p->dp->index, vlan); -} - -static int dsa_slave_port_vlan_dump(struct net_device *dev, - struct switchdev_obj_port_vlan *vlan, - switchdev_obj_dump_cb_t *cb) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (ds->ops->port_vlan_dump) - return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb); - - return -EOPNOTSUPP; -} - -static int dsa_slave_port_fdb_add(struct net_device *dev, - const struct switchdev_obj_port_fdb *fdb, - struct switchdev_trans *trans) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) - return -EOPNOTSUPP; - - return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans); - } - - ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans); - - return 0; -} - -static int dsa_slave_port_fdb_del(struct net_device *dev, - const struct switchdev_obj_port_fdb *fdb) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - int ret = -EOPNOTSUPP; - - if (ds->ops->port_fdb_del) - ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb); - - return ret; -} - -static int dsa_slave_port_fdb_dump(struct net_device *dev, - struct switchdev_obj_port_fdb *fdb, - switchdev_obj_dump_cb_t *cb) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (ds->ops->port_fdb_dump) - return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb); - - return -EOPNOTSUPP; -} - -static int dsa_slave_port_mdb_add(struct net_device *dev, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (switchdev_trans_ph_prepare(trans)) { - if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) - return -EOPNOTSUPP; - - return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans); - } - - ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans); - - return 0; -} - -static int dsa_slave_port_mdb_del(struct net_device *dev, - const struct switchdev_obj_port_mdb *mdb) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (ds->ops->port_mdb_del) - return ds->ops->port_mdb_del(ds, p->dp->index, mdb); - - return -EOPNOTSUPP; -} - -static int dsa_slave_port_mdb_dump(struct net_device *dev, - struct switchdev_obj_port_mdb *mdb, - switchdev_obj_dump_cb_t *cb) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (ds->ops->port_mdb_dump) - return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb); - - return -EOPNOTSUPP; -} - static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -387,96 +214,24 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -static int dsa_slave_stp_state_set(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - if (switchdev_trans_ph_prepare(trans)) - return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - - dsa_slave_set_state(dev, attr->u.stp_state); - - return 0; -} - -static int dsa_slave_vlan_filtering(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - - /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ - if (switchdev_trans_ph_prepare(trans)) - return 0; - - if (ds->ops->port_vlan_filtering) - return ds->ops->port_vlan_filtering(ds, p->dp->index, - attr->u.vlan_filtering); - - return 0; -} - -static unsigned int dsa_fastest_ageing_time(struct dsa_switch *ds, - unsigned int ageing_time) -{ - int i; - - for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = &ds->ports[i]; - - if (dp && dp->ageing_time && dp->ageing_time < ageing_time) - ageing_time = dp->ageing_time; - } - - return ageing_time; -} - -static int dsa_slave_ageing_time(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); - unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); - - if (switchdev_trans_ph_prepare(trans)) { - if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) - return -ERANGE; - if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) - return -ERANGE; - return 0; - } - - /* Keep the fastest ageing time in case of multiple bridges */ - p->dp->ageing_time = ageing_time; - ageing_time = dsa_fastest_ageing_time(ds, ageing_time); - - if (ds->ops->set_ageing_time) - return ds->ops->set_ageing_time(ds, ageing_time); - - return 0; -} - static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int ret; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - ret = dsa_slave_stp_state_set(dev, attr, trans); + ret = dsa_port_set_state(dp, attr->u.stp_state, trans); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - ret = dsa_slave_vlan_filtering(dev, attr, trans); + ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, + trans); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: - ret = dsa_slave_ageing_time(dev, attr, trans); + ret = dsa_port_ageing_time(dp, attr->u.ageing_time, trans); break; default: ret = -EOPNOTSUPP; @@ -490,6 +245,8 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err; /* For the prepare phase, ensure the full set of changes is feasable in @@ -499,18 +256,14 @@ static int dsa_slave_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_FDB: - err = dsa_slave_port_fdb_add(dev, - SWITCHDEV_OBJ_PORT_FDB(obj), - trans); + err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj), trans); break; case SWITCHDEV_OBJ_ID_PORT_MDB: - err = dsa_slave_port_mdb_add(dev, SWITCHDEV_OBJ_PORT_MDB(obj), - trans); + err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_port_vlan_add(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj), - trans); + err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), + trans); break; default: err = -EOPNOTSUPP; @@ -523,19 +276,19 @@ static int dsa_slave_port_obj_add(struct net_device *dev, static int dsa_slave_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_FDB: - err = dsa_slave_port_fdb_del(dev, - SWITCHDEV_OBJ_PORT_FDB(obj)); + err = dsa_port_fdb_del(dp, SWITCHDEV_OBJ_PORT_FDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_MDB: - err = dsa_slave_port_mdb_del(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_port_vlan_del(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj)); + err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj)); break; default: err = -EOPNOTSUPP; @@ -549,22 +302,19 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, switchdev_obj_dump_cb_t *cb) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_FDB: - err = dsa_slave_port_fdb_dump(dev, - SWITCHDEV_OBJ_PORT_FDB(obj), - cb); + err = dsa_port_fdb_dump(dp, SWITCHDEV_OBJ_PORT_FDB(obj), cb); break; case SWITCHDEV_OBJ_ID_PORT_MDB: - err = dsa_slave_port_mdb_dump(dev, SWITCHDEV_OBJ_PORT_MDB(obj), - cb); + err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_port_vlan_dump(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj), - cb); + err = dsa_port_vlan_dump(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), cb); break; default: err = -EOPNOTSUPP; @@ -574,57 +324,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, return err; } -static int dsa_slave_bridge_port_join(struct net_device *dev, - struct net_device *br) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_notifier_bridge_info info = { - .sw_index = p->dp->ds->index, - .port = p->dp->index, - .br = br, - }; - int err; - - /* Here the port is already bridged. Reflect the current configuration - * so that drivers can program their chips accordingly. - */ - p->dp->bridge_dev = br; - - err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info); - - /* The bridging is rolled back on error */ - if (err) - p->dp->bridge_dev = NULL; - - return err; -} - -static void dsa_slave_bridge_port_leave(struct net_device *dev, - struct net_device *br) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_notifier_bridge_info info = { - .sw_index = p->dp->ds->index, - .port = p->dp->index, - .br = br, - }; - int err; - - /* Here the port is already unbridged. Reflect the current configuration - * so that drivers can program their chips accordingly. - */ - p->dp->bridge_dev = NULL; - - err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info); - if (err) - netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); - - /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, - * so allow it to be in BR_STATE_FORWARDING to be kept functional - */ - dsa_slave_set_state(dev, BR_STATE_FORWARDING); -} - static int dsa_slave_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { @@ -821,8 +520,8 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->cpu_switch; - s8 cpu_port = dst->cpu_port; + struct dsa_switch *ds = dst->cpu_dp->ds; + s8 cpu_port = dst->cpu_dp->index; int count = 0; if (dst->master_ethtool_ops.get_sset_count) { @@ -838,7 +537,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->cpu_switch; + struct dsa_switch *ds = dst->cpu_dp->ds; int count = 0; if (dst->master_ethtool_ops.get_sset_count) @@ -854,8 +553,8 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->cpu_switch; - s8 cpu_port = dst->cpu_port; + struct dsa_switch *ds = dst->cpu_dp->ds; + s8 cpu_port = dst->cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; unsigned int i; @@ -1528,14 +1227,16 @@ static bool dsa_slave_dev_check(struct net_device *dev) static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; int err = NOTIFY_DONE; if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { - err = dsa_slave_bridge_port_join(dev, info->upper_dev); + err = dsa_port_bridge_join(dp, info->upper_dev); err = notifier_from_errno(err); } else { - dsa_slave_bridge_port_leave(dev, info->upper_dev); + dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index ca6e26e514f0..c1e4b2d5a3ae 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -12,7 +12,51 @@ #include <linux/netdevice.h> #include <linux/notifier.h> -#include <net/dsa.h> +#include <net/switchdev.h> + +#include "dsa_priv.h" + +static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, + unsigned int ageing_time) +{ + int i; + + for (i = 0; i < ds->num_ports; ++i) { + struct dsa_port *dp = &ds->ports[i]; + + if (dp->ageing_time && dp->ageing_time < ageing_time) + ageing_time = dp->ageing_time; + } + + return ageing_time; +} + +static int dsa_switch_ageing_time(struct dsa_switch *ds, + struct dsa_notifier_ageing_time_info *info) +{ + unsigned int ageing_time = info->ageing_time; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) + return -ERANGE; + if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) + return -ERANGE; + return 0; + } + + /* Program the fastest ageing time in case of multiple bridges */ + ageing_time = dsa_switch_fastest_ageing_time(ds, ageing_time); + + if (ds->ops->set_ageing_time) + return ds->ops->set_ageing_time(ds, ageing_time); + + return 0; +} static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) @@ -40,6 +84,117 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, return 0; } +static int dsa_switch_fdb_add(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + const struct switchdev_obj_port_fdb *fdb = info->fdb; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_prepare(ds, info->port, fdb, trans); + } + + ds->ops->port_fdb_add(ds, info->port, fdb, trans); + + return 0; +} + +static int dsa_switch_fdb_del(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + const struct switchdev_obj_port_fdb *fdb = info->fdb; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (!ds->ops->port_fdb_del) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_del(ds, info->port, fdb); +} + +static int dsa_switch_mdb_add(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) +{ + const struct switchdev_obj_port_mdb *mdb = info->mdb; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_mdb_prepare(ds, info->port, mdb, trans); + } + + ds->ops->port_mdb_add(ds, info->port, mdb, trans); + + return 0; +} + +static int dsa_switch_mdb_del(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) +{ + const struct switchdev_obj_port_mdb *mdb = info->mdb; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (!ds->ops->port_mdb_del) + return -EOPNOTSUPP; + + return ds->ops->port_mdb_del(ds, info->port, mdb); +} + +static int dsa_switch_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) +{ + const struct switchdev_obj_port_vlan *vlan = info->vlan; + struct switchdev_trans *trans = info->trans; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) + return -EOPNOTSUPP; + + return ds->ops->port_vlan_prepare(ds, info->port, vlan, trans); + } + + ds->ops->port_vlan_add(ds, info->port, vlan, trans); + + return 0; +} + +static int dsa_switch_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) +{ + const struct switchdev_obj_port_vlan *vlan = info->vlan; + + /* Do not care yet about other switch chips of the fabric */ + if (ds->index != info->sw_index) + return 0; + + if (!ds->ops->port_vlan_del) + return -EOPNOTSUPP; + + return ds->ops->port_vlan_del(ds, info->port, vlan); +} + static int dsa_switch_event(struct notifier_block *nb, unsigned long event, void *info) { @@ -47,12 +202,33 @@ static int dsa_switch_event(struct notifier_block *nb, int err; switch (event) { + case DSA_NOTIFIER_AGEING_TIME: + err = dsa_switch_ageing_time(ds, info); + break; case DSA_NOTIFIER_BRIDGE_JOIN: err = dsa_switch_bridge_join(ds, info); break; case DSA_NOTIFIER_BRIDGE_LEAVE: err = dsa_switch_bridge_leave(ds, info); break; + case DSA_NOTIFIER_FDB_ADD: + err = dsa_switch_fdb_add(ds, info); + break; + case DSA_NOTIFIER_FDB_DEL: + err = dsa_switch_fdb_del(ds, info); + break; + case DSA_NOTIFIER_MDB_ADD: + err = dsa_switch_mdb_add(ds, info); + break; + case DSA_NOTIFIER_MDB_DEL: + err = dsa_switch_mdb_del(ds, info); + break; + case DSA_NOTIFIER_VLAN_ADD: + err = dsa_switch_vlan_add(ds, info); + break; + case DSA_NOTIFIER_VLAN_DEL: + err = dsa_switch_vlan_del(ds, info); + break; default: err = -EOPNOTSUPP; break; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 2a9b52c5af86..9f204f18ada3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -12,7 +12,7 @@ #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> -#include <net/dsa.h> + #include "dsa_priv.h" /* This tag length is 4 bytes, older ones were 6 bytes, we do not @@ -101,7 +101,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, int source_port; u8 *brcm_tag; - ds = dst->cpu_switch; + ds = dst->cpu_dp->ds; if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) goto out_drop; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 1c6633f0de01..3b62a57956a3 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -11,7 +11,7 @@ #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> -#include <net/dsa.h> + #include "dsa_priv.h" #define DSA_HLEN 4 diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index d9c668aa5e54..f95cafd05702 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -11,7 +11,7 @@ #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> -#include <net/dsa.h> + #include "dsa_priv.h" #define DSA_HLEN 4 diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 70130ed5c21a..afd59330b5f1 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -14,7 +14,7 @@ #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> -#include <net/dsa.h> + #include "dsa_priv.h" /* To define the outgoing port and to discover the incoming port a regular diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 837cdddb53f0..d1258e84cd71 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -13,7 +13,7 @@ */ #include <linux/etherdevice.h> -#include <net/dsa.h> + #include "dsa_priv.h" #define MTK_HDR_LEN 4 diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 3ba3f59f7a34..2451007699b7 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -12,7 +12,7 @@ */ #include <linux/etherdevice.h> -#include <net/dsa.h> + #include "dsa_priv.h" #define QCA_HDR_LEN 2 @@ -99,7 +99,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* This protocol doesn't support cascading multiple switches so it's * safe to assume the switch is first in the tree */ - ds = dst->cpu_switch; + ds = dst->cpu_dp->ds; if (!ds) goto out_drop; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index aafc2fc74c30..7488ab2932ab 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -11,7 +11,7 @@ #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> -#include <net/dsa.h> + #include "dsa_priv.h" static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) @@ -67,7 +67,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, u8 *trailer; int source_port; - ds = dst->cpu_switch; + ds = dst->cpu_dp->ds; if (skb_linearize(skb)) goto out_drop; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 83e3ed258467..14d2f7bd7c76 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -594,7 +594,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = fib_table_insert(net, tb, &cfg); + err = fib_table_insert(net, tb, + &cfg, NULL); else err = -ENOBUFS; } @@ -626,14 +627,15 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_config *cfg) + struct nlmsghdr *nlh, struct fib_config *cfg, + struct netlink_ext_ack *extack) { struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, - NULL); + extack); if (err < 0) goto errout; @@ -654,6 +656,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } @@ -718,12 +721,13 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_table *tb; int err; - err = rtm_to_fib_config(net, skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; tb = fib_get_table(net, cfg.fc_table); if (!tb) { + NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto errout; } @@ -741,7 +745,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_table *tb; int err; - err = rtm_to_fib_config(net, skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; @@ -751,7 +755,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_table_insert(net, tb, &cfg); + err = fib_table_insert(net, tb, &cfg, extack); errout: return err; } @@ -845,7 +849,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - fib_table_insert(net, tb, &cfg); + fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg); } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 9c02920725db..2704e08545da 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -28,7 +28,8 @@ static inline void fib_alias_accessed(struct fib_alias *fa) /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); -struct fib_info *fib_create_info(struct fib_config *cfg); +struct fib_info *fib_create_info(struct fib_config *cfg, + struct netlink_ext_ack *extack); int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449ddb8cc1..4852e183afe0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -32,6 +32,7 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/netlink.h> #include <net/arp.h> #include <net/ip.h> @@ -454,7 +455,8 @@ static int fib_detect_death(struct fib_info *fi, int order, #ifdef CONFIG_IP_ROUTE_MULTIPATH -static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) +static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, + struct netlink_ext_ack *extack) { int nhs = 0; @@ -464,22 +466,35 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) } /* leftover implies invalid nexthop configuration, discard it */ - return remaining > 0 ? 0 : nhs; + if (remaining > 0) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - extra data after nexthops"); + nhs = 0; + } + + return nhs; } static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, - int remaining, struct fib_config *cfg) + int remaining, struct fib_config *cfg, + struct netlink_ext_ack *extack) { int ret; change_nexthops(fi) { int attrlen; - if (!rtnh_ok(rtnh, remaining)) + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - extra data after nexthop"); return -EINVAL; + } - if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); return -EINVAL; + } nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; @@ -505,8 +520,12 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); - if (!nla_entype) + if (!nla_entype) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, + "Encap type is missing"); goto err_inval; + } ret = lwtunnel_build_state(nla_get_u16( nla_entype), @@ -714,7 +733,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) * |-> {local prefix} (terminal node) */ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh) + struct fib_nh *nh, struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -727,16 +746,25 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (nh->nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; - if (cfg->fc_scope >= RT_SCOPE_LINK) + if (cfg->fc_scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid scope"); return -EINVAL; + } dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; - if (!(dev->flags & IFF_UP)) + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, + "Nexthop device is not up"); return -ENETDOWN; + } addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); - if (addr_type != RTN_UNICAST) + if (addr_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway"); return -EINVAL; + } if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; @@ -776,18 +804,25 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } if (err) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway"); rcu_read_unlock(); return err; } } err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; + } nh->nh_scope = res.scope; nh->nh_oif = FIB_RES_OIF(res); nh->nh_dev = dev = FIB_RES_DEV(res); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG(extack, + "No egress device for nexthop gateway"); goto out; + } dev_hold(dev); if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; @@ -795,17 +830,21 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } else { struct in_device *in_dev; - if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) + if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; - + } rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->nh_oif); if (!in_dev) goto out; err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) + if (!(in_dev->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; + } nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; @@ -980,7 +1019,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) return 0; } -struct fib_info *fib_create_info(struct fib_config *cfg) +struct fib_info *fib_create_info(struct fib_config *cfg, + struct netlink_ext_ack *extack) { int err; struct fib_info *fi = NULL; @@ -992,15 +1032,20 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; /* Fast check to catch the most weird cases */ - if (fib_props[cfg->fc_type].scope > cfg->fc_scope) + if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { + NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; + } - if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { + NL_SET_ERR_MSG(extack, + "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); goto err_inval; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { - nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); + nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); if (nhs == 0) goto err_inval; } @@ -1062,18 +1107,29 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); if (err != 0) goto failure; - if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) + if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) { + NL_SET_ERR_MSG(extack, + "Nexthop device index does not match RTA_OIF"); goto err_inval; - if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) + } + if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY"); goto err_inval; + } #ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) + if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + NL_SET_ERR_MSG(extack, + "Nexthop class id does not match RTA_FLOW"); goto err_inval; + } #endif #else + NL_SET_ERR_MSG(extack, + "Multipath support not enabled in kernel"); goto err_inval; #endif } else { @@ -1082,8 +1138,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_encap) { struct lwtunnel_state *lwtstate; - if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { + NL_SET_ERR_MSG(extack, + "LWT encap type not specified"); goto err_inval; + } err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET, cfg, &lwtstate); @@ -1106,8 +1165,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } if (fib_props[cfg->fc_type].error) { - if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) + if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { + NL_SET_ERR_MSG(extack, + "Gateway, device and multipath can not be specified for this route type"); goto err_inval; + } goto link_it; } else { switch (cfg->fc_type) { @@ -1118,19 +1180,30 @@ struct fib_info *fib_create_info(struct fib_config *cfg) case RTN_MULTICAST: break; default: + NL_SET_ERR_MSG(extack, "Invalid route type"); goto err_inval; } } - if (cfg->fc_scope > RT_SCOPE_HOST) + if (cfg->fc_scope > RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; + } if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ - if (nhs != 1 || nh->nh_gw) + if (nhs != 1) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have multiple nexthops"); + goto err_inval; + } + if (nh->nh_gw) { + NL_SET_ERR_MSG(extack, + "Route with host scope can not have a gateway"); goto err_inval; + } nh->nh_scope = RT_SCOPE_NOWHERE; nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); err = -ENODEV; @@ -1140,7 +1213,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh); + err = fib_check_nh(cfg, fi, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) @@ -1150,8 +1223,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { + NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); goto err_inval; + } change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 51182ff2b441..6d0f6c79d9aa 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1101,7 +1101,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, - struct fib_config *cfg) + struct fib_config *cfg, struct netlink_ext_ack *extack) { enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; @@ -1125,7 +1125,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - fi = fib_create_info(cfg); + fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 805f6607f8d9..8e0257d01200 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <net/genetlink.h> #include <net/gue.h> +#include <net/fou.h> #include <net/ip.h> #include <net/protocol.h> #include <net/udp.h> @@ -859,25 +860,6 @@ size_t gue_encap_hlen(struct ip_tunnel_encap *e) } EXPORT_SYMBOL(gue_encap_hlen); -static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, - struct flowi4 *fl4, u8 *protocol, __be16 sport) -{ - struct udphdr *uh; - - skb_push(skb, sizeof(struct udphdr)); - skb_reset_transport_header(skb); - - uh = udp_hdr(skb); - - uh->dest = e->dport; - uh->source = sport; - uh->len = htons(skb->len); - udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, - fl4->saddr, fl4->daddr, skb->len); - - *protocol = IPPROTO_UDP; -} - int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { @@ -894,24 +876,6 @@ int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } EXPORT_SYMBOL(__fou_build_header); -int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi4 *fl4) -{ - int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : - SKB_GSO_UDP_TUNNEL; - __be16 sport; - int err; - - err = __fou_build_header(skb, e, protocol, &sport, type); - if (err) - return err; - - fou_build_udp(skb, e, fl4, protocol, sport); - - return 0; -} -EXPORT_SYMBOL(fou_build_header); - int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { @@ -985,8 +949,46 @@ int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } EXPORT_SYMBOL(__gue_build_header); -int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi4 *fl4) +#ifdef CONFIG_NET_FOU_IP_TUNNELS + +static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi4 *fl4, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; + __be16 sport; + int err; + + err = __fou_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} + +static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; @@ -1001,9 +1003,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } -EXPORT_SYMBOL(gue_build_header); -#ifdef CONFIG_NET_FOU_IP_TUNNELS static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1054d330bf9d..82dec8825d28 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -25,6 +25,7 @@ #include <net/xfrm.h> #include <net/tcp.h> #include <net/sock_reuseport.h> +#include <net/addrconf.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 7cd8d0d918f8..6f8d9e5e062b 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -172,7 +172,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) struct iphdr *iph = ip_hdr(skb_in); u8 proto; - if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET)) + if (iph->frag_off & htons(IP_OFFSET)) return; if (skb_csum_unnecessary(skb_in)) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0257d965f111..6426250a58ea 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -66,10 +66,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ -__u32 cookie_init_timestamp(struct request_sock *req) +u64 cookie_init_timestamp(struct request_sock *req) { struct inet_request_sock *ireq; - u32 ts, ts_now = tcp_time_stamp; + u32 ts, ts_now = tcp_time_stamp_raw(); u32 options = 0; ireq = inet_rsk(req); @@ -88,7 +88,7 @@ __u32 cookie_init_timestamp(struct request_sock *req) ts <<= TSBITS; ts |= options; } - return ts; + return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); } @@ -343,7 +343,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack.v64 = 0; + treq->snt_synack = 0; treq->tfo_listener = false; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 842b575f8fdd..aaf663a1e571 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -386,7 +386,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U); + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -2183,7 +2183,7 @@ adjudge_to_death: /* Now socket is owned by kernel and we acquire BH lock - to finish close. No need to check for user refs. + * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); @@ -2475,7 +2475,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet - * know which interface is going to be used */ + * know which interface is going to be used + */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; @@ -2710,7 +2711,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EPERM; else - tp->tsoffset = val - tcp_time_stamp; + tp->tsoffset = val - tcp_time_stamp_raw(); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -2761,7 +2762,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) - stats[i] += tcp_time_stamp - tp->chrono_start; + stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } @@ -2845,7 +2846,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; - now = tcp_time_stamp; + now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); @@ -3076,7 +3077,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_time_stamp + tp->tsoffset; + val = tcp_time_stamp_raw() + tp->tsoffset; break; case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index b89bce4c721e..dbcc9352a48f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -52,10 +52,9 @@ * There is a public e-mail list for discussing BBR development and testing: * https://groups.google.com/forum/#!forum/bbr-dev * - * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, - * since pacing is integral to the BBR design and implementation. - * BBR without pacing would not function properly, and may incur unnecessary - * high packet loss rates. + * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, + * otherwise TCP stack falls back to an internal pacing using one high + * resolution timer per TCP socket and may use more resources. */ #include <linux/module.h> #include <net/tcp.h> @@ -92,7 +91,7 @@ struct bbr { struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ u32 rtt_cnt; /* count of packet-timed rounds elapsed */ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ - struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ + u64 cycle_mstamp; /* time of this cycle phase start */ u32 mode:3, /* current bbr_mode in state machine */ prev_ca_state:3, /* CA state on previous ACK */ packet_conservation:1, /* use packet conservation? */ @@ -412,7 +411,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); bool is_full_length = - skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > + tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > bbr->min_rtt_us; u32 inflight, bw; @@ -498,7 +497,7 @@ static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; + bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); bbr->lt_last_delivered = tp->delivered; bbr->lt_last_lost = tp->lost; bbr->lt_rtt_cnt = 0; @@ -552,7 +551,7 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) struct bbr *bbr = inet_csk_ca(sk); u32 lost, delivered; u64 bw; - s32 t; + u32 t; if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ if (bbr->mode == BBR_PROBE_BW && bbr->round_start && @@ -604,15 +603,15 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) return; /* Find average delivery rate in this sampling interval. */ - t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); - if (t < 1) - return; /* interval is less than one jiffy, so wait */ - t = jiffies_to_usecs(t); - /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ - if (t < 1) { + t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; + if ((s32)t < 1) + return; /* interval is less than one ms, so wait */ + /* Check if can multiply without overflow */ + if (t >= ~0U / USEC_PER_MSEC) { bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ return; } + t *= USEC_PER_MSEC; bw = (u64)delivered * BW_UNIT; do_div(bw, t); bbr_lt_bw_interval_done(sk, bw); @@ -731,12 +730,12 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) bool filter_expired; /* Track min RTT seen in the min_rtt_win_sec filter window: */ - filter_expired = after(tcp_time_stamp, + filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { bbr->min_rtt_us = rs->rtt_us; - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; } if (bbr_probe_rtt_mode_ms > 0 && filter_expired && @@ -755,7 +754,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) /* Maintain min packets in flight for max(200 ms, 1 round). */ if (!bbr->probe_rtt_done_stamp && tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { - bbr->probe_rtt_done_stamp = tcp_time_stamp + + bbr->probe_rtt_done_stamp = tcp_jiffies32 + msecs_to_jiffies(bbr_probe_rtt_mode_ms); bbr->probe_rtt_round_done = 0; bbr->next_rtt_delivered = tp->delivered; @@ -763,8 +762,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) if (bbr->round_start) bbr->probe_rtt_round_done = 1; if (bbr->probe_rtt_round_done && - after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { - bbr->min_rtt_stamp = tcp_time_stamp; + after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { + bbr->min_rtt_stamp = tcp_jiffies32; bbr->restore_cwnd = 1; /* snap to prior_cwnd */ bbr_reset_mode(sk); } @@ -811,7 +810,7 @@ static void bbr_init(struct sock *sk) bbr->probe_rtt_done_stamp = 0; bbr->probe_rtt_round_done = 0; bbr->min_rtt_us = tcp_min_rtt(tp); - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ @@ -826,10 +825,12 @@ static void bbr_init(struct sock *sk) bbr->idle_restart = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; - bbr->cycle_mstamp.v64 = 0; + bbr->cycle_mstamp = 0; bbr->cycle_idx = 0; bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); + + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); } static u32 bbr_sndbuf_expand(struct sock *sk) diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 36087bca9f48..609965f0e298 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -84,14 +84,14 @@ static void bictcp_init(struct sock *sk) static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { if (ca->last_cwnd == cwnd && - (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; ca->last_cwnd = cwnd; - ca->last_time = tcp_time_stamp; + ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) /* record the beginning of an epoch */ - ca->epoch_start = tcp_time_stamp; + ca->epoch_start = tcp_jiffies32; /* start off normal */ if (cwnd <= low_window) { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0683ba447d77..57ae5b5ae643 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -155,7 +155,7 @@ static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_jiffies32; s32 delta; delta = now - tcp_sk(sk)->lsndtime; @@ -231,21 +231,21 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && - (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; /* The CUBIC function can update ca->cnt at most once per jiffy. * On all cwnd reduction events, ca->epoch_start is set to 0, * which will force a recalculation of ca->cnt. */ - if (ca->epoch_start && tcp_time_stamp == ca->last_time) + if (ca->epoch_start && tcp_jiffies32 == ca->last_time) goto tcp_friendliness; ca->last_cwnd = cwnd; - ca->last_time = tcp_time_stamp; + ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) { - ca->epoch_start = tcp_time_stamp; /* record beginning */ + ca->epoch_start = tcp_jiffies32; /* record beginning */ ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ @@ -276,7 +276,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) * if the cwnd < 1 million packets !!! */ - t = (s32)(tcp_time_stamp - ca->epoch_start); + t = (s32)(tcp_jiffies32 - ca->epoch_start); t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; @@ -448,7 +448,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) return; /* Discard delay samples right after fast recovery */ - if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; delay = (sample->rtt_us << 3) / USEC_PER_MSEC; diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4a4d8e76738f..3eb78cde6ff0 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -104,7 +104,7 @@ static void measure_achieved_throughput(struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_jiffies32; if (icsk->icsk_ca_state == TCP_CA_Open) ca->pkts_acked = sample->pkts_acked; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 174d4376baa5..2fa55f57ac06 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -441,7 +441,7 @@ void tcp_init_buffer_space(struct sock *sk) tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -463,7 +463,7 @@ void tcp_init_buffer_space(struct sock *sk) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -555,11 +555,11 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { u32 delta_us; - if (tp->rcv_rtt_est.time.v64 == 0) + if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); + delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -571,13 +571,15 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) - tcp_rcv_rtt_update(tp, - jiffies_to_usecs(tcp_time_stamp - - tp->rx_opt.rcv_tsecr), - 0); + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + + tcp_rcv_rtt_update(tp, delta_us, 0); + } } /* @@ -590,7 +592,7 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; - time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); + time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; @@ -672,7 +674,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_rcv_rtt_measure(tp); - now = tcp_time_stamp; + now = tcp_jiffies32; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize @@ -885,6 +887,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, struct tcp_sock *tp = tcp_sk(sk); int mib_idx; + if (WARN_ON_ONCE(metric < 0)) + return; + if (metric > tp->reordering) { tp->reordering = min(sysctl_tcp_max_reordering, metric); @@ -1134,8 +1139,8 @@ struct tcp_sacktag_state { * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. */ - struct skb_mstamp first_sackt; - struct skb_mstamp last_sackt; + u64 first_sackt; + u64 last_sackt; struct rate_sample *rate; int flag; }; @@ -1200,7 +1205,7 @@ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, int dup_sack, int pcount, - const struct skb_mstamp *xmit_time) + u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1242,9 +1247,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - if (state->first_sackt.v64 == 0) - state->first_sackt = *xmit_time; - state->last_sackt = *xmit_time; + if (state->first_sackt == 0) + state->first_sackt = xmit_time; + state->last_sackt = xmit_time; } if (sacked & TCPCB_LOST) { @@ -1304,7 +1309,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - &skb->skb_mstamp); + skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) @@ -1356,8 +1361,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); - if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) - TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; + if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) + TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1587,7 +1592,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - &skb->skb_mstamp); + skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); if (!before(TCP_SKB_CB(skb)->seq, @@ -1954,7 +1959,7 @@ void tcp_enter_loss(struct sock *sk) } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->retrans_out = 0; tp->lost_out = 0; @@ -2383,7 +2388,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) tcp_ecn_withdraw_cwr(tp); } } - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->undo_marker = 0; } @@ -2520,7 +2525,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -2590,7 +2595,7 @@ static void tcp_mtup_probe_success(struct sock *sk) tcp_mss_to_mtu(sk, tp->mss_cache) / icsk->icsk_mtup.probe_size; tp->snd_cwnd_cnt = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; @@ -2911,7 +2916,7 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) struct tcp_sock *tp = tcp_sk(sk); u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; - minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, + minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } @@ -2936,9 +2941,12 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * See draft-ietf-tcplw-high-performance-00, section 3.3. */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - flag & FLAG_ACKED) - seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - - tp->rx_opt.rcv_tsecr); + flag & FLAG_ACKED) { + u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + + seq_rtt_us = ca_rtt_us = delta_us; + } if (seq_rtt_us < 0) return false; @@ -2960,12 +2968,8 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { long rtt_us = -1L; - if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); - } + if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) + rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); } @@ -2976,7 +2980,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); - tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; + tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32; } /* Restart timer after forward progress on connection. @@ -3001,14 +3005,14 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = - tcp_skb_timestamp(skb) + rto; - s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); - /* delta may not be positive if the socket is locked + u64 rto_time_stamp = skb->skb_mstamp + + jiffies_to_usecs(rto); + s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta > 0) - rto = delta; + if (delta_us > 0) + rto = usecs_to_jiffies(delta_us); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); @@ -3060,9 +3064,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct skb_mstamp first_ackt, last_ackt; + u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp *now = &tp->tcp_mstamp; u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; @@ -3075,7 +3078,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, bool rtt_update; int flag = 0; - first_ackt.v64 = 0; + first_ackt = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -3106,8 +3109,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { last_ackt = skb->skb_mstamp; - WARN_ON_ONCE(last_ackt.v64 == 0); - if (!first_ackt.v64) + WARN_ON_ONCE(last_ackt == 0); + if (!first_ackt) first_ackt = last_ackt; last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; @@ -3122,7 +3125,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - &skb->skb_mstamp); + skb->skb_mstamp); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3165,13 +3168,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { - seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); - ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); + if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { + seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); + ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); } - if (sack->first_sackt.v64) { - sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); - ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); + if (sack->first_sackt) { + sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); + ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); } sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, @@ -3201,7 +3204,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3390,7 +3393,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, u32 *last_oow_ack_time) { if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { NET_INC_STATS(net, mib_idx); @@ -3398,7 +3401,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, } } - *last_oow_ack_time = tcp_time_stamp; + *last_oow_ack_time = tcp_jiffies32; return false; /* not rate-limited: go ahead, send dupack now! */ } @@ -3553,7 +3556,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ - sack_state.first_sackt.v64 = 0; + sack_state.first_sackt = 0; sack_state.rate = &rs; /* We very likely will need to access write queue head. */ @@ -3636,7 +3639,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) */ sk->sk_err_soft = 0; icsk->icsk_probes_out = 0; - tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; @@ -5019,7 +5022,7 @@ static void tcp_new_space(struct sock *sk) if (tcp_should_expand_sndbuf(sk)) { tcp_sndbuf_expand(sk); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } sk->sk_write_space(sk); @@ -5356,7 +5359,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* @@ -5554,7 +5557,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) struct inet_connection_sock *icsk = inet_csk(sk); tcp_set_state(sk, TCP_ESTABLISHED); - icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.lrcvtime = tcp_jiffies32; if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -5571,7 +5574,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ - tp->lsndtime = tcp_time_stamp; + tp->lsndtime = tcp_jiffies32; tcp_init_buffer_space(sk); @@ -5672,7 +5675,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp)) { + tcp_time_stamp(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; @@ -5917,7 +5920,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5929,7 +5932,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; } - skb_mstamp_get(&tp->tcp_mstamp); + tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { @@ -6008,7 +6011,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ - tp->lsndtime = tcp_time_stamp; + tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); @@ -6202,7 +6205,7 @@ static void tcp_openreq_init(struct request_sock *req, req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - skb_mstamp_get(&tcp_rsk(req)->snt_synack); + tcp_rsk(req)->snt_synack = tcp_clock_us(); tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5ab2aac5ca19..191b2f78b19d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -376,8 +376,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) struct sock *sk; struct sk_buff *skb; struct request_sock *fastopen; - __u32 seq, snd_una; - __u32 remaining; + u32 seq, snd_una; + s32 remaining; + u32 delta_us; int err; struct net *net = dev_net(icmp_skb->dev); @@ -483,11 +484,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); remaining = icsk->icsk_rto - - min(icsk->icsk_rto, - tcp_time_stamp - tcp_skb_timestamp(skb)); + usecs_to_jiffies(delta_us); - if (remaining) { + if (remaining > 0) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); } else { @@ -811,7 +813,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -839,7 +841,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp + tcp_rsk(req)->ts_off, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index d6fb6c067af4..ae10ed64fe13 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -37,7 +37,7 @@ #include <net/tcp.h> /* resolution of owd */ -#define LP_RESOL 1000 +#define LP_RESOL TCP_TS_HZ /** * enum tcp_lp_state @@ -147,9 +147,9 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) tp->rx_opt.rcv_tsecr == lp->local_ref_time) goto out; - m = HZ * (tp->rx_opt.rcv_tsval - - lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - - lp->local_ref_time); + m = TCP_TS_HZ * + (tp->rx_opt.rcv_tsval - lp->remote_ref_time) / + (tp->rx_opt.rcv_tsecr - lp->local_ref_time); if (m < 0) m = -m; @@ -194,7 +194,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) if (lp->flag & LP_VALID_RHZ) { owd = tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - - tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ); if (owd < 0) owd = -owd; } @@ -264,18 +264,19 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + u32 now = tcp_time_stamp(tp); u32 delta; if (sample->rtt_us > 0) tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ - delta = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + delta = now - tp->rx_opt.rcv_tsecr; if ((s32)delta > 0) lp->inference = 3 * delta; /* test if within inference */ - if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + if (lp->last_drop && (now - lp->last_drop < lp->inference)) lp->flag |= LP_WITHIN_INF; else lp->flag &= ~LP_WITHIN_INF; @@ -312,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); /* record this drop time */ - lp->last_drop = tcp_time_stamp; + lp->last_drop = now; } static struct tcp_congestion_ops tcp_lp __read_mostly = { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 653bbd67e3a3..102b2c90bb80 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -524,7 +524,7 @@ reset: tp->snd_cwnd = 1; else tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 717be4de5324..d0642df73044 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,9 +445,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_time_stamp; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->packets_out = 0; newtp->retrans_out = 0; @@ -455,7 +455,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; - newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; @@ -526,7 +526,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; - newtp->rack.mstamp.v64 = 0; + newtp->rack.mstamp = 0; newtp->rack.advanced = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 5de82a8d4d87..6d650ed3cb59 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -424,8 +424,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) } /* Extract info for Tcp socket info provided via netlink */ -size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, - union tcp_cc_info *info) +static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct tcpnv *ca = inet_csk_ca(sk); @@ -440,7 +440,6 @@ size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, } return 0; } -EXPORT_SYMBOL_GPL(tcpnv_get_info); static struct tcp_congestion_ops tcpnv __read_mostly = { .init = tcpnv_init, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4858e190f6ac..478f75baee31 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -151,7 +151,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } @@ -160,7 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -904,6 +904,72 @@ out: sk_free(sk); } +/* Note: Called under hard irq. + * We can not call TCP stack right away. + */ +enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) +{ + struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); + struct sock *sk = (struct sock *)tp; + unsigned long nval, oval; + + for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { + struct tsq_tasklet *tsq; + bool empty; + + if (oval & TSQF_QUEUED) + break; + + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); + if (nval != oval) + continue; + + if (!atomic_inc_not_zero(&sk->sk_wmem_alloc)) + break; + /* queue this socket to tasklet queue */ + tsq = this_cpu_ptr(&tsq_tasklet); + empty = list_empty(&tsq->head); + list_add(&tp->tsq_node, &tsq->head); + if (empty) + tasklet_schedule(&tsq->tasklet); + break; + } + return HRTIMER_NORESTART; +} + +/* BBR congestion control needs pacing. + * Same remark for SO_MAX_PACING_RATE. + * sch_fq packet scheduler is efficiently handling pacing, + * but is not always installed/used. + * Return true if TCP stack should pace packets itself. + */ +static bool tcp_needs_internal_pacing(const struct sock *sk) +{ + return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; +} + +static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) +{ + u64 len_ns; + u32 rate; + + if (!tcp_needs_internal_pacing(sk)) + return; + rate = sk->sk_pacing_rate; + if (!rate || rate == ~0U) + return; + + /* Should account for header sizes as sch_fq does, + * but lets make things simple. + */ + len_ns = (u64)skb->len * NSEC_PER_SEC; + do_div(len_ns, rate); + hrtimer_start(&tcp_sk(sk)->pacing_timer, + ktime_add_ns(ktime_get(), len_ns), + HRTIMER_MODE_ABS_PINNED); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -931,8 +997,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); + skb->skb_mstamp = tp->tcp_mstamp; if (clone_it) { - skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; tcp_rate_skb_sent(sk, skb); @@ -1034,6 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); + tcp_internal_pacing(sk, skb); } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -1408,7 +1475,7 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1509,7 +1576,7 @@ static void tcp_cwnd_application_limited(struct sock *sk) } tp->snd_cwnd_used = 0; } - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) @@ -1530,14 +1597,14 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (sysctl_tcp_slow_start_after_idle && - (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && + (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1839,7 +1906,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, const struct inet_connection_sock *icsk = inet_csk(sk); u32 age, send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp now; struct sk_buff *head; int win_divisor; @@ -1852,7 +1918,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Avoid bursty behavior by allowing defer * only if the last write was recent. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1895,8 +1961,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, } head = tcp_write_queue_head(sk); - skb_mstamp_get(&now); - age = skb_mstamp_us_delta(&now, &head->skb_mstamp); + + age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -1921,7 +1987,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) s32 delta; interval = net->ipv4.sysctl_tcp_probe_interval; - delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); @@ -1933,7 +1999,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } @@ -2086,6 +2152,12 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } +static bool tcp_pacing_check(const struct sock *sk) +{ + return tcp_needs_internal_pacing(sk) && + hrtimer_active(&tcp_sk(sk)->pacing_timer); +} + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) @@ -2130,7 +2202,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tp->chrono_type > TCP_CHRONO_UNSPEC) tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; @@ -2207,15 +2279,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } max_segs = tcp_tso_segs(sk, mss_now); + tcp_mstamp_refresh(tp); while ((skb = tcp_send_head(sk))) { unsigned int limit; + if (tcp_pacing_check(sk)) + break; + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; goto repair; /* Skip network transmission */ } @@ -2342,10 +2418,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) timeout = max_t(u32, timeout, msecs_to_jiffies(10)); /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_time_stamp + timeout; + tlp_time_stamp = tcp_jiffies32 + timeout; rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_time_stamp; + s32 delta = rto_time_stamp - tcp_jiffies32; if (delta > 0) timeout = delta; } @@ -2803,7 +2879,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; @@ -2878,6 +2954,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == tcp_send_head(sk)) break; + + if (tcp_pacing_check(sk)) + break; + /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; @@ -3015,7 +3095,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); - skb_mstamp_get(&skb->skb_mstamp); + tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -3111,10 +3191,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); + skb->skb_mstamp = cookie_init_timestamp(req); else #endif - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tcp_clock_us(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3244,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; else - tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; @@ -3373,7 +3453,8 @@ int tcp_connect(struct sock *sk) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = tcp_time_stamp; + tcp_mstamp_refresh(tp); + tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); @@ -3492,7 +3573,6 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); } EXPORT_SYMBOL_GPL(tcp_send_ack); @@ -3526,15 +3606,16 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - skb_mstamp_get(&skb->skb_mstamp); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } +/* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c6a9fa894646..ad99569d4c1e 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -78,7 +78,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - if (!scb->tx.delivered_mstamp.v64) + if (!scb->tx.delivered_mstamp) return; if (!rs->prior_delivered || @@ -89,9 +89,9 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, rs->is_retrans = scb->sacked & TCPCB_RETRANS; /* Find the duration of the "send phase" of this window: */ - rs->interval_us = skb_mstamp_us_delta( - &skb->skb_mstamp, - &scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp_us_delta( + skb->skb_mstamp, + scb->tx.first_tx_mstamp); /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = skb->skb_mstamp; @@ -101,7 +101,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, * we don't need to reset since it'll be freed soon. */ if (scb->sacked & TCPCB_SACKED_ACKED) - scb->tx.delivered_mstamp.v64 = 0; + scb->tx.delivered_mstamp = 0; } /* Update the connection delivery information and generate a rate sample. */ @@ -125,7 +125,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp.v64) { + if (!rs->prior_mstamp) { rs->delivered = -1; rs->interval_us = -1; return; @@ -138,8 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp, - &rs->prior_mstamp); /* ack phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Normally we expect interval_us >= min-rtt. diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 362b8c75bfab..fe9a493d0208 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -17,12 +17,9 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -static bool tcp_rack_sent_after(const struct skb_mstamp *t1, - const struct skb_mstamp *t2, - u32 seq1, u32 seq2) +static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { - return skb_mstamp_after(t1, t2) || - (t1->v64 == t2->v64 && after(seq1, seq2)); + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): @@ -72,14 +69,14 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) scb->sacked & TCPCB_SACKED_ACKED) continue; - if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, + if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, tp->rack.end_seq, scb->end_seq)) { /* Step 3 in draft-cheng-tcpm-rack-00.txt: * A packet is lost if its elapsed time is beyond * the recent RTT plus the reordering window. */ - u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp, - &skb->skb_mstamp); + u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, + skb->skb_mstamp); s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; if (remaining < 0) { @@ -127,16 +124,16 @@ void tcp_rack_mark_lost(struct sock *sk) * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time) + u64 xmit_time) { u32 rtt_us; - if (tp->rack.mstamp.v64 && - !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, + if (tp->rack.mstamp && + !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, end_seq, tp->rack.end_seq)) return; - rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time); + rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); if (sacked & TCPCB_RETRANS) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior @@ -152,7 +149,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, return; } tp->rack.rtt_us = rtt_us; - tp->rack.mstamp = *xmit_time; + tp->rack.mstamp = xmit_time; tp->rack.end_seq = end_seq; tp->rack.advanced = 1; } @@ -166,7 +163,6 @@ void tcp_rack_reo_timeout(struct sock *sk) u32 timeout, prior_inflight; prior_inflight = tcp_packets_in_flight(tp); - skb_mstamp_get(&tp->tcp_mstamp); tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 14672543cf0b..c4a35ba7f8ed 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -63,7 +63,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -73,7 +73,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; @@ -115,7 +115,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk); @@ -153,8 +153,8 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int timeout, bool syn_set) { - unsigned int linear_backoff_thresh, start_ts; unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -172,7 +172,7 @@ static bool retransmits_timed_out(struct sock *sk, timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; } - return (tcp_time_stamp - start_ts) >= timeout; + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); } /* A write timeout has occurred. Process the after effects. */ @@ -339,9 +339,10 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(tcp_send_head(sk)); if (!start_ts) - skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); + tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) + (s32)(tcp_time_stamp(tp) - start_ts) > + jiffies_to_msecs(icsk->icsk_user_timeout)) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -451,7 +452,7 @@ void tcp_retransmit_timer(struct sock *sk) tp->snd_una, tp->snd_nxt); } #endif - if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { tcp_write_err(sk); goto out; } @@ -561,6 +562,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } + tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; switch (event) { @@ -710,4 +712,7 @@ void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); + hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); + tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; } diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 9775453b8d17..bec9cafbe3f9 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -68,7 +68,7 @@ static void tcp_westwood_init(struct sock *sk) w->cumul_ack = 0; w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; - w->rtt_win_sx = tcp_time_stamp; + w->rtt_win_sx = tcp_jiffies32; w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 1; } @@ -116,7 +116,7 @@ static void tcp_westwood_pkts_acked(struct sock *sk, static void westwood_update_window(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); - s32 delta = tcp_time_stamp - w->rtt_win_sx; + s32 delta = tcp_jiffies32 - w->rtt_win_sx; /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first @@ -140,7 +140,7 @@ static void westwood_update_window(struct sock *sk) westwood_filter(w, delta); w->bk = 0; - w->rtt_win_sx = tcp_time_stamp; + w->rtt_win_sx = tcp_jiffies32; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1d6219bf2d6b..fdcb7437cc15 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1164,22 +1164,32 @@ out: } /* fully reclaim rmem/fwd memory allocated for skb */ -static void udp_rmem_release(struct sock *sk, int size, int partial) +static void udp_rmem_release(struct sock *sk, int size, int partial, + bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); + struct sk_buff_head *sk_queue; int amt; if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&sk->sk_receive_queue)) + !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; } up->forward_deficit = 0; + /* acquire the sk_receive_queue for fwd allocated memory scheduling, + * if the called don't held it already + */ + sk_queue = &sk->sk_receive_queue; + if (!rx_queue_lock_held) + spin_lock(&sk_queue->lock); + + sk->sk_forward_alloc += size; amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); sk->sk_forward_alloc -= amt; @@ -1188,19 +1198,31 @@ static void udp_rmem_release(struct sock *sk, int size, int partial) __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); atomic_sub(size, &sk->sk_rmem_alloc); + + /* this can save us from acquiring the rx queue lock on next receive */ + skb_queue_splice_tail_init(sk_queue, &up->reader_queue); + + if (!rx_queue_lock_held) + spin_unlock(&sk_queue->lock); } -/* Note: called with sk_receive_queue.lock held. +/* Note: called with reader_queue.lock held. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch * This avoids a cache line miss while receive_queue lock is held. * Look at __udp_enqueue_schedule_skb() to find where this copy is done. */ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) { - udp_rmem_release(sk, skb->dev_scratch, 1); + udp_rmem_release(sk, skb->dev_scratch, 1, false); } EXPORT_SYMBOL(udp_skb_destructor); +/* as above, but the caller held the rx queue lock, too */ +static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) +{ + udp_rmem_release(sk, skb->dev_scratch, 1, true); +} + /* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line @@ -1306,14 +1328,16 @@ EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); void udp_destruct_sock(struct sock *sk) { /* reclaim completely the forward allocated memory */ + struct udp_sock *up = udp_sk(sk); unsigned int total = 0; struct sk_buff *skb; - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue); + while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) { total += skb->truesize; kfree_skb(skb); } - udp_rmem_release(sk, total, 0); + udp_rmem_release(sk, total, 0, true); inet_sock_destruct(sk); } @@ -1321,6 +1345,7 @@ EXPORT_SYMBOL_GPL(udp_destruct_sock); int udp_init_sock(struct sock *sk) { + skb_queue_head_init(&udp_sk(sk)->reader_queue); sk->sk_destruct = udp_destruct_sock; return 0; } @@ -1338,6 +1363,26 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) } EXPORT_SYMBOL_GPL(skb_consume_udp); +static struct sk_buff *__first_packet_length(struct sock *sk, + struct sk_buff_head *rcvq, + int *total) +{ + struct sk_buff *skb; + + while ((skb = skb_peek(rcvq)) != NULL && + udp_lib_checksum_complete(skb)) { + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + atomic_inc(&sk->sk_drops); + __skb_unlink(skb, rcvq); + *total += skb->truesize; + kfree_skb(skb); + } + return skb; +} + /** * first_packet_length - return length of first packet in receive queue * @sk: socket @@ -1347,26 +1392,24 @@ EXPORT_SYMBOL_GPL(skb_consume_udp); */ static int first_packet_length(struct sock *sk) { - struct sk_buff_head *rcvq = &sk->sk_receive_queue; + struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; + struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff *skb; int total = 0; int res; spin_lock_bh(&rcvq->lock); - while ((skb = skb_peek(rcvq)) != NULL && - udp_lib_checksum_complete(skb)) { - __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, - IS_UDPLITE(sk)); - __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); - __skb_unlink(skb, rcvq); - total += skb->truesize; - kfree_skb(skb); + skb = __first_packet_length(sk, rcvq, &total); + if (!skb && !skb_queue_empty(sk_queue)) { + spin_lock(&sk_queue->lock); + skb_queue_splice_tail_init(sk_queue, rcvq); + spin_unlock(&sk_queue->lock); + + skb = __first_packet_length(sk, rcvq, &total); } res = skb ? skb->len : -1; if (total) - udp_rmem_release(sk, total, 1); + udp_rmem_release(sk, total, 1, false); spin_unlock_bh(&rcvq->lock); return res; } @@ -1400,6 +1443,77 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) } EXPORT_SYMBOL(udp_ioctl); +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, + int noblock, int *peeked, int *off, int *err) +{ + struct sk_buff_head *sk_queue = &sk->sk_receive_queue; + struct sk_buff_head *queue; + struct sk_buff *last; + long timeo; + int error; + + queue = &udp_sk(sk)->reader_queue; + flags |= noblock ? MSG_DONTWAIT : 0; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + do { + struct sk_buff *skb; + + error = sock_error(sk); + if (error) + break; + + error = -EAGAIN; + *peeked = 0; + do { + spin_lock_bh(&queue->lock); + skb = __skb_try_recv_from_queue(sk, queue, flags, + udp_skb_destructor, + peeked, off, err, + &last); + if (skb) { + spin_unlock_bh(&queue->lock); + return skb; + } + + if (skb_queue_empty(sk_queue)) { + spin_unlock_bh(&queue->lock); + goto busy_check; + } + + /* refill the reader queue and walk it again + * keep both queues locked to avoid re-acquiring + * the sk_receive_queue lock if fwd memory scheduling + * is needed. + */ + spin_lock(&sk_queue->lock); + skb_queue_splice_tail_init(sk_queue, queue); + + skb = __skb_try_recv_from_queue(sk, queue, flags, + udp_skb_dtor_locked, + peeked, off, err, + &last); + spin_unlock(&sk_queue->lock); + spin_unlock_bh(&queue->lock); + if (skb) + return skb; + +busy_check: + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(sk_queue)); + + /* sk_queue is empty, reader_queue may contain peeked packets */ + } while (timeo && + !__skb_wait_for_more_packets(sk, &error, &timeo, + (struct sk_buff *)sk_queue)); + + *err = error; + return NULL; +} +EXPORT_SYMBOL_GPL(__skb_recv_udp); + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -1490,7 +1604,8 @@ try_again: return err; csum_copy_err: - if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) { + if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, + udp_skb_destructor)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2325,6 +2440,9 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; + if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) + mask |= POLLIN | POLLRDNORM; + sock_rps_record_flow(sk); /* Check for false positives due to checksum errors */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6a4fb1e629fb..25443fd946a8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2280,7 +2280,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, cfg.fc_flags |= RTF_NONEXTHOP; #endif - ip6_route_add(&cfg); + ip6_route_add(&cfg, NULL); } @@ -2335,7 +2335,7 @@ static void addrconf_add_mroute(struct net_device *dev) ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); - ip6_route_add(&cfg); + ip6_route_add(&cfg, NULL); } static struct inet6_dev *addrconf_add_dev(struct net_device *dev) diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index 9ea249b9451e..6de3c04b0f30 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -14,6 +14,8 @@ #include <net/udp.h> #include <net/udp_tunnel.h> +#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL) + static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, struct flowi6 *fl6, u8 *protocol, __be16 sport) { @@ -33,8 +35,8 @@ static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, *protocol = IPPROTO_UDP; } -int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi6 *fl6) +static int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) { __be16 sport; int err; @@ -49,10 +51,9 @@ int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } -EXPORT_SYMBOL(fou6_build_header); -int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi6 *fl6) +static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) { __be16 sport; int err; @@ -67,9 +68,6 @@ int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } -EXPORT_SYMBOL(gue6_build_header); - -#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL) static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { .encap_hlen = fou_encap_hlen, diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index b3df03e3faa0..f4a413aba423 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -91,7 +91,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); - return -EINVAL; + return err; } static int ila_input(struct sk_buff *skb) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d4bf2c68a545..deea901746c8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -473,7 +473,8 @@ out: static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, - int replace_required, int sernum) + int replace_required, int sernum, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -497,6 +498,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { + NL_SET_ERR_MSG(extack, + "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } @@ -543,6 +546,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, * That would keep IPv6 consistent with IPv4 */ if (replace_required) { + NL_SET_ERR_MSG(extack, + "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } @@ -964,7 +969,8 @@ void fib6_force_start_gc(struct net *net) */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc) + struct nl_info *info, struct mx6_config *mxc, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; @@ -987,7 +993,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required, sernum); + replace_required, sernum, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -1028,7 +1034,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum); + allow_create, replace_required, sernum, + extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated @@ -1047,7 +1054,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum); + allow_create, replace_required, sernum, + extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index eedee5d108d9..f63b18e05c69 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -220,9 +220,6 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook) __be16 fo; u8 proto; - if (skb->csum_bad) - return false; - if (skb_csum_unnecessary(skb)) return true; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dc61b0b5e64e..80bda31ffbbe 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -938,14 +938,15 @@ EXPORT_SYMBOL(rt6_lookup); */ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, - struct mx6_config *mxc) + struct mx6_config *mxc, + struct netlink_ext_ack *extack) { int err; struct fib6_table *table; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info, mxc); + err = fib6_add(&table->tb6_root, rt, info, mxc, extack); write_unlock_bh(&table->tb6_lock); return err; @@ -956,7 +957,7 @@ int ip6_ins_rt(struct rt6_info *rt) struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; struct mx6_config mxc = { .mx = NULL, }; - return __ip6_ins_rt(rt, &info, &mxc); + return __ip6_ins_rt(rt, &info, &mxc, NULL); } static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, @@ -1844,7 +1845,8 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } -static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) +static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct rt6_info *rt = NULL; @@ -1855,14 +1857,25 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) int err = -EINVAL; /* RTF_PCPU is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_PCPU) + if (cfg->fc_flags & RTF_PCPU) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); goto out; + } - if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) + if (cfg->fc_dst_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + goto out; + } + if (cfg->fc_src_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid source address length"); goto out; + } #ifndef CONFIG_IPV6_SUBTREES - if (cfg->fc_src_len) + if (cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, + "Specifying source address requires IPV6_SUBTREES to be enabled"); goto out; + } #endif if (cfg->fc_ifindex) { err = -ENODEV; @@ -2013,9 +2026,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) err = -EINVAL; if (ipv6_chk_addr_and_flags(net, gw_addr, gwa_type & IPV6_ADDR_LINKLOCAL ? - dev : NULL, 0, 0)) + dev : NULL, 0, 0)) { + NL_SET_ERR_MSG(extack, "Invalid gateway address"); goto out; - + } rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { @@ -2031,8 +2045,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) addressing */ if (!(gwa_type & (IPV6_ADDR_UNICAST | - IPV6_ADDR_MAPPED))) + IPV6_ADDR_MAPPED))) { + NL_SET_ERR_MSG(extack, + "Invalid gateway address"); goto out; + } if (cfg->fc_table) { grt = ip6_nh_lookup_table(net, cfg, gw_addr); @@ -2072,8 +2089,14 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) goto out; } err = -EINVAL; - if (!dev || (dev->flags & IFF_LOOPBACK)) + if (!dev) { + NL_SET_ERR_MSG(extack, "Egress device not specified"); goto out; + } else if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, + "Egress device can not be loopback device for this route"); + goto out; + } } err = -ENODEV; @@ -2082,6 +2105,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!ipv6_addr_any(&cfg->fc_prefsrc)) { if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { + NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; goto out; } @@ -2111,13 +2135,14 @@ out: return ERR_PTR(err); } -int ip6_route_add(struct fib6_config *cfg) +int ip6_route_add(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct mx6_config mxc = { .mx = NULL, }; struct rt6_info *rt; int err; - rt = ip6_route_info_create(cfg); + rt = ip6_route_info_create(cfg, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -2128,7 +2153,7 @@ int ip6_route_add(struct fib6_config *cfg) if (err) goto out; - err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); kfree(mxc.mx); @@ -2222,7 +2247,8 @@ out_put: return err; } -static int ip6_route_del(struct fib6_config *cfg) +static int ip6_route_del(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct fib6_table *table; struct fib6_node *fn; @@ -2230,8 +2256,10 @@ static int ip6_route_del(struct fib6_config *cfg) int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); - if (!table) + if (!table) { + NL_SET_ERR_MSG(extack, "FIB table does not exist"); return err; + } read_lock_bh(&table->tb6_lock); @@ -2483,7 +2511,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; - ip6_route_add(&cfg); + ip6_route_add(&cfg, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } @@ -2529,7 +2557,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, cfg.fc_gateway = *gwaddr; - if (!ip6_route_add(&cfg)) { + if (!ip6_route_add(&cfg, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); @@ -2622,10 +2650,10 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); switch (cmd) { case SIOCADDRT: - err = ip6_route_add(&cfg); + err = ip6_route_add(&cfg, NULL); break; case SIOCDELRT: - err = ip6_route_del(&cfg); + err = ip6_route_del(&cfg, NULL); break; default: err = -EINVAL; @@ -2903,7 +2931,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, - struct fib6_config *cfg) + struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; @@ -3097,7 +3126,8 @@ static void ip6_route_mpath_notify(struct rt6_info *rt, inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); } -static int ip6_route_multipath_add(struct fib6_config *cfg) +static int ip6_route_multipath_add(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct rt6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; @@ -3145,7 +3175,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) r_cfg.fc_encap_type = nla_get_u16(nla); } - rt = ip6_route_info_create(&r_cfg); + rt = ip6_route_info_create(&r_cfg, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -3170,7 +3200,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { rt_last = nh->rt6_info; - err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc); + err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); /* save reference to first route for notification */ if (!rt_notif && !err) rt_notif = nh->rt6_info; @@ -3212,7 +3242,7 @@ add_errout: list_for_each_entry(nh, &rt6_nh_list, next) { if (err_nh == nh) break; - ip6_route_del(&nh->r_cfg); + ip6_route_del(&nh->r_cfg, extack); } cleanup: @@ -3227,7 +3257,8 @@ cleanup: return err; } -static int ip6_route_multipath_del(struct fib6_config *cfg) +static int ip6_route_multipath_del(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { struct fib6_config r_cfg; struct rtnexthop *rtnh; @@ -3254,7 +3285,7 @@ static int ip6_route_multipath_del(struct fib6_config *cfg) r_cfg.fc_flags |= RTF_GATEWAY; } } - err = ip6_route_del(&r_cfg); + err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; @@ -3270,15 +3301,15 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config cfg; int err; - err = rtm_to_fib6_config(skb, nlh, &cfg); + err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_mp) - return ip6_route_multipath_del(&cfg); + return ip6_route_multipath_del(&cfg, extack); else { cfg.fc_delete_all_nh = 1; - return ip6_route_del(&cfg); + return ip6_route_del(&cfg, extack); } } @@ -3288,14 +3319,14 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config cfg; int err; - err = rtm_to_fib6_config(skb, nlh, &cfg); + err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_mp) - return ip6_route_multipath_add(&cfg); + return ip6_route_multipath_add(&cfg, extack); else - return ip6_route_add(&cfg); + return ip6_route_add(&cfg, extack); } static size_t rt6_nlmsg_size(struct rt6_info *rt) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 5f44ffed2576..15fba55e3da8 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -303,13 +303,9 @@ static int seg6_genl_dumphmac_done(struct netlink_callback *cb) static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; - struct net *net = sock_net(skb->sk); - struct seg6_pernet_data *sdata; struct seg6_hmac_info *hinfo; int ret; - sdata = seg6_pernet(net); - ret = rhashtable_walk_start(iter); if (ret && ret != -EAGAIN) goto done; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 5abc3692b901..971823359f5b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -211,7 +211,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack.v64 = 0; + treq->snt_synack = 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; treq->ts_off = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4f4310a36a04..233edfabe1db 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -949,7 +949,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); @@ -971,7 +971,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp + tcp_rsk(req)->ts_off, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 06ec39b79609..2e9b52bded2d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -455,7 +455,8 @@ try_again: return err; csum_copy_err: - if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) { + if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, + udp_skb_destructor)) { if (is_udp4) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index deca20fb2ce2..da49191f7ad0 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1985,7 +1985,7 @@ static int kcm_create(struct net *net, struct socket *sock, return 0; } -static struct net_proto_family kcm_family_ops = { +static const struct net_proto_family kcm_family_ops = { .family = PF_KCM, .create = kcm_create, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index a504e87c6ddf..49bd8bb16b18 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -152,7 +152,7 @@ void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; - opts->tsval = tcp_time_stamp & ~0x3f; + opts->tsval = tcp_time_stamp_raw() & ~0x3f; if (opts->options & XT_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 54e40fa47822..d3e594eb36d0 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -48,7 +48,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto, return rc; } -static struct net_proto_family nfc_sock_family_ops = { +static const struct net_proto_family nfc_sock_family_ops = { .owner = THIS_MODULE, .family = PF_NFC, .create = nfc_sock_create, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7b17da9a94a0..9ddc9f8412a2 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -453,7 +453,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, /* Complete checksum if needed */ if (skb->ip_summed == CHECKSUM_PARTIAL && - (err = skb_checksum_help(skb))) + (err = skb_csum_hwoffload_help(skb, 0))) goto out; /* Older versions of OVS user space enforce alignment of the last diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a90e8f355c00..0ecf2a858767 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -28,6 +28,31 @@ #include <net/act_api.h> #include <net/netlink.h> +static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) +{ + u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK; + + if (!tp) + return -EINVAL; + a->goto_chain = tcf_chain_get(tp->chain->block, chain_index); + if (!a->goto_chain) + return -ENOMEM; + return 0; +} + +static void tcf_action_goto_chain_fini(struct tc_action *a) +{ + tcf_chain_put(a->goto_chain); +} + +static void tcf_action_goto_chain_exec(const struct tc_action *a, + struct tcf_result *res) +{ + const struct tcf_chain *chain = a->goto_chain; + + res->goto_tp = rcu_dereference_bh(chain->filter_chain); +} + static void free_tcf(struct rcu_head *head) { struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); @@ -39,6 +64,8 @@ static void free_tcf(struct rcu_head *head) kfree(p->act_cookie->data); kfree(p->act_cookie); } + if (p->goto_chain) + tcf_action_goto_chain_fini(p); kfree(p); } @@ -465,6 +492,8 @@ repeat: else /* faulty graph, stop pipeline */ return TC_ACT_OK; } + } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { + tcf_action_goto_chain_exec(a, res); } if (ret != TC_ACT_PIPE) @@ -570,9 +599,9 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } -struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, - struct nlattr *est, char *name, int ovr, - int bind) +struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, + struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind) { struct tc_action *a; struct tc_action_ops *a_o; @@ -657,6 +686,17 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, if (err != ACT_P_CREATED) module_put(a_o->owner); + if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { + err = tcf_action_goto_chain_init(a, tp); + if (err) { + LIST_HEAD(actions); + + list_add_tail(&a->list, &actions); + tcf_action_destroy(&actions, bind); + return ERR_PTR(err); + } + } + return a; err_mod: @@ -680,8 +720,9 @@ static void cleanup_a(struct list_head *actions, int ovr) a->tcfa_refcnt--; } -int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, struct list_head *actions) +int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, int bind, + struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -693,7 +734,7 @@ int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); + act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1020,7 +1061,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, int ret = 0; LIST_HEAD(actions); - ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions); if (ret) return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index ab6fdbd34db7..3317a2f579da 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -350,6 +350,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl, sctph->checksum = sctp_compute_cksum(skb, skb_network_offset(skb) + ihl); skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; return 1; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 22f88b35a546..4020b8d932a1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -106,13 +106,12 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, - struct tcf_proto __rcu **chain, int event) + struct tcf_chain *chain, int event) { - struct tcf_proto __rcu **it_chain; struct tcf_proto *tp; - for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL; - it_chain = &tp->next) + for (tp = rtnl_dereference(chain->filter_chain); + tp; tp = rtnl_dereference(tp->next)) tfilter_notify(net, oskb, n, tp, 0, event, false); } @@ -125,11 +124,12 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) if (tp) first = tp->prio - 1; - return first; + return TC_H_MAJ(first); } static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, - u32 prio, u32 parent, struct Qdisc *q) + u32 prio, u32 parent, struct Qdisc *q, + struct tcf_chain *chain) { struct tcf_proto *tp; int err; @@ -165,6 +165,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->prio = prio; tp->classid = parent; tp->q = q; + tp->chain = chain; err = tp->ops->init(tp); if (err) { @@ -185,16 +186,213 @@ static void tcf_proto_destroy(struct tcf_proto *tp) kfree_rcu(tp, rcu); } -void tcf_destroy_chain(struct tcf_proto __rcu **fl) +static struct tcf_chain *tcf_chain_create(struct tcf_block *block, + u32 chain_index) +{ + struct tcf_chain *chain; + + chain = kzalloc(sizeof(*chain), GFP_KERNEL); + if (!chain) + return NULL; + list_add_tail(&chain->list, &block->chain_list); + chain->block = block; + chain->index = chain_index; + chain->refcnt = 1; + return chain; +} + +static void tcf_chain_destroy(struct tcf_chain *chain) { struct tcf_proto *tp; - while ((tp = rtnl_dereference(*fl)) != NULL) { - RCU_INIT_POINTER(*fl, tp->next); + list_del(&chain->list); + while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { + RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp); } + kfree(chain); +} + +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index) +{ + struct tcf_chain *chain; + + list_for_each_entry(chain, &block->chain_list, list) { + if (chain->index == chain_index) { + chain->refcnt++; + return chain; + } + } + return tcf_chain_create(block, chain_index); +} +EXPORT_SYMBOL(tcf_chain_get); + +void tcf_chain_put(struct tcf_chain *chain) +{ + /* Destroy unused chain, with exception of chain 0, which is the + * default one and has to be always present. + */ + if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0) + tcf_chain_destroy(chain); +} +EXPORT_SYMBOL(tcf_chain_put); + +static void +tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, + struct tcf_proto __rcu **p_filter_chain) +{ + chain->p_filter_chain = p_filter_chain; +} + +int tcf_block_get(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain) +{ + struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); + struct tcf_chain *chain; + int err; + + if (!block) + return -ENOMEM; + INIT_LIST_HEAD(&block->chain_list); + /* Create chain 0 by default, it has to be always present. */ + chain = tcf_chain_create(block, 0); + if (!chain) { + err = -ENOMEM; + goto err_chain_create; + } + tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); + *p_block = block; + return 0; + +err_chain_create: + kfree(block); + return err; +} +EXPORT_SYMBOL(tcf_block_get); + +void tcf_block_put(struct tcf_block *block) +{ + struct tcf_chain *chain, *tmp; + + if (!block) + return; + + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_destroy(chain); + kfree(block); +} +EXPORT_SYMBOL(tcf_block_put); + +/* Main classifier routine: scans classifier chain attached + * to this qdisc, (optionally) tests for protocol and asks + * specific classifiers. + */ +int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) +{ + __be16 protocol = tc_skb_protocol(skb); +#ifdef CONFIG_NET_CLS_ACT + const int max_reclassify_loop = 4; + const struct tcf_proto *old_tp = tp; + int limit = 0; + +reclassify: +#endif + for (; tp; tp = rcu_dereference_bh(tp->next)) { + int err; + + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + + err = tp->classify(skb, tp, res); +#ifdef CONFIG_NET_CLS_ACT + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { + goto reset; + } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { + old_tp = res->goto_tp; + goto reset; + } +#endif + if (err >= 0) + return err; + } + + return TC_ACT_UNSPEC; /* signal: continue lookup */ +#ifdef CONFIG_NET_CLS_ACT +reset: + if (unlikely(limit++ >= max_reclassify_loop)) { + net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", + tp->q->ops->id, tp->prio & 0xffff, + ntohs(tp->protocol)); + return TC_ACT_SHOT; + } + + tp = old_tp; + protocol = tc_skb_protocol(skb); + goto reclassify; +#endif +} +EXPORT_SYMBOL(tcf_classify); + +struct tcf_chain_info { + struct tcf_proto __rcu **pprev; + struct tcf_proto __rcu *next; +}; + +static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain_info *chain_info) +{ + return rtnl_dereference(*chain_info->pprev); +} + +static void tcf_chain_tp_insert(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + struct tcf_proto *tp) +{ + if (chain->p_filter_chain && + *chain_info->pprev == chain->filter_chain) + *chain->p_filter_chain = tp; + RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); + rcu_assign_pointer(*chain_info->pprev, tp); +} + +static void tcf_chain_tp_remove(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + struct tcf_proto *tp) +{ + struct tcf_proto *next = rtnl_dereference(chain_info->next); + + if (chain->p_filter_chain && tp == chain->filter_chain) + *chain->p_filter_chain = next; + RCU_INIT_POINTER(*chain_info->pprev, next); +} + +static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, + struct tcf_chain_info *chain_info, + u32 protocol, u32 prio, + bool prio_allocate) +{ + struct tcf_proto **pprev; + struct tcf_proto *tp; + + /* Check the chain for existence of proto-tcf with this priority */ + for (pprev = &chain->filter_chain; + (tp = rtnl_dereference(*pprev)); pprev = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (prio_allocate || + (tp->protocol != protocol && protocol)) + return ERR_PTR(-EINVAL); + } else { + tp = NULL; + } + break; + } + } + chain_info->pprev = pprev; + chain_info->next = tp ? tp->next : NULL; + return tp; } -EXPORT_SYMBOL(tcf_destroy_chain); /* Add/change/delete/get a filter node */ @@ -206,13 +404,14 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct tcmsg *t; u32 protocol; u32 prio; - u32 nprio; + bool prio_allocate; u32 parent; + u32 chain_index; struct net_device *dev; struct Qdisc *q; - struct tcf_proto __rcu **back; - struct tcf_proto __rcu **chain; - struct tcf_proto *next; + struct tcf_chain_info chain_info; + struct tcf_chain *chain = NULL; + struct tcf_block *block; struct tcf_proto *tp; const struct Qdisc_class_ops *cops; unsigned long cl; @@ -234,7 +433,7 @@ replay: t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); - nprio = prio; + prio_allocate = false; parent = t->tcm_parent; cl = 0; @@ -250,6 +449,7 @@ replay: */ if (n->nlmsg_flags & NLM_F_CREATE) { prio = TC_H_MAKE(0x80000000U, 0U); + prio_allocate = true; break; } /* fall-through */ @@ -280,7 +480,7 @@ replay: if (!cops) return -EINVAL; - if (cops->tcf_chain == NULL) + if (!cops->tcf_block) return -EOPNOTSUPP; /* Do we search for filter, attached to class? */ @@ -291,34 +491,35 @@ replay: } /* And the last stroke */ - chain = cops->tcf_chain(q, cl); - if (chain == NULL) { + block = cops->tcf_block(q, cl); + if (!block) { + err = -EINVAL; + goto errout; + } + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { err = -EINVAL; goto errout; } + chain = tcf_chain_get(block, chain_index); + if (!chain) { + err = -ENOMEM; + goto errout; + } + if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); - tcf_destroy_chain(chain); + tcf_chain_destroy(chain); err = 0; goto errout; } - /* Check the chain for existence of proto-tcf with this priority */ - for (back = chain; - (tp = rtnl_dereference(*back)) != NULL; - back = &tp->next) { - if (tp->prio >= prio) { - if (tp->prio == prio) { - if (!nprio || - (tp->protocol != protocol && protocol)) { - err = -EINVAL; - goto errout; - } - } else { - tp = NULL; - } - break; - } + tp = tcf_chain_tp_find(chain, &chain_info, protocol, + prio, prio_allocate); + if (IS_ERR(tp)) { + err = PTR_ERR(tp); + goto errout; } if (tp == NULL) { @@ -335,11 +536,11 @@ replay: goto errout; } - if (!nprio) - nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); + if (prio_allocate) + prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info)); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, nprio, parent, q); + protocol, prio, parent, q, chain); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; @@ -354,8 +555,7 @@ replay: if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - next = rtnl_dereference(tp->next); - RCU_INIT_POINTER(*back, next); + tcf_chain_tp_remove(chain, &chain_info, tp); tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER, false); tcf_proto_destroy(tp); @@ -384,11 +584,10 @@ replay: err = tp->ops->delete(tp, fh, &last); if (err) goto errout; - next = rtnl_dereference(tp->next); tfilter_notify(net, skb, n, tp, t->tcm_handle, RTM_DELTFILTER, false); if (last) { - RCU_INIT_POINTER(*back, next); + tcf_chain_tp_remove(chain, &chain_info, tp); tcf_proto_destroy(tp); } goto errout; @@ -405,10 +604,8 @@ replay: err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); if (err == 0) { - if (tp_created) { - RCU_INIT_POINTER(tp->next, rtnl_dereference(*back)); - rcu_assign_pointer(*back, tp); - } + if (tp_created) + tcf_chain_tp_insert(chain, &chain_info, tp); tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); } else { if (tp_created) @@ -416,6 +613,8 @@ replay: } errout: + if (chain) + tcf_chain_put(chain); if (cl) cops->put(q, cl); if (err == -EAGAIN) @@ -444,6 +643,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index)) + goto nla_put_failure; tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; @@ -500,22 +701,76 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, RTM_NEWTFILTER); } +static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, + struct netlink_callback *cb, + long index_start, long *p_index) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct tcf_dump_args arg; + struct tcf_proto *tp; + + for (tp = rtnl_dereference(chain->filter_chain); + tp; tp = rtnl_dereference(tp->next), (*p_index)++) { + if (*p_index < index_start) + continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (*p_index > index_start) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (cb->args[1] == 0) { + if (tcf_fill_node(net, skb, tp, 0, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTFILTER) <= 0) + return false; + + cb->args[1] = 1; + } + if (!tp->ops->walk) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1] - 1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count + 1; + if (arg.w.stop) + return false; + } + return true; +} + /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int t; - int s_t; + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q; - struct tcf_proto *tp, __rcu **chain; + struct tcf_block *block; + struct tcf_chain *chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; const struct Qdisc_class_ops *cops; - struct tcf_dump_args arg; + long index_start; + long index; + int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; + + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err) + return err; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return skb->len; @@ -529,56 +784,29 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) cops = q->ops->cl_ops; if (!cops) goto errout; - if (cops->tcf_chain == NULL) + if (!cops->tcf_block) goto errout; if (TC_H_MIN(tcm->tcm_parent)) { cl = cops->get(q, tcm->tcm_parent); if (cl == 0) goto errout; } - chain = cops->tcf_chain(q, cl); - if (chain == NULL) + block = cops->tcf_block(q, cl); + if (!block) goto errout; - s_t = cb->args[0]; + index_start = cb->args[0]; + index = 0; - for (tp = rtnl_dereference(*chain), t = 0; - tp; tp = rtnl_dereference(tp->next), t++) { - if (t < s_t) - continue; - if (TC_H_MAJ(tcm->tcm_info) && - TC_H_MAJ(tcm->tcm_info) != tp->prio) - continue; - if (TC_H_MIN(tcm->tcm_info) && - TC_H_MIN(tcm->tcm_info) != tp->protocol) + list_for_each_entry(chain, &block->chain_list, list) { + if (tca[TCA_CHAIN] && + nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; - if (t > s_t) - memset(&cb->args[1], 0, - sizeof(cb->args)-sizeof(cb->args[0])); - if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER) <= 0) - break; - - cb->args[1] = 1; - } - if (tp->ops->walk == NULL) - continue; - arg.w.fn = tcf_node_dump; - arg.skb = skb; - arg.cb = cb; - arg.w.stop = 0; - arg.w.skip = cb->args[1] - 1; - arg.w.count = 0; - tp->ops->walk(tp, &arg.w); - cb->args[1] = arg.w.count + 1; - if (arg.w.stop) + if (!tcf_chain_dump(chain, skb, cb, index_start, &index)) break; } - cb->args[0] = t; + cb->args[0] = index; errout: if (cl) @@ -608,8 +836,9 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct tc_action *act; if (exts->police && tb[exts->police]) { - act = tcf_action_init_1(net, tb[exts->police], rate_tlv, - "police", ovr, TCA_ACT_BIND); + act = tcf_action_init_1(net, tp, tb[exts->police], + rate_tlv, "police", ovr, + TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); @@ -620,8 +849,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, LIST_HEAD(actions); int err, i = 0; - err = tcf_action_init(net, tb[exts->action], rate_tlv, - NULL, ovr, TCA_ACT_BIND, + err = tcf_action_init(net, tp, tb[exts->action], + rate_tlv, NULL, ovr, TCA_ACT_BIND, &actions); if (err) return err; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index e88342fde1bc..5d95401bbc02 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -163,7 +163,7 @@ int register_qdisc(struct Qdisc_ops *qops) if (!(cops->get && cops->put && cops->walk && cops->leaf)) goto out_einval; - if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf)) + if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) goto out_einval; } @@ -1878,54 +1878,6 @@ done: return skb->len; } -/* Main classifier routine: scans classifier chain attached - * to this qdisc, (optionally) tests for protocol and asks - * specific classifiers. - */ -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode) -{ - __be16 protocol = tc_skb_protocol(skb); -#ifdef CONFIG_NET_CLS_ACT - const int max_reclassify_loop = 4; - const struct tcf_proto *old_tp = tp; - int limit = 0; - -reclassify: -#endif - for (; tp; tp = rcu_dereference_bh(tp->next)) { - int err; - - if (tp->protocol != protocol && - tp->protocol != htons(ETH_P_ALL)) - continue; - - err = tp->classify(skb, tp, res); -#ifdef CONFIG_NET_CLS_ACT - if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) - goto reset; -#endif - if (err >= 0) - return err; - } - - return TC_ACT_UNSPEC; /* signal: continue lookup */ -#ifdef CONFIG_NET_CLS_ACT -reset: - if (unlikely(limit++ >= max_reclassify_loop)) { - net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", - tp->q->ops->id, tp->prio & 0xffff, - ntohs(tp->protocol)); - return TC_ACT_SHOT; - } - - tp = old_tp; - protocol = tc_skb_protocol(skb); - goto reclassify; -#endif -} -EXPORT_SYMBOL(tc_classify); - #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 40cbceed4de8..f435546c3864 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -43,6 +43,7 @@ struct atm_flow_data { struct Qdisc *q; /* FIFO, TBF, etc. */ struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); /* chaining */ @@ -143,7 +144,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl) list_del_init(&flow->list); pr_debug("atm_tc_put: qdisc %p\n", flow->q); qdisc_destroy(flow->q); - tcf_destroy_chain(&flow->filter_list); + tcf_block_put(flow->block); if (flow->sock) { pr_debug("atm_tc_put: f_count %ld\n", file_count(flow->sock->file)); @@ -274,7 +275,13 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, error = -ENOBUFS; goto err_out; } - RCU_INIT_POINTER(flow->filter_list, NULL); + + error = tcf_block_get(&flow->block, &flow->filter_list); + if (error) { + kfree(flow); + goto err_out; + } + flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!flow->q) flow->q = &noop_qdisc; @@ -346,14 +353,13 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - return flow ? &flow->filter_list : &p->link.filter_list; + return flow ? flow->block : p->link.block; } /* --------------------------- Qdisc operations ---------------------------- */ @@ -377,7 +383,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tc_classify(skb, fl, &res, true); + result = tcf_classify(skb, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; @@ -524,6 +530,7 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch) static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) { struct atm_qdisc_data *p = qdisc_priv(sch); + int err; pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); INIT_LIST_HEAD(&p->flows); @@ -534,7 +541,11 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) if (!p->link.q) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - RCU_INIT_POINTER(p->link.filter_list, NULL); + + err = tcf_block_get(&p->link.block, &p->link.filter_list); + if (err) + return err; + p->link.vcc = NULL; p->link.sock = NULL; p->link.classid = sch->handle; @@ -561,7 +572,7 @@ static void atm_tc_destroy(struct Qdisc *sch) pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); list_for_each_entry(flow, &p->flows, list) - tcf_destroy_chain(&flow->filter_list); + tcf_block_put(flow->block); list_for_each_entry_safe(flow, tmp, &p->flows, list) { if (flow->ref > 1) @@ -646,7 +657,7 @@ static const struct Qdisc_class_ops atm_class_ops = { .change = atm_tc_change, .delete = atm_tc_delete, .walk = atm_tc_walk, - .tcf_chain = atm_tc_find_tcf, + .tcf_block = atm_tc_tcf_block, .bind_tcf = atm_tc_bind_filter, .unbind_tcf = atm_tc_put, .dump = atm_tc_dump_class, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 7415859fd4c3..8dd6d0aca678 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -127,6 +127,7 @@ struct cbq_class { struct tc_cbq_xstats xstats; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; int refcnt; int filters; @@ -233,7 +234,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tc_classify(skb, fl, &res, true); + result = tcf_classify(skb, fl, &res, true); if (!fl || result < 0) goto fallback; @@ -1405,7 +1406,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) WARN_ON(cl->filters); - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); qdisc_destroy(cl->q); qdisc_put_rtab(cl->R_tab); gen_kill_estimator(&cl->rate_est); @@ -1430,7 +1431,7 @@ static void cbq_destroy(struct Qdisc *sch) */ for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); } for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], @@ -1585,12 +1586,19 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; + err = tcf_block_get(&cl->block, &cl->filter_list); + if (err) { + kfree(cl); + return err; + } + if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { + tcf_block_put(cl->block); kfree(cl); goto failure; } @@ -1688,8 +1696,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) return 0; } -static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch, - unsigned long arg) +static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; @@ -1697,7 +1704,7 @@ static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch, if (cl == NULL) cl = &q->link; - return &cl->filter_list; + return cl->block; } static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, @@ -1756,7 +1763,7 @@ static const struct Qdisc_class_ops cbq_class_ops = { .change = cbq_change_class, .delete = cbq_delete, .walk = cbq_walk, - .tcf_chain = cbq_find_tcf, + .tcf_block = cbq_tcf_block, .bind_tcf = cbq_bind_filter, .unbind_tcf = cbq_unbind_filter, .dump = cbq_dump_class, diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 58a8c32eab23..5db2a2843c66 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -36,6 +36,7 @@ struct drr_class { struct drr_sched { struct list_head active; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct Qdisc_class_hash clhash; }; @@ -190,15 +191,14 @@ static void drr_put_class(struct Qdisc *sch, unsigned long arg) drr_destroy_class(sch, cl); } -static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl) { struct drr_sched *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, @@ -333,7 +333,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -431,6 +431,9 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct drr_sched *q = qdisc_priv(sch); int err; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; @@ -462,7 +465,7 @@ static void drr_destroy_qdisc(struct Qdisc *sch) struct hlist_node *next; unsigned int i; - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -477,7 +480,7 @@ static const struct Qdisc_class_ops drr_class_ops = { .delete = drr_delete_class, .get = drr_get_class, .put = drr_put_class, - .tcf_chain = drr_tcf_chain, + .tcf_block = drr_tcf_block, .bind_tcf = drr_bind_tcf, .unbind_tcf = drr_unbind_tcf, .graft = drr_graft_class, diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 1c0f877f673a..7ccdd825d34e 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -44,6 +44,7 @@ struct mask_value { struct dsmark_qdisc_data { struct Qdisc *q; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct mask_value *mv; u16 indices; u8 set_tc_index; @@ -183,11 +184,11 @@ ignore: } } -static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - return &p->filter_list; + + return p->block; } /* --------------------------- Qdisc operations ---------------------------- */ @@ -234,7 +235,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tc_classify(skb, fl, &res, false); + int result = tcf_classify(skb, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); @@ -342,6 +343,10 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) goto errout; + err = tcf_block_get(&p->block, &p->filter_list); + if (err) + return err; + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL); if (err < 0) goto errout; @@ -400,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - tcf_destroy_chain(&p->filter_list); + tcf_block_put(p->block); qdisc_destroy(p->q); if (p->mv != p->embedded) kfree(p->mv); @@ -468,7 +473,7 @@ static const struct Qdisc_class_ops dsmark_class_ops = { .change = dsmark_change, .delete = dsmark_delete, .walk = dsmark_walk, - .tcf_chain = dsmark_find_tcf, + .tcf_block = dsmark_tcf_block, .bind_tcf = dsmark_bind_filter, .unbind_tcf = dsmark_put, .dump = dsmark_dump_class, diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index b488721a0059..147fde73a0f5 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -390,9 +390,17 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->stat_tcp_retrans++; qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { + struct sock *sk = skb->sk; + fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); + if (sk && q->rate_enable) { + if (unlikely(smp_load_acquire(&sk->sk_pacing_status) != + SK_PACING_FQ)) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); + } q->inactive_flows--; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9201abce928c..f201e73947fb 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -55,6 +55,7 @@ struct fq_codel_flow { struct fq_codel_sched_data { struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ u32 *backlogs; /* backlog table [flows_cnt] */ u32 flows_cnt; /* number of flows */ @@ -96,7 +97,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, filter, &res, false); + result = tcf_classify(skb, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -450,7 +451,7 @@ static void fq_codel_destroy(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); kvfree(q->backlogs); kvfree(q->flows); } @@ -459,6 +460,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) { struct fq_codel_sched_data *q = qdisc_priv(sch); int i; + int err; sch->limit = 10*1024; q->flows_cnt = 1024; @@ -478,6 +480,10 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) return err; } + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + if (!q->flows) { q->flows = kvzalloc(q->flows_cnt * sizeof(struct fq_codel_flow), GFP_KERNEL); @@ -589,14 +595,13 @@ static void fq_codel_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *fq_codel_tcf_block(struct Qdisc *sch, unsigned long cl) { struct fq_codel_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, @@ -679,7 +684,7 @@ static const struct Qdisc_class_ops fq_codel_class_ops = { .leaf = fq_codel_leaf, .get = fq_codel_get, .put = fq_codel_put, - .tcf_chain = fq_codel_find_tcf, + .tcf_block = fq_codel_tcf_block, .bind_tcf = fq_codel_bind, .unbind_tcf = fq_codel_put, .dump = fq_codel_dump_class, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 5cb82f6c1b06..a324f84b1ccd 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,6 +116,7 @@ struct hfsc_class { struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ + struct tcf_block *block; unsigned int filter_cnt; /* filter count */ unsigned int level; /* class level in hierarchy */ @@ -1040,12 +1041,19 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + err = tcf_block_get(&cl->block, &cl->filter_list); + if (err) { + kfree(cl); + return err; + } + if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { + tcf_block_put(cl->block); kfree(cl); return err; } @@ -1091,7 +1099,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) { struct hfsc_sched *q = qdisc_priv(sch); - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); qdisc_destroy(cl->qdisc); gen_kill_estimator(&cl->rate_est); if (cl != &q->root) @@ -1142,7 +1150,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -1261,8 +1269,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) cl->filter_cnt--; } -static struct tcf_proto __rcu ** -hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) +static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; @@ -1270,7 +1277,7 @@ hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) if (cl == NULL) cl = &q->root; - return &cl->filter_list; + return cl->block; } static int @@ -1515,7 +1522,7 @@ hfsc_destroy_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -1662,7 +1669,7 @@ static const struct Qdisc_class_ops hfsc_class_ops = { .put = hfsc_put_class, .bind_tcf = hfsc_bind_tcf, .unbind_tcf = hfsc_unbind_tcf, - .tcf_chain = hfsc_tcf_chain, + .tcf_block = hfsc_tcf_block, .dump = hfsc_dump_class, .dump_stats = hfsc_dump_class_stats, .walk = hfsc_walk diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 570ef3b0c09b..195bbca9eb0b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -105,6 +105,7 @@ struct htb_class { int quantum; /* but stored for parent-to-leaf return */ struct tcf_proto __rcu *filter_list; /* class attached filters */ + struct tcf_block *block; int filter_cnt; int refcnt; /* usage count of this class */ @@ -156,6 +157,7 @@ struct htb_sched { /* filters for qdisc itself */ struct tcf_proto __rcu *filter_list; + struct tcf_block *block; #define HTB_WARN_TOOMANYEVENTS 0x1 unsigned int warned; /* only one warning */ @@ -231,7 +233,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -1017,6 +1019,10 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL); if (err < 0) return err; @@ -1230,7 +1236,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) qdisc_destroy(cl->un.leaf.q); } gen_kill_estimator(&cl->rate_est); - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); kfree(cl); } @@ -1248,11 +1254,11 @@ static void htb_destroy(struct Qdisc *sch) * because filter need its target class alive to be able to call * unbind_filter on it (without Oops). */ - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) - tcf_destroy_chain(&cl->filter_list); + tcf_block_put(cl->block); } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -1396,6 +1402,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; + err = tcf_block_get(&cl->block, &cl->filter_list); + if (err) { + kfree(cl); + goto failure; + } if (htb_rate_est || tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, @@ -1403,6 +1414,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_root_sleeping_running(sch), tca[TCA_RATE] ? : &est.nla); if (err) { + tcf_block_put(cl->block); kfree(cl); goto failure; } @@ -1521,14 +1533,12 @@ failure: return err; } -static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch, - unsigned long arg) +static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list; - return fl; + return cl ? cl->block : q->block; } static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, @@ -1591,7 +1601,7 @@ static const struct Qdisc_class_ops htb_class_ops = { .change = htb_change_class, .delete = htb_delete, .walk = htb_walk, - .tcf_chain = htb_find_tcf, + .tcf_block = htb_tcf_block, .bind_tcf = htb_bind_filter, .unbind_tcf = htb_unbind_filter, .dump = htb_dump_class, diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 3bab5f66c392..d8a9bebcab90 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -18,6 +18,10 @@ #include <net/pkt_sched.h> #include <net/pkt_cls.h> +struct ingress_sched_data { + struct tcf_block *block; +}; + static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; @@ -47,16 +51,23 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { } -static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl) { - struct net_device *dev = qdisc_dev(sch); + struct ingress_sched_data *q = qdisc_priv(sch); - return &dev->ingress_cl_list; + return q->block; } static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { + struct ingress_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err; + + err = tcf_block_get(&q->block, &dev->ingress_cl_list); + if (err) + return err; + net_inc_ingress_queue(); sch->flags |= TCQ_F_CPUSTATS; @@ -65,9 +76,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) static void ingress_destroy(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); + struct ingress_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&dev->ingress_cl_list); + tcf_block_put(q->block); net_dec_ingress_queue(); } @@ -91,7 +102,7 @@ static const struct Qdisc_class_ops ingress_class_ops = { .get = ingress_get, .put = ingress_put, .walk = ingress_walk, - .tcf_chain = ingress_find_tcf, + .tcf_block = ingress_tcf_block, .tcf_cl_offload = ingress_cl_offload, .bind_tcf = ingress_bind_filter, .unbind_tcf = ingress_put, @@ -100,12 +111,18 @@ static const struct Qdisc_class_ops ingress_class_ops = { static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", + .priv_size = sizeof(struct ingress_sched_data), .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, .owner = THIS_MODULE, }; +struct clsact_sched_data { + struct tcf_block *ingress_block; + struct tcf_block *egress_block; +}; + static unsigned long clsact_get(struct Qdisc *sch, u32 classid) { switch (TC_H_MIN(classid)) { @@ -128,16 +145,15 @@ static unsigned long clsact_bind_filter(struct Qdisc *sch, return clsact_get(sch, classid); } -static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl) { - struct net_device *dev = qdisc_dev(sch); + struct clsact_sched_data *q = qdisc_priv(sch); switch (cl) { case TC_H_MIN(TC_H_MIN_INGRESS): - return &dev->ingress_cl_list; + return q->ingress_block; case TC_H_MIN(TC_H_MIN_EGRESS): - return &dev->egress_cl_list; + return q->egress_block; default: return NULL; } @@ -145,6 +161,18 @@ static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch, static int clsact_init(struct Qdisc *sch, struct nlattr *opt) { + struct clsact_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err; + + err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list); + if (err) + return err; + + err = tcf_block_get(&q->egress_block, &dev->egress_cl_list); + if (err) + return err; + net_inc_ingress_queue(); net_inc_egress_queue(); @@ -155,10 +183,10 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) static void clsact_destroy(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); + struct clsact_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&dev->ingress_cl_list); - tcf_destroy_chain(&dev->egress_cl_list); + tcf_block_put(q->egress_block); + tcf_block_put(q->ingress_block); net_dec_ingress_queue(); net_dec_egress_queue(); @@ -169,7 +197,7 @@ static const struct Qdisc_class_ops clsact_class_ops = { .get = clsact_get, .put = ingress_put, .walk = ingress_walk, - .tcf_chain = clsact_find_tcf, + .tcf_block = clsact_tcf_block, .tcf_cl_offload = clsact_cl_offload, .bind_tcf = clsact_bind_filter, .unbind_tcf = ingress_put, @@ -178,6 +206,7 @@ static const struct Qdisc_class_ops clsact_class_ops = { static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", + .priv_size = sizeof(struct clsact_sched_data), .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 43a3a10b3c81..604767482ad0 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -32,6 +32,7 @@ struct multiq_sched_data { u16 max_bands; u16 curband; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct Qdisc **queues; }; @@ -46,7 +47,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, fl, &res, false); + err = tcf_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -170,7 +171,7 @@ multiq_destroy(struct Qdisc *sch) int band; struct multiq_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (band = 0; band < q->bands; band++) qdisc_destroy(q->queues[band]); @@ -243,6 +244,10 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + q->max_bands = qdisc_dev(sch)->num_tx_queues; q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); @@ -367,14 +372,13 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl) { struct multiq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static const struct Qdisc_class_ops multiq_class_ops = { @@ -383,7 +387,7 @@ static const struct Qdisc_class_ops multiq_class_ops = { .get = multiq_get, .put = multiq_put, .walk = multiq_walk, - .tcf_chain = multiq_find_tcf, + .tcf_block = multiq_tcf_block, .bind_tcf = multiq_bind, .unbind_tcf = multiq_put, .dump = multiq_dump_class, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 92c2e6d448d7..a2404688dd01 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -25,6 +25,7 @@ struct prio_sched_data { int bands; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; @@ -42,7 +43,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tc_classify(skb, fl, &res, false); + err = tcf_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -145,7 +146,7 @@ prio_destroy(struct Qdisc *sch) int prio; struct prio_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (prio = 0; prio < q->bands; prio++) qdisc_destroy(q->queues[prio]); } @@ -204,9 +205,16 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) static int prio_init(struct Qdisc *sch, struct nlattr *opt) { + struct prio_sched_data *q = qdisc_priv(sch); + int err; + if (!opt) return -EINVAL; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + return prio_tune(sch, opt); } @@ -317,14 +325,13 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl) { struct prio_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static const struct Qdisc_class_ops prio_class_ops = { @@ -333,7 +340,7 @@ static const struct Qdisc_class_ops prio_class_ops = { .get = prio_get, .put = prio_put, .walk = prio_walk, - .tcf_chain = prio_find_tcf, + .tcf_block = prio_tcf_block, .bind_tcf = prio_bind, .unbind_tcf = prio_put, .dump = prio_dump_class, diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 041eba3006cc..076ad032befb 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -182,6 +182,7 @@ struct qfq_group { struct qfq_sched { struct tcf_proto __rcu *filter_list; + struct tcf_block *block; struct Qdisc_class_hash clhash; u64 oldV, V; /* Precise virtual times. */ @@ -582,15 +583,14 @@ static void qfq_put_class(struct Qdisc *sch, unsigned long arg) qfq_destroy_class(sch, cl); } -static struct tcf_proto __rcu **qfq_tcf_chain(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl) { struct qfq_sched *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, @@ -720,7 +720,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1438,6 +1438,10 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; + err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; @@ -1492,7 +1496,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) struct hlist_node *next; unsigned int i; - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], @@ -1508,7 +1512,7 @@ static const struct Qdisc_class_ops qfq_class_ops = { .delete = qfq_delete_class, .get = qfq_get_class, .put = qfq_put_class, - .tcf_chain = qfq_tcf_chain, + .tcf_block = qfq_tcf_block, .bind_tcf = qfq_bind_tcf, .unbind_tcf = qfq_unbind_tcf, .graft = qfq_graft_class, diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 0f777273ba29..9756b1ccd345 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -56,6 +56,7 @@ struct sfb_bins { struct sfb_sched_data { struct Qdisc *qdisc; struct tcf_proto __rcu *filter_list; + struct tcf_block *block; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; @@ -259,7 +260,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -465,7 +466,7 @@ static void sfb_destroy(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); qdisc_destroy(q->qdisc); } @@ -549,6 +550,11 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) static int sfb_init(struct Qdisc *sch, struct nlattr *opt) { struct sfb_sched_data *q = qdisc_priv(sch); + int err; + + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; q->qdisc = &noop_qdisc; return sfb_change(sch, opt); @@ -657,14 +663,13 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl) { struct sfb_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, @@ -682,7 +687,7 @@ static const struct Qdisc_class_ops sfb_class_ops = { .change = sfb_change_class, .delete = sfb_delete, .walk = sfb_walk, - .tcf_chain = sfb_find_tcf, + .tcf_block = sfb_tcf_block, .bind_tcf = sfb_bind, .unbind_tcf = sfb_put, .dump = sfb_dump_class, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 332d94be6e1c..66dfd15b7946 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -126,6 +126,7 @@ struct sfq_sched_data { u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct tcf_proto __rcu *filter_list; + struct tcf_block *block; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ @@ -180,7 +181,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, fl, &res, false); + result = tcf_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -697,7 +698,7 @@ static void sfq_destroy(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); + tcf_block_put(q->block); q->perturb_period = 0; del_timer_sync(&q->perturb_timer); sfq_free(q->ht); @@ -709,6 +710,11 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); int i; + int err; + + err = tcf_block_get(&q->block, &q->filter_list); + if (err) + return err; setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, (unsigned long)sch); @@ -815,14 +821,13 @@ static void sfq_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch, - unsigned long cl) +static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl) { struct sfq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; - return &q->filter_list; + return q->block; } static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, @@ -878,7 +883,7 @@ static const struct Qdisc_class_ops sfq_class_ops = { .leaf = sfq_leaf, .get = sfq_get, .put = sfq_put, - .tcf_chain = sfq_find_tcf, + .tcf_block = sfq_tcf_block, .bind_tcf = sfq_bind, .unbind_tcf = sfq_put, .dump = sfq_dump_class, diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 4f5a2b580aa5..275925b93b29 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -35,6 +35,7 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } @@ -98,6 +99,11 @@ static const struct net_offload sctp6_offload = { }, }; +static const struct skb_checksum_ops crc32c_csum_ops = { + .update = sctp_csum_update, + .combine = sctp_csum_combine, +}; + int __init sctp_offload_init(void) { int ret; @@ -110,6 +116,7 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; + crc32c_csum_stub = &crc32c_csum_ops; return ret; ipv4: diff --git a/net/sctp/output.c b/net/sctp/output.c index 1409a875ad8e..e2edf2ebbade 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -538,6 +538,7 @@ merge: } else { chksum: head->ip_summed = CHECKSUM_PARTIAL; + head->csum_not_inet = 1; head->csum_start = skb_transport_header(head) - head->head; head->csum_offset = offsetof(struct sctphdr, checksum); } diff --git a/net/socket.c b/net/socket.c index c2564eb25c6b..8f9dab330d57 100644 --- a/net/socket.c +++ b/net/socket.c @@ -461,7 +461,7 @@ EXPORT_SYMBOL(sock_from_file); * @err: pointer to an error code return * * The file handle passed in is locked and the socket it is bound - * too is returned. If an error occurs the err pointer is overwritten + * to is returned. If an error occurs the err pointer is overwritten * with a negative errno code and NULL is returned. The function checks * for both invalid handles and passing a handle which is not a socket. * @@ -662,6 +662,40 @@ static bool skb_is_err_queue(const struct sk_buff *skb) return skb->pkt_type == PACKET_OUTGOING; } +/* On transmit, software and hardware timestamps are returned independently. + * As the two skb clones share the hardware timestamp, which may be updated + * before the software timestamp is received, a hardware TX timestamp may be + * returned only if there is no software TX timestamp. Ignore false software + * timestamps, which may be made in the __sock_recv_timestamp() call when the + * option SO_TIMESTAMP(NS) is enabled on the socket, even when the skb has a + * hardware timestamp. + */ +static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) +{ + return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); +} + +static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct scm_ts_pktinfo ts_pktinfo; + struct net_device *orig_dev; + + if (!skb_mac_header_was_set(skb)) + return; + + memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); + + rcu_read_lock(); + orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); + if (orig_dev) + ts_pktinfo.if_index = orig_dev->ifindex; + rcu_read_unlock(); + + ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, + sizeof(ts_pktinfo), &ts_pktinfo); +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ @@ -670,14 +704,16 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); struct scm_timestamping tss; - int empty = 1; + int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (need_software_tstamp && skb->tstamp == 0) + if (need_software_tstamp && skb->tstamp == 0) { __net_timestamp(skb); + false_tstamp = 1; + } if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { @@ -699,8 +735,13 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) + !skb_is_swtx_tstamp(skb, false_tstamp) && + ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { empty = 0; + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + !skb_is_err_queue(skb)) + put_ts_pktinfo(msg, skb); + } if (!empty) { put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); |