diff options
Diffstat (limited to 'net')
181 files changed, 5662 insertions, 2265 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 5e9950453955..dc4411165e43 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -330,6 +330,7 @@ static void vlan_transfer_features(struct net_device *dev, vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); + vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } @@ -357,6 +358,7 @@ static int __vlan_device_event(struct net_device *dev, unsigned long event) static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; @@ -459,7 +461,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) - dev_change_flags(vlandev, flgs | IFF_UP); + dev_change_flags(vlandev, flgs | IFF_UP, + extack); netif_stacked_transfer_operstate(dev, vlandev); } break; @@ -647,93 +650,6 @@ out: return err; } -static struct sk_buff *vlan_gro_receive(struct list_head *head, - struct sk_buff *skb) -{ - const struct packet_offload *ptype; - unsigned int hlen, off_vlan; - struct sk_buff *pp = NULL; - struct vlan_hdr *vhdr; - struct sk_buff *p; - __be16 type; - int flush = 1; - - off_vlan = skb_gro_offset(skb); - hlen = off_vlan + sizeof(*vhdr); - vhdr = skb_gro_header_fast(skb, off_vlan); - if (skb_gro_header_hard(skb, hlen)) { - vhdr = skb_gro_header_slow(skb, hlen, off_vlan); - if (unlikely(!vhdr)) - goto out; - } - - type = vhdr->h_vlan_encapsulated_proto; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) - goto out_unlock; - - flush = 0; - - list_for_each_entry(p, head, list) { - struct vlan_hdr *vhdr2; - - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); - if (compare_vlan_header(vhdr, vhdr2)) - NAPI_GRO_CB(p)->same_flow = 0; - } - - skb_gro_pull(skb, sizeof(*vhdr)); - skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); - pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); - -out_unlock: - rcu_read_unlock(); -out: - skb_gro_flush_final(skb, pp, flush); - - return pp; -} - -static int vlan_gro_complete(struct sk_buff *skb, int nhoff) -{ - struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); - __be16 type = vhdr->h_vlan_encapsulated_proto; - struct packet_offload *ptype; - int err = -ENOENT; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); - - rcu_read_unlock(); - return err; -} - -static struct packet_offload vlan_packet_offloads[] __read_mostly = { - { - .type = cpu_to_be16(ETH_P_8021Q), - .priority = 10, - .callbacks = { - .gro_receive = vlan_gro_receive, - .gro_complete = vlan_gro_complete, - }, - }, - { - .type = cpu_to_be16(ETH_P_8021AD), - .priority = 10, - .callbacks = { - .gro_receive = vlan_gro_receive, - .gro_complete = vlan_gro_complete, - }, - }, -}; - static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); @@ -761,7 +677,6 @@ static struct pernet_operations vlan_net_ops = { static int __init vlan_proto_init(void) { int err; - unsigned int i; pr_info("%s v%s\n", vlan_fullname, vlan_version); @@ -785,9 +700,6 @@ static int __init vlan_proto_init(void) if (err < 0) goto err5; - for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) - dev_add_offload(&vlan_packet_offloads[i]); - vlan_ioctl_set(vlan_ioctl_handler); return 0; @@ -805,13 +717,8 @@ err0: static void __exit vlan_cleanup_module(void) { - unsigned int i; - vlan_ioctl_set(NULL); - for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) - dev_remove_offload(&vlan_packet_offloads[i]); - vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 44df1c3df02d..c46daf09a501 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -92,6 +92,18 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, return NULL; } +static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) +{ + netdev_features_t ret; + + ret = real_dev->hw_enc_features & + (NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO | NETIF_F_GSO_ENCAP_ALL); + + if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) + return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; + return 0; +} + #define vlan_group_for_each_dev(grp, i, dev) \ for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \ if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 4f60e86f4b8d..a313165e7a67 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -57,7 +57,7 @@ bool vlan_do_receive(struct sk_buff **skbp) } skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats); @@ -223,6 +223,33 @@ static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vi return -ENODEV; } +int vlan_for_each(struct net_device *dev, + int (*action)(struct net_device *dev, int vid, void *arg), + void *arg) +{ + struct vlan_vid_info *vid_info; + struct vlan_info *vlan_info; + struct net_device *vdev; + int ret; + + ASSERT_RTNL(); + + vlan_info = rtnl_dereference(dev->vlan_info); + if (!vlan_info) + return 0; + + list_for_each_entry(vid_info, &vlan_info->vid_list, list) { + vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto, + vid_info->vid); + ret = action(vdev, vid_info->vid, arg); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL(vlan_for_each); + int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto) { struct net_device *real_dev = vlan_info->real_dev; @@ -426,3 +453,102 @@ bool vlan_uses_dev(const struct net_device *dev) return vlan_info->grp.nr_vlan_devs ? true : false; } EXPORT_SYMBOL(vlan_uses_dev); + +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) +{ + const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; + __be16 type; + int flush = 1; + + off_vlan = skb_gro_offset(skb); + hlen = off_vlan + sizeof(*vhdr); + vhdr = skb_gro_header_fast(skb, off_vlan); + if (skb_gro_header_hard(skb, hlen)) { + vhdr = skb_gro_header_slow(skb, hlen, off_vlan); + if (unlikely(!vhdr)) + goto out; + } + + type = vhdr->h_vlan_encapsulated_proto; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) + goto out_unlock; + + flush = 0; + + list_for_each_entry(p, head, list) { + struct vlan_hdr *vhdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); + if (compare_vlan_header(vhdr, vhdr2)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + skb_gro_pull(skb, sizeof(*vhdr)); + skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + +out_unlock: + rcu_read_unlock(); +out: + skb_gro_flush_final(skb, pp, flush); + + return pp; +} + +static int vlan_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); + __be16 type = vhdr->h_vlan_encapsulated_proto; + struct packet_offload *ptype; + int err = -ENOENT; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); + + rcu_read_unlock(); + return err; +} + +static struct packet_offload vlan_packet_offloads[] __read_mostly = { + { + .type = cpu_to_be16(ETH_P_8021Q), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, + { + .type = cpu_to_be16(ETH_P_8021AD), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, +}; + +static int __init vlan_offload_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_add_offload(&vlan_packet_offloads[i]); + + return 0; +} + +fs_initcall(vlan_offload_init); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ff720f1ebf73..b2d9c8f27cd7 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -562,6 +562,7 @@ static int vlan_dev_init(struct net_device *dev) dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_ALL_FCOE; @@ -572,6 +573,7 @@ static int vlan_dev_init(struct net_device *dev) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; + dev->hw_enc_features = vlan_tnl_features(real_dev); /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index f75816f58107..c386e6981416 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -22,7 +22,6 @@ config BATMAN_ADV tristate "B.A.T.M.A.N. Advanced Meshing Protocol" depends on NET - select CRC16 select LIBCRC32C help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is @@ -48,6 +47,7 @@ config BATMAN_ADV_BATMAN_V config BATMAN_ADV_BLA bool "Bridge Loop Avoidance" depends on BATMAN_ADV && INET + select CRC16 default y help This option enables BLA (Bridge Loop Avoidance), a mechanism @@ -82,6 +82,7 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) + default y help This option enables the multicast optimisation which aims to reduce the air overhead while improving the reliability of @@ -100,12 +101,13 @@ config BATMAN_ADV_DEBUGFS config BATMAN_ADV_DEBUG bool "B.A.T.M.A.N. debugging" - depends on BATMAN_ADV_DEBUGFS + depends on BATMAN_ADV help This is an option for use by developers; most people should say N here. This enables compilation of support for - outputting debugging information to the kernel log. The - output is controlled via the module parameter debug. + outputting debugging information to the debugfs log or tracing + buffer. The output is controlled via the batadv netdev specific + log_level setting. config BATMAN_ADV_TRACING bool "B.A.T.M.A.N. tracing support" diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index d2227091029f..f97e566f0402 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -34,7 +34,6 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> @@ -2585,13 +2584,14 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv, * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ -static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { @@ -2611,13 +2611,16 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + ret = -EMSGSIZE; if (curr_gw == gw_node) @@ -2668,13 +2671,15 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, int idx_skip = cb->args[0]; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { + spin_lock_bh(&bat_priv->gw.list_lock); + cb->seq = bat_priv->gw.generation << 1 | 1; + + hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; - if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, gw_node)) { + if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv, + gw_node)) { idx_skip = idx - 1; goto unlock; } @@ -2682,7 +2687,7 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, idx_skip = idx; unlock: - rcu_read_unlock(); + spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 6baec4e68898..90e33f84d37a 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -27,11 +27,13 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> +#include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> @@ -915,13 +917,14 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv, * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ -static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { @@ -941,13 +944,16 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + ret = -EMSGSIZE; if (curr_gw == gw_node) { @@ -1018,13 +1024,15 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, int idx_skip = cb->args[0]; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { + spin_lock_bh(&bat_priv->gw.list_lock); + cb->seq = bat_priv->gw.generation << 1 | 1; + + hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; - if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, gw_node)) { + if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv, + gw_node)) { idx_skip = idx - 1; goto unlock; } @@ -1032,7 +1040,7 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, idx_skip = idx; unlock: - rcu_read_unlock(); + spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 5f1aeeded0e3..5fdde2947802 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -2094,14 +2094,15 @@ out: * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface * @claim: entry to dump * * Return: 0 or error code. */ static int -batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { @@ -2111,13 +2112,16 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, void *hdr; int ret = -EINVAL; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_BLA_CLAIM); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); @@ -2153,28 +2157,33 @@ out: * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface - * @head: bucket to dump + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int -batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, - struct hlist_head *head, int *idx_skip) + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_skip) { struct batadv_bla_claim *claim; int idx = 0; int ret = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(claim, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; - ret = batadv_bla_claim_dump_entry(msg, portid, seq, + ret = batadv_bla_claim_dump_entry(msg, portid, cb, primary_if, claim); if (ret) { *idx_skip = idx - 1; @@ -2184,7 +2193,7 @@ batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, *idx_skip = 0; unlock: - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return ret; } @@ -2204,7 +2213,6 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -2230,11 +2238,8 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_bla_claim_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, - primary_if, head, &idx)) + if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if, + hash, bucket, &idx)) break; bucket++; } @@ -2325,14 +2330,15 @@ out: * netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface * @backbone_gw: entry to dump * * Return: 0 or error code. */ static int -batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { @@ -2343,13 +2349,16 @@ batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, void *hdr; int ret = -EINVAL; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_BLA_BACKBONE); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); spin_lock_bh(&backbone_gw->crc_lock); @@ -2386,28 +2395,33 @@ out: * a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface - * @head: bucket to dump + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int -batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, - struct hlist_head *head, int *idx_skip) + struct batadv_hashtable *hash, + unsigned int bucket, int *idx_skip) { struct batadv_bla_backbone_gw *backbone_gw; int idx = 0; int ret = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; - ret = batadv_bla_backbone_dump_entry(msg, portid, seq, + ret = batadv_bla_backbone_dump_entry(msg, portid, cb, primary_if, backbone_gw); if (ret) { *idx_skip = idx - 1; @@ -2417,7 +2431,7 @@ batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, *idx_skip = 0; unlock: - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return ret; } @@ -2437,7 +2451,6 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -2463,11 +2476,8 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_bla_backbone_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, - primary_if, head, &idx)) + if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if, + hash, bucket, &idx)) break; bucket++; } diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 8b608a2e2653..d4a7702e48d8 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -19,6 +19,7 @@ #include "debugfs.h" #include "main.h" +#include <asm/current.h> #include <linux/dcache.h> #include <linux/debugfs.h> #include <linux/err.h> @@ -27,6 +28,7 @@ #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/printk.h> +#include <linux/sched.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/stddef.h> diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index a60bacf7120b..b9ffe1826527 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -863,23 +863,27 @@ out: * netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @dat_entry: entry to dump * * Return: 0 or error code. */ static int -batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_dat_entry *dat_entry) { int msecs; void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_DAT_CACHE); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + msecs = jiffies_to_msecs(jiffies - dat_entry->last_update); if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS, @@ -901,27 +905,31 @@ batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message - * @head: bucket to dump + * @cb: Control block containing additional options + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int -batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, - struct hlist_head *head, int *idx_skip) +batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_skip) { struct batadv_dat_entry *dat_entry; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(dat_entry, &hash->table[bucket], hash_entry) { if (idx < *idx_skip) goto skip; - if (batadv_dat_cache_dump_entry(msg, portid, seq, - dat_entry)) { - rcu_read_unlock(); + if (batadv_dat_cache_dump_entry(msg, portid, cb, dat_entry)) { + spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; @@ -930,7 +938,7 @@ batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, skip: idx++; } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return 0; } @@ -951,7 +959,6 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -977,10 +984,7 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_dat_cache_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, head, + if (batadv_dat_cache_dump_bucket(msg, portid, cb, hash, bucket, &idx)) break; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 140c61a3f1ec..9d8e5eda2314 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -377,6 +377,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); + bat_priv->gw.generation++; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", @@ -472,6 +473,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, if (!hlist_unhashed(&gw_node->list)) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); + bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); @@ -518,6 +520,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); + bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 781c5b6e6e8e..508f4416dfc9 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -951,6 +951,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); + batadv_hardif_generation++; return hard_iface; @@ -993,6 +994,7 @@ void batadv_hardif_remove_interfaces(void) list_for_each_entry_safe(hard_iface, hard_iface_tmp, &batadv_hardif_list, list) { list_del_rcu(&hard_iface->list); + batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); } rtnl_unlock(); @@ -1054,6 +1056,7 @@ static int batadv_hard_if_event(struct notifier_block *this, case NETDEV_UNREGISTER: case NETDEV_PRE_TYPE_CHANGE: list_del_rcu(&hard_iface->list); + batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); break; diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 7b49e4001778..9194f4d891b1 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -32,6 +32,8 @@ static void batadv_hash_init(struct batadv_hashtable *hash) INIT_HLIST_HEAD(&hash->table[i]); spin_lock_init(&hash->list_locks[i]); } + + atomic_set(&hash->generation, 0); } /** diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 9490a7ca2ba6..0e36fa1c7c3e 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -21,6 +21,7 @@ #include "main.h" +#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> #include <linux/rculist.h> @@ -58,6 +59,9 @@ struct batadv_hashtable { /** @size: size of hashtable */ u32 size; + + /** @generation: current (generation) sequence number */ + atomic_t generation; }; /* allocates and clears the hash */ @@ -112,6 +116,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash, /* no duplicate found in list, add new element */ hlist_add_head_rcu(data_node, head); + atomic_inc(&hash->generation); ret = 0; @@ -154,6 +159,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, data_save = node; hlist_del_rcu(node); + atomic_inc(&hash->generation); break; } spin_unlock_bh(&hash->list_locks[index]); diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 6beb5f067810..02e55b78132f 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -43,6 +43,8 @@ #include "debugfs.h" #include "trace.h" +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + #define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; @@ -92,33 +94,6 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, return 0; } -/** - * batadv_debug_log() - Add debug log entry - * @bat_priv: the bat priv with all the soft interface information - * @fmt: format string - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", - jiffies_to_msecs(jiffies), &vaf); - - trace_batadv_dbg(bat_priv, &vaf); - - va_end(args); - - return 0; -} - static int batadv_log_open(struct inode *inode, struct file *file) { if (!try_module_get(THIS_MODULE)) @@ -259,3 +234,34 @@ void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) kfree(bat_priv->debug_log); bat_priv->debug_log = NULL; } + +#endif /* CONFIG_BATMAN_ADV_DEBUGFS */ + +/** + * batadv_debug_log() - Add debug log entry + * @bat_priv: the bat priv with all the soft interface information + * @fmt: format string + * + * Return: 0 on success or negative error number in case of failure + */ +int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", + jiffies_to_msecs(jiffies), &vaf); +#endif + + trace_batadv_dbg(bat_priv, &vaf); + + va_end(args); + + return 0; +} diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 69c0d85bceb3..d1ed839fd32b 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -74,6 +74,7 @@ * list traversals just rcu-locked */ struct list_head batadv_hardif_list; +unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); @@ -186,6 +187,8 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); INIT_HLIST_HEAD(&bat_priv->tp_list); + bat_priv->gw.generation = 0; + ret = batadv_v_mesh_init(bat_priv); if (ret < 0) goto err; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2002b70e18db..b572066325e4 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.4" +#define BATADV_SOURCE_VERSION "2019.0" #endif /* B.A.T.M.A.N. parameters */ @@ -247,6 +247,7 @@ static inline int batadv_print_vid(unsigned short vid) } extern struct list_head batadv_hardif_list; +extern unsigned int batadv_hardif_generation; extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 86725d792e15..69244e4598f5 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1365,22 +1365,26 @@ int batadv_mcast_mesh_info_put(struct sk_buff *msg, * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int -batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); @@ -1405,21 +1409,26 @@ batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * table to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message - * @head: bucket to dump + * @cb: Control block containing additional options + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int -batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, - struct hlist_head *head, long *idx_skip) +batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, + struct batadv_hashtable *hash, + unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; @@ -1427,9 +1436,8 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, if (idx < *idx_skip) goto skip; - if (batadv_mcast_flags_dump_entry(msg, portid, seq, - orig_node)) { - rcu_read_unlock(); + if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { + spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; @@ -1438,7 +1446,7 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, skip: idx++; } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return 0; } @@ -1447,7 +1455,7 @@ skip: * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: the bat priv with all the soft interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump @@ -1455,19 +1463,17 @@ skip: * Return: 0 or error code. */ static int -__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq, +__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; - struct hlist_head *head; long idx_tmp = *idx; while (bucket_tmp < hash->size) { - head = &hash->table[bucket_tmp]; - - if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head, - &idx_tmp)) + if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, + *bucket, &idx_tmp)) break; bucket_tmp++; @@ -1550,8 +1556,7 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) return ret; bat_priv = netdev_priv(primary_if->soft_iface); - ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, bucket, idx); + ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0d9459b69bdb..2dc3304cee54 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -29,11 +29,11 @@ #include <linux/if_ether.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> @@ -445,23 +445,27 @@ out: * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @hard_iface: Hard interface to dump * * Return: error code, or 0 on success */ static int -batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *hard_iface) { struct net_device *net_dev = hard_iface->net_dev; void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_HARDIFS); if (!hdr) return -EMSGSIZE; + genl_dump_check_consistent(cb, hdr); + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, @@ -498,7 +502,6 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hard_iface *hard_iface; int ifindex; int portid = NETLINK_CB(cb->skb).portid; - int seq = cb->nlh->nlmsg_seq; int skip = cb->args[0]; int i = 0; @@ -516,23 +519,24 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) return -ENODEV; } - rcu_read_lock(); + rtnl_lock(); + cb->seq = batadv_hardif_generation << 1 | 1; - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (i++ < skip) continue; - if (batadv_netlink_dump_hardif_entry(msg, portid, seq, + if (batadv_netlink_dump_hardif_entry(msg, portid, cb, hard_iface)) { i--; break; } } - rcu_read_unlock(); + rtnl_unlock(); dev_put(soft_iface); diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3d57f9981f25..8e1024217cff 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -16,7 +16,5 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include <linux/module.h> - #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 3acda26a30ca..104784be94d7 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -21,7 +21,13 @@ #include "main.h" +#include <linux/bug.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/percpu.h> +#include <linux/printk.h> #include <linux/tracepoint.h> +#include <linux/types.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM batadv diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index d21624c44665..8dcd4968cde7 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1145,14 +1145,15 @@ out: * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @common: tt local & tt global common data * * Return: Error code, or 0 on success */ static int -batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common) { @@ -1173,12 +1174,14 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, batadv_softif_vlan_put(vlan); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_LOCAL); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || @@ -1201,34 +1204,39 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @head: Pointer to the list containing the local tt entries + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_s: Number of entries to skip * * Return: Error code, or 0 on success */ static int -batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, - struct hlist_head *head, int *idx_s) + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_s) { struct batadv_tt_common_entry *common; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(common, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(common, &hash->table[bucket], hash_entry) { if (idx++ < *idx_s) continue; - if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv, + if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv, common)) { - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = idx - 1; return -EMSGSIZE; } } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = 0; return 0; @@ -1248,7 +1256,6 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; - struct hlist_head *head; int ret; int ifindex; int bucket = cb->args[0]; @@ -1276,10 +1283,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) hash = bat_priv->tt.local_hash; while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, head, &idx)) + if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv, + hash, bucket, &idx)) break; bucket++; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 45b5592de816..cbe17da36fcb 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1096,12 +1096,15 @@ struct batadv_priv_gw { /** @gateway_list: list of available gateway nodes */ struct hlist_head gateway_list; - /** @list_lock: lock protecting gateway_list & curr_gw */ + /** @list_lock: lock protecting gateway_list, curr_gw, generation */ spinlock_t list_lock; /** @curr_gw: pointer to currently selected gateway node */ struct batadv_gw_node __rcu *curr_gw; + /** @generation: current (generation) sequence number */ + unsigned int generation; + /** * @mode: gateway operation: off, client or server (see batadv_gw_modes) */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 828e87fe8027..9d79c7de234a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -607,7 +607,7 @@ static void ifup(struct net_device *netdev) int err; rtnl_lock(); - err = dev_open(netdev); + err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); diff --git a/net/bridge/br.c b/net/bridge/br.c index 360ad66c21e9..4e7cd993ce94 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -175,6 +175,82 @@ static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; +/* br_boolopt_toggle - change user-controlled boolean option + * + * @br: bridge device + * @opt: id of the option to change + * @on: new option value + * @extack: extack for error messages + * + * Changes the value of the respective boolean option to @on taking care of + * any internal option value mapping and configuration. + */ +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack) +{ + switch (opt) { + case BR_BOOLOPT_NO_LL_LEARN: + br_opt_toggle(br, BROPT_NO_LL_LEARN, on); + break; + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) +{ + switch (opt) { + case BR_BOOLOPT_NO_LL_LEARN: + return br_opt_get(br, BROPT_NO_LL_LEARN); + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack) +{ + unsigned long bitmap = bm->optmask; + int err = 0; + int opt_id; + + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { + bool on = !!(bm->optval & BIT(opt_id)); + + err = br_boolopt_toggle(br, opt_id, on, extack); + if (err) { + br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", + opt_id, br_boolopt_get(br, opt_id), on, err); + break; + } + } + + return err; +} + +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm) +{ + u32 optval = 0; + int opt_id; + + for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) + optval |= (br_boolopt_get(br, opt_id) << opt_id); + + bm->optval = optval; + bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); +} + +/* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c6abf927f0c9..9f41a5d4da3f 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -131,9 +131,17 @@ static int br_dev_init(struct net_device *dev) return err; } + err = br_mdb_hash_init(br); + if (err) { + free_percpu(br->stats); + br_fdb_hash_fini(br); + return err; + } + err = br_vlan_init(br); if (err) { free_percpu(br->stats); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); return err; } @@ -142,6 +150,7 @@ static int br_dev_init(struct net_device *dev) if (err) { free_percpu(br->stats); br_vlan_flush(br); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); } br_set_lockdep_class(dev); @@ -156,6 +165,7 @@ static void br_dev_uninit(struct net_device *dev) br_multicast_dev_del(br); br_multicast_uninit_stats(br); br_vlan_flush(br); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); free_percpu(br->stats); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e56ba3912a90..38b1d0dd0529 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1164,3 +1164,23 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, spin_unlock_bh(&br->hash_lock); } + +void br_fdb_clear_offload(const struct net_device *dev, u16 vid) +{ + struct net_bridge_fdb_entry *f; + struct net_bridge_port *p; + + ASSERT_RTNL(); + + p = br_port_get_rtnl(dev); + if (!p) + return; + + spin_lock_bh(&p->br->hash_lock); + hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { + if (f->dst == p && f->key.vlan_id == vid) + f->offloaded = 0; + } + spin_unlock_bh(&p->br->hash_lock); +} +EXPORT_SYMBOL_GPL(br_fdb_clear_offload); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 9b46d2dc4c22..d4863f5679ac 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -741,3 +741,15 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) if (mask & BR_NEIGH_SUPPRESS) br_recalculate_neigh_suppress_enabled(br); } + +bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) +{ + struct net_bridge_port *p; + + p = br_port_get_rtnl_rcu(dev); + if (!p) + return false; + + return p->flags & flag; +} +EXPORT_SYMBOL_GPL(br_port_flag_is_set); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 3ddca11f44c2..5ea7e56119c1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -188,7 +188,9 @@ static void __br_handle_local_finish(struct sk_buff *skb) u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ - if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) + if ((p->flags & BR_LEARNING) && + !br_opt_get(p->br, BROPT_NO_LL_LEARN) && + br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index a7ea2d431714..79d4c9d253e0 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -78,82 +78,72 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { + int idx = 0, s_idx = cb->args[1], err = 0; struct net_bridge *br = netdev_priv(dev); - struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; - int i, err = 0; - int idx = 0, s_idx = cb->args[1]; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; - mdb = rcu_dereference(br->mdb); - if (!mdb) - return 0; - nest = nla_nest_start(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; - for (i = 0; i < mdb->max; i++) { - struct net_bridge_mdb_entry *mp; + hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct net_bridge_port *port; - hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) { - if (idx < s_idx) - goto skip; + if (idx < s_idx) + goto skip; - nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); - if (nest2 == NULL) { - err = -EMSGSIZE; - goto out; - } + nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); + if (!nest2) { + err = -EMSGSIZE; + break; + } - for (pp = &mp->ports; - (p = rcu_dereference(*pp)) != NULL; - pp = &p->next) { - struct nlattr *nest_ent; - struct br_mdb_entry e; - - port = p->port; - if (!port) - continue; - - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.vid = p->addr.vid; - __mdb_entry_fill_flags(&e, p->flags); - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; + for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; + pp = &p->next) { + struct nlattr *nest_ent; + struct br_mdb_entry e; + + port = p->port; + if (!port) + continue; + + memset(&e, 0, sizeof(e)); + e.ifindex = port->dev->ifindex; + e.vid = p->addr.vid; + __mdb_entry_fill_flags(&e, p->flags); + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif - e.addr.proto = p->addr.proto; - nest_ent = nla_nest_start(skb, - MDBA_MDB_ENTRY_INFO); - if (!nest_ent) { - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } - if (nla_put_nohdr(skb, sizeof(e), &e) || - nla_put_u32(skb, - MDBA_MDB_EATTR_TIMER, - br_timer_value(&p->timer))) { - nla_nest_cancel(skb, nest_ent); - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } - nla_nest_end(skb, nest_ent); + e.addr.proto = p->addr.proto; + nest_ent = nla_nest_start(skb, MDBA_MDB_ENTRY_INFO); + if (!nest_ent) { + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; } - nla_nest_end(skb, nest2); - skip: - idx++; + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(&p->timer))) { + nla_nest_cancel(skb, nest_ent); + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; + } + nla_nest_end(skb, nest_ent); } + nla_nest_end(skb, nest2); +skip: + idx++; } out: @@ -203,8 +193,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); - /* In theory this could be wrapped to 0... */ - cb->seq = net->dev_base_seq + br_mdb_rehash_seq; + cb->seq = net->dev_base_seq; for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_EBRIDGE) { @@ -297,7 +286,6 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv) struct br_mdb_complete_info *data = priv; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; @@ -306,8 +294,7 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv) goto err; spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, &data->ip); + mp = br_mdb_ip_get(br, &data->ip); if (!mp) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; @@ -588,14 +575,12 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct net_bridge_mdb_htable *mdb; unsigned long now = jiffies; int err; - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, group); + mp = br_mdb_ip_get(br, group); if (!mp) { - mp = br_multicast_new_group(br, port, group); + mp = br_multicast_new_group(br, group); err = PTR_ERR_OR_ZERO(mp); if (err) return err; @@ -696,7 +681,6 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; @@ -709,9 +693,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - - mp = br_mdb_ip_get(mdb, &ip); + mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; @@ -728,7 +710,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); err = 0; if (!mp->ports && !mp->host_joined && diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 6bac0d6b7b94..879cd2315769 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -37,6 +37,14 @@ #include "br_private.h" +static const struct rhashtable_params br_mdb_rht_params = { + .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode), + .key_offset = offsetof(struct net_bridge_mdb_entry, addr), + .key_len = sizeof(struct br_ip), + .automatic_shrinking = true, + .locks_mul = 1, +}; + static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); static void br_multicast_add_router(struct net_bridge *br, @@ -54,7 +62,6 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif -unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) { @@ -73,89 +80,58 @@ static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) return 0; } -static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip, - __u16 vid) -{ - return jhash_2words((__force u32)ip, vid, mdb->secret) & (mdb->max - 1); -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb, - const struct in6_addr *ip, - __u16 vid) -{ - return jhash_2words(ipv6_addr_hash(ip), vid, - mdb->secret) & (mdb->max - 1); -} -#endif - -static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb, - struct br_ip *ip) +static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, + struct br_ip *dst) { - switch (ip->proto) { - case htons(ETH_P_IP): - return __br_ip4_hash(mdb, ip->u.ip4, ip->vid); -#if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return __br_ip6_hash(mdb, &ip->u.ip6, ip->vid); -#endif - } - return 0; + return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); } -static struct net_bridge_mdb_entry *__br_mdb_ip_get( - struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash) +struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br, + struct br_ip *dst) { - struct net_bridge_mdb_entry *mp; + struct net_bridge_mdb_entry *ent; - hlist_for_each_entry_rcu(mp, &mdb->mhash[hash], hlist[mdb->ver]) { - if (br_ip_equal(&mp->addr, dst)) - return mp; - } - - return NULL; -} + lockdep_assert_held_once(&br->multicast_lock); -struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, - struct br_ip *dst) -{ - if (!mdb) - return NULL; + rcu_read_lock(); + ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); + rcu_read_unlock(); - return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst)); + return ent; } -static struct net_bridge_mdb_entry *br_mdb_ip4_get( - struct net_bridge_mdb_htable *mdb, __be32 dst, __u16 vid) +static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br, + __be32 dst, __u16 vid) { struct br_ip br_dst; + memset(&br_dst, 0, sizeof(br_dst)); br_dst.u.ip4 = dst; br_dst.proto = htons(ETH_P_IP); br_dst.vid = vid; - return br_mdb_ip_get(mdb, &br_dst); + return br_mdb_ip_get(br, &br_dst); } #if IS_ENABLED(CONFIG_IPV6) -static struct net_bridge_mdb_entry *br_mdb_ip6_get( - struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst, - __u16 vid) +static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, + const struct in6_addr *dst, + __u16 vid) { struct br_ip br_dst; + memset(&br_dst, 0, sizeof(br_dst)); br_dst.u.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); br_dst.vid = vid; - return br_mdb_ip_get(mdb, &br_dst); + return br_mdb_ip_get(br, &br_dst); } #endif struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb); struct br_ip ip; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) @@ -164,6 +140,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, if (BR_INPUT_SKB_CB(skb)->igmp) return NULL; + memset(&ip, 0, sizeof(ip)); ip.proto = skb->protocol; ip.vid = vid; @@ -180,70 +157,13 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return NULL; } - return br_mdb_ip_get(mdb, &ip); -} - -static void br_mdb_free(struct rcu_head *head) -{ - struct net_bridge_mdb_htable *mdb = - container_of(head, struct net_bridge_mdb_htable, rcu); - struct net_bridge_mdb_htable *old = mdb->old; - - mdb->old = NULL; - kfree(old->mhash); - kfree(old); -} - -static int br_mdb_copy(struct net_bridge_mdb_htable *new, - struct net_bridge_mdb_htable *old, - int elasticity) -{ - struct net_bridge_mdb_entry *mp; - int maxlen; - int len; - int i; - - for (i = 0; i < old->max; i++) - hlist_for_each_entry(mp, &old->mhash[i], hlist[old->ver]) - hlist_add_head(&mp->hlist[new->ver], - &new->mhash[br_ip_hash(new, &mp->addr)]); - - if (!elasticity) - return 0; - - maxlen = 0; - for (i = 0; i < new->max; i++) { - len = 0; - hlist_for_each_entry(mp, &new->mhash[i], hlist[new->ver]) - len++; - if (len > maxlen) - maxlen = len; - } - - return maxlen > elasticity ? -EINVAL : 0; -} - -void br_multicast_free_pg(struct rcu_head *head) -{ - struct net_bridge_port_group *p = - container_of(head, struct net_bridge_port_group, rcu); - - kfree(p); -} - -static void br_multicast_free_group(struct rcu_head *head) -{ - struct net_bridge_mdb_entry *mp = - container_of(head, struct net_bridge_mdb_entry, rcu); - - kfree(mp); + return br_mdb_ip_get_rcu(br, &ip); } static void br_multicast_group_expired(struct timer_list *t) { struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); struct net_bridge *br = mp->br; - struct net_bridge_mdb_htable *mdb; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&mp->timer)) @@ -255,12 +175,11 @@ static void br_multicast_group_expired(struct timer_list *t) if (mp->ports) goto out; - mdb = mlock_dereference(br->mdb, br); + rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + hlist_del_rcu(&mp->mdb_node); - hlist_del_rcu(&mp->hlist[mdb->ver]); - mdb->size--; - - call_rcu_bh(&mp->rcu, br_multicast_free_group); + kfree_rcu(mp, rcu); out: spin_unlock(&br->multicast_lock); @@ -269,14 +188,11 @@ out: static void br_multicast_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - mdb = mlock_dereference(br->mdb, br); - - mp = br_mdb_ip_get(mdb, &pg->addr); + mp = br_mdb_ip_get(br, &pg->addr); if (WARN_ON(!mp)) return; @@ -291,7 +207,7 @@ static void br_multicast_del_pg(struct net_bridge *br, del_timer(&p->timer); br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, p->flags); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) @@ -319,53 +235,6 @@ out: spin_unlock(&br->multicast_lock); } -static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max, - int elasticity) -{ - struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1); - struct net_bridge_mdb_htable *mdb; - int err; - - mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC); - if (!mdb) - return -ENOMEM; - - mdb->max = max; - mdb->old = old; - - mdb->mhash = kcalloc(max, sizeof(*mdb->mhash), GFP_ATOMIC); - if (!mdb->mhash) { - kfree(mdb); - return -ENOMEM; - } - - mdb->size = old ? old->size : 0; - mdb->ver = old ? old->ver ^ 1 : 0; - - if (!old || elasticity) - get_random_bytes(&mdb->secret, sizeof(mdb->secret)); - else - mdb->secret = old->secret; - - if (!old) - goto out; - - err = br_mdb_copy(mdb, old, elasticity); - if (err) { - kfree(mdb->mhash); - kfree(mdb); - return err; - } - - br_mdb_rehash_seq++; - call_rcu_bh(&mdb->rcu, br_mdb_free); - -out: - rcu_assign_pointer(*mdbp, mdb); - - return 0; -} - static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, __be32 group, u8 *igmp_type) @@ -589,111 +458,19 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, return NULL; } -static struct net_bridge_mdb_entry *br_multicast_get_group( - struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, int hash) -{ - struct net_bridge_mdb_htable *mdb; - struct net_bridge_mdb_entry *mp; - unsigned int count = 0; - unsigned int max; - int elasticity; - int err; - - mdb = rcu_dereference_protected(br->mdb, 1); - hlist_for_each_entry(mp, &mdb->mhash[hash], hlist[mdb->ver]) { - count++; - if (unlikely(br_ip_equal(group, &mp->addr))) - return mp; - } - - elasticity = 0; - max = mdb->max; - - if (unlikely(count > br->hash_elasticity && count)) { - if (net_ratelimit()) - br_info(br, "Multicast hash table " - "chain limit reached: %s\n", - port ? port->dev->name : br->dev->name); - - elasticity = br->hash_elasticity; - } - - if (mdb->size >= max) { - max *= 2; - if (unlikely(max > br->hash_max)) { - br_warn(br, "Multicast hash table maximum of %d " - "reached, disabling snooping: %s\n", - br->hash_max, - port ? port->dev->name : br->dev->name); - err = -E2BIG; -disable: - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); - goto err; - } - } - - if (max > mdb->max || elasticity) { - if (mdb->old) { - if (net_ratelimit()) - br_info(br, "Multicast hash table " - "on fire: %s\n", - port ? port->dev->name : br->dev->name); - err = -EEXIST; - goto err; - } - - err = br_mdb_rehash(&br->mdb, max, elasticity); - if (err) { - br_warn(br, "Cannot rehash multicast " - "hash table, disabling snooping: %s, %d, %d\n", - port ? port->dev->name : br->dev->name, - mdb->size, err); - goto disable; - } - - err = -EAGAIN; - goto err; - } - - return NULL; - -err: - mp = ERR_PTR(err); - return mp; -} - struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, - struct net_bridge_port *p, struct br_ip *group) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - int hash; int err; - mdb = rcu_dereference_protected(br->mdb, 1); - if (!mdb) { - err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0); - if (err) - return ERR_PTR(err); - goto rehash; - } - - hash = br_ip_hash(mdb, group); - mp = br_multicast_get_group(br, p, group, hash); - switch (PTR_ERR(mp)) { - case 0: - break; - - case -EAGAIN: -rehash: - mdb = rcu_dereference_protected(br->mdb, 1); - hash = br_ip_hash(mdb, group); - break; + mp = br_mdb_ip_get(br, group); + if (mp) + return mp; - default: - goto out; + if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); + return ERR_PTR(-E2BIG); } mp = kzalloc(sizeof(*mp), GFP_ATOMIC); @@ -703,11 +480,15 @@ rehash: mp->br = br; mp->addr = *group; timer_setup(&mp->timer, br_multicast_group_expired, 0); + err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + if (err) { + kfree(mp); + mp = ERR_PTR(err); + } else { + hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list); + } - hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]); - mdb->size++; - -out: return mp; } @@ -768,7 +549,7 @@ static int br_multicast_add_group(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - mp = br_multicast_new_group(br, port, group); + mp = br_multicast_new_group(br, group); err = PTR_ERR(mp); if (IS_ERR(mp)) goto err; @@ -837,6 +618,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, if (ipv6_addr_is_ll_all_nodes(group)) return 0; + memset(&br_group, 0, sizeof(br_group)); br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; @@ -1483,7 +1265,7 @@ static void br_ip4_multicast_query(struct net_bridge *br, goto out; } - mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid); + mp = br_mdb_ip4_get(br, group, vid); if (!mp) goto out; @@ -1567,7 +1349,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, goto out; } - mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid); + mp = br_mdb_ip6_get(br, group, vid); if (!mp) goto out; @@ -1601,7 +1383,6 @@ br_multicast_leave_group(struct net_bridge *br, struct bridge_mcast_own_query *own_query, const unsigned char *src) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; unsigned long now; @@ -1612,8 +1393,7 @@ br_multicast_leave_group(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, group); + mp = br_mdb_ip_get(br, group); if (!mp) goto out; @@ -1629,7 +1409,7 @@ br_multicast_leave_group(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); br_mdb_notify(br->dev, port, group, RTM_DELMDB, p->flags); @@ -1961,8 +1741,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t) void br_multicast_init(struct net_bridge *br) { - br->hash_elasticity = 4; - br->hash_max = 512; + br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_last_member_count = 2; @@ -1999,6 +1778,7 @@ void br_multicast_init(struct net_bridge *br) timer_setup(&br->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif + INIT_HLIST_HEAD(&br->mdb_list); } static void __br_multicast_open(struct net_bridge *br, @@ -2033,40 +1813,20 @@ void br_multicast_stop(struct net_bridge *br) void br_multicast_dev_del(struct net_bridge *br) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - struct hlist_node *n; - u32 ver; - int i; + struct hlist_node *tmp; spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - if (!mdb) - goto out; - - br->mdb = NULL; - - ver = mdb->ver; - for (i = 0; i < mdb->max; i++) { - hlist_for_each_entry_safe(mp, n, &mdb->mhash[i], - hlist[ver]) { - del_timer(&mp->timer); - call_rcu_bh(&mp->rcu, br_multicast_free_group); - } + hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) { + del_timer(&mp->timer); + rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + hlist_del_rcu(&mp->mdb_node); + kfree_rcu(mp, rcu); } - - if (mdb->old) { - spin_unlock_bh(&br->multicast_lock); - rcu_barrier_bh(); - spin_lock_bh(&br->multicast_lock); - WARN_ON(mdb->old); - } - - mdb->old = mdb; - call_rcu_bh(&mdb->rcu, br_mdb_free); - -out: spin_unlock_bh(&br->multicast_lock); + + rcu_barrier(); } int br_multicast_set_router(struct net_bridge *br, unsigned long val) @@ -2176,7 +1936,6 @@ static void br_multicast_start_querier(struct net_bridge *br, int br_multicast_toggle(struct net_bridge *br, unsigned long val) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_port *port; int err = 0; @@ -2192,21 +1951,6 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) if (!netif_running(br->dev)) goto unlock; - mdb = mlock_dereference(br->mdb, br); - if (mdb) { - if (mdb->old) { - err = -EEXIST; -rollback: - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); - goto unlock; - } - - err = br_mdb_rehash(&br->mdb, mdb->max, - br->hash_elasticity); - if (err) - goto rollback; - } - br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port(port); @@ -2271,45 +2015,6 @@ unlock: return 0; } -int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) -{ - int err = -EINVAL; - u32 old; - struct net_bridge_mdb_htable *mdb; - - spin_lock_bh(&br->multicast_lock); - if (!is_power_of_2(val)) - goto unlock; - - mdb = mlock_dereference(br->mdb, br); - if (mdb && val < mdb->size) - goto unlock; - - err = 0; - - old = br->hash_max; - br->hash_max = val; - - if (mdb) { - if (mdb->old) { - err = -EEXIST; -rollback: - br->hash_max = old; - goto unlock; - } - - err = br_mdb_rehash(&br->mdb, br->hash_max, - br->hash_elasticity); - if (err) - goto rollback; - } - -unlock: - spin_unlock_bh(&br->multicast_lock); - - return err; -} - int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) { /* Currently we support only version 2 and 3 */ @@ -2646,3 +2351,13 @@ void br_multicast_get_stats(const struct net_bridge *br, } memcpy(dest, &tdst, sizeof(*dest)); } + +int br_mdb_hash_init(struct net_bridge *br) +{ + return rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); +} + +void br_mdb_hash_fini(struct net_bridge *br) +{ + rhashtable_destroy(&br->mdb_hash_tbl); +} diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index b1b5e8516724..c9383c470a83 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -671,10 +671,8 @@ static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff return 0; } - if (data->vlan_tci) { - skb->vlan_tci = data->vlan_tci; - skb->vlan_proto = data->vlan_proto; - } + if (data->vlan_proto) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); @@ -740,8 +738,13 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff data = this_cpu_ptr(&brnf_frag_data_storage); - data->vlan_tci = skb->vlan_tci; - data->vlan_proto = skb->vlan_proto; + if (skb_vlan_tag_present(skb)) { + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + } else { + data->vlan_proto = 0; + } + data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3345f1984542..ff2c10d47529 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1035,6 +1035,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, + [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_boolopt_multi) }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1187,19 +1189,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } - if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) { - u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]); + if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) + br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", + RHT_ELASTICITY); - br->hash_elasticity = val; - } - - if (data[IFLA_BR_MCAST_HASH_MAX]) { - u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); - - err = br_multicast_set_hash_max(br, hash_max); - if (err) - return err; - } + if (data[IFLA_BR_MCAST_HASH_MAX]) + br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); @@ -1296,6 +1291,15 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } #endif + if (data[IFLA_BR_MULTI_BOOLOPT]) { + struct br_boolopt_multi *bm; + + bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); + err = br_boolopt_multi_toggle(br, bm, extack); + if (err) + return err; + } + return 0; } @@ -1374,6 +1378,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif + nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ 0; } @@ -1387,6 +1392,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); + struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); @@ -1403,6 +1409,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; + br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || @@ -1420,7 +1427,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || - nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) + nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || + nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING @@ -1442,8 +1450,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) br_opt_get(br, BROPT_MULTICAST_QUERIER)) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || - nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, - br->hash_elasticity) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_last_member_count) || diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 04c19a37e500..5719b4d3e466 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -31,6 +31,8 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) +#define BR_MULTICAST_DEFAULT_HASH_MAX 4096 + #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ @@ -213,23 +215,14 @@ struct net_bridge_port_group { }; struct net_bridge_mdb_entry { - struct hlist_node hlist[2]; + struct rhash_head rhnode; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; struct rcu_head rcu; struct timer_list timer; struct br_ip addr; bool host_joined; -}; - -struct net_bridge_mdb_htable { - struct hlist_head *mhash; - struct rcu_head rcu; - struct net_bridge_mdb_htable *old; - u32 size; - u32 max; - u32 secret; - u32 ver; + struct hlist_node mdb_node; }; struct net_bridge_port { @@ -328,6 +321,7 @@ enum net_bridge_opts { BROPT_NEIGH_SUPPRESS_ENABLED, BROPT_MTU_SET_BY_USER, BROPT_VLAN_STATS_PER_PORT, + BROPT_NO_LL_LEARN, }; struct net_bridge { @@ -380,7 +374,6 @@ struct net_bridge { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - u32 hash_elasticity; u32 hash_max; u32 multicast_last_member_count; @@ -399,7 +392,9 @@ struct net_bridge { unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; - struct net_bridge_mdb_htable __rcu *mdb; + struct rhashtable mdb_hash_tbl; + + struct hlist_head mdb_list; struct hlist_head router_list; struct timer_list multicast_router_timer; @@ -507,6 +502,14 @@ static inline int br_opt_get(const struct net_bridge *br, return test_bit(opt, &br->options); } +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack); +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt); +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack); +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm); void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); /* br_device.c */ @@ -650,7 +653,6 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING -extern unsigned int br_mdb_rehash_seq; int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid); struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, @@ -675,17 +677,15 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val); #endif struct net_bridge_mdb_entry * -br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst); +br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); struct net_bridge_mdb_entry * -br_multicast_new_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group); -void br_multicast_free_pg(struct rcu_head *head); +br_multicast_new_group(struct net_bridge *br, struct br_ip *group); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src); -void br_mdb_init(void); -void br_mdb_uninit(void); +int br_mdb_hash_init(struct net_bridge *br); +void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_ip *group, int type, u8 flags); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, @@ -697,6 +697,8 @@ void br_multicast_uninit_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest); +void br_mdb_init(void); +void br_mdb_uninit(void); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -822,6 +824,15 @@ static inline void br_mdb_uninit(void) { } +static inline int br_mdb_hash_init(struct net_bridge *br) +{ + return 0; +} + +static inline void br_mdb_hash_fini(struct net_bridge *br) +{ +} + static inline void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, @@ -912,7 +923,7 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) int err = 0; if (skb_vlan_tag_present(skb)) { - *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK; + *vid = skb_vlan_tag_get_id(skb); } else { *vid = 0; err = -EINVAL; diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 60182bef6341..b05b94e9c595 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -328,6 +328,27 @@ static ssize_t flush_store(struct device *d, } static DEVICE_ATTR_WO(flush); +static ssize_t no_linklocal_learn_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); +} + +static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val) +{ + return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, NULL); +} + +static ssize_t no_linklocal_learn_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_no_linklocal_learn); +} +static DEVICE_ATTR_RW(no_linklocal_learn); + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) @@ -403,13 +424,13 @@ static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { - struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->hash_elasticity); + return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val) { - br->hash_elasticity = val; + br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", + RHT_ELASTICITY); return 0; } @@ -428,10 +449,16 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, return sprintf(buf, "%u\n", br->hash_max); } +static int set_hash_max(struct net_bridge *br, unsigned long val) +{ + br->hash_max = val; + return 0; +} + static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_hash_max); + return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); @@ -841,6 +868,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, + &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 7c87a2fe5248..88715edb119a 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -320,9 +320,6 @@ static ssize_t brport_store(struct kobject *kobj, if (!rtnl_trylock()) return restart_syscall(); - if (!p->dev || !p->br) - goto out_unlock; - if (brport_attr->store_raw) { char *buf_copy; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index e84be08b8285..48f50d7ac624 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -421,7 +421,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, } if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { @@ -494,8 +494,8 @@ static bool __allowed_ingress(const struct net_bridge *br, __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. - * At this point, We know that skb->vlan_tci had - * VLAN_TAG_PRESENT bit and its VID field was 0x000. + * At this point, we know that skb->vlan_tci VID + * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; @@ -1217,9 +1217,13 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; + struct net_bridge_port *p; ASSERT_RTNL(); - if (netif_is_bridge_master(dev)) + p = br_port_get_check_rtnl(dev); + if (p) + vg = nbp_vlan_group(p); + else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; diff --git a/net/core/datagram.c b/net/core/datagram.c index 57f3a6fcfc1e..4bf62b1afa3b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -728,49 +728,6 @@ fault: return -EFAULT; } -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) -{ - __sum16 sum; - - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - if (!skb_shared(skb)) - skb->csum_valid = !sum; - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete_head); - -__sum16 __skb_checksum_complete(struct sk_buff *skb) -{ - __wsum csum; - __sum16 sum; - - csum = skb_checksum(skb, 0, skb->len, 0); - - /* skb->csum holds pseudo checksum */ - sum = csum_fold(csum_add(skb->csum, csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - - if (!skb_shared(skb)) { - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; - } - - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete); - /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff @@ -810,7 +767,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(NULL); + netdev_rx_csum_fault(NULL, skb); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index 722d50dbf8a4..754284873355 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -162,6 +162,9 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); static struct napi_struct *napi_by_id(unsigned int napi_id); /* @@ -1361,7 +1364,7 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1377,7 +1380,7 @@ static int __dev_open(struct net_device *dev) */ netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; @@ -1406,7 +1409,8 @@ static int __dev_open(struct net_device *dev) /** * dev_open - prepare an interface for use. - * @dev: device to open + * @dev: device to open + * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally @@ -1416,14 +1420,14 @@ static int __dev_open(struct net_device *dev) * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ -int dev_open(struct net_device *dev) +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; @@ -1733,6 +1737,18 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1744,11 +1760,7 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info = { - .dev = dev, - }; - - return call_netdevice_notifiers_info(val, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -3096,10 +3108,17 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + if (dev) + pr_err("dev features: %pNF\n", &dev->features); + pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", + skb->len, skb->data_len, skb->pkt_type, + skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, + skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level); dump_stack(); } } @@ -4525,9 +4544,14 @@ static int netif_rx_internal(struct sk_buff *skb) int netif_rx(struct sk_buff *skb) { + int ret; + trace_netif_rx_entry(skb); - return netif_rx_internal(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_rx); @@ -4542,6 +4566,7 @@ int netif_rx_ni(struct sk_buff *skb) if (local_softirq_pending()) do_softirq(); preempt_enable(); + trace_netif_rx_ni_exit(err); return err; } @@ -4894,7 +4919,7 @@ skip_classify: * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; @@ -5227,9 +5252,14 @@ static void netif_receive_skb_list_internal(struct list_head *head) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + trace_netif_receive_skb_entry(skb); - return netif_receive_skb_internal(skb); + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -5249,9 +5279,12 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; - list_for_each_entry(skb, head, list) - trace_netif_receive_skb_list_entry(skb); + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); @@ -5362,11 +5395,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - u32 i; + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - if (test_bit(i, &napi->gro_bitmask)) - __napi_gro_flush_chain(napi, i, flush_old); + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); @@ -5391,7 +5426,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5639,12 +5676,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + gro_result_t ret; + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + trace_napi_gro_receive_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_receive); @@ -5657,7 +5699,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; @@ -5762,6 +5804,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) gro_result_t napi_gro_frags(struct napi_struct *napi) { + gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); if (!skb) @@ -5769,7 +5812,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_frags); @@ -5785,10 +5831,11 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; @@ -7467,7 +7514,8 @@ unsigned int dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -7504,7 +7552,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if (old_flags & IFF_UP) __dev_close(dev); else - ret = __dev_open(dev); + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { @@ -7564,16 +7612,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, * dev_change_flags - change device settings * @dev: device * @flags: device state flags + * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index d884d8f5f0e5..81a8cd4ea3bd 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,6 +278,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + +/** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 90e8aa36881e..da273ec3cc57 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a4b29a13d31..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) diff --git a/net/core/filter.c b/net/core/filter.c index 8d2c629501e2..8659b40172d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -296,22 +296,18 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_VLAN_TAG: - case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); - if (skb_field == SKF_AD_VLAN_TAG) { - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, - ~VLAN_TAG_PRESENT); - } else { - /* dst_reg >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); - /* dst_reg &= 1 */ + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); - } break; } @@ -467,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { @@ -2428,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3908,6 +4073,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -4825,37 +5010,31 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; - int dif = 0; - - if (skb->dev) - dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); @@ -4864,7 +5043,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4881,31 +5060,33 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; if (unlikely(family == AF_UNSPEC || flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); + sdif = inet6_sdif(skb); + if ((s32)netns_id < 0) { net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } @@ -4915,6 +5096,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4964,6 +5164,87 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -4986,6 +5267,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5070,6 +5352,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -5214,6 +5504,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5240,6 +5538,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } @@ -5264,6 +5564,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } @@ -5440,6 +5742,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5467,6 +5773,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5505,6 +5812,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5532,6 +5843,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5741,6 +6053,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -5960,6 +6273,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6046,6 +6360,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6140,19 +6455,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - case offsetof(struct __sk_buff, vlan_tci): - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); - if (si->off == offsetof(struct __sk_buff, vlan_tci)) { - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, - ~VLAN_TAG_PRESENT); - } else { - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); - } break; case offsetof(struct __sk_buff, cb[0]) ... @@ -6355,6 +6670,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); } return insn - insn_buf; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 588f475019d4..2e8d91e54179 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -952,8 +952,7 @@ proto_again: if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); - key_vlan->vlan_priority = - (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 41954e42a2de..c3b58712e98b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,6 +118,34 @@ unsigned long neigh_rand_reach_time(unsigned long base) } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } +} + +static void neigh_change_state(struct neighbour *n, u8 new) +{ + bool on_gc_list = !list_empty(&n->gc_list); + bool new_is_perm = new & NUD_PERMANENT; + + n->nud_state = new; + + /* remove from the gc list if new state is permanent; + * add to the gc list if new state is not permanent + */ + if (new_is_perm && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!new_is_perm && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } +} static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, struct neighbour __rcu **np, struct neigh_table *tbl) @@ -132,7 +160,7 @@ static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); - n->dead = 1; + neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); @@ -166,32 +194,31 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + unsigned long tref = jiffies - 5 * HZ; + u8 flags = NTF_EXT_LEARNED; + struct neighbour *n, *tmp; + u8 state = NUD_PERMANENT; int shrunk = 0; - int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ - if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, - tbl)) { - shrunk = 1; - continue; - } - np = &n->next; + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if (!(n->nud_state & state) && !(n->flags & flags) && + time_after(tref, n->updated)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; } } @@ -260,8 +287,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); - n->dead = 1; - + neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, @@ -321,13 +347,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + bool permanent) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - entries = atomic_inc_return(&tbl->entries) - 1; + if (permanent) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -340,6 +371,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -358,11 +390,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; + + if (!permanent) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + else + INIT_LIST_HEAD(&n->gc_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!permanent) + atomic_dec(&tbl->gc_entries); goto out; } @@ -505,13 +545,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) +static struct neighbour *___neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool permanent, bool want_ref) { + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent); u32 hash_val; unsigned int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -591,6 +633,12 @@ out_neigh_release: neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, false, want_ref); +} EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) @@ -854,7 +902,7 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; - n->dead = 1; + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; @@ -1137,8 +1185,9 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags, u32 nlmsg_pid) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { u8 old; int err; @@ -1155,8 +1204,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - if (neigh->dead) + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); goto out; + } neigh_update_ext_learned(neigh, flags, ¬ify); @@ -1164,7 +1215,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh->nud_state = new; + neigh_change_state(neigh, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1193,8 +1244,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } @@ -1241,7 +1294,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh->nud_state = new; + neigh_change_state(neigh, new); notify = 1; } @@ -1307,6 +1360,12 @@ out: return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is @@ -1571,6 +1630,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1678,8 +1738,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1694,8 +1756,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < (int)tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); @@ -1711,10 +1775,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); @@ -1744,8 +1807,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1755,16 +1820,21 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } + dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1780,8 +1850,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); goto out; + } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { @@ -1790,7 +1862,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - neigh = __neigh_lookup_errno(tbl, dst, dev); + neigh = ___neigh_create(tbl, dst, dev, + ndm->ndm_state & NUD_PERMANENT, + true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -1817,8 +1891,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); neigh_release(neigh); out: @@ -2631,7 +2705,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); - n->dead = 1; + neigh_mark_dead(n); } else np = &n->next; write_unlock(&n->lock); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bd67c4d0fcfd..ff9fd2bb4ce4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int)new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..05b23b285058 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -735,23 +736,38 @@ static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } -static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, int nsid) +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; + bool add_ref; + int ref_nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nla_put_s32(skb, NETNSA_NSID, nsid)) + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) + goto nla_put_failure; + + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -767,10 +783,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; + struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; - struct net *peer; - int err, id; + int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); @@ -782,6 +803,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; @@ -793,15 +819,29 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); + } + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } - id = peernet2id(net, peer); - err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, id); + fillargs.nsid = peernet2id(target, peer); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -811,14 +851,17 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: + if (fillargs.add_ref) + put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { - struct net *net; + struct net *tgt_net; + struct net *ref_net; struct sk_buff *skb; - struct netlink_callback *cb; + struct net_fill_args fillargs; int idx; int s_idx; }; @@ -831,9 +874,10 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) if (net_cb->idx < net_cb->s_idx) goto cont; - ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, - net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, id); + net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -842,33 +886,96 @@ cont: return 0; } +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; + net_cb->tgt_net = net; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { - .net = net, + .tgt_net = sock_net(skb->sk), .skb = skb, - .cb = cb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, .idx = 0, .s_idx = cb->args[0], }; + int err = 0; - if (cb->strict_check && - nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { - NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); - return -EINVAL; + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; } - spin_lock_bh(&net->nsid_lock); - idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_bh(&net->nsid_lock); + spin_lock_bh(&net_cb.tgt_net->nsid_lock); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net) && + !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + err = -EAGAIN; + goto end; + } + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net)) + spin_unlock_bh(&net_cb.ref_net->nsid_lock); + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; - return skb->len; +end: + if (net_cb.fillargs.add_ref) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id) { + struct net_fill_args fillargs = { + .cmd = cmd, + .nsid = id, + }; struct sk_buff *msg; int err = -ENOMEM; @@ -876,7 +983,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2b9fdbc43205..36a2b63ffd6d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - err = dev_open(ndev); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7819f7804eeb..c9c0407a7ee0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> -#define RTNL_MAX_TYPE 49 +#define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } @@ -2870,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } @@ -2885,9 +2887,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *net, - const char *ifname, unsigned char name_assign_type, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; @@ -2903,11 +2907,15 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - if (num_tx_queues < 1 || num_tx_queues > 4096) + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); + } - if (num_rx_queues < 1 || num_rx_queues > 4096) + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); + } dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); @@ -2965,20 +2973,24 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attr, struct netlink_ext_ack *extack) { + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; + unsigned char name_assign_type = NET_NAME_USER; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + const struct rtnl_link_ops *m_ops = NULL; + struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *tb[IFLA_MAX + 1]; + struct net *dest_net, *link_net; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; struct net_device *dev; - struct net_device *master_dev = NULL; struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - unsigned char name_assign_type = NET_NAME_USER; + struct nlattr **data; int err; #ifdef CONFIG_MODULES @@ -3034,193 +3046,200 @@ replay: ops = NULL; } - if (1) { - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; - struct nlattr **data = NULL; - struct nlattr **slave_data = NULL; - struct net *dest_net, *link_net = NULL; - - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, NULL); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; } + } - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - NULL); - if (err < 0) - return err; - slave_data = slave_attr; - } + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + slave_data = slave_attr; } + } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (dev) { + int status = 0; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } - err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data, - extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; - return do_setlink(skb, dev, ifm, extack, tb, ifname, - status); + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, + return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); - return -ENODEV; - } + return -ENODEV; + } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; - if (!ops) { + if (!ops) { #ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - return -EOPNOTSUPP; + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } - if (!ops->setup) - return -EOPNOTSUPP; - - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } + if (!ops->setup) + return -EOPNOTSUPP; - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + if (!ifname[0]) { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } - dev->ifindex = ifm->ifi_index; + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } - if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data, - extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(dev); - goto out; - } - } else { - err = register_netdevice(dev); - if (err < 0) { + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) free_netdev(dev); - goto out; - } + goto out; + } + } else { + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; } - err = rtnl_configure_link(dev, ifm); + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), - extack); - if (err) - goto out_unregister; - } + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); + if (ops->newlink) { + LIST_HEAD(list_kill); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); } + goto out; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **attr; + int ret; + + attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, attr, extack); + kfree(attr); + return ret; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a8217e221e19..40552547c69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1097,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1105,6 +1106,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) @@ -1131,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1151,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } @@ -1925,8 +1932,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; @@ -2366,19 +2371,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer @@ -2645,6 +2637,65 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( @@ -2962,28 +3013,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) @@ -4856,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; - skb->offload_mr_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; #endif if (!xnet) @@ -5128,7 +5157,7 @@ int skb_vlan_pop(struct sk_buff *skb) int err; if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 080a880a1761..f00902c532cc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); @@ -698,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -950,10 +953,12 @@ set_rcvbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else + } else if (val != sk->sk_mark) { sk->sk_mark = val; + sk_dst_reset(sk); + } break; case SO_RXQ_OVFL: @@ -1014,7 +1019,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ba5cba56f574..d8fe3e549373 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } +EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { diff --git a/net/core/stream.c b/net/core/stream.c index 7d329fb1f553..e94bb02a5629 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_is_writeable(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8e08cea6f178..26a21d97b6b0 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -231,7 +231,7 @@ EXPORT_SYMBOL(dccp_req_err); * check at all. A more general error queue to queue errors for later handling * is probably better. */ -static void dccp_v4_err(struct sk_buff *skb, u32 info) +static int dccp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (struct iphdr *)skb->data; const u8 offset = iph->ihl << 2; @@ -259,16 +259,18 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) - return dccp_req_err(sk, seq); + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + dccp_req_err(sk, seq); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -357,6 +359,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6344f1b18a6a..d5740bad5b18 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -68,7 +68,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) } -static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; @@ -96,16 +96,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) - return dccp_req_err(sk, seq); + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + dccp_req_err(sk, seq); + return 0; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -183,6 +185,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out: bh_unlock_sock(sk); sock_put(sk); + return 0; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 43733accf58e..658cd32bb7b3 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -948,6 +948,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; + sk->sk_max_ack_backlog = backlog; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -960,7 +961,6 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (err) goto out; } - sk->sk_max_ack_backlog = backlog; err = 0; out: diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7d6ff983ba2c..7aab5d088c72 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -192,7 +192,7 @@ static int check_port(__le16 port) static unsigned short port_alloc(struct sock *sk) { struct dn_scp *scp = DN_SK(sk); -static unsigned short port = 0x2000; + static unsigned short port = 0x2000; unsigned short i_port = port; while(check_port(cpu_to_le16(++port)) != 0) { diff --git a/net/dsa/master.c b/net/dsa/master.c index 5e8c9bef78bd..71bb15f491c8 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -179,10 +179,38 @@ static const struct attribute_group dsa_group = { .attrs = dsa_slave_attrs, }; +static void dsa_master_set_mtu(struct net_device *dev, struct dsa_port *cpu_dp) +{ + unsigned int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + int err; + + rtnl_lock(); + if (mtu <= dev->max_mtu) { + err = dev_set_mtu(dev, mtu); + if (err) + netdev_dbg(dev, "Unable to set MTU to include for DSA overheads\n"); + } + rtnl_unlock(); +} + +static void dsa_master_reset_mtu(struct net_device *dev) +{ + int err; + + rtnl_lock(); + err = dev_set_mtu(dev, ETH_DATA_LEN); + if (err) + netdev_dbg(dev, + "Unable to reset MTU to exclude DSA overheads\n"); + rtnl_unlock(); +} + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; + dsa_master_set_mtu(dev, cpu_dp); + /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -206,6 +234,7 @@ void dsa_master_teardown(struct net_device *dev) { sysfs_remove_group(&dev->dev.kobj, &dsa_group); dsa_master_ethtool_teardown(dev); + dsa_master_reset_mtu(dev); dev->dsa_ptr = NULL; diff --git a/net/dsa/port.c b/net/dsa/port.c index ed0595459df1..2d7e01b23572 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -252,9 +252,6 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - if (netif_is_bridge_master(vlan->obj.orig_dev)) - return -EOPNOTSUPP; - if (br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index aec78f5aca72..a3fcc1d01615 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1050,8 +1050,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_attr_set = dsa_slave_port_attr_set, - .switchdev_port_obj_add = dsa_slave_port_obj_add, - .switchdev_port_obj_del = dsa_slave_port_obj_del, }; static struct device_type dsa_type = { @@ -1529,6 +1527,44 @@ err_fdb_work_init: return NOTIFY_BAD; } +static int +dsa_slave_switchdev_port_obj_event(unsigned long event, + struct net_device *netdev, + struct switchdev_notifier_port_obj_info *port_obj_info) +{ + int err = -EOPNOTSUPP; + + switch (event) { + case SWITCHDEV_PORT_OBJ_ADD: + err = dsa_slave_port_obj_add(netdev, port_obj_info->obj, + port_obj_info->trans); + break; + case SWITCHDEV_PORT_OBJ_DEL: + err = dsa_slave_port_obj_del(netdev, port_obj_info->obj); + break; + } + + port_obj_info->handled = true; + return notifier_from_errno(err); +} + +static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + + if (!dsa_slave_dev_check(dev)) + return NOTIFY_DONE; + + switch (event) { + case SWITCHDEV_PORT_OBJ_ADD: /* fall through */ + case SWITCHDEV_PORT_OBJ_DEL: + return dsa_slave_switchdev_port_obj_event(event, dev, ptr); + } + + return NOTIFY_DONE; +} + static struct notifier_block dsa_slave_nb __read_mostly = { .notifier_call = dsa_slave_netdevice_event, }; @@ -1537,8 +1573,13 @@ static struct notifier_block dsa_slave_switchdev_notifier = { .notifier_call = dsa_slave_switchdev_event, }; +static struct notifier_block dsa_slave_switchdev_blocking_notifier = { + .notifier_call = dsa_slave_switchdev_blocking_event, +}; + int dsa_slave_register_notifier(void) { + struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_slave_nb); @@ -1549,8 +1590,15 @@ int dsa_slave_register_notifier(void) if (err) goto err_switchdev_nb; + nb = &dsa_slave_switchdev_blocking_notifier; + err = register_switchdev_blocking_notifier(nb); + if (err) + goto err_switchdev_blocking_nb; + return 0; +err_switchdev_blocking_nb: + unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_slave_nb); return err; @@ -1558,8 +1606,14 @@ err_switchdev_nb: void dsa_slave_unregister_notifier(void) { + struct notifier_block *nb; int err; + nb = &dsa_slave_switchdev_blocking_notifier; + err = unregister_switchdev_blocking_notifier(nb); + if (err) + pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); + err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 2b06bb91318b..4aa1d368a5ae 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -174,6 +174,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops brcm_netdev_ops = { .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, + .overhead = BRCM_TAG_LEN, }; #endif @@ -196,5 +197,6 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, const struct dsa_device_ops brcm_prepend_netdev_ops = { .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, + .overhead = BRCM_TAG_LEN, }; #endif diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cd13cfc542ce..8b2f92e3f3a2 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -149,4 +149,5 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops dsa_netdev_ops = { .xmit = dsa_xmit, .rcv = dsa_rcv, + .overhead = DSA_HLEN, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 4083326b806e..f5b87ee5c94e 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -168,4 +168,5 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops edsa_netdev_ops = { .xmit = edsa_xmit, .rcv = edsa_rcv, + .overhead = EDSA_HLEN, }; diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 49e9b73f1be3..cb6f82ffe5eb 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -106,4 +106,5 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, const struct dsa_device_ops gswip_netdev_ops = { .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, + .overhead = GSWIP_RX_HEADER_LEN, }; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0f62effad88f..96411f70ab9f 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -99,4 +99,5 @@ static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops ksz_netdev_ops = { .xmit = ksz_xmit, .rcv = ksz_rcv, + .overhead = KSZ_INGRESS_TAG_LEN, }; diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 548c00254c07..f48889e46ff7 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -140,4 +140,5 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops lan9303_netdev_ops = { .xmit = lan9303_xmit, .rcv = lan9303_rcv, + .overhead = LAN9303_TAG_LEN, }; diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 11535bc70743..f39f4dfeda34 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -109,4 +109,5 @@ const struct dsa_device_ops mtk_netdev_ops = { .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, .flow_dissect = mtk_tag_flow_dissect, + .overhead = MTK_HDR_LEN, }; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 613f4ee97771..ed4f6dc26365 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -101,4 +101,5 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops qca_netdev_ops = { .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, + .overhead = QCA_HDR_LEN, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 56197f0d9608..b40756ed6e57 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -84,4 +84,5 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops trailer_netdev_ops = { .xmit = trailer_xmit, .rcv = trailer_rcv, + .overhead = 4, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index fd8faa0dfa61..4c520110b04f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -47,6 +47,7 @@ #include <linux/inet.h> #include <linux/ip.h> #include <linux/netdevice.h> +#include <linux/nvmem-consumer.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> @@ -165,15 +166,17 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { - if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) - skb->pkt_type = PACKET_BROADCAST; - else - skb->pkt_type = PACKET_MULTICAST; + if (unlikely(!ether_addr_equal_64bits(eth->h_dest, + dev->dev_addr))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { + if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + skb->pkt_type = PACKET_OTHERHOST; + } } - else if (unlikely(!ether_addr_equal_64bits(eth->h_dest, - dev->dev_addr))) - skb->pkt_type = PACKET_OTHERHOST; /* * Some variants of DSA tagging don't have an ethertype field @@ -548,3 +551,40 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) return 0; } EXPORT_SYMBOL(eth_platform_get_mac_address); + +/** + * Obtain the MAC address from an nvmem cell named 'mac-address' associated + * with given device. + * + * @dev: Device with which the mac-address cell is associated. + * @addrbuf: Buffer to which the MAC address will be copied on success. + * + * Returns 0 on success or a negative error number on failure. + */ +int nvmem_get_mac_address(struct device *dev, void *addrbuf) +{ + struct nvmem_cell *cell; + const void *mac; + size_t len; + + cell = nvmem_cell_get(dev, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + ether_addr_copy(addrbuf, mac); + kfree(mac); + + return 0; +} +EXPORT_SYMBOL(nvmem_get_mac_address); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1fbe2f815474..326c422c22f8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; + sk->sk_max_ack_backlog = backlog; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog) goto out; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } - sk->sk_max_ack_backlog = backlog; err = 0; out: @@ -1964,6 +1964,8 @@ static int __init inet_init(void) /* Add UDP-Lite (RFC 3828) */ udplite4_register(); + raw_init(); + ping_init(); /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a34602ae27de..5b9b6d497f71 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1100,7 +1100,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) inet_del_ifa(in_dev, ifap, 1); break; } - ret = dev_change_flags(dev, ifr->ifr_flags); + ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b5c3937ca6ec..5022bc63863a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!fi) goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, - cfg->fc_mx_len); + cfg->fc_mx_len, extack); if (unlikely(IS_ERR(fi->fib_metrics))) { err = PTR_ERR(fi->fib_metrics); kfree(fi); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 500a59906b87..0d0ad19ecb87 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -3,6 +3,7 @@ #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <linux/icmp.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> @@ -1003,15 +1004,82 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } +static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info) +{ + const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]); + + if (ipprot && ipprot->err_handler) { + if (!ipprot->err_handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +static int gue_err(struct sk_buff *skb, u32 info) +{ + int transport_offset = skb_transport_offset(skb); + struct guehdr *guehdr; + size_t optlen; + int ret; + + if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + + switch (((struct iphdr *)guehdr)->version) { + case 4: + ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info); + goto out; +#if IS_ENABLED(CONFIG_IPV6) + case 6: + ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info); + goto out; +#endif + default: + ret = -EOPNOTSUPP; + goto out; + } + } + default: /* Undefined version */ + return -EOPNOTSUPP; + } + + if (guehdr->control) + return -ENOENT; + + optlen = guehdr->hlen << 2; + + if (validate_gue_flags(guehdr, optlen)) + return -EINVAL; + + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); + +out: + skb_set_transport_header(skb, transport_offset); + return ret; +} + static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, + .err_handler = gue_err, }; static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, + .err_handler = gue_err, }; static int ip_tunnel_encap_add_fou_ops(void) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7efe740c06eb..a4bf22ee3aed 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -151,20 +151,25 @@ drop: return NET_RX_DROP; } -static void gre_err(struct sk_buff *skb, u32 info) +static int gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; const struct iphdr *iph = (const struct iphdr *)skb->data; u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; + int err = 0; if (ver >= GREPROTO_MAX) - return; + return -EINVAL; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (proto && proto->err_handler) proto->err_handler(skb, info); + else + err = -EPROTONOSUPPORT; rcu_read_unlock(); + + return err; } static const struct net_protocol net_gre_protocol = { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d832beed6e3a..065997f414e6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1079,7 +1079,7 @@ error: goto drop; } -void icmp_err(struct sk_buff *skb, u32 info) +int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; @@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info) */ if (icmph->type != ICMP_ECHOREPLY) { ping_err(skb, offset, info); - return; + return 0; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP); else if (type == ICMP_REDIRECT) ipv4_redirect(skb, net, 0, IPPROTO_ICMP); + + return 0; } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 15e7f7915a21..6ea523d71947 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * int i, low, high, attempt_half; struct inet_bind_bucket *tb; u32 remaining, offset; + int l3mdev; + l3mdev = inet_sk_bound_l3mdev(sk); attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); @@ -219,7 +221,8 @@ other_parity_scan: hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (!inet_csk_bind_conflict(sk, tb, false, false)) goto success; goto next_port; @@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); struct inet_bind_bucket *tb = NULL; kuid_t uid = sock_i_uid(sk); + int l3mdev; + + l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &port); @@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) goto tb_found; tb_not_found: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, l3mdev); if (!tb) goto fail_unlock; tb_found: @@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog) reqsk_queue_alloc(&icsk->icsk_accept_queue); - sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 411dd7a90046..13890d5bfc34 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk) struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum) + const unsigned short snum, + int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); + tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; + int l3mdev; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; @@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOENT; } if (tb->port != port) { + l3mdev = inet_sk_bound_l3mdev(sk); + /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same @@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && - tb->port == port) + tb->l3mdev == l3mdev && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, - sock_net(sk), head, port); + sock_net(sk), head, port, + l3mdev); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; @@ -229,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net, { int score = -1; struct inet_sock *inet = inet_sk(sk); + bool dev_match; if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { @@ -239,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } @@ -675,6 +679,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 remaining, offset; int ret, i, low, high; static u32 hint; + int l3mdev; if (port) { head = &hinfo->bhash[inet_bhashfn(net, port, @@ -693,6 +698,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return ret; } + l3mdev = inet_sk_bound_l3mdev(sk); + inet_get_local_port_range(net, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; @@ -719,7 +726,8 @@ other_parity_scan: * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && tb->port == port) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; @@ -732,7 +740,7 @@ other_parity_scan: } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 32662e9e5d21..06ee4696703c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV + if (skb->offload_l3_fwd_mark) { + consume_skb(skb); + return 0; + } +#endif + if (unlikely(opt->optlen)) ip_forward_options(skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 38befe829caf..76a9a5f7a40e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; static unsigned int erspan_net_id __read_mostly; -static void ipgre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) +static int ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info, unsigned int data_len = 0; struct ip_tunnel *t; + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else if (tpi->proto == htons(ETH_P_ERSPAN) || + tpi->proto == htons(ETH_P_ERSPAN2)) + itn = net_generic(net, erspan_net_id); + else + itn = net_generic(net, ipgre_net_id); + + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->daddr, iph->saddr, tpi->key); + + if (!t) + return -ENOENT; + switch (type) { default: case ICMP_PARAMETERPROB: - return; + return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return; + return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return; + return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; @@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info, break; } - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else if (tpi->proto == htons(ETH_P_ERSPAN) || - tpi->proto == htons(ETH_P_ERSPAN2)) - itn = net_generic(net, erspan_net_id); - else - itn = net_generic(net, ipgre_net_id); - - iph = (const struct iphdr *)(icmp_hdr(skb) + 1); - t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, - iph->daddr, iph->saddr, tpi->key); - - if (!t) - return; - #if IS_ENABLED(CONFIG_IPV6) if (tpi->proto == htons(ETH_P_IPV6) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type, data_len)) - return; + return 0; #endif if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return; + return 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return; + return 0; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; + + return 0; } static void gre_err(struct sk_buff *skb, u32 info) @@ -1601,7 +1603,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, - &ipgre_tap_ops, tb); + &ipgre_tap_ops, tb, NULL); if (IS_ERR(dev)) return dev; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e609b08c9df4..26921f6b3b92 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { - __skb_pull(skb, skb_network_header_len(skb)); - - rcu_read_lock(); - { - int protocol = ip_hdr(skb)->protocol; - const struct net_protocol *ipprot; - int raw; + const struct net_protocol *ipprot; + int raw, ret; - resubmit: - raw = raw_local_deliver(skb, protocol); +resubmit: + raw = raw_local_deliver(skb, protocol); - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot) { - int ret; - - if (!ipprot->no_policy) { - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - kfree_skb(skb); - goto out; - } - nf_reset(skb); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot) { + if (!ipprot->no_policy) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return; } - ret = ipprot->handler(skb); - if (ret < 0) { - protocol = -ret; - goto resubmit; + nf_reset(skb); + } + ret = ipprot->handler(skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + } else { + if (!raw) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); } - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); } else { - if (!raw) { - if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_PROT_UNREACH, 0); - } - kfree_skb(skb); - } else { - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); - consume_skb(skb); - } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + consume_skb(skb); } } - out: +} + +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + __skb_pull(skb, skb_network_header_len(skb)); + + rcu_read_lock(); + ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5dbec21856f4..ab6618036afe 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; @@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; + bool paged, extra_uref; u32 tskey = 0; - bool paged; skb = skb_peek_tail(queue); @@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + extra_uref = true; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + } + } + cork->length += length; /* So, what's going on in the loop below? @@ -1001,12 +1016,6 @@ alloc_new_skb: skb->csum = 0; skb_reserve(skb, hh_len); - /* only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - /* * Find where to start putting bytes. */ @@ -1039,6 +1048,13 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + /* only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1068,7 +1084,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1098,6 +1114,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1110,6 +1130,8 @@ alloc_new_skb: error_efault: err = -EFAULT; error: + if (uarg) + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c248e0dccbe1..c857ec6b9784 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, } skb_clear_hash_if_not_l4(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, xnet); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 88212615bf4c..55757764c381 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -220,7 +220,7 @@ static int __init ic_open_devs(void) for_each_netdev(&init_net, dev) { if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) continue; - if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0) pr_err("IP-Config: Failed to open %s\n", dev->name); } @@ -238,7 +238,7 @@ static int __init ic_open_devs(void) if (ic_proto_enabled && !able) continue; oflags = dev->flags; - if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) { pr_err("IP-Config: Failed to open %s\n", dev->name); continue; @@ -315,7 +315,7 @@ static void __init ic_close_devs(void) dev = d->dev; if (d != ic_dev && !netdev_uses_dsa(dev)) { pr_debug("IP-Config: Downing %s\n", dev->name); - dev_change_flags(dev, d->flags); + dev_change_flags(dev, d->flags, NULL); } kfree(d); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e65287c27e3d..57c5dd283a2c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) struct ip_tunnel *t; int err = 0; + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!t) { + err = -ENOENT; + goto out; + } + switch (type) { case ICMP_DEST_UNREACH: switch (code) { @@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) goto out; } - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); - if (!t) { - err = -ENOENT; - goto out; - } - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a6defbec4f1b..ea04e38f56e9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -506,7 +506,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); } @@ -589,7 +589,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); @@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; - if (!skb->offload_mr_fwd_mark) + if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 6d218f5a2e71..ca9a5fefdefa 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -6,7 +6,8 @@ #include <net/tcp.h> static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, - int fc_mx_len, u32 *metrics) + int fc_mx_len, u32 *metrics, + struct netlink_ext_ack *extack) { bool ecn_ca = false; struct nlattr *nla; @@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, if (!type) continue; - if (type > RTAX_MAX) + if (type > RTAX_MAX) { + NL_SET_ERR_MSG(extack, "Invalid metric type"); return -EINVAL; + } if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) + if (val == TCP_CA_UNSPEC) { + NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; + } } else { - if (nla_len(nla) != sizeof(u32)) + if (nla_len(nla) != sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid attribute in metrics"); return -EINVAL; + } val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) @@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, val = 65535 - 15; if (type == RTAX_HOPLIMIT && val > 255) val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { + NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute"); return -EINVAL; + } metrics[type - 1] = val; } @@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, } struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, - int fc_mx_len) + int fc_mx_len, + struct netlink_ext_ack *extack) { struct dst_metrics *fib_metrics; int err; @@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); - err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics); + err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics, + extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); } else { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 70289682a670..c3610b37bb4c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 32a691b7ce2c..92d249e053be 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -29,6 +29,7 @@ #include <net/protocol.h> struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet_protos); const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8ca3eb06ba04..076f51646d26 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) goto found; /* gotcha */ } sk = NULL; @@ -805,7 +804,7 @@ out: return copied; } -static int raw_init(struct sock *sk) +static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); @@ -970,7 +969,7 @@ struct proto raw_prot = { .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .ioctl = raw_ioctl, - .init = raw_init, + .init = raw_sk_init, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, @@ -1134,3 +1133,27 @@ void __init raw_proc_exit(void) unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ + +static void raw_sysctl_init_net(struct net *net) +{ +#ifdef CONFIG_NET_L3_MASTER_DEV + net->ipv4.sysctl_raw_l3mdev_accept = 1; +#endif +} + +static int __net_init raw_sysctl_init(struct net *net) +{ + raw_sysctl_init_net(net); + return 0; +} + +static struct pernet_operations __net_initdata raw_sysctl_ops = { + .init = raw_sysctl_init, +}; + +void __init raw_init(void) +{ + raw_sysctl_init_net(&init_net); + if (register_pernet_subsys(&raw_sysctl_ops)) + panic("RAW: failed to init sysctl parameters.\n"); +} diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c0a9d26c06ce..c4ddbc5f01fc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev, print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), - dev->hard_header_len, true); + dev->hard_header_len, false); } } #endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 891ed2f91467..ba0fc4b18465 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_ping_group_range, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "raw_l3mdev_accept", + .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9e6bc4d6daa7..27e2f6837062 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1423,7 +1423,7 @@ do_error: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, } continue; - found_ok_skb: +found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) @@ -2147,7 +2147,7 @@ skip_copy: sk_eat_skb(sk, skb); continue; - found_fin_ok: +found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) @@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state) * socket sitting in hash tables. */ inet_sk_state_store(sk, state); - -#ifdef STATE_TRACE - SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -#endif } EXPORT_SYMBOL_GPL(tcp_set_state); @@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ 0; } @@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); + nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); return stats; } @@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void) if (unlikely(!tcp_md5sig_pool_populated)) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool_populated) + if (!tcp_md5sig_pool_populated) { __tcp_alloc_md5sig_pool(); + if (tcp_md5sig_pool_populated) + static_key_slow_inc(&tcp_md5_needed); + } mutex_unlock(&tcp_md5sig_mutex); } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 9277abdd822a..0f497fc49c3f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; -/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */ static const int bbr_pacing_margin_percent = 1; /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain @@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); } -/* Pace using current bw estimate and a gain factor. In order to help drive the - * network toward lower queues while maintaining high utilization and low - * latency, the average pacing rate aims to be slightly (~1%) lower than the - * estimated bandwidth. This is an important aspect of the design. In this - * implementation this slightly lower pacing rate is achieved implicitly by not - * including link-layer headers in the packet size used for the pacing rate. - */ +/* Pace using current bw estimate and a gain factor. */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3b45fe530f91..a47c1cdf90fc 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -289,12 +289,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; - u32 tosend; + u32 tosend, delta = 0; int ret; more_data: - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + /* Track delta in msg size to add/subtract it on SK_DROP from + * returned to user copied size. This ensures user doesn't + * get a positive return code with msg_cut_data and SK_DROP + * verdict. + */ + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->sg.size < delta) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { @@ -350,7 +361,7 @@ more_data: default: sk_msg_free_partial(sk, msg, tosend); sk_msg_apply_bytes(psock, tosend); - *copied -= tosend; + *copied -= (tosend + delta); return -EACCES; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a9d9555a973f..76858b14ebe9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1865,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack) { - struct tcp_sock *tp = tcp_sk(sk); - u32 prior_sacked = tp->sacked_out; + if (num_dupack) { + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + s32 delivered; - tp->sacked_out++; - tcp_check_reno_reordering(sk, 0); - if (tp->sacked_out > prior_sacked) - tp->delivered++; /* Some out-of-order packet is delivered */ - tcp_verify_left_out(tp); + tp->sacked_out += num_dupack; + tcp_check_reno_reordering(sk, 0); + delivered = tp->sacked_out - prior_sacked; + if (delivered > 0) + tp->delivered += delivered; + tcp_verify_left_out(tp); + } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ @@ -2459,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && - !(flag & FLAG_LOST_RETRANS)) { + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == + FLAG_RETRANS_DATA_ACKED) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); @@ -2636,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); @@ -2655,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, return; if (after(tp->snd_nxt, tp->high_seq)) { - if (flag & FLAG_DATA_SACKED || is_dupack) + if (flag & FLAG_DATA_SACKED || num_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; @@ -2681,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. */ - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) - tcp_add_reno_sack(sk); + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) + tcp_add_reno_sack(sk, num_dupack); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2759,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, - bool is_dupack, int *ack_flag, int *rexmit) + int num_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - tcp_force_fast_retransmit(sk)); + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && + tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; @@ -2812,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { - if (tcp_is_reno(tp) && is_dupack) - tcp_add_reno_sack(sk); + if (tcp_is_reno(tp)) + tcp_add_reno_sack(sk, num_dupack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2828,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_identify_packet_loss(sk, ack_flag); break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack, rexmit); + tcp_process_loss(sk, flag, num_dupack, rexmit); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) @@ -2839,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - if (is_dupack) - tcp_add_reno_sack(sk); + tcp_add_reno_sack(sk, num_dupack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -3562,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; - bool is_dupack = false; + int num_dupack = 0; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; @@ -3614,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == + FLAG_SND_UNA_ADVANCED) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. @@ -3672,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_set_xmit_timer(sk); if (tcp_ack_is_dubious(sk, flag)) { - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { + num_dupack = 1; + /* Consider if pure acks were aggregated in tcp_add_backlog() */ + if (!(flag & FLAG_DATA)) + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + } + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); } @@ -3691,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); } @@ -3716,7 +3725,7 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); @@ -4606,13 +4615,12 @@ end: } } -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, - bool *fragstolen) +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, + bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); - __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; @@ -4663,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { + if (tcp_queue_rcv(sk, skb, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } @@ -4723,7 +4731,7 @@ queue_and_out: goto drop; } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5599,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); + __skb_pull(skb, tcp_header_len); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index de47038afdf0..efc6fef692ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err); * */ -void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); @@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, - type == ICMP_PARAMETERPROB || - type == ICMP_TIME_EXCEEDED || - (type == ICMP_DEST_UNREACH && - (code == ICMP_NET_UNREACH || - code == ICMP_HOST_UNREACH))); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_rtx_queue_head(sk); - BUG_ON(!skb); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); @@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) @@ -969,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) * We need to maintain these in the sk structure. */ +struct static_key tcp_md5_needed __read_mostly; +EXPORT_SYMBOL(tcp_md5_needed); + /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, + const union tcp_md5_addr *addr, + int family) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1010,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, } return best_match; } -EXPORT_SYMBOL(tcp_md5_do_lookup); +EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1618,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; - - /* Only socket owner can try to collapse/prune rx queues - * to reduce memory overhead, so add a little headroom here. - * Few sockets backlog are possibly concurrently non empty. - */ - limit += 64*1024; + struct skb_shared_info *shinfo; + const struct tcphdr *th; + struct tcphdr *thtail; + struct sk_buff *tail; + unsigned int hdrlen; + bool fragstolen; + u32 gso_segs; + int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -1633,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ skb_condense(skb); + skb_dst_drop(skb); + + if (unlikely(tcp_checksum_complete(skb))) { + bh_unlock_sock(sk); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + return true; + } + + /* Attempt coalescing to last skb in backlog, even if we are + * above the limits. + * This is okay because skb capacity is limited to MAX_SKB_FRAGS. + */ + th = (const struct tcphdr *)skb->data; + hdrlen = th->doff * 4; + shinfo = skb_shinfo(skb); + + if (!shinfo->gso_size) + shinfo->gso_size = skb->len - hdrlen; + + if (!shinfo->gso_segs) + shinfo->gso_segs = 1; + + tail = sk->sk_backlog.tail; + if (!tail) + goto no_coalesce; + thtail = (struct tcphdr *)tail->data; + + if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || + TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || + ((TCP_SKB_CB(tail)->tcp_flags | + TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || + ((TCP_SKB_CB(tail)->tcp_flags ^ + TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || +#ifdef CONFIG_TLS_DEVICE + tail->decrypted != skb->decrypted || +#endif + thtail->doff != th->doff || + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + goto no_coalesce; + + __skb_pull(skb, hdrlen); + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + thtail->window = th->window; + + TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) + TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + + TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + TCP_SKB_CB(tail)->has_rxtstamp = true; + tail->tstamp = skb->tstamp; + skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + } + + /* Not as strict as GRO. We only need to carry mss max value */ + skb_shinfo(tail)->gso_size = max(shinfo->gso_size, + skb_shinfo(tail)->gso_size); + + gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; + skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + + sk->sk_backlog.len += delta; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPBACKLOGCOALESCE); + kfree_skb_partial(skb, fragstolen); + return false; + } + __skb_push(skb, hdrlen); + +no_coalesce: + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); @@ -2573,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of four TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* Default TSQ limit of 16 TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d1676d8a6ed7..c31badfee806 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - (*rcv_wscale) = 0; + *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { - space >>= 1; - (*rcv_wscale)++; - } + *rcv_wscale = clamp_t(int, ilog2(space) - 15, + 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); @@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -1909,18 +1909,22 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 age, send_win, cong_win, limit, in_flight; + u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; + s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer - * only if the last write was recent. + * only if the last write was recent (1 ms). + * Note that tp->tcp_wstamp_ns can be in the future if we have + * packets waiting in a qdisc or device for EDT delivery. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) + delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; + if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1943,6 +1947,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; + /* If this packet won't get more data, do not wait. */ + if (TCP_SKB_CB(skb)->eor) + goto send_now; + win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1967,9 +1975,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); + delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ - if (age < (tp->srtt_us >> 4)) + if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; /* Ok, it looks like it is advisable to defer. @@ -2228,8 +2236,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); - limit = min_t(unsigned long, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + if (sk->sk_pacing_status == SK_PACING_NONE) + limit = min_t(unsigned long, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c0630013c1ae..33bf8e9c8663 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -149,34 +149,40 @@ drop: } #endif -static void tunnel4_err(struct sk_buff *skb, u32 info) +static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) -static void tunnel64_err(struct sk_buff *skb, u32 info) +static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) -static void tunnelmpls4_err(struct sk_buff *skb, u32 info) +static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1976fddb9e00..aff2a8e99e01 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -105,6 +105,7 @@ #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> +#include <net/ip_tunnels.h> #include <net/route.h> #include <net/checksum.h> #include <net/xfrm.h> @@ -115,6 +116,7 @@ #include "udp_impl.h" #include <net/sock_reuseport.h> #include <net/addrconf.h> +#include <net/udp_tunnel.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -371,6 +373,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -398,15 +401,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -585,6 +584,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +void udp_encap_enable(void) +{ + static_branch_inc(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) +{ + int i; + + for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { + int (*handler)(struct sk_buff *skb, u32 info); + + if (!iptun_encaps[i]) + continue; + handler = rcu_dereference(iptun_encaps[i]->err_handler); + if (handler && !handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp4_lib_err_encap(struct net *net, + const struct iphdr *iph, + struct udphdr *uh, + struct udp_table *udptable, + struct sk_buff *skb, u32 info) +{ + int network_offset, transport_offset; + struct sock *sk; + + network_offset = skb_network_offset(skb); + transport_offset = skb_transport_offset(skb); + + /* Network header needs to point to the outer IPv4 header inside ICMP */ + skb_reset_network_header(skb); + + /* Transport header needs to point to the UDP header */ + skb_set_transport_header(skb, iph->ihl << 2); + + sk = __udp4_lib_lookup(net, iph->daddr, uh->source, + iph->saddr, uh->dest, skb->dev->ifindex, 0, + udptable, NULL); + if (sk) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); + struct udp_sock *up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (!lookup || lookup(sk, skb)) + sk = NULL; + } + + if (!sk) + sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); + + skb_set_transport_header(skb, transport_offset); + skb_set_network_header(skb, network_offset); + + return sk; +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -596,13 +678,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, * to find the appropriate port. */ -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + bool tunnel = false; struct sock *sk; int harderr; int err; @@ -612,8 +695,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; /* No socket for error */ + /* No socket for error: try tunnels before discarding */ + sk = ERR_PTR(-ENOENT); + if (static_branch_unlikely(&udp_encap_needed_key)) { + sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb, + info); + if (!sk) + return 0; + } + + if (IS_ERR(sk)) { + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); + return PTR_ERR(sk); + } + + tunnel = true; } err = 0; @@ -656,6 +752,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ + if (tunnel) { + /* ...not for tunnels though: we don't have a sending socket */ + goto out; + } if (!inet->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -665,12 +765,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - return; + return 0; } -void udp_err(struct sk_buff *skb, u32 info) +int udp_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udp_table); + return __udp4_lib_err(skb, info, &udp_table); } /* @@ -1713,6 +1813,10 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } + + if (udp_sk(sk)->gro_enabled) + udp_cmsg_recv(msg, sk, skb); + if (inet->cmsg_flags) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); @@ -1889,13 +1993,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -void udp_encap_enable(void) -{ - static_branch_enable(&udp_encap_needed_key); -} -EXPORT_SYMBOL(udp_encap_enable); - /* returns: * -1: error * 0: success @@ -1904,7 +2001,7 @@ EXPORT_SYMBOL(udp_encap_enable); * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); @@ -2007,6 +2104,27 @@ drop: return -1; } +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *next, *segs; + int ret; + + if (likely(!udp_unexpected_gso(sk, skb))) + return udp_queue_rcv_one_skb(sk, skb); + + BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); + __skb_push(skb, -skb_mac_offset(skb)); + segs = udp_rcv_segment(sk, skb, true); + for (skb = segs; skb; skb = next) { + next = skb->next; + __skb_pull(skb, skb_transport_offset(skb)); + ret = udp_queue_rcv_one_skb(sk, skb); + if (ret > 0) + ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); + } + return 0; +} + /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -2398,11 +2516,15 @@ void udp_destroy_sock(struct sock *sk) bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); - if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { - void (*encap_destroy)(struct sock *sk); - encap_destroy = READ_ONCE(up->encap_destroy); - if (encap_destroy) - encap_destroy(sk); + if (static_branch_unlikely(&udp_encap_needed_key)) { + if (up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = READ_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + if (up->encap_enabled) + static_branch_dec(&udp_encap_needed_key); } } @@ -2447,7 +2569,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; - udp_encap_enable(); + lock_sock(sk); + udp_tunnel_encap_enable(sk->sk_socket); + release_sock(sk); break; default: err = -ENOPROTOOPT; @@ -2469,6 +2593,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, up->gso_size = val; break; + case UDP_GRO: + lock_sock(sk); + if (valbool) + udp_tunnel_encap_enable(sk->sk_socket); + up->gro_enabled = valbool; + release_sock(sk); + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index e7d18b140287..322672655419 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -7,7 +7,7 @@ #include <net/inet_common.h> int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 802f2bc00d69..0646d61f4fa8 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,6 +343,54 @@ out: return segs; } +#define UDP_GRO_CNT_MAX 64 +static struct sk_buff *udp_gro_receive_segment(struct list_head *head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + struct sk_buff *pp = NULL; + struct udphdr *uh2; + struct sk_buff *p; + + /* requires non zero csum, for symmetry with GSO */ + if (!uh->check) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + /* pull encapsulating udp header */ + skb_gro_pull(skb, sizeof(struct udphdr)); + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + + list_for_each_entry(p, head, list) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = udp_hdr(p); + + /* Match ports only, as csum is always non zero */ + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* Terminate the flow on len mismatch or if it grow "too much". + * Under small packet flood GRO count could elsewhere grow a lot + * leading to execessive truesize values + */ + if (!skb_gro_receive(p, skb) && + NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) + pp = p; + else if (uh->len != uh2->len) + pp = p; + + return pp; + } + + /* mismatch, but we never need to flush */ + return NULL; +} + struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, udp_lookup_t lookup) { @@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, int flush = 1; struct sock *sk; + rcu_read_lock(); + sk = (*lookup)(skb, uh->source, uh->dest); + if (!sk) + goto out_unlock; + + if (udp_sk(sk)->gro_enabled) { + pp = call_gro_receive(udp_gro_receive_segment, head, skb); + rcu_read_unlock(); + return pp; + } + if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid)) - goto out; + !NAPI_GRO_CB(skb)->csum_valid) || + !udp_sk(sk)->gro_receive) + goto out_unlock; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; - rcu_read_lock(); - sk = (*lookup)(skb, uh->source, uh->dest); - - if (sk && udp_sk(sk)->gro_receive) - goto unflush; - goto out_unlock; - -unflush: flush = 0; list_for_each_entry(p, head, list) { @@ -394,7 +446,6 @@ unflush: out_unlock: rcu_read_unlock(); -out: skb_gro_flush_final(skb, pp, flush); return pp; } @@ -427,6 +478,19 @@ flush: return NULL; } +static int udp_gro_complete_segment(struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + + skb->csum_start = (unsigned char *)uh - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; + return 0; +} + int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup) { @@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, uh->len = newlen; - /* Set encapsulation before calling into inner gro_complete() functions - * to make them set up the inner offsets. - */ - skb->encapsulation = 1; - rcu_read_lock(); sk = (*lookup)(skb, uh->source, uh->dest); - if (sk && udp_sk(sk)->gro_complete) + if (sk && udp_sk(sk)->gro_enabled) { + err = udp_gro_complete_segment(skb); + } else if (sk && udp_sk(sk)->gro_complete) { + skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM + : SKB_GSO_UDP_TUNNEL; + + /* Set encapsulation before calling into inner gro_complete() + * functions to make them set up the inner offsets. + */ + skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); + } rcu_read_unlock(); if (skb->remcsum_offload) @@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (uh->check) { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); - } else { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; - } return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6539ff15e9a3..be8b5b2157d8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; + if (cfg->bind_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, cfg->bind_ifindex); + if (!dev) { + err = -ENODEV; + goto error; + } + + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, + dev->name, strlen(dev->name) + 1); + dev_put(dev); + + if (err < 0) + goto error; + } + udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -68,6 +85,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 8545457752fb..39c7f17d916f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } -static void udplite_err(struct sk_buff *skb, u32 info) +static int udplite_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udplite_table); + return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8dd0e6ab8606..35c54865dc42 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +static int xfrm4_esp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(esp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ah_rcv(struct sk_buff *skb) @@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +static int xfrm4_ah_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ah4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ipcomp_rcv(struct sk_buff *skb) @@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ipcomp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static const struct net_protocol esp4_protocol = { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 045597b9a7c0..521e471f1cf9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2820,7 +2820,7 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) dev = __dev_get_by_name(net, p.name); if (!dev) goto err_exit; - err = dev_open(dev); + err = dev_open(dev, NULL); } } #endif diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 94999058e110..cca3b3603c42 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -433,7 +433,6 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { - unsigned int hash = inet6_acaddr_hash(net, addr); struct net_device *nh_dev; struct ifacaddr6 *aca; bool found = false; @@ -441,7 +440,9 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); - else + else { + unsigned int hash = inet6_acaddr_hash(net, addr); + hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], aca_addr_lst) { nh_dev = fib6_info_nh_dev(aca->aca_rt); @@ -452,6 +453,7 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, break; } } + } rcu_read_unlock(); return found; } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1ede7a16a0be..bde08aa549f3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -772,6 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, case IPV6_2292PKTINFO: { struct net_device *dev = NULL; + int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; @@ -779,12 +780,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + src_idx = src_info->ipi6_ifindex; - if (src_info->ipi6_ifindex) { + if (src_idx) { if (fl6->flowi6_oif && - src_info->ipi6_ifindex != fl6->flowi6_oif) + src_idx != fl6->flowi6_oif && + (sk->sk_bound_dev_if != fl6->flowi6_oif || + !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; - fl6->flowi6_oif = src_info->ipi6_ifindex; + fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index 6de3c04b0f30..bd675c61deb1 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -4,6 +4,7 @@ #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/udp.h> +#include <linux/icmpv6.h> #include <linux/types.h> #include <linux/kernel.h> #include <net/fou.h> @@ -69,14 +70,87 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } +static int gue6_err_proto_handler(int proto, struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info) +{ + const struct inet6_protocol *ipprot; + + ipprot = rcu_dereference(inet6_protos[proto]); + if (ipprot && ipprot->err_handler) { + if (!ipprot->err_handler(skb, opt, type, code, offset, info)) + return 0; + } + + return -ENOENT; +} + +static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + int transport_offset = skb_transport_offset(skb); + struct guehdr *guehdr; + size_t optlen; + int ret; + + if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); + + switch (((struct iphdr *)guehdr)->version) { + case 4: + ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt, + type, code, offset, info); + goto out; + case 6: + ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt, + type, code, offset, info); + goto out; + default: + ret = -EOPNOTSUPP; + goto out; + } + } + default: /* Undefined version */ + return -EOPNOTSUPP; + } + + if (guehdr->control) + return -ENOENT; + + optlen = guehdr->hlen << 2; + + if (validate_gue_flags(guehdr, optlen)) + return -EINVAL; + + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); + ret = gue6_err_proto_handler(guehdr->proto_ctype, skb, + opt, type, code, offset, info); + +out: + skb_set_transport_header(skb, transport_offset); + return ret; +} + + static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou6_build_header, + .err_handler = gue6_err, }; static const struct ip6_tnl_encap_ops gue_ip6tun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue6_build_header, + .err_handler = gue6_err, }; static int ip6_tnl_encap_add_fou_ops(void) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index c9c53ade55c3..5d7aa2c2770c 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -84,7 +84,7 @@ static inline struct sock *icmpv6_sk(struct net *net) return net->ipv6.icmp_sk[smp_processor_id()]; } -static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ @@ -100,6 +100,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) ping_err(skb, offset, ntohl(info)); + + return 0; } static int icmpv6_rcv(struct sk_buff *skb); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3d7c7460a0c5..5eeeba7181a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; + bool dev_match; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { @@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score++; - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score++; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 515adbdba1d2..81b69bcee714 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -423,7 +423,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) } -static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); @@ -433,13 +433,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6), offset) < 0) - return; + return -EINVAL; ipv6h = (const struct ipv6hdr *)skb->data; t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, tpi.key, tpi.proto); if (!t) - return; + return -ENOENT; switch (type) { struct ipv6_tlv_tnl_enc_lim *tel; @@ -449,14 +449,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, t->parms.name); if (code != ICMPV6_PORT_UNREACH) break; - return; + return 0; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); break; } - return; + return 0; case ICMPV6_PARAMPROB: teli = 0; if (code == ICMPV6_HDR_FIELD) @@ -472,14 +472,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } - return; + return 0; case ICMPV6_PKT_TOOBIG: ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); - return; + return 0; case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); - return; + return 0; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) @@ -487,6 +487,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, else t->err_count = 1; t->err_time = jiffies; + + return 0; } static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index c1d85830c906..c7ed2b6d5a1d 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -319,28 +319,26 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, /* * Deliver the packet to the host */ - - -static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, + bool have_final) { const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; - int nexthdr; bool raw; - bool have_final = false; /* * Parse extension headers */ - rcu_read_lock(); resubmit: idev = ip6_dst_idev(skb_dst(skb)); - if (!pskb_pull(skb, skb_transport_offset(skb))) - goto discard; nhoff = IP6CB(skb)->nhoff; - nexthdr = skb_network_header(skb)[nhoff]; + if (!have_final) { + if (!pskb_pull(skb, skb_transport_offset(skb))) + goto discard; + nexthdr = skb_network_header(skb)[nhoff]; + } resubmit_final: raw = raw6_local_deliver(skb, nexthdr); @@ -359,6 +357,8 @@ resubmit_final: } } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + int sdif = inet6_sdif(skb); + struct net_device *dev; /* Only do this once for first final protocol */ have_final = true; @@ -371,9 +371,19 @@ resubmit_final: skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + dev = dev_get_by_index_rcu(net, sdif); + if (!dev) + goto discard; + } else { + dev = skb->dev; + } + if (ipv6_addr_is_multicast(&hdr->daddr) && - !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, - &hdr->saddr) && + !ipv6_chk_mcast_addr(dev, &hdr->daddr, + &hdr->saddr) && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } @@ -411,13 +421,19 @@ resubmit_final: consume_skb(skb); } } - rcu_read_unlock(); - return 0; + return; discard: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); - rcu_read_unlock(); kfree_skb(skb); +} + +static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + rcu_read_lock(); + ip6_protocol_deliver_rcu(net, skb, 0, false); + rcu_read_unlock(); + return 0; } @@ -432,15 +448,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c7e495f12011..70f525c33cb6 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -229,14 +229,21 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head, * XXX skbs on the gro_list have all been parsed and pulled * already so we don't need to compare nlen * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) - * memcmp() alone below is suffcient, right? + * memcmp() alone below is sufficient, right? */ if ((first_word & htonl(0xF00FFFFF)) || - memcmp(&iph->nexthdr, &iph2->nexthdr, - nlen - offsetof(struct ipv6hdr, nexthdr))) { + !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || + !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || + *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) { +not_same_flow: NAPI_GRO_CB(p)->same_flow = 0; continue; } + if (unlikely(nlen > sizeof(struct ipv6hdr))) { + if (memcmp(iph + 1, iph2 + 1, + nlen - sizeof(struct ipv6hdr))) + goto not_same_flow; + } /* flush if Traffic Class fields are different */ NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); NAPI_GRO_CB(p)->flush |= flush; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fcd3c66ded16..9d55ee33b7f9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -378,6 +378,13 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV + if (skb->offload_l3_fwd_mark) { + consume_skb(skb); + return 0; + } +#endif + return dst_output(net, sk, skb); } @@ -1245,6 +1252,7 @@ static int __ip6_append_data(struct sock *sk, { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; @@ -1257,7 +1265,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged; + bool paged, extra_uref; skb = skb_peek_tail(queue); if (!skb) { @@ -1322,6 +1330,20 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + extra_uref = true; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + } + } + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1440,12 +1462,6 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - /* * Find where to start putting bytes */ @@ -1477,6 +1493,13 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1506,7 +1529,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1536,6 +1559,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1548,6 +1575,8 @@ alloc_new_skb: error_efault: err = -EFAULT; error: + if (uarg) + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index b283f293ee4a..3965d5396b0a 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -31,6 +31,22 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; } + if (cfg->bind_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, cfg->bind_ifindex); + if (!dev) { + err = -ENODEV; + goto error; + } + + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, + dev->name, strlen(dev->name) + 1); + dev_put(dev); + + if (err < 0) + goto error; + } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e2ea691e42c6..8c63494400c4 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -655,7 +655,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) return NULL; } - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 381ce38940ae..973e215c3114 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ sticky_done: retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5e0efd3954e9..aed7eb5c2123 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif)) continue; if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 059f0531f7c1..194bc162866d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2977,7 +2977,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!rt) goto out; - rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len); + rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, + extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); /* Do not leave garbage there. */ @@ -3710,7 +3711,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, if (!f6i) return ERR_PTR(-ENOMEM); - f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0); + f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); f6i->dst_nocount = true; f6i->dst_host = true; f6i->fib6_protocol = RTPROT_KERNEL; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 03e6b7a2bc53..a3f559162521 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -349,7 +349,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk) } } -static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; @@ -371,17 +371,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, fatal); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, fatal); + return 0; + } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -467,6 +469,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out: bh_unlock_sock(sk); sock_put(sk); + return 0; } diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index dae25cad05cd..1991dede7367 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -134,24 +134,28 @@ drop: return 0; } -static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } -static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static const struct inet6_protocol tunnel6_protocol = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d2d97d07ef27..09cba4cfe31f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -45,6 +45,7 @@ #include <net/raw.h> #include <net/tcp_states.h> #include <net/ip6_checksum.h> +#include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> @@ -117,6 +118,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -144,15 +146,10 @@ static int compute_score(struct sock *sk, struct net *net, score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score++; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); + if (!dev_match) + return -1; + score++; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -329,6 +326,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; + struct udp_mib *mib; int is_udp4; if (flags & MSG_ERRQUEUE) @@ -352,6 +350,7 @@ try_again: msg->msg_flags |= MSG_TRUNC; is_udp4 = (skb->protocol == htons(ETH_P_IP)); + mib = __UDPX_MIB(sk, is_udp4); /* * If checksum is needed at all, try to do it while copying the @@ -380,24 +379,13 @@ try_again: if (unlikely(err)) { if (!peeked) { atomic_inc(&sk->sk_drops); - if (is_udp4) - UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - is_udplite); - else - UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - is_udplite); + SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); return err; } - if (!peeked) { - if (is_udp4) - UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, - is_udplite); - else - UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, - is_udplite); - } + if (!peeked) + SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS); sock_recv_ts_and_drops(msg, sk, skb); @@ -421,6 +409,9 @@ try_again: *addr_len = sizeof(*sin6); } + if (udp_sk(sk)->gro_enabled) + udp_cmsg_recv(msg, sk, skb); + if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); @@ -443,17 +434,8 @@ try_again: csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { - if (is_udp4) { - UDP_INC_STATS(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); - } else { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); - } + SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS); + SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -463,15 +445,106 @@ csum_copy_err: goto try_again; } -void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info, - struct udp_table *udptable) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +void udpv6_encap_enable(void) +{ + static_branch_inc(&udpv6_encap_needed_key); +} +EXPORT_SYMBOL(udpv6_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info) +{ + int i; + + for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { + int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info); + + if (!ip6tun_encaps[i]) + continue; + handler = rcu_dereference(ip6tun_encaps[i]->err_handler); + if (handler && !handler(skb, opt, type, code, offset, info)) + return 0; + } + + return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp6_lib_err_encap(struct net *net, + const struct ipv6hdr *hdr, int offset, + struct udphdr *uh, + struct udp_table *udptable, + struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, __be32 info) +{ + int network_offset, transport_offset; + struct sock *sk; + + network_offset = skb_network_offset(skb); + transport_offset = skb_transport_offset(skb); + + /* Network header needs to point to the outer IPv6 header inside ICMP */ + skb_reset_network_header(skb); + + /* Transport header needs to point to the UDP header */ + skb_set_transport_header(skb, offset); + + sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source, + &hdr->saddr, uh->dest, + inet6_iif(skb), 0, udptable, skb); + if (sk) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); + struct udp_sock *up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (!lookup || lookup(sk, skb)) + sk = NULL; + } + + if (!sk) { + sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code, + offset, info)); + } + + skb_set_transport_header(skb, transport_offset); + skb_set_network_header(skb, network_offset); + + return sk; +} + +int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info, + struct udp_table *udptable) { struct ipv6_pinfo *np; const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct in6_addr *saddr = &hdr->saddr; const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); + bool tunnel = false; struct sock *sk; int harderr; int err; @@ -480,9 +553,23 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, skb); if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return; + /* No socket for error: try tunnels before discarding */ + sk = ERR_PTR(-ENOENT); + if (static_branch_unlikely(&udpv6_encap_needed_key)) { + sk = __udp6_lib_err_encap(net, hdr, offset, uh, + udptable, skb, + opt, type, code, info); + if (!sk) + return 0; + } + + if (IS_ERR(sk)) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); + return PTR_ERR(sk); + } + + tunnel = true; } harderr = icmpv6_err_convert(type, code, &err); @@ -496,10 +583,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, harderr = 1; } if (type == NDISC_REDIRECT) { - ip6_sk_redirect(skb, sk); + if (tunnel) { + ip6_redirect(skb, sock_net(sk), inet6_iif(skb), + sk->sk_mark, sk->sk_uid); + } else { + ip6_sk_redirect(skb, sk); + } goto out; } + /* Tunnels don't have an application socket: don't pass errors back */ + if (tunnel) + goto out; + if (!np->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -510,7 +606,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk->sk_err = err; sk->sk_error_report(sk); out: - return; + return 0; } static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -541,21 +637,14 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static __inline__ void udpv6_err(struct sk_buff *skb, - struct inet6_skb_parm *opt, u8 type, - u8 code, int offset, __be32 info) +static __inline__ int udpv6_err(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, + u8 code, int offset, __be32 info) { - __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); + return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); -void udpv6_encap_enable(void) -{ - static_branch_enable(&udpv6_encap_needed_key); -} -EXPORT_SYMBOL(udpv6_encap_enable); - -static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); @@ -638,10 +727,32 @@ drop: return -1; } +static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *next, *segs; + int ret; + + if (likely(!udp_unexpected_gso(sk, skb))) + return udpv6_queue_rcv_one_skb(sk, skb); + + __skb_push(skb, -skb_mac_offset(skb)); + segs = udp_rcv_segment(sk, skb, false); + for (skb = segs; skb; skb = next) { + next = skb->next; + __skb_pull(skb, skb_transport_offset(skb)); + + ret = udpv6_queue_rcv_one_skb(sk, skb); + if (ret > 0) + ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret, + true); + } + return 0; +} + static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -653,7 +764,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))) return false; @@ -687,6 +798,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -701,7 +813,8 @@ start_lookup: sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. @@ -1458,11 +1571,15 @@ void udpv6_destroy_sock(struct sock *sk) udp_v6_flush_pending_frames(sk); release_sock(sk); - if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { - void (*encap_destroy)(struct sock *sk); - encap_destroy = READ_ONCE(up->encap_destroy); - if (encap_destroy) - encap_destroy(sk); + if (static_branch_unlikely(&udpv6_encap_needed_key)) { + if (up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = READ_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + if (up->encap_enabled) + static_branch_dec(&udpv6_encap_needed_key); } inet6_destroy_sock(sk); diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 7903e21c178b..5730e6503cb4 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -9,8 +9,8 @@ #include <net/transp_v6.h> int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, - __be32, struct udp_table *); +int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, + __be32, struct udp_table *); int udp_v6_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 1b8e161ac527..828b2457f97b 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (uh->check) { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + if (uh->check) uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, &ipv6h->daddr, 0); - } else { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; - } return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb); } diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 5000ad6878e6..a125aebc29e5 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -20,11 +20,12 @@ static int udplitev6_rcv(struct sk_buff *skb) return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } -static void udplitev6_err(struct sk_buff *skb, +static int udplitev6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table); + return __udp6_lib_err(skb, opt, type, code, offset, info, + &udplite_table); } static const struct inet6_protocol udplitev6_protocol = { diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index b2dc8ce49378..cc979b702c89 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -80,14 +80,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(esp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static int xfrm6_ah_rcv(struct sk_buff *skb) @@ -107,14 +109,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ah6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static int xfrm6_ipcomp_rcv(struct sk_buff *skb) @@ -134,14 +138,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ipcomp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static const struct inet6_protocol esp6_protocol = { diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 0bed4cc20603..78ea5a739d10 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1873,30 +1873,26 @@ static void iucv_callback_txdone(struct iucv_path *path, struct sock *sk = path->private; struct sk_buff *this = NULL; struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q; - struct sk_buff *list_skb = list->next; + struct sk_buff *list_skb; unsigned long flags; bh_lock_sock(sk); - if (!skb_queue_empty(list)) { - spin_lock_irqsave(&list->lock, flags); - while (list_skb != (struct sk_buff *)list) { - if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { - this = list_skb; - break; - } - list_skb = list_skb->next; + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk(list, list_skb) { + if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { + this = list_skb; + break; } - if (this) - __skb_unlink(this, list); - - spin_unlock_irqrestore(&list->lock, flags); + } + if (this) + __skb_unlink(this, list); + spin_unlock_irqrestore(&list->lock, flags); - if (this) { - kfree_skb(this); - /* wake up any process waiting for sending */ - iucv_sock_wake_msglim(sk); - } + if (this) { + kfree_skb(this); + /* wake up any process waiting for sending */ + iucv_sock_wake_msglim(sk); } if (sk->sk_state == IUCV_CLOSING) { @@ -2284,11 +2280,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, list = &iucv->send_skb_q; spin_lock_irqsave(&list->lock, flags); - if (skb_queue_empty(list)) - goto out_unlock; - list_skb = list->next; - nskb = list_skb->next; - while (list_skb != (struct sk_buff *)list) { + skb_queue_walk_safe(list, list_skb, nskb) { if (skb_shinfo(list_skb) == skb_shinfo(skb)) { switch (n) { case TX_NOTIFY_OK: @@ -2321,10 +2313,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, } break; } - list_skb = nskb; - nskb = nskb->next; } -out_unlock: spin_unlock_irqrestore(&list->lock, flags); if (sk->sk_state == IUCV_CLOSING) { diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 8da86ceca33d..309dee76724e 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -47,6 +47,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev) EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); /** + * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master + * device + * @net: network namespace for device index lookup + * @ifindex: targeted interface + */ +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, ifindex); + while (dev && !netif_is_l3_master(dev)) + dev = netdev_master_upper_dev_get(dev); + + return dev ? dev->ifindex : 0; +} +EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); + +/** * l3mdev_fib_table - get FIB table id associated with an L3 * master interface * @dev: targeted interface diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 1dae77c54009..87505600dbb2 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -73,10 +73,15 @@ enum { #define NCSI_OEM_MFR_BCM_ID 0x113d /* Broadcom specific OEM Command */ #define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */ +/* Mellanox specific OEM Command */ +#define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */ +#define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */ /* OEM Command payload lengths*/ #define NCSI_OEM_BCM_CMD_GMA_LEN 12 +#define NCSI_OEM_MLX_CMD_GMA_LEN 8 /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 +#define MLX_MAC_ADDR_OFFSET 8 struct ncsi_channel_version { @@ -222,6 +227,10 @@ struct ncsi_package { unsigned int channel_num; /* Number of channels */ struct list_head channels; /* List of chanels */ struct list_head node; /* Form list of packages */ + + bool multi_channel; /* Enable multiple channels */ + u32 channel_whitelist; /* Channels to configure */ + struct ncsi_channel *preferred_channel; /* Primary channel */ }; struct ncsi_request { @@ -287,16 +296,16 @@ struct ncsi_dev_priv { #define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ #define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ #define NCSI_DEV_RESHUFFLE 4 +#define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ #if IS_ENABLED(CONFIG_IPV6) unsigned int inet6_addr_num; /* Number of IPv6 addresses */ #endif + unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ - struct ncsi_package *force_package; /* Force a specific package */ - struct ncsi_channel *force_channel; /* Force a specific channel */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ #define NCSI_REQ_START_IDX 1 @@ -309,6 +318,9 @@ struct ncsi_dev_priv { struct list_head node; /* Form NCSI device list */ #define NCSI_MAX_VLAN_VIDS 15 struct list_head vlan_vids; /* List of active VLAN IDs */ + + bool multi_package; /* Enable multiple packages */ + u32 package_whitelist; /* Packages to configure */ }; struct ncsi_cmd_arg { @@ -341,6 +353,7 @@ extern spinlock_t ncsi_dev_lock; list_for_each_entry_rcu(nc, &np->channels, node) /* Resources */ +int ncsi_reset_dev(struct ncsi_dev *nd); void ncsi_start_channel_monitor(struct ncsi_channel *nc); void ncsi_stop_channel_monitor(struct ncsi_channel *nc); struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, @@ -361,6 +374,13 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); +bool ncsi_channel_has_link(struct ncsi_channel *channel); +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp, + struct ncsi_channel *channel); +int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp, + struct ncsi_package *np, + struct ncsi_channel *disable, + struct ncsi_channel *enable); /* Packet handlers */ u32 ncsi_calculate_checksum(unsigned char *data, int len); diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 25e483e8278b..26d67e27551f 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -50,13 +50,15 @@ static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h, static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, struct ncsi_aen_pkt_hdr *h) { - struct ncsi_aen_lsc_pkt *lsc; - struct ncsi_channel *nc; + struct ncsi_channel *nc, *tmp; struct ncsi_channel_mode *ncm; - bool chained; - int state; unsigned long old_data, data; + struct ncsi_aen_lsc_pkt *lsc; + struct ncsi_package *np; + bool had_link, has_link; unsigned long flags; + bool chained; + int state; /* Find the NCSI channel */ ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); @@ -73,6 +75,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); + had_link = !!(old_data & 0x1); + has_link = !!(data & 0x1); + netdev_dbg(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n", nc->id, data & 0x1 ? "up" : "down"); @@ -80,22 +85,60 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, state = nc->state; spin_unlock_irqrestore(&nc->lock, flags); - if (!((old_data ^ data) & 0x1) || chained) - return 0; - if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) && - !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1))) + if (state == NCSI_CHANNEL_INACTIVE) + netdev_warn(ndp->ndev.dev, + "NCSI: Inactive channel %u received AEN!\n", + nc->id); + + if ((had_link == has_link) || chained) return 0; - if (!(ndp->flags & NCSI_DEV_HWA) && - state == NCSI_CHANNEL_ACTIVE) - ndp->flags |= NCSI_DEV_RESHUFFLE; + if (!ndp->multi_package && !nc->package->multi_channel) { + if (had_link) { + ndp->flags |= NCSI_DEV_RESHUFFLE; + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + return ncsi_process_next_channel(ndp); + } + /* Configured channel came up */ + return 0; + } - ncsi_stop_channel_monitor(nc); - spin_lock_irqsave(&ndp->lock, flags); - list_add_tail_rcu(&nc->link, &ndp->channel_queue); - spin_unlock_irqrestore(&ndp->lock, flags); + if (had_link) { + ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; + if (ncsi_channel_is_last(ndp, nc)) { + /* No channels left, reconfigure */ + return ncsi_reset_dev(&ndp->ndev); + } else if (ncm->enable) { + /* Need to failover Tx channel */ + ncsi_update_tx_channel(ndp, nc->package, nc, NULL); + } + } else if (has_link && nc->package->preferred_channel == nc) { + /* Return Tx to preferred channel */ + ncsi_update_tx_channel(ndp, nc->package, NULL, nc); + } else if (has_link) { + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, tmp) { + /* Enable Tx on this channel if the current Tx + * channel is down. + */ + ncm = &tmp->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable && + !ncsi_channel_has_link(tmp)) { + ncsi_update_tx_channel(ndp, nc->package, + tmp, nc); + break; + } + } + } + } - return ncsi_process_next_channel(ndp); + /* Leave configured channels active in a multi-channel scenario so + * AEN events are still received. + */ + return 0; } static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index bfc43b28c7a6..31359d5e14ad 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -28,6 +28,29 @@ LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); +bool ncsi_channel_has_link(struct ncsi_channel *channel) +{ + return !!(channel->modes[NCSI_MODE_LINK].data[2] & 0x1); +} + +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp, + struct ncsi_channel *channel) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + + NCSI_FOR_EACH_PACKAGE(ndp, np) + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (nc == channel) + continue; + if (nc->state == NCSI_CHANNEL_ACTIVE && + ncsi_channel_has_link(nc)) + return false; + } + + return true; +} + static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) { struct ncsi_dev *nd = &ndp->ndev; @@ -52,7 +75,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) continue; } - if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + if (ncsi_channel_has_link(nc)) { spin_unlock_irqrestore(&nc->lock, flags); nd->link_up = 1; goto report; @@ -113,10 +136,8 @@ static void ncsi_channel_monitor(struct timer_list *t) default: netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n", nc->id); - if (!(ndp->flags & NCSI_DEV_HWA)) { - ncsi_report_link(ndp, true); - ndp->flags |= NCSI_DEV_RESHUFFLE; - } + ncsi_report_link(ndp, true); + ndp->flags |= NCSI_DEV_RESHUFFLE; ncsi_stop_channel_monitor(nc); @@ -269,6 +290,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, np->ndp = ndp; spin_lock_init(&np->lock); INIT_LIST_HEAD(&np->channels); + np->channel_whitelist = UINT_MAX; spin_lock_irqsave(&ndp->lock, flags); tmp = ncsi_find_package(ndp, id); @@ -442,12 +464,14 @@ static void ncsi_request_timeout(struct timer_list *t) static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; - struct ncsi_package *np = ndp->active_package; - struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_package *np; + struct ncsi_channel *nc, *tmp; struct ncsi_cmd_arg nca; unsigned long flags; int ret; + np = ndp->active_package; + nc = ndp->active_channel; nca.ndp = ndp; nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { @@ -523,6 +547,15 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) if (ret) goto error; + NCSI_FOR_EACH_CHANNEL(np, tmp) { + /* If there is another channel active on this package + * do not deselect the package. + */ + if (tmp != nc && tmp->state == NCSI_CHANNEL_ACTIVE) { + nd->state = ncsi_dev_state_suspend_done; + break; + } + } break; case ncsi_dev_state_suspend_deselect: ndp->pending_req_num = 1; @@ -541,8 +574,10 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); - ncsi_process_next_channel(ndp); - + if (ndp->flags & NCSI_DEV_RESET) + ncsi_reset_dev(nd); + else + ncsi_process_next_channel(ndp); break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n", @@ -675,12 +710,38 @@ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) +{ + union { + u8 data_u8[NCSI_OEM_MLX_CMD_GMA_LEN]; + u32 data_u32[NCSI_OEM_MLX_CMD_GMA_LEN / sizeof(u32)]; + } u; + int ret = 0; + + nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN; + + memset(&u, 0, sizeof(u)); + u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA; + u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM; + + nca->data = u.data_u8; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; int (*handler)(struct ncsi_cmd_arg *nca); } ncsi_oem_gma_handlers[] = { - { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm } + { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }, + { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx } }; static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) @@ -717,13 +778,144 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) #endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ +/* Determine if a given channel from the channel_queue should be used for Tx */ +static bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, + struct ncsi_channel *nc) +{ + struct ncsi_channel_mode *ncm; + struct ncsi_channel *channel; + struct ncsi_package *np; + + /* Check if any other channel has Tx enabled; a channel may have already + * been configured and removed from the channel queue. + */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (!ndp->multi_package && np != nc->package) + continue; + NCSI_FOR_EACH_CHANNEL(np, channel) { + ncm = &channel->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable) + return false; + } + } + + /* This channel is the preferred channel and has link */ + list_for_each_entry_rcu(channel, &ndp->channel_queue, link) { + np = channel->package; + if (np->preferred_channel && + ncsi_channel_has_link(np->preferred_channel)) { + return np->preferred_channel == nc; + } + } + + /* This channel has link */ + if (ncsi_channel_has_link(nc)) + return true; + + list_for_each_entry_rcu(channel, &ndp->channel_queue, link) + if (ncsi_channel_has_link(channel)) + return false; + + /* No other channel has link; default to this one */ + return true; +} + +/* Change the active Tx channel in a multi-channel setup */ +int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp, + struct ncsi_package *package, + struct ncsi_channel *disable, + struct ncsi_channel *enable) +{ + struct ncsi_cmd_arg nca; + struct ncsi_channel *nc; + struct ncsi_package *np; + int ret = 0; + + if (!package->multi_channel && !ndp->multi_package) + netdev_warn(ndp->ndev.dev, + "NCSI: Trying to update Tx channel in single-channel mode\n"); + nca.ndp = ndp; + nca.req_flags = 0; + + /* Find current channel with Tx enabled */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (disable) + break; + if (!ndp->multi_package && np != package) + continue; + + NCSI_FOR_EACH_CHANNEL(np, nc) + if (nc->modes[NCSI_MODE_TX_ENABLE].enable) { + disable = nc; + break; + } + } + + /* Find a suitable channel for Tx */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (enable) + break; + if (!ndp->multi_package && np != package) + continue; + if (!(ndp->package_whitelist & (0x1 << np->id))) + continue; + + if (np->preferred_channel && + ncsi_channel_has_link(np->preferred_channel)) { + enable = np->preferred_channel; + break; + } + + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!(np->channel_whitelist & 0x1 << nc->id)) + continue; + if (nc->state != NCSI_CHANNEL_ACTIVE) + continue; + if (ncsi_channel_has_link(nc)) { + enable = nc; + break; + } + } + } + + if (disable == enable) + return -1; + + if (!enable) + return -1; + + if (disable) { + nca.channel = disable->id; + nca.package = disable->package->id; + nca.type = NCSI_PKT_CMD_DCNT; + ret = ncsi_xmit_cmd(&nca); + if (ret) + netdev_err(ndp->ndev.dev, + "Error %d sending DCNT\n", + ret); + } + + netdev_info(ndp->ndev.dev, "NCSI: channel %u enables Tx\n", enable->id); + + nca.channel = enable->id; + nca.package = enable->package->id; + nca.type = NCSI_PKT_CMD_ECNT; + ret = ncsi_xmit_cmd(&nca); + if (ret) + netdev_err(ndp->ndev.dev, + "Error %d sending ECNT\n", + ret); + + return ret; +} + static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_dev *nd = &ndp->ndev; - struct net_device *dev = nd->dev; struct ncsi_package *np = ndp->active_package; struct ncsi_channel *nc = ndp->active_channel; struct ncsi_channel *hot_nc = NULL; + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; struct ncsi_cmd_arg nca; unsigned char index; unsigned long flags; @@ -845,20 +1037,29 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; - nd->state = ncsi_dev_state_config_ecnt; + if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; #if IS_ENABLED(CONFIG_IPV6) if (ndp->inet6_addr_num > 0 && (nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) nd->state = ncsi_dev_state_config_egmf; - else - nd->state = ncsi_dev_state_config_ecnt; } else if (nd->state == ncsi_dev_state_config_egmf) { nca.type = NCSI_PKT_CMD_EGMF; nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - nd->state = ncsi_dev_state_config_ecnt; + if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; #endif /* CONFIG_IPV6 */ } else if (nd->state == ncsi_dev_state_config_ecnt) { + if (np->preferred_channel && + nc != np->preferred_channel) + netdev_info(ndp->ndev.dev, + "NCSI: Tx failed over to channel %u\n", + nc->id); nca.type = NCSI_PKT_CMD_ECNT; nd->state = ncsi_dev_state_config_ec; } else if (nd->state == ncsi_dev_state_config_ec) { @@ -889,6 +1090,16 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) netdev_dbg(ndp->ndev.dev, "NCSI: channel %u config done\n", nc->id); spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_ACTIVE; + + if (ndp->flags & NCSI_DEV_RESET) { + /* A reset event happened during config, start it now */ + nc->reconfigure_needed = false; + spin_unlock_irqrestore(&nc->lock, flags); + ncsi_reset_dev(nd); + break; + } + if (nc->reconfigure_needed) { /* This channel's configuration has been updated * part-way during the config state - start the @@ -909,10 +1120,8 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { hot_nc = nc; - nc->state = NCSI_CHANNEL_ACTIVE; } else { hot_nc = NULL; - nc->state = NCSI_CHANNEL_INACTIVE; netdev_dbg(ndp->ndev.dev, "NCSI: channel %u link down after config\n", nc->id); @@ -940,43 +1149,35 @@ error: static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_package *np, *force_package; - struct ncsi_channel *nc, *found, *hot_nc, *force_channel; + struct ncsi_channel *nc, *found, *hot_nc; struct ncsi_channel_mode *ncm; - unsigned long flags; + unsigned long flags, cflags; + struct ncsi_package *np; + bool with_link; spin_lock_irqsave(&ndp->lock, flags); hot_nc = ndp->hot_channel; - force_channel = ndp->force_channel; - force_package = ndp->force_package; spin_unlock_irqrestore(&ndp->lock, flags); - /* Force a specific channel whether or not it has link if we have been - * configured to do so - */ - if (force_package && force_channel) { - found = force_channel; - ncm = &found->modes[NCSI_MODE_LINK]; - if (!(ncm->data[2] & 0x1)) - netdev_info(ndp->ndev.dev, - "NCSI: Channel %u forced, but it is link down\n", - found->id); - goto out; - } - - /* The search is done once an inactive channel with up - * link is found. + /* By default the search is done once an inactive channel with up + * link is found, unless a preferred channel is set. + * If multi_package or multi_channel are configured all channels in the + * whitelist are added to the channel queue. */ found = NULL; + with_link = false; NCSI_FOR_EACH_PACKAGE(ndp, np) { - if (ndp->force_package && np != ndp->force_package) + if (!(ndp->package_whitelist & (0x1 << np->id))) continue; NCSI_FOR_EACH_CHANNEL(np, nc) { - spin_lock_irqsave(&nc->lock, flags); + if (!(np->channel_whitelist & (0x1 << nc->id))) + continue; + + spin_lock_irqsave(&nc->lock, cflags); if (!list_empty(&nc->link) || nc->state != NCSI_CHANNEL_INACTIVE) { - spin_unlock_irqrestore(&nc->lock, flags); + spin_unlock_irqrestore(&nc->lock, cflags); continue; } @@ -988,32 +1189,49 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) ncm = &nc->modes[NCSI_MODE_LINK]; if (ncm->data[2] & 0x1) { - spin_unlock_irqrestore(&nc->lock, flags); found = nc; - goto out; + with_link = true; } - spin_unlock_irqrestore(&nc->lock, flags); + /* If multi_channel is enabled configure all valid + * channels whether or not they currently have link + * so they will have AENs enabled. + */ + if (with_link || np->multi_channel) { + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, + &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + netdev_dbg(ndp->ndev.dev, + "NCSI: Channel %u added to queue (link %s)\n", + nc->id, + ncm->data[2] & 0x1 ? "up" : "down"); + } + + spin_unlock_irqrestore(&nc->lock, cflags); + + if (with_link && !np->multi_channel) + break; } + if (with_link && !ndp->multi_package) + break; } - if (!found) { + if (list_empty(&ndp->channel_queue) && found) { + netdev_info(ndp->ndev.dev, + "NCSI: No channel with link found, configuring channel %u\n", + found->id); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&found->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + } else if (!found) { netdev_warn(ndp->ndev.dev, - "NCSI: No channel found with link\n"); + "NCSI: No channel found to configure!\n"); ncsi_report_link(ndp, true); return -ENODEV; } - ncm = &found->modes[NCSI_MODE_LINK]; - netdev_dbg(ndp->ndev.dev, - "NCSI: Channel %u added to queue (link %s)\n", - found->id, ncm->data[2] & 0x1 ? "up" : "down"); - -out: - spin_lock_irqsave(&ndp->lock, flags); - list_add_tail_rcu(&found->link, &ndp->channel_queue); - spin_unlock_irqrestore(&ndp->lock, flags); - return ncsi_process_next_channel(ndp); } @@ -1050,35 +1268,6 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) return false; } -static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) -{ - struct ncsi_package *np; - struct ncsi_channel *nc; - unsigned long flags; - - /* Move all available channels to processing queue */ - spin_lock_irqsave(&ndp->lock, flags); - NCSI_FOR_EACH_PACKAGE(ndp, np) { - NCSI_FOR_EACH_CHANNEL(np, nc) { - WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE || - !list_empty(&nc->link)); - ncsi_stop_channel_monitor(nc); - list_add_tail_rcu(&nc->link, &ndp->channel_queue); - } - } - spin_unlock_irqrestore(&ndp->lock, flags); - - /* We can have no channels in extremely case */ - if (list_empty(&ndp->channel_queue)) { - netdev_err(ndp->ndev.dev, - "NCSI: No available channels for HWA\n"); - ncsi_report_link(ndp, false); - return -ENOENT; - } - - return ncsi_process_next_channel(ndp); -} - static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -1110,70 +1299,28 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: - ndp->pending_req_num = 16; + ndp->pending_req_num = 1; - /* Select all possible packages */ nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; + nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; - for (index = 0; index < 8; index++) { - nca.package = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - - /* Disable all possible packages */ - nca.type = NCSI_PKT_CMD_DP; - for (index = 0; index < 8; index++) { - nca.package = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; nd->state = ncsi_dev_state_probe_channel; break; case ncsi_dev_state_probe_channel: - if (!ndp->active_package) - ndp->active_package = list_first_or_null_rcu( - &ndp->packages, struct ncsi_package, node); - else if (list_is_last(&ndp->active_package->node, - &ndp->packages)) - ndp->active_package = NULL; - else - ndp->active_package = list_next_entry( - ndp->active_package, node); - - /* All available packages and channels are enumerated. The - * enumeration happens for once when the NCSI interface is - * started. So we need continue to start the interface after - * the enumeration. - * - * We have to choose an active channel before configuring it. - * Note that we possibly don't have active channel in extreme - * situation. - */ + ndp->active_package = ncsi_find_package(ndp, + ndp->package_probe_id); if (!ndp->active_package) { - ndp->flags |= NCSI_DEV_PROBED; - if (ncsi_check_hwa(ndp)) - ncsi_enable_hwa(ndp); - else - ncsi_choose_active_channel(ndp); - return; + /* No response */ + nd->state = ncsi_dev_state_probe_dp; + schedule_work(&ndp->work); + break; } - - /* Select the active package */ - ndp->pending_req_num = 1; - nca.type = NCSI_PKT_CMD_SP; - nca.bytes[0] = 1; - nca.package = ndp->active_package->id; - nca.channel = NCSI_RESERVED_CHANNEL; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - nd->state = ncsi_dev_state_probe_cis; + schedule_work(&ndp->work); break; case ncsi_dev_state_probe_cis: ndp->pending_req_num = NCSI_RESERVED_CHANNEL; @@ -1222,22 +1369,35 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) case ncsi_dev_state_probe_dp: ndp->pending_req_num = 1; - /* Deselect the active package */ + /* Deselect the current package */ nca.type = NCSI_PKT_CMD_DP; - nca.package = ndp->active_package->id; + nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; - /* Scan channels in next package */ - nd->state = ncsi_dev_state_probe_channel; + /* Probe next package */ + ndp->package_probe_id++; + if (ndp->package_probe_id >= 8) { + /* Probe finished */ + ndp->flags |= NCSI_DEV_PROBED; + break; + } + nd->state = ncsi_dev_state_probe_package; + ndp->active_package = NULL; break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n", nd->state); } + if (ndp->flags & NCSI_DEV_PROBED) { + /* Check if all packages have HWA support */ + ncsi_check_hwa(ndp); + ncsi_choose_active_channel(ndp); + } + return; error: netdev_err(ndp->ndev.dev, @@ -1556,6 +1716,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, INIT_LIST_HEAD(&ndp->channel_queue); INIT_LIST_HEAD(&ndp->vlan_vids); INIT_WORK(&ndp->work, ncsi_dev_work); + ndp->package_whitelist = UINT_MAX; /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); @@ -1592,26 +1753,19 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev); int ncsi_start_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); - int ret; if (nd->state != ncsi_dev_state_registered && nd->state != ncsi_dev_state_functional) return -ENOTTY; if (!(ndp->flags & NCSI_DEV_PROBED)) { + ndp->package_probe_id = 0; nd->state = ncsi_dev_state_probe; schedule_work(&ndp->work); return 0; } - if (ndp->flags & NCSI_DEV_HWA) { - netdev_info(ndp->ndev.dev, "NCSI: Enabling HWA mode\n"); - ret = ncsi_enable_hwa(ndp); - } else { - ret = ncsi_choose_active_channel(ndp); - } - - return ret; + return ncsi_reset_dev(nd); } EXPORT_SYMBOL_GPL(ncsi_start_dev); @@ -1624,7 +1778,10 @@ void ncsi_stop_dev(struct ncsi_dev *nd) int old_state; unsigned long flags; - /* Stop the channel monitor and reset channel's state */ + /* Stop the channel monitor on any active channels. Don't reset the + * channel state so we know which were active when ncsi_start_dev() + * is next called. + */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { ncsi_stop_channel_monitor(nc); @@ -1632,7 +1789,6 @@ void ncsi_stop_dev(struct ncsi_dev *nd) spin_lock_irqsave(&nc->lock, flags); chained = !list_empty(&nc->link); old_state = nc->state; - nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); WARN_ON_ONCE(chained || @@ -1645,6 +1801,92 @@ void ncsi_stop_dev(struct ncsi_dev *nd) } EXPORT_SYMBOL_GPL(ncsi_stop_dev); +int ncsi_reset_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_channel *nc, *active, *tmp; + struct ncsi_package *np; + unsigned long flags; + + spin_lock_irqsave(&ndp->lock, flags); + + if (!(ndp->flags & NCSI_DEV_RESET)) { + /* Haven't been called yet, check states */ + switch (nd->state & ncsi_dev_state_major) { + case ncsi_dev_state_registered: + case ncsi_dev_state_probe: + /* Not even probed yet - do nothing */ + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + case ncsi_dev_state_suspend: + case ncsi_dev_state_config: + /* Wait for the channel to finish its suspend/config + * operation; once it finishes it will check for + * NCSI_DEV_RESET and reset the state. + */ + ndp->flags |= NCSI_DEV_RESET; + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + } + } else { + switch (nd->state) { + case ncsi_dev_state_suspend_done: + case ncsi_dev_state_config_done: + case ncsi_dev_state_functional: + /* Ok */ + break; + default: + /* Current reset operation happening */ + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + } + } + + if (!list_empty(&ndp->channel_queue)) { + /* Clear any channel queue we may have interrupted */ + list_for_each_entry_safe(nc, tmp, &ndp->channel_queue, link) + list_del_init(&nc->link); + } + spin_unlock_irqrestore(&ndp->lock, flags); + + active = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + + if (nc->state == NCSI_CHANNEL_ACTIVE) { + active = nc; + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + ncsi_stop_channel_monitor(nc); + break; + } + + spin_unlock_irqrestore(&nc->lock, flags); + } + if (active) + break; + } + + if (!active) { + /* Done */ + spin_lock_irqsave(&ndp->lock, flags); + ndp->flags &= ~NCSI_DEV_RESET; + spin_unlock_irqrestore(&ndp->lock, flags); + return ncsi_choose_active_channel(ndp); + } + + spin_lock_irqsave(&ndp->lock, flags); + ndp->flags |= NCSI_DEV_RESET; + ndp->active_channel = active; + ndp->active_package = active->package; + spin_unlock_irqrestore(&ndp->lock, flags); + + nd->state = ncsi_dev_state_suspend; + schedule_work(&ndp->work); + return 0; +} + void ncsi_unregister_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 33314381b4f5..5d782445d2fc 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -30,6 +30,9 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 }, + [NCSI_ATTR_MULTI_FLAG] = { .type = NLA_FLAG }, + [NCSI_ATTR_PACKAGE_MASK] = { .type = NLA_U32 }, + [NCSI_ATTR_CHANNEL_MASK] = { .type = NLA_U32 }, }; static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) @@ -69,7 +72,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb, nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); if (nc->state == NCSI_CHANNEL_ACTIVE) nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); - if (ndp->force_channel == nc) + if (nc == nc->package->preferred_channel) nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version); @@ -114,7 +117,7 @@ static int ncsi_write_package_info(struct sk_buff *skb, if (!pnest) return -ENOMEM; nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); - if (ndp->force_package == np) + if ((0x1 << np->id) == ndp->package_whitelist) nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST); if (!cnest) { @@ -290,49 +293,58 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); package = NULL; - spin_lock_irqsave(&ndp->lock, flags); - NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) package = np; if (!package) { /* The user has set a package that does not exist */ - spin_unlock_irqrestore(&ndp->lock, flags); return -ERANGE; } channel = NULL; - if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { - /* Allow any channel */ - channel_id = NCSI_RESERVED_CHANNEL; - } else { + if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); NCSI_FOR_EACH_CHANNEL(package, nc) - if (nc->id == channel_id) + if (nc->id == channel_id) { channel = nc; + break; + } + if (!channel) { + netdev_info(ndp->ndev.dev, + "NCSI: Channel %u does not exist!\n", + channel_id); + return -ERANGE; + } } - if (channel_id != NCSI_RESERVED_CHANNEL && !channel) { - /* The user has set a channel that does not exist on this - * package - */ - spin_unlock_irqrestore(&ndp->lock, flags); - netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", - channel_id); - return -ERANGE; - } - - ndp->force_package = package; - ndp->force_channel = channel; + spin_lock_irqsave(&ndp->lock, flags); + ndp->package_whitelist = 0x1 << package->id; + ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); - netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n", - package_id, channel_id, - channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : ""); + spin_lock_irqsave(&package->lock, flags); + package->multi_channel = false; + if (channel) { + package->channel_whitelist = 0x1 << channel->id; + package->preferred_channel = channel; + } else { + /* Allow any channel */ + package->channel_whitelist = UINT_MAX; + package->preferred_channel = NULL; + } + spin_unlock_irqrestore(&package->lock, flags); + + if (channel) + netdev_info(ndp->ndev.dev, + "Set package 0x%x, channel 0x%x as preferred\n", + package_id, channel_id); + else + netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n", + package_id); - /* Bounce the NCSI channel to set changes */ - ncsi_stop_dev(&ndp->ndev); - ncsi_start_dev(&ndp->ndev); + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); return 0; } @@ -340,6 +352,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; + struct ncsi_package *np; unsigned long flags; if (!info || !info->attrs) @@ -353,16 +366,24 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) if (!ndp) return -ENODEV; - /* Clear any override */ + /* Reset any whitelists and disable multi mode */ spin_lock_irqsave(&ndp->lock, flags); - ndp->force_package = NULL; - ndp->force_channel = NULL; + ndp->package_whitelist = UINT_MAX; + ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); + + NCSI_FOR_EACH_PACKAGE(ndp, np) { + spin_lock_irqsave(&np->lock, flags); + np->multi_channel = false; + np->channel_whitelist = UINT_MAX; + np->preferred_channel = NULL; + spin_unlock_irqrestore(&np->lock, flags); + } netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); - /* Bounce the NCSI channel to set changes */ - ncsi_stop_dev(&ndp->ndev); - ncsi_start_dev(&ndp->ndev); + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); return 0; } @@ -563,6 +584,138 @@ int ncsi_send_netlink_err(struct net_device *dev, return nlmsg_unicast(net->genl_sock, skb, snd_portid); } +static int ncsi_set_package_mask_nl(struct sk_buff *msg, + struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned long flags; + int rc; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_MASK]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + spin_lock_irqsave(&ndp->lock, flags); + if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { + if (ndp->flags & NCSI_DEV_HWA) { + ndp->multi_package = true; + rc = 0; + } else { + netdev_err(ndp->ndev.dev, + "NCSI: Can't use multiple packages without HWA\n"); + rc = -EPERM; + } + } else { + ndp->multi_package = false; + rc = 0; + } + + if (!rc) + ndp->package_whitelist = + nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]); + spin_unlock_irqrestore(&ndp->lock, flags); + + if (!rc) { + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); + } + + return rc; +} + +static int ncsi_set_channel_mask_nl(struct sk_buff *msg, + struct genl_info *info) +{ + struct ncsi_package *np, *package; + struct ncsi_channel *nc, *channel; + u32 package_id, channel_id; + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_CHANNEL_MASK]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + package = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) { + package = np; + break; + } + if (!package) + return -ERANGE; + + spin_lock_irqsave(&package->lock, flags); + + channel = NULL; + if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + NCSI_FOR_EACH_CHANNEL(np, nc) + if (nc->id == channel_id) { + channel = nc; + break; + } + if (!channel) { + spin_unlock_irqrestore(&package->lock, flags); + return -ERANGE; + } + netdev_dbg(ndp->ndev.dev, + "NCSI: Channel %u set as preferred channel\n", + channel->id); + } + + package->channel_whitelist = + nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]); + if (package->channel_whitelist == 0) + netdev_dbg(ndp->ndev.dev, + "NCSI: Package %u set to all channels disabled\n", + package->id); + + package->preferred_channel = channel; + + if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { + package->multi_channel = true; + netdev_info(ndp->ndev.dev, + "NCSI: Multi-channel enabled on package %u\n", + package_id); + } else { + package->multi_channel = false; + } + + spin_unlock_irqrestore(&package->lock, flags); + + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); + + return 0; +} + static const struct genl_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, @@ -589,6 +742,18 @@ static const struct genl_ops ncsi_ops[] = { .doit = ncsi_send_cmd_nl, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NCSI_CMD_SET_PACKAGE_MASK, + .policy = ncsi_genl_policy, + .doit = ncsi_set_package_mask_nl, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NCSI_CMD_SET_CHANNEL_MASK, + .policy = ncsi_genl_policy, + .doit = ncsi_set_channel_mask_nl, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family ncsi_genl_family __ro_after_init = { diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 4d3f06be38bd..2a6d83a596c9 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -165,6 +165,15 @@ struct ncsi_rsp_oem_pkt { unsigned char data[]; /* Payload data */ }; +/* Mellanox Response Data */ +struct ncsi_rsp_oem_mlx_pkt { + unsigned char cmd_rev; /* Command Revision */ + unsigned char cmd; /* Command ID */ + unsigned char param; /* Parameter */ + unsigned char optional; /* Optional data */ + unsigned char data[]; /* Data */ +}; + /* Broadcom Response Data */ struct ncsi_rsp_oem_bcm_pkt { unsigned char ver; /* Payload Version */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 77e07ba3f493..dc07fcc7938e 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -256,7 +256,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr) if (!ncm->enable) return 0; - ncm->enable = 1; + ncm->enable = 0; return 0; } @@ -611,6 +611,45 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) return 0; } +/* Response handler for Mellanox command Get Mac Address */ +static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + const struct net_device_ops *ops = ndev->netdev_ops; + struct ncsi_rsp_oem_pkt *rsp; + struct sockaddr saddr; + int ret = 0; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + ret = ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) + netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); + + return ret; +} + +/* Response handler for Mellanox card */ +static int ncsi_rsp_handler_oem_mlx(struct ncsi_request *nr) +{ + struct ncsi_rsp_oem_mlx_pkt *mlx; + struct ncsi_rsp_oem_pkt *rsp; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + mlx = (struct ncsi_rsp_oem_mlx_pkt *)(rsp->data); + + if (mlx->cmd == NCSI_OEM_MLX_CMD_GMA && + mlx->param == NCSI_OEM_MLX_CMD_GMA_PARAM) + return ncsi_rsp_handler_oem_mlx_gma(nr); + return 0; +} + /* Response handler for Broadcom command Get Mac Address */ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) { @@ -655,7 +694,7 @@ static struct ncsi_rsp_oem_handler { unsigned int mfr_id; int (*handler)(struct ncsi_request *nr); } ncsi_rsp_oem_handlers[] = { - { NCSI_OEM_MFR_MLX_ID, NULL }, + { NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx }, { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm } }; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 43041f087eb3..1ce30efe6854 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1148,8 +1148,9 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; - entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])); - entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]); + __vlan_hwaccel_put_tag(entry->skb, + nla_get_be16(tb[NFQA_VLAN_PROTO]), + ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 85ae53d8fd09..e47ebbbe71b8 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -301,7 +301,7 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, key->eth.vlan.tpid = vlan->vlan_tpid; } return skb_vlan_push(skb, vlan->vlan_tpid, - ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); + ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK); } /* 'src' is already properly masked. */ @@ -822,8 +822,10 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk __skb_dst_copy(skb, data->dst); *OVS_CB(skb) = data->cb; skb->inner_protocol = data->inner_protocol; - skb->vlan_tci = data->vlan_tci; - skb->vlan_proto = data->vlan_proto; + if (data->vlan_tci & VLAN_CFI_MASK) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci & ~VLAN_CFI_MASK); + else + __vlan_hwaccel_clear_tag(skb); /* Reconstruct the MAC header. */ skb_push(skb, data->l2_len); @@ -867,7 +869,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb, data->cb = *OVS_CB(skb); data->inner_protocol = skb->inner_protocol; data->network_offset = orig_network_offset; - data->vlan_tci = skb->vlan_tci; + if (skb_vlan_tag_present(skb)) + data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK; + else + data->vlan_tci = 0; data->vlan_proto = skb->vlan_proto; data->mac_proto = mac_proto; data->l2_len = hlen; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 35966da84769..57e07768c9d1 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -325,7 +325,7 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, return -ENOMEM; vh = (struct vlan_head *)skb->data; - key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); + key_vh->tci = vh->tci | htons(VLAN_CFI_MASK); key_vh->tpid = vh->tpid; if (unlikely(untag_vlan)) { @@ -358,7 +358,7 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) int res; if (skb_vlan_tag_present(skb)) { - key->eth.vlan.tci = htons(skb->vlan_tci); + key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK); key->eth.vlan.tpid = skb->vlan_proto; } else { /* Parse outer vlan tag in the non-accelerated case. */ @@ -597,7 +597,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * skb_vlan_pop(), which will later shift the ethertype into * skb->protocol. */ - if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) + if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) skb->protocol = key->eth.cvlan.tpid; else skb->protocol = key->eth.type; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index c670dd24b8b7..ba01fc4270bd 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -60,7 +60,7 @@ struct ovs_tunnel_info { struct vlan_head { __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ - __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */ }; #define OVS_SW_FLOW_KEY_METADATA_SIZE \ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 865ecef68196..435a4bdf8f89 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -990,9 +990,9 @@ static int validate_vlan_from_nlattrs(const struct sw_flow_match *match, if (a[OVS_KEY_ATTR_VLAN]) tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (!(tci & htons(VLAN_CFI_MASK))) { if (tci) { - OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.", + OVS_NLERR(log, "%s TCI does not have VLAN_CFI_MASK bit set.", (inner) ? "C-VLAN" : "VLAN"); return -EINVAL; } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) { @@ -1013,9 +1013,9 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, __be16 tci = 0; __be16 tpid = 0; bool encap_valid = !!(match->key->eth.vlan.tci & - htons(VLAN_TAG_PRESENT)); + htons(VLAN_CFI_MASK)); bool i_encap_valid = !!(match->key->eth.cvlan.tci & - htons(VLAN_TAG_PRESENT)); + htons(VLAN_CFI_MASK)); if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) { /* Not a VLAN. */ @@ -1039,8 +1039,8 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, (inner) ? "C-VLAN" : "VLAN", ntohs(tpid)); return -EINVAL; } - if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.", + if (!(tci & htons(VLAN_CFI_MASK))) { + OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_CFI_MASK bit.", (inner) ? "C-VLAN" : "VLAN"); return -EINVAL; } @@ -1095,7 +1095,7 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match, if (err) return err; - encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT)); + encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_CFI_MASK)); if (encap_valid) { err = __parse_vlan_from_nlattrs(match, key_attrs, true, a, is_mask, log); @@ -2943,7 +2943,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, vlan = nla_data(a); if (!eth_type_vlan(vlan->vlan_tpid)) return -EINVAL; - if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK))) return -EINVAL; vlan_tci = vlan->vlan_tci; break; @@ -2959,7 +2959,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Prohibit push MPLS other than to a white list * for packets that have a known tag order. */ - if (vlan_tci & htons(VLAN_TAG_PRESENT) || + if (vlan_tci & htons(VLAN_CFI_MASK) || (eth_type != htons(ETH_P_IP) && eth_type != htons(ETH_P_IPV6) && eth_type != htons(ETH_P_ARP) && @@ -2971,7 +2971,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, } case OVS_ACTION_ATTR_POP_MPLS: - if (vlan_tci & htons(VLAN_TAG_PRESENT) || + if (vlan_tci & htons(VLAN_CFI_MASK) || !eth_p_mpls(eth_type)) return -EINVAL; @@ -3036,7 +3036,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_POP_ETH: if (mac_proto != MAC_PROTO_ETHERNET) return -EINVAL; - if (vlan_tci & htons(VLAN_TAG_PRESENT)) + if (vlan_tci & htons(VLAN_CFI_MASK)) return -EINVAL; mac_proto = MAC_PROTO_NONE; break; diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 5aaf3babfc3f..acb6077b7478 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -93,7 +93,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 0e72d95b0e8f..c38a62464b85 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -68,7 +68,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 2e5e7a41d8ef..9bec22e3e9e8 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -84,7 +84,6 @@ static struct net_device *get_dpdev(const struct datapath *dp) struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); - BUG_ON(!local); return local->dev; } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 7e6301b2ec4d..8f16f11f7ad3 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -131,7 +131,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 4cca8f274662..c3b90fadaff6 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -210,9 +210,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; + __be64 key_id = 0; int opts_len = 0; - __be64 key_id; - __be16 flags; + __be16 flags = 0; u8 tos, ttl; int ret = 0; int err; @@ -246,15 +246,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, case TCA_TUNNEL_KEY_ACT_RELEASE: break; case TCA_TUNNEL_KEY_ACT_SET: - if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { - NL_SET_ERR_MSG(extack, "Missing tunnel key id"); - ret = -EINVAL; - goto err_out; - } + if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + __be32 key32; - key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID])); + key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); + key_id = key32_to_tunnel_id(key32); + flags = TUNNEL_KEY; + } - flags = TUNNEL_KEY | TUNNEL_CSUM; + flags |= TUNNEL_CSUM; if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) flags &= ~TUNNEL_CSUM; @@ -508,10 +508,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); - if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || + if (((key->tun_flags & TUNNEL_KEY) && + nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || - nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || + (key->tp_dst && + nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, + key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, !(key->tun_flags & TUNNEL_CSUM)) || tunnel_key_opts_dump(skb, info)) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ba677d54a7af..93fdaf707313 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -63,7 +63,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { /* in-payload vlan tag, pop it */ err = __skb_vlan_pop(skb, &tci); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f427a1e00e7e..d92f44ac4c39 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -25,6 +25,7 @@ #include <linux/kmod.h> #include <linux/slab.h> #include <linux/idr.h> +#include <linux/rhashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -365,6 +366,245 @@ static void tcf_chain_flush(struct tcf_chain *chain) } } +static struct tcf_block *tc_dev_ingress_block(struct net_device *dev) +{ + const struct Qdisc_class_ops *cops; + struct Qdisc *qdisc; + + if (!dev_ingress_queue(dev)) + return NULL; + + qdisc = dev_ingress_queue(dev)->qdisc_sleeping; + if (!qdisc) + return NULL; + + cops = qdisc->ops->cl_ops; + if (!cops) + return NULL; + + if (!cops->tcf_block) + return NULL; + + return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL); +} + +static struct rhashtable indr_setup_block_ht; + +struct tc_indr_block_dev { + struct rhash_head ht_node; + struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; + struct tcf_block *block; +}; + +struct tc_indr_block_cb { + struct list_head list; + void *cb_priv; + tc_indr_block_bind_cb_t *cb; + void *cb_ident; +}; + +static const struct rhashtable_params tc_indr_setup_block_ht_params = { + .key_offset = offsetof(struct tc_indr_block_dev, dev), + .head_offset = offsetof(struct tc_indr_block_dev, ht_node), + .key_len = sizeof(struct net_device *), +}; + +static struct tc_indr_block_dev * +tc_indr_block_dev_lookup(struct net_device *dev) +{ + return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, + tc_indr_setup_block_ht_params); +} + +static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev) +{ + struct tc_indr_block_dev *indr_dev; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (indr_dev) + goto inc_ref; + + indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + INIT_LIST_HEAD(&indr_dev->cb_list); + indr_dev->dev = dev; + indr_dev->block = tc_dev_ingress_block(dev); + if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, + tc_indr_setup_block_ht_params)) { + kfree(indr_dev); + return NULL; + } + +inc_ref: + indr_dev->refcnt++; + return indr_dev; +} + +static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev) +{ + if (--indr_dev->refcnt) + return; + + rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, + tc_indr_setup_block_ht_params); + kfree(indr_dev); +} + +static struct tc_indr_block_cb * +tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + if (indr_block_cb->cb == cb && + indr_block_cb->cb_ident == cb_ident) + return indr_block_cb; + return NULL; +} + +static struct tc_indr_block_cb * +tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + + indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (indr_block_cb) + return ERR_PTR(-EEXIST); + + indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); + if (!indr_block_cb) + return ERR_PTR(-ENOMEM); + + indr_block_cb->cb_priv = cb_priv; + indr_block_cb->cb = cb; + indr_block_cb->cb_ident = cb_ident; + list_add(&indr_block_cb->list, &indr_dev->cb_list); + + return indr_block_cb; +} + +static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb) +{ + list_del(&indr_block_cb->list); + kfree(indr_block_cb); +} + +static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev, + struct tc_indr_block_cb *indr_block_cb, + enum tc_block_command command) +{ + struct tc_block_offload bo = { + .command = command, + .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS, + .block = indr_dev->block, + }; + + if (!indr_dev->block) + return; + + indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, + &bo); +} + +int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + int err; + + indr_dev = tc_indr_block_dev_get(dev); + if (!indr_dev) + return -ENOMEM; + + indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); + err = PTR_ERR_OR_ZERO(indr_block_cb); + if (err) + goto err_dev_put; + + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND); + return 0; + +err_dev_put: + tc_indr_block_dev_put(indr_dev); + return err; +} +EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register); + +int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + int err; + + rtnl_lock(); + err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident); + rtnl_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(tc_indr_block_cb_register); + +void __tc_indr_block_cb_unregister(struct net_device *dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (!indr_block_cb) + return; + + /* Send unbind message if required to free any block cbs. */ + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND); + tc_indr_block_cb_del(indr_block_cb); + tc_indr_block_dev_put(indr_dev); +} +EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister); + +void tc_indr_block_cb_unregister(struct net_device *dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + rtnl_lock(); + __tc_indr_block_cb_unregister(dev, cb, cb_ident); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister); + +static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, + struct tcf_block_ext_info *ei, + enum tc_block_command command, + struct netlink_ext_ack *extack) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + struct tc_block_offload bo = { + .command = command, + .binder_type = ei->binder_type, + .block = block, + .extack = extack, + }; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + indr_dev->block = command == TC_BLOCK_BIND ? block : NULL; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, + &bo); +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; @@ -406,12 +646,17 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; - return err; + if (err) + return err; + + tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); + return 0; no_offload_dev_inc: if (tcf_block_offload_in_use(block)) return -EOPNOTSUPP; block->nooffloaddevcnt++; + tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); return 0; } @@ -421,6 +666,8 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct net_device *dev = q->dev_queue->dev; int err; + tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL); + if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); @@ -2355,6 +2602,11 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; + err = rhashtable_init(&indr_setup_block_ht, + &tc_indr_setup_block_ht_params); + if (err) + goto err_rhash_setup_block_ht; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, @@ -2366,6 +2618,8 @@ static int __init tc_filter_init(void) return 0; +err_rhash_setup_block_ht: + unregister_pernet_subsys(&tcf_net_ops); err_register_pernet_subsys: destroy_workqueue(tc_filter_wq); return err; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 71312d7bd8f4..544811dded60 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -55,6 +55,8 @@ struct fl_flow_key { struct flow_dissector_key_ip ip; struct flow_dissector_key_ip enc_ip; struct flow_dissector_key_enc_opts enc_opts; + struct flow_dissector_key_ports tp_min; + struct flow_dissector_key_ports tp_max; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -65,6 +67,7 @@ struct fl_flow_mask_range { struct fl_flow_mask { struct fl_flow_key key; struct fl_flow_mask_range range; + u32 flags; struct rhash_head ht_node; struct rhashtable ht; struct rhashtable_params filter_ht_params; @@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key, memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey) +static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter, + struct fl_flow_key *key, + struct fl_flow_key *mkey) +{ + __be16 min_mask, max_mask, min_val, max_val; + + min_mask = htons(filter->mask->key.tp_min.dst); + max_mask = htons(filter->mask->key.tp_max.dst); + min_val = htons(filter->key.tp_min.dst); + max_val = htons(filter->key.tp_max.dst); + + if (min_mask && max_mask) { + if (htons(key->tp.dst) < min_val || + htons(key->tp.dst) > max_val) + return false; + + /* skb does not have min and max values */ + mkey->tp_min.dst = filter->mkey.tp_min.dst; + mkey->tp_max.dst = filter->mkey.tp_max.dst; + } + return true; +} + +static bool fl_range_port_src_cmp(struct cls_fl_filter *filter, + struct fl_flow_key *key, + struct fl_flow_key *mkey) +{ + __be16 min_mask, max_mask, min_val, max_val; + + min_mask = htons(filter->mask->key.tp_min.src); + max_mask = htons(filter->mask->key.tp_max.src); + min_val = htons(filter->key.tp_min.src); + max_val = htons(filter->key.tp_max.src); + + if (min_mask && max_mask) { + if (htons(key->tp.src) < min_val || + htons(key->tp.src) > max_val) + return false; + + /* skb does not have min and max values */ + mkey->tp_min.src = filter->mkey.tp_min.src; + mkey->tp_max.src = filter->mkey.tp_max.src; + } + return true; +} + +static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask, + struct fl_flow_key *mkey) { return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask), mask->filter_ht_params); } +static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, + struct fl_flow_key *mkey, + struct fl_flow_key *key) +{ + struct cls_fl_filter *filter, *f; + + list_for_each_entry_rcu(filter, &mask->filters, list) { + if (!fl_range_port_dst_cmp(filter, key, mkey)) + continue; + + if (!fl_range_port_src_cmp(filter, key, mkey)) + continue; + + f = __fl_lookup(mask, mkey); + if (f) + return f; + } + return NULL; +} + +static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, + struct fl_flow_key *mkey, + struct fl_flow_key *key) +{ + if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) + return fl_lookup_range(mask, mkey, key); + + return __fl_lookup(mask, mkey); +} + static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -208,7 +287,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, fl_set_masked_key(&skb_mkey, &skb_key, mask); - f = fl_lookup(mask, &skb_mkey); + f = fl_lookup(mask, &skb_mkey, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -514,6 +593,31 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask) +{ + fl_set_key_val(tb, &key->tp_min.dst, + TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst)); + fl_set_key_val(tb, &key->tp_max.dst, + TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst)); + fl_set_key_val(tb, &key->tp_min.src, + TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src)); + fl_set_key_val(tb, &key->tp_max.src, + TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src)); + + if ((mask->tp_min.dst && mask->tp_max.dst && + htons(key->tp_max.dst) <= htons(key->tp_min.dst)) || + (mask->tp_min.src && mask->tp_max.src && + htons(key->tp_max.src) <= htons(key->tp_min.src))) + return -EINVAL; + + return 0; +} + static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask) @@ -921,6 +1025,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, sizeof(key->arp.tha)); } + if (key->basic.ip_proto == IPPROTO_TCP || + key->basic.ip_proto == IPPROTO_UDP || + key->basic.ip_proto == IPPROTO_SCTP) { + ret = fl_set_key_port_range(tb, key, mask); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -1038,8 +1150,9 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_MASKED(mask, keys, cnt, - FLOW_DISSECTOR_KEY_PORTS, tp); + if (FL_KEY_IS_MASKED(mask, tp) || + FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max)) + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -1086,6 +1199,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, fl_mask_copy(newmask, mask); + if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) || + (newmask->key.tp_min.src && newmask->key.tp_max.src)) + newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE; + err = fl_init_mask_hashtable(newmask); if (err) goto errout_free; @@ -1238,7 +1355,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout_idr; - if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) { + if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) { err = -EEXIST; goto errout_mask; } @@ -1473,6 +1590,26 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, + struct fl_flow_key *mask) +{ + if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, + &mask->tp_min.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_min.dst)) || + fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX, + &mask->tp_max.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_max.dst)) || + fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN, + &mask->tp_min.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_min.src)) || + fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX, + &mask->tp_max.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_max.src))) + return -1; + + return 0; +} + static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) @@ -1809,6 +1946,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->arp.tha)))) goto nla_put_failure; + if ((key->basic.ip_proto == IPPROTO_TCP || + key->basic.ip_proto == IPPROTO_UDP || + key->basic.ip_proto == IPPROTO_SCTP) && + fl_dump_key_port_range(skb, key, mask)) + goto nla_put_failure; + if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4b28fd44576d..4c54bc440798 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -558,6 +558,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; + cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; @@ -1206,6 +1207,7 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; + cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ca3b0f46de53..9c88cec7e8a2 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -810,6 +810,71 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, } EXPORT_SYMBOL(qdisc_tree_reduce_backlog); +int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, + void *type_data) +{ + struct net_device *dev = qdisc_dev(sch); + int err; + + sch->flags &= ~TCQ_F_OFFLOADED; + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return 0; + + err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); + if (err == -EOPNOTSUPP) + return 0; + + if (!err) + sch->flags |= TCQ_F_OFFLOADED; + + return err; +} +EXPORT_SYMBOL(qdisc_offload_dump_helper); + +void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, + struct Qdisc *new, struct Qdisc *old, + enum tc_setup_type type, void *type_data, + struct netlink_ext_ack *extack) +{ + bool any_qdisc_is_offloaded; + int err; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); + + /* Don't report error if the graft is part of destroy operation. */ + if (!err || !new || new == &noop_qdisc) + return; + + /* Don't report error if the parent, the old child and the new + * one are not offloaded. + */ + any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; + any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; + any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; + + if (any_qdisc_is_offloaded) + NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); +} +EXPORT_SYMBOL(qdisc_offload_graft_helper); + +static void qdisc_offload_graft_root(struct net_device *dev, + struct Qdisc *new, struct Qdisc *old, + struct netlink_ext_ack *extack) +{ + struct tc_root_qopt_offload graft_offload = { + .command = TC_ROOT_GRAFT, + .handle = new ? new->handle : 0, + .ingress = (new && new->flags & TCQ_F_INGRESS) || + (old && old->flags & TCQ_F_INGRESS), + }; + + qdisc_offload_graft_helper(dev, NULL, new, old, + TC_SETUP_ROOT_QDISC, &graft_offload, extack); +} + static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { @@ -957,7 +1022,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, { struct Qdisc *q = old; struct net *net = dev_net(dev); - int err = 0; if (parent == NULL) { unsigned int i, num_q, ingress; @@ -977,6 +1041,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (dev->flags & IFF_UP) dev_deactivate(dev); + qdisc_offload_graft_root(dev, new, old, extack); + if (new && new->ops->attach) goto skip; @@ -1012,28 +1078,29 @@ skip: dev_activate(dev); } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; + unsigned long cl; + int err; /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && parent && !(parent->flags & TCQ_F_NOLOCK)) new->flags &= ~TCQ_F_NOLOCK; - err = -EOPNOTSUPP; - if (cops && cops->graft) { - unsigned long cl = cops->find(parent, classid); + if (!cops || !cops->graft) + return -EOPNOTSUPP; - if (cl) { - err = cops->graft(parent, cl, new, &old, - extack); - } else { - NL_SET_ERR_MSG(extack, "Specified class not found"); - err = -ENOENT; - } + cl = cops->find(parent, classid); + if (!cl) { + NL_SET_ERR_MSG(extack, "Specified class not found"); + return -ENOENT; } - if (!err) - notify_and_destroy(net, skb, n, classid, old, new); + + err = cops->graft(parent, cl, new, &old, extack); + if (err) + return err; + notify_and_destroy(net, skb, n, classid, old, new); } - return err; + return 0; } static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 1538d6fa8165..1150f22983df 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -30,7 +30,7 @@ struct etf_sched_data { int queue; s32 delta; /* in ns */ ktime_t last; /* The txtime of the last skb sent to the netdevice. */ - struct rb_root head; + struct rb_root_cached head; struct qdisc_watchdog watchdog; ktime_t (*get_time)(void); }; @@ -104,7 +104,7 @@ static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p; - p = rb_first(&q->head); + p = rb_first_cached(&q->head); if (!p) return NULL; @@ -117,8 +117,10 @@ static void reset_watchdog(struct Qdisc *sch) struct sk_buff *skb = etf_peek_timesortedlist(sch); ktime_t next; - if (!skb) + if (!skb) { + qdisc_watchdog_cancel(&q->watchdog); return; + } next = ktime_sub_ns(skb->tstamp, q->delta); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); @@ -154,8 +156,9 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); - struct rb_node **p = &q->head.rb_node, *parent = NULL; + struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; + bool leftmost = true; if (!is_packet_valid(sch, nskb)) { report_sock_error(nskb, EINVAL, @@ -168,13 +171,15 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, parent = *p; skb = rb_to_skb(parent); - if (ktime_after(txtime, skb->tstamp)) + if (ktime_after(txtime, skb->tstamp)) { p = &parent->rb_right; - else + leftmost = false; + } else { p = &parent->rb_left; + } } rb_link_node(&nskb->rbnode, parent, p); - rb_insert_color(&nskb->rbnode, &q->head); + rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost); qdisc_qstats_backlog_inc(sch, nskb); sch->q.qlen++; @@ -185,12 +190,42 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } -static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, - bool drop) +static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb, + ktime_t now) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *to_free = NULL; + struct sk_buff *tmp = NULL; + + skb_rbtree_walk_from_safe(skb, tmp) { + if (ktime_after(skb->tstamp, now)) + break; + + rb_erase_cached(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + + qdisc_qstats_backlog_dec(sch, skb); + qdisc_drop(skb, sch, &to_free); + qdisc_qstats_overlimit(sch); + sch->q.qlen--; + } + + kfree_skb_list(to_free); +} + +static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); - rb_erase(&skb->rbnode, &q->head); + rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. @@ -201,19 +236,9 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, qdisc_qstats_backlog_dec(sch, skb); - if (drop) { - struct sk_buff *to_free = NULL; + qdisc_bstats_update(sch, skb); - report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); - - qdisc_drop(skb, sch, &to_free); - kfree_skb_list(to_free); - qdisc_qstats_overlimit(sch); - } else { - qdisc_bstats_update(sch, skb); - - q->last = skb->tstamp; - } + q->last = skb->tstamp; sch->q.qlen--; } @@ -232,7 +257,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) /* Drop if packet has expired while in queue. */ if (ktime_before(skb->tstamp, now)) { - timesortedlist_erase(sch, skb, true); + timesortedlist_drop(sch, skb, now); skb = NULL; goto out; } @@ -241,7 +266,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) * txtime from deadline to (now + delta). */ if (q->deadline_mode) { - timesortedlist_erase(sch, skb, false); + timesortedlist_remove(sch, skb); skb->tstamp = now; goto out; } @@ -250,7 +275,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) /* Dequeue only if now is within the [txtime - delta, txtime] range. */ if (ktime_after(now, next)) - timesortedlist_erase(sch, skb, false); + timesortedlist_remove(sch, skb); else skb = NULL; @@ -386,14 +411,14 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, static void timesortedlist_clear(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); - struct rb_node *p = rb_first(&q->head); + struct rb_node *p = rb_first_cached(&q->head); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); - rb_erase(&skb->rbnode, &q->head); + rb_erase_cached(&skb->rbnode, &q->head); rtnl_kfree_skbs(skb, skb); sch->q.qlen--; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 25a7cf6d380f..1a662f2bb7bb 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -94,6 +94,7 @@ struct fq_sched_data { u32 flow_refill_delay; u32 flow_plimit; /* max packets per flow */ unsigned long flow_max_rate; /* optional max rate per flow */ + u64 ce_threshold; u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; @@ -107,6 +108,7 @@ struct fq_sched_data { u64 stat_gc_flows; u64 stat_internal_packets; u64 stat_throttled; + u64 stat_ce_mark; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; @@ -412,16 +414,21 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_get_ns(); struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; unsigned long rate; u32 plen; + u64 now; + + if (!sch->q.qlen) + return NULL; skb = fq_dequeue_head(sch, &q->internal); if (skb) goto out; + + now = ktime_get_ns(); fq_check_throttled(q, now); begin: head = &q->new_flows; @@ -454,6 +461,11 @@ begin: fq_flow_set_throttled(q, f); goto begin; } + if (time_next_packet && + (s64)(now - time_next_packet - q->ce_threshold) > 0) { + INET_ECN_set_ce(skb); + q->stat_ce_mark++; + } } skb = fq_dequeue_head(sch, f); @@ -657,6 +669,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -736,6 +749,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_ORPHAN_MASK]) q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); + if (tb[TCA_FQ_CE_THRESHOLD]) + q->ce_threshold = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (!err) { sch_tree_unlock(sch); err = fq_resize(sch, fq_log); @@ -786,6 +803,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; + + /* Default ce_threshold of 4294 seconds */ + q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; + qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); if (opt) @@ -799,6 +820,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); + u64 ce_threshold = q->ce_threshold; struct nlattr *opts; opts = nla_nest_start(skb, TCA_OPTIONS); @@ -807,6 +829,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + do_div(ce_threshold, NSEC_PER_USEC); + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || @@ -819,6 +843,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, q->low_rate_threshold) || + nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; @@ -848,6 +873,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.throttled_flows = q->throttled_flows; st.unthrottle_latency_ns = min_t(unsigned long, q->unthrottle_latency_ns, ~0U); + st.ce_mark = q->stat_ce_mark; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 4a042abf844c..234afbf9115b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -23,19 +23,23 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/red.h> #define GRED_DEF_PRIO (MAX_DPs / 2) #define GRED_VQ_MASK (MAX_DPs - 1) +#define GRED_VQ_RED_FLAGS (TC_RED_ECN | TC_RED_HARDDROP) + struct gred_sched_data; struct gred_sched; struct gred_sched_data { u32 limit; /* HARD maximal queue length */ u32 DP; /* the drop parameters */ - u32 bytesin; /* bytes seen on virtualQ so far*/ + u32 red_flags; /* virtualQ version of red_flags */ + u64 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ u8 prio; /* the prio of this vq */ @@ -139,14 +143,27 @@ static inline void gred_store_wred_set(struct gred_sched *table, table->wred_set.qidlestart = q->vars.qidlestart; } -static inline int gred_use_ecn(struct gred_sched *t) +static int gred_use_ecn(struct gred_sched_data *q) +{ + return q->red_flags & TC_RED_ECN; +} + +static int gred_use_harddrop(struct gred_sched_data *q) { - return t->red_flags & TC_RED_ECN; + return q->red_flags & TC_RED_HARDDROP; } -static inline int gred_use_harddrop(struct gred_sched *t) +static bool gred_per_vq_red_flags_used(struct gred_sched *table) { - return t->red_flags & TC_RED_HARDDROP; + unsigned int i; + + /* Local per-vq flags couldn't have been set unless global are 0 */ + if (table->red_flags) + return false; + for (i = 0; i < MAX_DPs; i++) + if (table->tab[i] && table->tab[i]->red_flags) + return true; + return false; } static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -212,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_PROB_MARK: qdisc_qstats_overlimit(sch); - if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; } @@ -222,7 +239,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_HARD_MARK: qdisc_qstats_overlimit(sch); - if (gred_use_harddrop(t) || !gred_use_ecn(t) || + if (gred_use_harddrop(q) || !gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; goto congestion_drop; @@ -295,15 +312,103 @@ static void gred_reset(struct Qdisc *sch) } } +static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) +{ + struct gred_sched *table = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_gred_qopt_offload opt = { + .command = command, + .handle = sch->handle, + .parent = sch->parent, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + if (command == TC_GRED_REPLACE) { + unsigned int i; + + opt.set.grio_on = gred_rio_mode(table); + opt.set.wred_on = gred_wred_mode(table); + opt.set.dp_cnt = table->DPs; + opt.set.dp_def = table->def; + + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + if (!q) + continue; + opt.set.tab[i].present = true; + opt.set.tab[i].limit = q->limit; + opt.set.tab[i].prio = q->prio; + opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; + opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; + opt.set.tab[i].is_ecn = gred_use_ecn(q); + opt.set.tab[i].is_harddrop = gred_use_harddrop(q); + opt.set.tab[i].probability = q->parms.max_P; + opt.set.tab[i].backlog = &q->backlog; + } + opt.set.qstats = &sch->qstats; + } + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); +} + +static int gred_offload_dump_stats(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_qopt_offload *hw_stats; + unsigned int i; + int ret; + + hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL); + if (!hw_stats) + return -ENOMEM; + + hw_stats->command = TC_GRED_STATS; + hw_stats->handle = sch->handle; + hw_stats->parent = sch->parent; + + for (i = 0; i < MAX_DPs; i++) + if (table->tab[i]) + hw_stats->stats.xstats[i] = &table->tab[i]->stats; + + ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); + /* Even if driver returns failure adjust the stats - in case offload + * ended but driver still wants to adjust the values. + */ + for (i = 0; i < MAX_DPs; i++) { + if (!table->tab[i]) + continue; + table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; + table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; + table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; + + _bstats_update(&sch->bstats, + hw_stats->stats.bstats[i].bytes, + hw_stats->stats.bstats[i].packets); + sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; + sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; + sch->qstats.drops += hw_stats->stats.qstats[i].drops; + sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; + sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; + } + + kfree(hw_stats); + return ret; +} + static inline void gred_destroy_vq(struct gred_sched_data *q) { kfree(q); } -static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) +static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps, + struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_sopt *sopt; + bool red_flags_changed; int i; if (!dps) @@ -311,13 +416,28 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) sopt = nla_data(dps); - if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || - sopt->def_DP >= sopt->DPs) + if (sopt->DPs > MAX_DPs) { + NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high"); + return -EINVAL; + } + if (sopt->DPs == 0) { + NL_SET_ERR_MSG_MOD(extack, + "number of virtual queues can't be 0"); + return -EINVAL; + } + if (sopt->def_DP >= sopt->DPs) { + NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count"); return -EINVAL; + } + if (sopt->flags && gred_per_vq_red_flags_used(table)) { + NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used"); + return -EINVAL; + } sch_tree_lock(sch); table->DPs = sopt->DPs; table->def = sopt->def_DP; + red_flags_changed = table->red_flags != sopt->flags; table->red_flags = sopt->flags; /* @@ -337,6 +457,12 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) gred_disable_wred_mode(table); } + if (red_flags_changed) + for (i = 0; i < table->DPs; i++) + if (table->tab[i]) + table->tab[i]->red_flags = + table->red_flags & GRED_VQ_RED_FLAGS; + for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", @@ -346,25 +472,30 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) } } + gred_offload(sch, TC_GRED_REPLACE); return 0; } static inline int gred_change_vq(struct Qdisc *sch, int dp, struct tc_gred_qopt *ctl, int prio, u8 *stab, u32 max_P, - struct gred_sched_data **prealloc) + struct gred_sched_data **prealloc, + struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) { + NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; + } if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; if (!q) return -ENOMEM; + q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS; } q->DP = dp; @@ -384,14 +515,127 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, return 0; } +static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = { + [TCA_GRED_VQ_DP] = { .type = NLA_U32 }, + [TCA_GRED_VQ_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = { + [TCA_GRED_VQ_ENTRY] = { .type = NLA_NESTED }, +}; + static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, [TCA_GRED_LIMIT] = { .type = NLA_U32 }, + [TCA_GRED_VQ_LIST] = { .type = NLA_NESTED }, }; +static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry) +{ + struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; + u32 dp; + + nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL); + + dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); + + if (tb[TCA_GRED_VQ_FLAGS]) + table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); +} + +static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs) +{ + const struct nlattr *attr; + int rem; + + nla_for_each_nested(attr, vqs, rem) { + switch (nla_type(attr)) { + case TCA_GRED_VQ_ENTRY: + gred_vq_apply(table, attr); + break; + } + } +} + +static int gred_vq_validate(struct gred_sched *table, u32 cdp, + const struct nlattr *entry, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; + int err; + u32 dp; + + err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, + extack); + if (err < 0) + return err; + + if (!tb[TCA_GRED_VQ_DP]) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified"); + return -EINVAL; + } + dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); + if (dp >= table->DPs) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds"); + return -EINVAL; + } + if (dp != cdp && !table->tab[dp]) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated"); + return -EINVAL; + } + + if (tb[TCA_GRED_VQ_FLAGS]) { + u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); + + if (table->red_flags && table->red_flags != red_flags) { + NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used"); + return -EINVAL; + } + if (red_flags & ~GRED_VQ_RED_FLAGS) { + NL_SET_ERR_MSG_MOD(extack, + "invalid RED flags specified"); + return -EINVAL; + } + } + + return 0; +} + +static int gred_vqs_validate(struct gred_sched *table, u32 cdp, + struct nlattr *vqs, struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int rem, err; + + err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX, + gred_vqe_policy, extack); + if (err < 0) + return err; + + nla_for_each_nested(attr, vqs, rem) { + switch (nla_type(attr)) { + case TCA_GRED_VQ_ENTRY: + err = gred_vq_validate(table, cdp, attr, extack); + if (err) + return err; + break; + default: + NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes"); + return -EINVAL; + } + } + + if (rem > 0) { + NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list"); + return -EINVAL; + } + + return 0; +} + static int gred_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -406,29 +650,39 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); - return gred_change_table_def(sch, tb[TCA_GRED_DPS]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } if (tb[TCA_GRED_PARMS] == NULL || tb[TCA_GRED_STAB] == NULL || - tb[TCA_GRED_LIMIT] != NULL) + tb[TCA_GRED_LIMIT] != NULL) { + NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time"); return -EINVAL; + } max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; - err = -EINVAL; ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); - if (ctl->DP >= table->DPs) - goto errout; + if (ctl->DP >= table->DPs) { + NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count"); + return -EINVAL; + } + + if (tb[TCA_GRED_VQ_LIST]) { + err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST], + extack); + if (err) + return err; + } if (gred_rio_mode(table)) { if (ctl->prio == 0) { @@ -448,9 +702,13 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); sch_tree_lock(sch); - err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc); + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc, + extack); if (err < 0) - goto errout_locked; + goto err_unlock_free; + + if (tb[TCA_GRED_VQ_LIST]) + gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]); if (gred_rio_mode(table)) { gred_disable_wred_mode(table); @@ -458,12 +716,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, gred_enable_wred_mode(table); } - err = 0; + sch_tree_unlock(sch); + kfree(prealloc); + + gred_offload(sch, TC_GRED_REPLACE); + return 0; -errout_locked: +err_unlock_free: sch_tree_unlock(sch); kfree(prealloc); -errout: return err; } @@ -476,12 +737,15 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; - if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) + if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) { + NL_SET_ERR_MSG_MOD(extack, + "virtual queue configuration can't be specified at initialization time"); return -EINVAL; + } if (tb[TCA_GRED_LIMIT]) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); @@ -489,13 +753,13 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); - return gred_change_table_def(sch, tb[TCA_GRED_DPS]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) { struct gred_sched *table = qdisc_priv(sch); - struct nlattr *parms, *opts = NULL; + struct nlattr *parms, *vqs, *opts = NULL; int i; u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { @@ -505,6 +769,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) .flags = table->red_flags, }; + if (gred_offload_dump_stats(sch)) + goto nla_put_failure; + opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; @@ -522,6 +789,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) goto nla_put_failure; + /* Old style all-in-one dump of VQs */ parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; @@ -572,6 +840,58 @@ append_opt: nla_nest_end(skb, parms); + /* Dump the VQs again, in more structured way */ + vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST); + if (!vqs) + goto nla_put_failure; + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + struct nlattr *vq; + + if (!q) + continue; + + vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY); + if (!vq) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags)) + goto nla_put_failure; + + /* Stats */ + if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin, + TCA_GRED_VQ_PAD)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG, + gred_backlog(table, q, sch))) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP, + q->stats.prob_drop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK, + q->stats.prob_mark)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP, + q->stats.forced_drop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK, + q->stats.forced_mark)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other)) + goto nla_put_failure; + + nla_nest_end(skb, vq); + } + nla_nest_end(skb, vqs); + return nla_nest_end(skb, opts); nla_put_failure: @@ -588,6 +908,7 @@ static void gred_destroy(struct Qdisc *sch) if (table->tab[i]) gred_destroy_vq(table->tab[i]); } + gred_offload(sch, TC_GRED_DESTROY); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f20f3a0f8424..203659bc3906 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -38,9 +38,8 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd) return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); } -static void mq_offload_stats(struct Qdisc *sch) +static int mq_offload_stats(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); struct tc_mq_qopt_offload opt = { .command = TC_MQ_STATS, .handle = sch->handle, @@ -50,8 +49,7 @@ static void mq_offload_stats(struct Qdisc *sch) }, }; - if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt); } static void mq_destroy(struct Qdisc *sch) @@ -171,9 +169,8 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } - mq_offload_stats(sch); - return 0; + return mq_offload_stats(sch); } static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) @@ -196,6 +193,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + struct tc_mq_qopt_offload graft_offload; struct net_device *dev = qdisc_dev(sch); if (dev->flags & IFF_UP) @@ -206,6 +204,14 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); + + graft_offload.handle = sch->handle; + graft_offload.graft_params.queue = cl - 1; + graft_offload.graft_params.child_handle = new ? new->handle : 0; + graft_offload.command = TC_MQ_GRAFT; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, + TC_SETUP_QDISC_MQ, &graft_offload, extack); return 0; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 22cd46a60057..75046ec72144 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -77,6 +77,10 @@ struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; + /* a linear queue; reduces rbtree rebalancing when jitter is low */ + struct sk_buff *t_head; + struct sk_buff *t_tail; + /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; @@ -369,26 +373,39 @@ static void tfifo_reset(struct Qdisc *sch) rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } + + rtnl_kfree_skbs(q->t_head, q->t_tail); + q->t_head = NULL; + q->t_tail = NULL; } static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); u64 tnext = netem_skb_cb(nskb)->time_to_send; - struct rb_node **p = &q->t_root.rb_node, *parent = NULL; - while (*p) { - struct sk_buff *skb; - - parent = *p; - skb = rb_to_skb(parent); - if (tnext >= netem_skb_cb(skb)->time_to_send) - p = &parent->rb_right; + if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { + if (q->t_tail) + q->t_tail->next = nskb; else - p = &parent->rb_left; + q->t_head = nskb; + q->t_tail = nskb; + } else { + struct rb_node **p = &q->t_root.rb_node, *parent = NULL; + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (tnext >= netem_skb_cb(skb)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->t_root); } - rb_link_node(&nskb->rbnode, parent, p); - rb_insert_color(&nskb->rbnode, &q->t_root); sch->q.qlen++; } @@ -533,9 +550,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || - t_last->time_to_send > last->time_to_send) { + t_last->time_to_send > last->time_to_send) + last = t_last; + } + if (q->t_tail) { + struct netem_skb_cb *t_last = + netem_skb_cb(q->t_tail); + + if (!last || + t_last->time_to_send > last->time_to_send) last = t_last; - } } if (last) { @@ -614,11 +638,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) q->slot.bytes_left = q->slot_config.max_bytes; } +static struct sk_buff *netem_peek(struct netem_sched_data *q) +{ + struct sk_buff *skb = skb_rb_first(&q->t_root); + u64 t1, t2; + + if (!skb) + return q->t_head; + if (!q->t_head) + return skb; + + t1 = netem_skb_cb(skb)->time_to_send; + t2 = netem_skb_cb(q->t_head)->time_to_send; + if (t1 < t2) + return skb; + return q->t_head; +} + +static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) +{ + if (skb == q->t_head) { + q->t_head = skb->next; + if (!q->t_head) + q->t_tail = NULL; + } else { + rb_erase(&skb->rbnode, &q->t_root); + } +} + static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - struct rb_node *p; tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); @@ -628,20 +679,18 @@ deliver: qdisc_bstats_update(sch, skb); return skb; } - p = rb_first(&q->t_root); - if (p) { + skb = netem_peek(q); + if (skb) { u64 time_to_send; u64 now = ktime_get_ns(); - skb = rb_to_skb(p); - /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; if (q->slot.slot_next && q->slot.slot_next < time_to_send) get_slot_next(q, now); - if (time_to_send <= now && q->slot.slot_next <= now) { - rb_erase(p, &q->t_root); + if (time_to_send <= now && q->slot.slot_next <= now) { + netem_erase_head(q, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index f8af98621179..cdf68706e40f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -220,7 +220,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); - qdisc_put(child); } for (i = oldbands; i < q->bands; i++) { @@ -230,6 +229,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, } sch_tree_unlock(sch); + + for (i = q->bands; i < oldbands; i++) + qdisc_put(q->queues[i]); return 0; } @@ -251,7 +253,6 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt, static int prio_dump_offload(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload hw_stats = { .command = TC_PRIO_STATS, .handle = sch->handle, @@ -263,21 +264,8 @@ static int prio_dump_offload(struct Qdisc *sch) }, }, }; - int err; - - sch->flags &= ~TCQ_F_OFFLOADED; - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) - return 0; - - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, - &hw_stats); - if (err == -EOPNOTSUPP) - return 0; - if (!err) - sch->flags |= TCQ_F_OFFLOADED; - - return err; + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats); } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -309,43 +297,22 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, { struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt_offload graft_offload; - struct net_device *dev = qdisc_dev(sch); unsigned long band = arg - 1; - bool any_qdisc_is_offloaded; - int err; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); - if (!tc_can_offload(dev)) - return 0; - graft_offload.handle = sch->handle; graft_offload.parent = sch->parent; graft_offload.graft_params.band = band; graft_offload.graft_params.child_handle = new->handle; graft_offload.command = TC_PRIO_GRAFT; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, - &graft_offload); - - /* Don't report error if the graft is part of destroy operation. */ - if (err && new != &noop_qdisc) { - /* Don't report error if the parent, the old child and the new - * one are not offloaded. - */ - any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; - any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED; - if (*old) - any_qdisc_is_offloaded |= (*old)->flags & - TCQ_F_OFFLOADED; - - if (any_qdisc_is_offloaded) - NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); - } - + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, + TC_SETUP_QDISC_PRIO, &graft_offload, + extack); return 0; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 3ce6c0a2c493..9df9942340ea 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -166,7 +166,9 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.set.min = q->parms.qth_min >> q->parms.Wlog; opt.set.max = q->parms.qth_max >> q->parms.Wlog; opt.set.probability = q->parms.max_P; + opt.set.limit = q->limit; opt.set.is_ecn = red_use_ecn(q); + opt.set.is_harddrop = red_use_harddrop(q); opt.set.qstats = &sch->qstats; } else { opt.command = TC_RED_DESTROY; @@ -193,10 +195,10 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { static int red_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct Qdisc *old_child = NULL, *child = NULL; struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; struct tc_red_qopt *ctl; - struct Qdisc *child = NULL; int err; u32 max_P; @@ -233,7 +235,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (child) { qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_put(q->qdisc); + old_child = q->qdisc; q->qdisc = child; } @@ -252,7 +254,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, red_start_of_idle_period(&q->vars); sch_tree_unlock(sch); + red_offload(sch, true); + + if (old_child) + qdisc_put(old_child); return 0; } @@ -279,9 +285,8 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, return red_change(sch, opt, extack); } -static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) +static int red_dump_offload_stats(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); struct tc_red_qopt_offload hw_stats = { .command = TC_RED_STATS, .handle = sch->handle, @@ -291,22 +296,8 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) .stats.qstats = &sch->qstats, }, }; - int err; - - sch->flags &= ~TCQ_F_OFFLOADED; - - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) - return 0; - - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, - &hw_stats); - if (err == -EOPNOTSUPP) - return 0; - if (!err) - sch->flags |= TCQ_F_OFFLOADED; - - return err; + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_RED, &hw_stats); } static int red_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -324,7 +315,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) }; int err; - err = red_dump_offload_stats(sch, &opt); + err = red_dump_offload_stats(sch); if (err) goto nla_put_failure; @@ -377,6 +368,21 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl, return 0; } +static void red_graft_offload(struct Qdisc *sch, + struct Qdisc *new, struct Qdisc *old, + struct netlink_ext_ack *extack) +{ + struct tc_red_qopt_offload graft_offload = { + .handle = sch->handle, + .parent = sch->parent, + .child_handle = new->handle, + .command = TC_RED_GRAFT, + }; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, + TC_SETUP_QDISC_RED, &graft_offload, extack); +} + static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { @@ -386,6 +392,8 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); + + red_graft_offload(sch, new, *old, extack); return 0; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 914750b819b2..201c888604e4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -132,6 +132,8 @@ static struct sctp_association *sctp_association_init( */ asoc->max_burst = sp->max_burst; + asoc->subscribe = sp->subscribe; + /* initialize association timers */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 7df3704982f5..ebf28adba789 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp, return match; } +int sctp_bind_addrs_check(struct sctp_sock *sp, + struct sctp_sock *sp2, int cnt2) +{ + struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; + struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; + struct sctp_sockaddr_entry *laddr, *laddr2; + bool exist = false; + int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { + if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && + laddr->valid && laddr2->valid) { + exist = true; + goto next; + } + } + cnt = 0; + break; +next: + cnt++; + } + rcu_read_unlock(); + + return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); +} + /* Does the address 'addr' conflict with any addresses in * the bp. */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index d2048de86e7c..64bef313d436 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -86,11 +86,10 @@ void sctp_datamsg_free(struct sctp_datamsg *msg) /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { + struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; - struct sctp_sock *sp; struct sctp_ulpevent *ev; - struct sctp_association *asoc = NULL; int error = 0, notify; /* If we failed, we may need to notify. */ @@ -108,9 +107,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) else error = asoc->outqueue.error; - sp = sctp_sk(asoc->base.sk); - notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED, - &sp->subscribe); + notify = sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED); } /* Generate a SEND FAILED event only if enabled. */ diff --git a/net/sctp/input.c b/net/sctp/input.c index 5c36a99882ed..d7a649d240e5 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -57,6 +57,7 @@ #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> +#include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); @@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp); -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, - const union sctp_addr *laddr); +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( + struct net *net, struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *daddr); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, @@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb) asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport); if (!asoc) - ep = __sctp_rcv_lookup_endpoint(net, &dest); + ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; @@ -574,7 +577,7 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) * is probably better. * */ -void sctp_v4_err(struct sk_buff *skb, __u32 info) +int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int ihlen = iph->ihl * 4; @@ -599,7 +602,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } /* Warning: The sock lock is held. Remember to call * sctp_err_finish! @@ -653,6 +656,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) out_unlock: sctp_err_finish(sk, transport); + return 0; } /* @@ -720,43 +724,87 @@ discard: } /* Insert endpoint into the hash table. */ -static void __sctp_hash_endpoint(struct sctp_endpoint *ep) +static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); - struct sctp_ep_common *epb; + struct sock *sk = ep->base.sk; + struct net *net = sock_net(sk); struct sctp_hashbucket *head; + struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (sk->sk_reuseport) { + bool any = sctp_is_ep_boundall(sk); + struct sctp_ep_common *epb2; + struct list_head *list; + int cnt = 0, err = 1; + + list_for_each(list, &ep->base.bind_addr.address_list) + cnt++; + + sctp_for_each_hentry(epb2, &head->chain) { + struct sock *sk2 = epb2->sk; + + if (!net_eq(sock_net(sk2), net) || sk2 == sk || + !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || + !sk2->sk_reuseport) + continue; + + err = sctp_bind_addrs_check(sctp_sk(sk2), + sctp_sk(sk), cnt); + if (!err) { + err = reuseport_add_sock(sk, sk2, any); + if (err) + return err; + break; + } else if (err < 0) { + return err; + } + } + + if (err) { + err = reuseport_alloc(sk, any); + if (err) + return err; + } + } + write_lock(&head->lock); hlist_add_head(&epb->node, &head->chain); write_unlock(&head->lock); + return 0; } /* Add an endpoint to the hash. Local BH-safe. */ -void sctp_hash_endpoint(struct sctp_endpoint *ep) +int sctp_hash_endpoint(struct sctp_endpoint *ep) { + int err; + local_bh_disable(); - __sctp_hash_endpoint(ep); + err = __sctp_hash_endpoint(ep); local_bh_enable(); + + return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); + struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); + epb->hashent = sctp_ep_hashfn(sock_net(sk), epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + write_lock(&head->lock); hlist_del_init(&epb->node); write_unlock(&head->lock); @@ -770,16 +818,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep) local_bh_enable(); } +static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, + const union sctp_addr *paddr, __u32 seed) +{ + __u32 addr; + + if (paddr->sa.sa_family == AF_INET6) + addr = jhash(&paddr->v6.sin6_addr, 16, seed); + else + addr = (__force __u32)paddr->v4.sin_addr.s_addr; + + return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | + (__force __u32)lport, net_hash_mix(net), seed); +} + /* Look up an endpoint. */ -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, - const union sctp_addr *laddr) +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( + struct net *net, struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *paddr) { struct sctp_hashbucket *head; struct sctp_ep_common *epb; struct sctp_endpoint *ep; + struct sock *sk; + __be16 lport; int hash; - hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port)); + lport = laddr->v4.sin_port; + hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(epb, &head->chain) { @@ -791,6 +858,15 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: + sk = ep->base.sk; + if (sk->sk_reuseport) { + __u32 phash = sctp_hashfn(net, lport, paddr, 0); + + sk = reuseport_select_sock(sk, phash, skb, + sizeof(struct sctphdr)); + if (sk) + ep = sctp_sk(sk)->ep; + } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; @@ -829,35 +905,17 @@ out: static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; - const union sctp_addr *paddr = &t->ipaddr; - const struct net *net = sock_net(t->asoc->base.sk); - __be16 lport = htons(t->asoc->base.bind_addr.port); - __u32 addr; - - if (paddr->sa.sa_family == AF_INET6) - addr = jhash(&paddr->v6.sin6_addr, 16, seed); - else - addr = (__force __u32)paddr->v4.sin_addr.s_addr; - return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | - (__force __u32)lport, net_hash_mix(net), seed); + return sctp_hashfn(sock_net(t->asoc->base.sk), + htons(t->asoc->base.bind_addr.port), + &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; - const union sctp_addr *paddr = x->paddr; - const struct net *net = x->net; - __be16 lport = x->lport; - __u32 addr; - if (paddr->sa.sa_family == AF_INET6) - addr = jhash(&paddr->v6.sin6_addr, 16, seed); - else - addr = (__force __u32)paddr->v4.sin_addr.s_addr; - - return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | - (__force __u32)lport, net_hash_mix(net), seed); + return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fc6c5e4bffa5..6e27c62646e9 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -138,7 +138,7 @@ static struct notifier_block sctp_inet6addr_notifier = { }; /* ICMP error handler. */ -static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct inet6_dev *idev; @@ -147,7 +147,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct sctp_transport *transport; struct ipv6_pinfo *np; __u16 saveip, savesctp; - int err; + int err, ret = 0; struct net *net = dev_net(skb->dev); idev = in6_dev_get(skb->dev); @@ -163,6 +163,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->transport_header = savesctp; if (!sk) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); + ret = -ENOENT; goto out; } @@ -202,6 +203,8 @@ out_unlock: out: if (likely(idev != NULL)) in6_dev_put(idev); + + return ret; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c index c0817f7a8964..a8c4c33377bc 100644 --- a/net/sctp/primitive.c +++ b/net/sctp/primitive.c @@ -53,7 +53,7 @@ int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \ void *arg) { \ int error = 0; \ - enum sctp_event event_type; union sctp_subtype subtype; \ + enum sctp_event_type event_type; union sctp_subtype subtype; \ enum sctp_state state; \ struct sctp_endpoint *ep; \ \ diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 85d393090238..1d143bc3f73d 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -52,7 +52,7 @@ #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> -static int sctp_cmd_interpreter(enum sctp_event event_type, +static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); -static int sctp_side_effects(enum sctp_event event_type, +static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands, /* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, - enum sctp_event event_type, + enum sctp_event_type event_type, union sctp_subtype subtype, struct sctp_chunk *chunk, unsigned int error) @@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association *asoc) * If you want to understand all of lksctp, this is a * good place to start. */ -int sctp_do_sm(struct net *net, enum sctp_event event_type, +int sctp_do_sm(struct net *net, enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp) @@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type, /***************************************************************** * This the master state function side effect processing function. *****************************************************************/ -static int sctp_side_effects(enum sctp_event event_type, +static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -1285,7 +1285,7 @@ bail: ********************************************************************/ /* This is the side-effect interpreter. */ -static int sctp_cmd_interpreter(enum sctp_event event_type, +static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 691d9dc620e3..d239b94aa48c 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -79,7 +79,7 @@ static const struct sctp_sm_table_entry bug = { const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, - enum sctp_event event_type, + enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b8cebd5a87e5..f93c3cf9e567 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2230,7 +2230,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (sp->recvrcvinfo) sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ - if (sp->subscribe.sctp_data_io_event) + if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT)) sctp_ulpevent_read_sndrcvinfo(event, msg); err = copied; @@ -2304,22 +2304,33 @@ static int sctp_setsockopt_disable_fragments(struct sock *sk, static int sctp_setsockopt_events(struct sock *sk, char __user *optval, unsigned int optlen) { + struct sctp_event_subscribe subscribe; + __u8 *sn_type = (__u8 *)&subscribe; + struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_ulpevent *event; + int i; if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; - if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) + + if (copy_from_user(&subscribe, optval, optlen)) return -EFAULT; + for (i = 0; i < optlen; i++) + sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i, + sn_type[i]); + + list_for_each_entry(asoc, &sp->ep->asocs, asocs) + asoc->subscribe = sctp_sk(sk)->subscribe; + /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. */ - if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT, - &sctp_sk(sk)->subscribe)) { - asoc = sctp_id2assoc(sk, 0); + if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) { + struct sctp_ulpevent *event; + asoc = sctp_id2assoc(sk, 0); if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_USER | __GFP_NOWARN); @@ -4260,6 +4271,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, return 0; } +static int sctp_setsockopt_event(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_ulpevent *event; + struct sctp_event param; + int retval = 0; + + if (optlen < sizeof(param)) { + retval = -EINVAL; + goto out; + } + + optlen = sizeof(param); + if (copy_from_user(¶m, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (param.se_type < SCTP_SN_TYPE_BASE || + param.se_type > SCTP_SN_TYPE_MAX) { + retval = -EINVAL; + goto out; + } + + asoc = sctp_id2assoc(sk, param.se_assoc_id); + if (!asoc) { + sctp_ulpevent_type_set(&sctp_sk(sk)->subscribe, + param.se_type, param.se_on); + goto out; + } + + sctp_ulpevent_type_set(&asoc->subscribe, param.se_type, param.se_on); + + if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) { + if (sctp_outq_is_empty(&asoc->outqueue)) { + event = sctp_ulpevent_make_sender_dry_event(asoc, + GFP_USER | __GFP_NOWARN); + if (!event) { + retval = -ENOMEM; + goto out; + } + + asoc->stream.si->enqueue_event(&asoc->ulpq, event); + } + } + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4457,6 +4519,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_REUSE_PORT: retval = sctp_setsockopt_reuse_port(sk, optval, optlen); break; + case SCTP_EVENT: + retval = sctp_setsockopt_event(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4705,7 +4770,7 @@ static int sctp_init_sock(struct sock *sk) /* Initialize default event subscriptions. By default, all the * options are off. */ - memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe)); + sp->subscribe = 0; /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS @@ -5250,14 +5315,24 @@ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len, static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, int __user *optlen) { + struct sctp_event_subscribe subscribe; + __u8 *sn_type = (__u8 *)&subscribe; + int i; + if (len == 0) return -EINVAL; if (len > sizeof(struct sctp_event_subscribe)) len = sizeof(struct sctp_event_subscribe); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len)) + + for (i = 0; i < len; i++) + sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe, + SCTP_SN_TYPE_BASE + i); + + if (copy_to_user(optval, &subscribe, len)) return -EFAULT; + return 0; } @@ -7392,6 +7467,37 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval, + int __user *optlen) +{ + struct sctp_association *asoc; + struct sctp_event param; + __u16 subscribe; + + if (len < sizeof(param)) + return -EINVAL; + + len = sizeof(param); + if (copy_from_user(¶m, optval, len)) + return -EFAULT; + + if (param.se_type < SCTP_SN_TYPE_BASE || + param.se_type > SCTP_SN_TYPE_MAX) + return -EINVAL; + + asoc = sctp_id2assoc(sk, param.se_assoc_id); + subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe; + param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type); + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, ¶m, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7590,6 +7696,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_REUSE_PORT: retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); break; + case SCTP_EVENT: + retval = sctp_getsockopt_event(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7627,8 +7736,10 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { - bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse); + struct sctp_sock *sp = sctp_sk(sk); + bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ + kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; unsigned short snum; int ret; @@ -7704,7 +7815,10 @@ pp_found: pr_debug("%s: found a possible match\n", __func__); - if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) + if ((pp->fastreuse && reuse && + sk->sk_state != SCTP_SS_LISTENING) || + (pp->fastreuseport && sk->sk_reuseport && + uid_eq(pp->fastuid, uid))) goto success; /* Run through the list of sockets bound to the port @@ -7718,16 +7832,18 @@ pp_found: * in an endpoint. */ sk_for_each_bound(sk2, &pp->owner) { - struct sctp_endpoint *ep2; - ep2 = sctp_sk(sk2)->ep; + struct sctp_sock *sp2 = sctp_sk(sk2); + struct sctp_endpoint *ep2 = sp2->ep; if (sk == sk2 || - (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) && - sk2->sk_state != SCTP_SS_LISTENING)) + (reuse && (sk2->sk_reuse || sp2->reuse) && + sk2->sk_state != SCTP_SS_LISTENING) || + (sk->sk_reuseport && sk2->sk_reuseport && + uid_eq(uid, sock_i_uid(sk2)))) continue; - if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, - sctp_sk(sk2), sctp_sk(sk))) { + if (sctp_bind_addr_conflict(&ep2->base.bind_addr, + addr, sp2, sp)) { ret = (long)sk2; goto fail_unlock; } @@ -7750,19 +7866,32 @@ pp_not_found: pp->fastreuse = 1; else pp->fastreuse = 0; - } else if (pp->fastreuse && - (!reuse || sk->sk_state == SCTP_SS_LISTENING)) - pp->fastreuse = 0; + + if (sk->sk_reuseport) { + pp->fastreuseport = 1; + pp->fastuid = uid; + } else { + pp->fastreuseport = 0; + } + } else { + if (pp->fastreuse && + (!reuse || sk->sk_state == SCTP_SS_LISTENING)) + pp->fastreuse = 0; + + if (pp->fastreuseport && + (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid))) + pp->fastreuseport = 0; + } /* We are set, so fill up all the data in the hash table * entry, tie the socket list information with the rest of the * sockets FIXME: Blurry, NPI (ipg). */ success: - if (!sctp_sk(sk)->bind_hash) { + if (!sp->bind_hash) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &pp->owner); - sctp_sk(sk)->bind_hash = pp; + sp->bind_hash = pp; } ret = 0; @@ -7835,8 +7964,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } sk->sk_max_ack_backlog = backlog; - sctp_hash_endpoint(ep); - return 0; + return sctp_hash_endpoint(ep); } /* diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 0a78cdf86463..a6bf21579466 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -140,7 +140,7 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; - struct sk_buff *pos; + struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { @@ -166,23 +166,30 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, return; } + loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && - MID_lt(event->mid, cevent->mid))) + MID_lt(event->mid, cevent->mid))) { + loc = pos; break; - + } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || - event->fsn < cevent->fsn)) + event->fsn < cevent->fsn)) { + loc = pos; break; + } } - __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); + if (!loc) + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + else + __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( @@ -383,7 +390,7 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; - struct sk_buff *pos; + struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { @@ -403,18 +410,25 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, return; } + loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; - if (cevent->stream > event->stream) + if (cevent->stream > event->stream) { + loc = pos; break; - + } if (cevent->stream == event->stream && - MID_lt(event->mid, cevent->mid)) + MID_lt(event->mid, cevent->mid)) { + loc = pos; break; + } } - __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); + if (!loc) + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + else + __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, @@ -489,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq, sk_incoming_cpu_update(sk); } - if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; if (skb_list) @@ -980,17 +994,19 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; - if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, - &sctp_sk(sk)->subscribe)) + if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, + SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { + struct sctp_sock *sp = sctp_sk(sk); + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); - if (!sctp_sk(sk)->data_ready_signalled) { - sctp_sk(sk)->data_ready_signalled = 1; + if (!sp->data_ready_signalled) { + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 331cc734e3db..5dde92101743 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -219,7 +219,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sk_incoming_cpu_update(sk); } /* Check if the user wishes to receive this event. */ - if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; /* If we are in partial delivery mode, post to the lobby until @@ -1129,16 +1129,16 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *ev = NULL; - struct sock *sk; struct sctp_sock *sp; + struct sock *sk; if (!ulpq->pd_mode) return; sk = ulpq->asoc->base.sk; sp = sctp_sk(sk); - if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, - &sctp_sk(sk)->subscribe)) + if (sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, + SCTP_PARTIAL_DELIVERY_EVENT)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, 0, 0, 0, gfp); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5fbaf1901571..63f08b4e51d6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -301,14 +301,17 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, optionally send confirm_rkey msg to register with peer */ +/* register a new rmb, send confirm_rkey msg to register with peer */ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, bool conf_rkey) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; - return -EFAULT; + if (!rmb_desc->wr_reg) { + /* register memory region for new rmb */ + if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { + rmb_desc->regerr = 1; + return -EFAULT; + } + rmb_desc->wr_reg = 1; } if (!conf_rkey) return 0; @@ -337,8 +340,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } if (link->llc_confirm_rc) @@ -365,8 +368,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } /* send add link reject message, only one link supported for now */ @@ -535,7 +538,8 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type, if (rc) return rc; /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); + return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT, + CLC_WAIT_TIME); } /* setup for RDMA connection of client */ @@ -583,8 +587,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, local_contact); } else { - if (!smc->conn.rmb_desc->reused && - smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, local_contact); } @@ -968,8 +971,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } if (link->llc_confirm_resp_rc) @@ -989,8 +992,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); @@ -1145,10 +1148,8 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; if (local_contact != SMC_FIRST_CONTACT) { - if (!new_smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) - return SMC_CLC_DECL_ERR_REGRMB; - } + if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1184,7 +1185,6 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, return 0; decline: - mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); return reason_code; } @@ -1225,7 +1225,7 @@ static void smc_listen_work(struct work_struct *work) */ pclc = (struct smc_clc_msg_proposal *)&buf; reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL); + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (reason_code) { smc_listen_decline(new_smc, reason_code, 0); return; @@ -1275,7 +1275,7 @@ static void smc_listen_work(struct work_struct *work) /* receive SMC Confirm CLC message */ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM); + SMC_CLC_CONFIRM, CLC_WAIT_TIME); if (reason_code) { mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); @@ -1284,8 +1284,10 @@ static void smc_listen_work(struct work_struct *work) /* finish worker */ if (!ism_supported) { - if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) + if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) { + mutex_unlock(&smc_create_lgr_pending); return; + } } smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); @@ -1357,7 +1359,6 @@ static int smc_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; - INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); sock_hold(sk); /* sock_hold in tcp_listen_worker */ if (!schedule_work(&smc->tcp_listen_work)) sock_put(sk); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 89c3a8c7859a..776e9dfc915d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -265,7 +265,7 @@ out: * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. */ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, - u8 expected_type) + u8 expected_type, unsigned long timeout) { long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; struct sock *clc_sk = smc->clcsock->sk; @@ -285,7 +285,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; - smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + clc_sk->sk_rcvtimeo = timeout; iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); @@ -297,7 +297,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clc_sk->sk_err) { reason_code = -clc_sk->sk_err; - smc->sk.sk_err = clc_sk->sk_err; + if (clc_sk->sk_err == EAGAIN && + expected_type == SMC_CLC_DECLINE) + clc_sk->sk_err = 0; /* reset for fallback usage */ + else + smc->sk.sk_err = clc_sk->sk_err; goto out; } if (!len) { /* peer has performed orderly shutdown */ @@ -306,7 +310,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, goto out; } if (len < 0) { - smc->sk.sk_err = -len; + if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE) + smc->sk.sk_err = -len; reason_code = len; goto out; } @@ -346,7 +351,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: - smc->clcsock->sk->sk_rcvtimeo = rcvtimeo; + clc_sk->sk_rcvtimeo = rcvtimeo; return reason_code; } @@ -374,10 +379,8 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(struct smc_clc_msg_decline)); if (len < sizeof(struct smc_clc_msg_decline)) - smc->sk.sk_err = EPROTO; - if (len < 0) - smc->sk.sk_err = -len; - return sock_error(&smc->sk); + len = -EPROTO; + return len > 0 ? 0 : len; } /* send CLC PROPOSAL message across internal TCP socket */ @@ -536,7 +539,6 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) struct smc_link *link; struct msghdr msg; struct kvec vec; - int rc = 0; int len; memset(&aclc, 0, sizeof(aclc)); @@ -589,13 +591,8 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) vec.iov_len = ntohs(aclc.hdr.length); len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, ntohs(aclc.hdr.length)); - if (len < ntohs(aclc.hdr.length)) { - if (len >= 0) - new_smc->sk.sk_err = EPROTO; - else - new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err; - rc = sock_error(&new_smc->sk); - } + if (len < ntohs(aclc.hdr.length)) + len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; - return rc; + return len > 0 ? 0 : len; } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 18da89b681c2..24658e8c0de4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -27,6 +27,7 @@ #define SMC_TYPE_D 1 /* SMC-D only */ #define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ +#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ #define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */ #define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */ @@ -182,7 +183,7 @@ struct smcd_dev; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, - u8 expected_type); + u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1c9fa7f0261a..35c1cdc93e1c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -149,6 +149,8 @@ static int smc_link_send_delete(struct smc_link *lnk) return -ENOTCONN; } +static void smc_lgr_free(struct smc_link_group *lgr); + static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), @@ -171,8 +173,11 @@ free: spin_unlock_bh(&smc_lgr_list.lock); if (!lgr->is_smcd && !lgr->terminating) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* try to send del link msg, on error free lgr immediately */ - if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { + if (lnk->state == SMC_LNK_ACTIVE && + !smc_link_send_delete(lnk)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); return; @@ -295,8 +300,13 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - conn->rmb_desc->reused = 1; conn->rmb_desc->used = 0; + if (!lgr->is_smcd) { + /* unregister rmb with peer */ + smc_llc_do_delete_rkey( + &lgr->lnk[SMC_SINGLE_LINK], + conn->rmb_desc); + } } else { /* buf registration failed, reuse not possible */ write_lock_bh(&lgr->rmbs_lock); @@ -410,7 +420,7 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) } /* remove a link group */ -void smc_lgr_free(struct smc_link_group *lgr) +static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index cf98f4d6093e..b00287989a3d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -109,6 +109,9 @@ struct smc_link { int llc_testlink_time; /* testlink interval */ struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ + struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ + int llc_delete_rkey_rc; /* rc from del rkey msg */ + struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -127,7 +130,7 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 reused : 1; /* new created / reused */ + u8 wr_reg : 1; /* mem region registered */ u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ @@ -243,7 +246,6 @@ struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; -void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9c916c709ca7..a6d3623d06f4 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -238,6 +238,29 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, return rc; } +/* send LLC delete rkey request */ +static int smc_llc_send_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_llc_msg_delete_rkey *rkeyllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; + memset(rkeyllc, 0, sizeof(*rkeyllc)); + rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; + rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); + rkeyllc->num_rkeys = 1; + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + /* prepare an add link message */ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, struct smc_link *link, u8 mac[], u8 gid[], @@ -509,7 +532,9 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, int i, max; if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ + link->llc_delete_rkey_rc = llc->hd.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_delete_rkey); } else { max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { @@ -610,6 +635,8 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_add); init_completion(&link->llc_add_resp); init_completion(&link->llc_confirm_rkey); + init_completion(&link->llc_delete_rkey); + mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; @@ -650,8 +677,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, { int rc; + /* protected by mutex smc_create_lgr_pending */ reinit_completion(&link->llc_confirm_rkey); - smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_send_confirm_rkey(link, rmb_desc); + if (rc) + return rc; /* receive CONFIRM RKEY response from server over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, SMC_LLC_WAIT_TIME); @@ -660,6 +690,29 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, return 0; } +/* unregister an rtoken at the remote peer */ +int smc_llc_do_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + int rc; + + mutex_lock(&link->llc_delete_rkey_mutex); + reinit_completion(&link->llc_delete_rkey); + rc = smc_llc_send_delete_rkey(link, rmb_desc); + if (rc) + goto out; + /* receive DELETE RKEY response from server over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, + SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_delete_rkey_rc) + rc = -EFAULT; + else + rc = 0; +out: + mutex_unlock(&link->llc_delete_rkey_mutex); + return rc; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 9e2ff088e301..461c0c3ef76e 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -49,6 +49,8 @@ void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smc_llc_do_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 9062967575c4..7e55cfc69697 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -175,7 +175,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); return 0; no_checksum: if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 74b9d916a58b..fe23fac4dc4b 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -353,30 +353,29 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) return 0; } -static int __switchdev_port_obj_add(struct net_device *dev, - const struct switchdev_obj *obj, - struct switchdev_trans *trans) +static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, + struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans) { - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct list_head *iter; - int err = -EOPNOTSUPP; - - if (ops && ops->switchdev_port_obj_add) - return ops->switchdev_port_obj_add(dev, obj, trans); + int rc; + int err; - /* Switch device port(s) may be stacked under - * bond/team/vlan dev, so recurse down to add object on - * each port. - */ + struct switchdev_notifier_port_obj_info obj_info = { + .obj = obj, + .trans = trans, + .handled = false, + }; - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = __switchdev_port_obj_add(lower_dev, obj, trans); - if (err) - break; + rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info); + err = notifier_to_errno(rc); + if (err) { + WARN_ON(!obj_info.handled); + return err; } - - return err; + if (!obj_info.handled) + return -EOPNOTSUPP; + return 0; } static int switchdev_port_obj_add_now(struct net_device *dev, @@ -397,7 +396,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev, */ trans.ph_prepare = true; - err = __switchdev_port_obj_add(dev, obj, &trans); + err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, &trans); if (err) { /* Prepare phase failed: abort the transaction. Any * resources reserved in the prepare phase are @@ -416,7 +416,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev, */ trans.ph_prepare = false; - err = __switchdev_port_obj_add(dev, obj, &trans); + err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, &trans); WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); switchdev_trans_items_warn_destroy(dev, &trans); @@ -471,26 +472,8 @@ EXPORT_SYMBOL_GPL(switchdev_port_obj_add); static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct list_head *iter; - int err = -EOPNOTSUPP; - - if (ops && ops->switchdev_port_obj_del) - return ops->switchdev_port_obj_del(dev, obj); - - /* Switch device port(s) may be stacked under - * bond/team/vlan dev, so recurse down to delete object on - * each port. - */ - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_obj_del_now(lower_dev, obj); - if (err) - break; - } - - return err; + return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, + dev, obj, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, @@ -535,6 +518,7 @@ int switchdev_port_obj_del(struct net_device *dev, EXPORT_SYMBOL_GPL(switchdev_port_obj_del); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); +static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier @@ -576,6 +560,31 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); +int register_switchdev_blocking_notifier(struct notifier_block *nb) +{ + struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + + return blocking_notifier_chain_register(chain, nb); +} +EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); + +int unregister_switchdev_blocking_notifier(struct notifier_block *nb) +{ + struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + + return blocking_notifier_chain_unregister(chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); + +int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info) +{ + info->dev = dev; + return blocking_notifier_call_chain(&switchdev_blocking_notif_chain, + val, info); +} +EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); + bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { @@ -595,3 +604,103 @@ bool switchdev_port_same_parent_id(struct net_device *a, return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); } EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); + +static int __switchdev_handle_port_obj_add(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*add_cb)(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans)) +{ + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) { + /* This flag is only checked if the return value is success. */ + port_obj_info->handled = true; + return add_cb(dev, port_obj_info->obj, port_obj_info->trans); + } + + /* Switch ports might be stacked under e.g. a LAG. Ignore the + * unsupported devices, another driver might be able to handle them. But + * propagate to the callers any hard errors. + * + * If the driver does its own bookkeeping of stacked ports, it's not + * necessary to go through this helper. + */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info, + check_cb, add_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return err; +} + +int switchdev_handle_port_obj_add(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*add_cb)(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans)) +{ + int err; + + err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, + add_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); + +static int __switchdev_handle_port_obj_del(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*del_cb)(struct net_device *dev, + const struct switchdev_obj *obj)) +{ + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) { + /* This flag is only checked if the return value is success. */ + port_obj_info->handled = true; + return del_cb(dev, port_obj_info->obj); + } + + /* Switch ports might be stacked under e.g. a LAG. Ignore the + * unsupported devices, another driver might be able to handle them. But + * propagate to the callers any hard errors. + * + * If the driver does its own bookkeeping of stacked ports, it's not + * necessary to go through this helper. + */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info, + check_cb, del_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return err; +} + +int switchdev_handle_port_obj_del(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*del_cb)(struct net_device *dev, + const struct switchdev_obj *obj)) +{ + int err; + + err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, + del_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); diff --git a/net/tipc/link.c b/net/tipc/link.c index 836727e363c4..9e265eb89726 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -105,7 +105,7 @@ struct tipc_stats { * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages - * @last_retransmitted: sequence number of most recently retransmitted message + * @prev_from: sequence number of most previous retransmission request * @stale_cnt: counter for number of identical retransmit attempts * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released @@ -163,7 +163,7 @@ struct tipc_link { u16 limit; } backlog[5]; u16 snd_nxt; - u16 last_retransm; + u16 prev_from; u16 window; u16 stale_cnt; unsigned long stale_limit; @@ -186,9 +186,6 @@ struct tipc_link { u16 acked; struct tipc_link *bc_rcvlink; struct tipc_link *bc_sndlink; - unsigned long prev_retr; - u16 prev_from; - u16 prev_to; u8 nack_state; bool bc_peer_is_up; @@ -210,7 +207,7 @@ enum { BC_NACK_SND_SUPPRESS, }; -#define TIPC_BC_RETR_LIMIT 10 /* [ms] */ +#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */ /* * Interval between NACKs when packets arrive out of order @@ -1036,10 +1033,12 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, if (!skb) return 0; + if (less(to, from)) + return 0; /* Detect repeated retransmit failures on same packet */ - if (r->last_retransm != buf_seqno(skb)) { - r->last_retransm = buf_seqno(skb); + if (r->prev_from != from) { + r->prev_from = from; r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); r->stale_cnt = 0; } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { @@ -1055,6 +1054,11 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, continue; if (more(msg_seqno(hdr), to)) break; + if (link_is_bc_sndlink(l)) { + if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) + continue; + TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + } _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; @@ -1737,42 +1741,6 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) l->rcv_nxt = peers_snd_nxt; } -/* link_bc_retr eval()- check if the indicated range can be retransmitted now - * - Adjust permitted range if there is overlap with previous retransmission - */ -static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to) -{ - unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr); - - if (less(*to, *from)) - return false; - - /* New retransmission request */ - if ((elapsed > TIPC_BC_RETR_LIMIT) || - less(*to, l->prev_from) || more(*from, l->prev_to)) { - l->prev_from = *from; - l->prev_to = *to; - l->prev_retr = jiffies; - return true; - } - - /* Inside range of previous retransmit */ - if (!less(*from, l->prev_from) && !more(*to, l->prev_to)) - return false; - - /* Fully or partially outside previous range => exclude overlap */ - if (less(*from, l->prev_from)) { - *to = l->prev_from - 1; - l->prev_from = *from; - } - if (more(*to, l->prev_to)) { - *from = l->prev_to + 1; - l->prev_to = *to; - } - l->prev_retr = jiffies; - return true; -} - /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state */ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, @@ -1803,8 +1771,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - if (link_bc_retr_eval(snd_l, &from, &to)) - rc = tipc_link_retrans(snd_l, l, from, to, xmitq); + rc = tipc_link_retrans(snd_l, l, from, to, xmitq); l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a2879e6ec5b6..a0924956bb61 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -105,6 +105,7 @@ struct tipc_skb_cb { u32 bytes_read; u32 orig_member; struct sk_buff *tail; + unsigned long nxt_retr; bool validated; u16 chain_imp; u16 ackers; diff --git a/net/tipc/node.c b/net/tipc/node.c index 488019766433..32556f480a60 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -624,6 +624,12 @@ static void tipc_node_timeout(struct timer_list *t) __skb_queue_head_init(&xmitq); + /* Initial node interval to value larger (10 seconds), then it will be + * recalculated with link lowest tolerance + */ + tipc_node_read_lock(n); + n->keepalive_intv = 10000; + tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7b1af8b59cd2..d4ecc66464e6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -687,6 +687,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sock *sk_redir; struct tls_rec *rec; int err = 0, send; + u32 delta = 0; bool enospc; psock = sk_psock_get(sk); @@ -694,8 +695,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, return tls_push_record(sk, flags, record_type); more_data: enospc = sk_msg_full(msg); - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (delta < msg->sg.size) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { err = -ENOSPC; @@ -743,7 +750,7 @@ more_data: msg->apply_bytes -= send; if (msg->sg.size == 0) tls_free_open_rec(sk); - *copied -= send; + *copied -= (send + delta); err = -EACCES; } |