diff options
Diffstat (limited to 'net')
369 files changed, 12808 insertions, 5914 deletions
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 24915e0bb9ea..6c152f9ea26e 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -232,18 +232,7 @@ static int lowpan_context_show(struct seq_file *file, void *offset) return 0; } - -static int lowpan_context_open(struct inode *inode, struct file *file) -{ - return single_open(file, lowpan_context_show, inode->i_private); -} - -static const struct file_operations lowpan_context_fops = { - .open = lowpan_context_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(lowpan_context); static int lowpan_short_addr_get(void *data, u64 *val) { diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 5e9950453955..dc4411165e43 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -330,6 +330,7 @@ static void vlan_transfer_features(struct net_device *dev, vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); + vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } @@ -357,6 +358,7 @@ static int __vlan_device_event(struct net_device *dev, unsigned long event) static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; @@ -459,7 +461,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) - dev_change_flags(vlandev, flgs | IFF_UP); + dev_change_flags(vlandev, flgs | IFF_UP, + extack); netif_stacked_transfer_operstate(dev, vlandev); } break; @@ -647,93 +650,6 @@ out: return err; } -static struct sk_buff *vlan_gro_receive(struct list_head *head, - struct sk_buff *skb) -{ - const struct packet_offload *ptype; - unsigned int hlen, off_vlan; - struct sk_buff *pp = NULL; - struct vlan_hdr *vhdr; - struct sk_buff *p; - __be16 type; - int flush = 1; - - off_vlan = skb_gro_offset(skb); - hlen = off_vlan + sizeof(*vhdr); - vhdr = skb_gro_header_fast(skb, off_vlan); - if (skb_gro_header_hard(skb, hlen)) { - vhdr = skb_gro_header_slow(skb, hlen, off_vlan); - if (unlikely(!vhdr)) - goto out; - } - - type = vhdr->h_vlan_encapsulated_proto; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) - goto out_unlock; - - flush = 0; - - list_for_each_entry(p, head, list) { - struct vlan_hdr *vhdr2; - - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); - if (compare_vlan_header(vhdr, vhdr2)) - NAPI_GRO_CB(p)->same_flow = 0; - } - - skb_gro_pull(skb, sizeof(*vhdr)); - skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); - pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); - -out_unlock: - rcu_read_unlock(); -out: - skb_gro_flush_final(skb, pp, flush); - - return pp; -} - -static int vlan_gro_complete(struct sk_buff *skb, int nhoff) -{ - struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); - __be16 type = vhdr->h_vlan_encapsulated_proto; - struct packet_offload *ptype; - int err = -ENOENT; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); - - rcu_read_unlock(); - return err; -} - -static struct packet_offload vlan_packet_offloads[] __read_mostly = { - { - .type = cpu_to_be16(ETH_P_8021Q), - .priority = 10, - .callbacks = { - .gro_receive = vlan_gro_receive, - .gro_complete = vlan_gro_complete, - }, - }, - { - .type = cpu_to_be16(ETH_P_8021AD), - .priority = 10, - .callbacks = { - .gro_receive = vlan_gro_receive, - .gro_complete = vlan_gro_complete, - }, - }, -}; - static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); @@ -761,7 +677,6 @@ static struct pernet_operations vlan_net_ops = { static int __init vlan_proto_init(void) { int err; - unsigned int i; pr_info("%s v%s\n", vlan_fullname, vlan_version); @@ -785,9 +700,6 @@ static int __init vlan_proto_init(void) if (err < 0) goto err5; - for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) - dev_add_offload(&vlan_packet_offloads[i]); - vlan_ioctl_set(vlan_ioctl_handler); return 0; @@ -805,13 +717,8 @@ err0: static void __exit vlan_cleanup_module(void) { - unsigned int i; - vlan_ioctl_set(NULL); - for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) - dev_remove_offload(&vlan_packet_offloads[i]); - vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 44df1c3df02d..c46daf09a501 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -92,6 +92,18 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, return NULL; } +static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) +{ + netdev_features_t ret; + + ret = real_dev->hw_enc_features & + (NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO | NETIF_F_GSO_ENCAP_ALL); + + if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) + return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; + return 0; +} + #define vlan_group_for_each_dev(grp, i, dev) \ for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \ if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 4f60e86f4b8d..a313165e7a67 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -57,7 +57,7 @@ bool vlan_do_receive(struct sk_buff **skbp) } skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats); @@ -223,6 +223,33 @@ static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vi return -ENODEV; } +int vlan_for_each(struct net_device *dev, + int (*action)(struct net_device *dev, int vid, void *arg), + void *arg) +{ + struct vlan_vid_info *vid_info; + struct vlan_info *vlan_info; + struct net_device *vdev; + int ret; + + ASSERT_RTNL(); + + vlan_info = rtnl_dereference(dev->vlan_info); + if (!vlan_info) + return 0; + + list_for_each_entry(vid_info, &vlan_info->vid_list, list) { + vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto, + vid_info->vid); + ret = action(vdev, vid_info->vid, arg); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL(vlan_for_each); + int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto) { struct net_device *real_dev = vlan_info->real_dev; @@ -426,3 +453,102 @@ bool vlan_uses_dev(const struct net_device *dev) return vlan_info->grp.nr_vlan_devs ? true : false; } EXPORT_SYMBOL(vlan_uses_dev); + +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) +{ + const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; + __be16 type; + int flush = 1; + + off_vlan = skb_gro_offset(skb); + hlen = off_vlan + sizeof(*vhdr); + vhdr = skb_gro_header_fast(skb, off_vlan); + if (skb_gro_header_hard(skb, hlen)) { + vhdr = skb_gro_header_slow(skb, hlen, off_vlan); + if (unlikely(!vhdr)) + goto out; + } + + type = vhdr->h_vlan_encapsulated_proto; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) + goto out_unlock; + + flush = 0; + + list_for_each_entry(p, head, list) { + struct vlan_hdr *vhdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); + if (compare_vlan_header(vhdr, vhdr2)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + skb_gro_pull(skb, sizeof(*vhdr)); + skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + +out_unlock: + rcu_read_unlock(); +out: + skb_gro_flush_final(skb, pp, flush); + + return pp; +} + +static int vlan_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); + __be16 type = vhdr->h_vlan_encapsulated_proto; + struct packet_offload *ptype; + int err = -ENOENT; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); + + rcu_read_unlock(); + return err; +} + +static struct packet_offload vlan_packet_offloads[] __read_mostly = { + { + .type = cpu_to_be16(ETH_P_8021Q), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, + { + .type = cpu_to_be16(ETH_P_8021AD), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, +}; + +static int __init vlan_offload_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_add_offload(&vlan_packet_offloads[i]); + + return 0; +} + +fs_initcall(vlan_offload_init); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ff720f1ebf73..b2d9c8f27cd7 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -562,6 +562,7 @@ static int vlan_dev_init(struct net_device *dev) dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_ALL_FCOE; @@ -572,6 +573,7 @@ static int vlan_dev_init(struct net_device *dev) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; + dev->hw_enc_features = vlan_tnl_features(real_dev); /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; diff --git a/net/Kconfig b/net/Kconfig index f235edb593ba..5cb9de1aaf88 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -51,6 +51,9 @@ config NET_INGRESS config NET_EGRESS bool +config SKB_EXTENSIONS + bool + menu "Networking options" source "net/packet/Kconfig" @@ -184,6 +187,7 @@ config BRIDGE_NETFILTER depends on NETFILTER && INET depends on NETFILTER_ADVANCED select NETFILTER_FAMILY_BRIDGE + select SKB_EXTENSIONS default m ---help--- Enabling this option will let arptables resp. iptables see bridged diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index f75816f58107..c386e6981416 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -22,7 +22,6 @@ config BATMAN_ADV tristate "B.A.T.M.A.N. Advanced Meshing Protocol" depends on NET - select CRC16 select LIBCRC32C help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is @@ -48,6 +47,7 @@ config BATMAN_ADV_BATMAN_V config BATMAN_ADV_BLA bool "Bridge Loop Avoidance" depends on BATMAN_ADV && INET + select CRC16 default y help This option enables BLA (Bridge Loop Avoidance), a mechanism @@ -82,6 +82,7 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) + default y help This option enables the multicast optimisation which aims to reduce the air overhead while improving the reliability of @@ -100,12 +101,13 @@ config BATMAN_ADV_DEBUGFS config BATMAN_ADV_DEBUG bool "B.A.T.M.A.N. debugging" - depends on BATMAN_ADV_DEBUGFS + depends on BATMAN_ADV help This is an option for use by developers; most people should say N here. This enables compilation of support for - outputting debugging information to the kernel log. The - output is controlled via the module parameter debug. + outputting debugging information to the debugfs log or tracing + buffer. The output is controlled via the batadv netdev specific + log_level setting. config BATMAN_ADV_TRACING bool "B.A.T.M.A.N. tracing support" diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index d2227091029f..f97e566f0402 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -34,7 +34,6 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> @@ -2585,13 +2584,14 @@ static void batadv_iv_gw_print(struct batadv_priv *bat_priv, * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ -static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { @@ -2611,13 +2611,16 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + ret = -EMSGSIZE; if (curr_gw == gw_node) @@ -2668,13 +2671,15 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, int idx_skip = cb->args[0]; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { + spin_lock_bh(&bat_priv->gw.list_lock); + cb->seq = bat_priv->gw.generation << 1 | 1; + + hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; - if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, gw_node)) { + if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv, + gw_node)) { idx_skip = idx - 1; goto unlock; } @@ -2682,7 +2687,7 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, idx_skip = idx; unlock: - rcu_read_unlock(); + spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 6baec4e68898..90e33f84d37a 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -27,11 +27,13 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> +#include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> @@ -915,13 +917,14 @@ static void batadv_v_gw_print(struct batadv_priv *bat_priv, * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ -static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { @@ -941,13 +944,16 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + ret = -EMSGSIZE; if (curr_gw == gw_node) { @@ -1018,13 +1024,15 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, int idx_skip = cb->args[0]; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { + spin_lock_bh(&bat_priv->gw.list_lock); + cb->seq = bat_priv->gw.generation << 1 | 1; + + hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; - if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, gw_node)) { + if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv, + gw_node)) { idx_skip = idx - 1; goto unlock; } @@ -1032,7 +1040,7 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, idx_skip = idx; unlock: - rcu_read_unlock(); + spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 5f1aeeded0e3..5fdde2947802 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -2094,14 +2094,15 @@ out: * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface * @claim: entry to dump * * Return: 0 or error code. */ static int -batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { @@ -2111,13 +2112,16 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, void *hdr; int ret = -EINVAL; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_BLA_CLAIM); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); @@ -2153,28 +2157,33 @@ out: * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface - * @head: bucket to dump + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int -batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, - struct hlist_head *head, int *idx_skip) + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_skip) { struct batadv_bla_claim *claim; int idx = 0; int ret = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(claim, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; - ret = batadv_bla_claim_dump_entry(msg, portid, seq, + ret = batadv_bla_claim_dump_entry(msg, portid, cb, primary_if, claim); if (ret) { *idx_skip = idx - 1; @@ -2184,7 +2193,7 @@ batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, *idx_skip = 0; unlock: - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return ret; } @@ -2204,7 +2213,6 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -2230,11 +2238,8 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_bla_claim_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, - primary_if, head, &idx)) + if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if, + hash, bucket, &idx)) break; bucket++; } @@ -2325,14 +2330,15 @@ out: * netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface * @backbone_gw: entry to dump * * Return: 0 or error code. */ static int -batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { @@ -2343,13 +2349,16 @@ batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, void *hdr; int ret = -EINVAL; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_BLA_BACKBONE); if (!hdr) { ret = -ENOBUFS; goto out; } + genl_dump_check_consistent(cb, hdr); + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); spin_lock_bh(&backbone_gw->crc_lock); @@ -2386,28 +2395,33 @@ out: * a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @primary_if: primary interface - * @head: bucket to dump + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int -batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *primary_if, - struct hlist_head *head, int *idx_skip) + struct batadv_hashtable *hash, + unsigned int bucket, int *idx_skip) { struct batadv_bla_backbone_gw *backbone_gw; int idx = 0; int ret = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; - ret = batadv_bla_backbone_dump_entry(msg, portid, seq, + ret = batadv_bla_backbone_dump_entry(msg, portid, cb, primary_if, backbone_gw); if (ret) { *idx_skip = idx - 1; @@ -2417,7 +2431,7 @@ batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, *idx_skip = 0; unlock: - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return ret; } @@ -2437,7 +2451,6 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -2463,11 +2476,8 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_bla_backbone_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, - primary_if, head, &idx)) + if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if, + hash, bucket, &idx)) break; bucket++; } diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 8b608a2e2653..d4a7702e48d8 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -19,6 +19,7 @@ #include "debugfs.h" #include "main.h" +#include <asm/current.h> #include <linux/dcache.h> #include <linux/debugfs.h> #include <linux/err.h> @@ -27,6 +28,7 @@ #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/printk.h> +#include <linux/sched.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/stddef.h> diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index a60bacf7120b..b9ffe1826527 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -863,23 +863,27 @@ out: * netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @dat_entry: entry to dump * * Return: 0 or error code. */ static int -batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_dat_entry *dat_entry) { int msecs; void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_DAT_CACHE); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + msecs = jiffies_to_msecs(jiffies - dat_entry->last_update); if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS, @@ -901,27 +905,31 @@ batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message - * @head: bucket to dump + * @cb: Control block containing additional options + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int -batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, - struct hlist_head *head, int *idx_skip) +batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_skip) { struct batadv_dat_entry *dat_entry; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(dat_entry, &hash->table[bucket], hash_entry) { if (idx < *idx_skip) goto skip; - if (batadv_dat_cache_dump_entry(msg, portid, seq, - dat_entry)) { - rcu_read_unlock(); + if (batadv_dat_cache_dump_entry(msg, portid, cb, dat_entry)) { + spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; @@ -930,7 +938,7 @@ batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, skip: idx++; } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return 0; } @@ -951,7 +959,6 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; - struct hlist_head *head; int idx = cb->args[1]; int ifindex; int ret = 0; @@ -977,10 +984,7 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) } while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_dat_cache_dump_bucket(msg, portid, - cb->nlh->nlmsg_seq, head, + if (batadv_dat_cache_dump_bucket(msg, portid, cb, hash, bucket, &idx)) break; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 140c61a3f1ec..9d8e5eda2314 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -377,6 +377,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); + bat_priv->gw.generation++; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", @@ -472,6 +473,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, if (!hlist_unhashed(&gw_node->list)) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); + bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); @@ -518,6 +520,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); + bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 781c5b6e6e8e..508f4416dfc9 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -951,6 +951,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); + batadv_hardif_generation++; return hard_iface; @@ -993,6 +994,7 @@ void batadv_hardif_remove_interfaces(void) list_for_each_entry_safe(hard_iface, hard_iface_tmp, &batadv_hardif_list, list) { list_del_rcu(&hard_iface->list); + batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); } rtnl_unlock(); @@ -1054,6 +1056,7 @@ static int batadv_hard_if_event(struct notifier_block *this, case NETDEV_UNREGISTER: case NETDEV_PRE_TYPE_CHANGE: list_del_rcu(&hard_iface->list); + batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); break; diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 7b49e4001778..9194f4d891b1 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -32,6 +32,8 @@ static void batadv_hash_init(struct batadv_hashtable *hash) INIT_HLIST_HEAD(&hash->table[i]); spin_lock_init(&hash->list_locks[i]); } + + atomic_set(&hash->generation, 0); } /** diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 9490a7ca2ba6..0e36fa1c7c3e 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -21,6 +21,7 @@ #include "main.h" +#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> #include <linux/rculist.h> @@ -58,6 +59,9 @@ struct batadv_hashtable { /** @size: size of hashtable */ u32 size; + + /** @generation: current (generation) sequence number */ + atomic_t generation; }; /* allocates and clears the hash */ @@ -112,6 +116,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash, /* no duplicate found in list, add new element */ hlist_add_head_rcu(data_node, head); + atomic_inc(&hash->generation); ret = 0; @@ -154,6 +159,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, data_save = node; hlist_del_rcu(node); + atomic_inc(&hash->generation); break; } spin_unlock_bh(&hash->list_locks[index]); diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 6beb5f067810..02e55b78132f 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -43,6 +43,8 @@ #include "debugfs.h" #include "trace.h" +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + #define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; @@ -92,33 +94,6 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, return 0; } -/** - * batadv_debug_log() - Add debug log entry - * @bat_priv: the bat priv with all the soft interface information - * @fmt: format string - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", - jiffies_to_msecs(jiffies), &vaf); - - trace_batadv_dbg(bat_priv, &vaf); - - va_end(args); - - return 0; -} - static int batadv_log_open(struct inode *inode, struct file *file) { if (!try_module_get(THIS_MODULE)) @@ -259,3 +234,34 @@ void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) kfree(bat_priv->debug_log); bat_priv->debug_log = NULL; } + +#endif /* CONFIG_BATMAN_ADV_DEBUGFS */ + +/** + * batadv_debug_log() - Add debug log entry + * @bat_priv: the bat priv with all the soft interface information + * @fmt: format string + * + * Return: 0 on success or negative error number in case of failure + */ +int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV", + jiffies_to_msecs(jiffies), &vaf); +#endif + + trace_batadv_dbg(bat_priv, &vaf); + + va_end(args); + + return 0; +} diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 69c0d85bceb3..d1ed839fd32b 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -74,6 +74,7 @@ * list traversals just rcu-locked */ struct list_head batadv_hardif_list; +unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); @@ -186,6 +187,8 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); INIT_HLIST_HEAD(&bat_priv->tp_list); + bat_priv->gw.generation = 0; + ret = batadv_v_mesh_init(bat_priv); if (ret < 0) goto err; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2002b70e18db..b572066325e4 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.4" +#define BATADV_SOURCE_VERSION "2019.0" #endif /* B.A.T.M.A.N. parameters */ @@ -247,6 +247,7 @@ static inline int batadv_print_vid(unsigned short vid) } extern struct list_head batadv_hardif_list; +extern unsigned int batadv_hardif_generation; extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 86725d792e15..69244e4598f5 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1365,22 +1365,26 @@ int batadv_mcast_mesh_info_put(struct sk_buff *msg, * to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int -batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); @@ -1405,21 +1409,26 @@ batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * table to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message - * @head: bucket to dump + * @cb: Control block containing additional options + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int -batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, - struct hlist_head *head, long *idx_skip) +batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, + struct batadv_hashtable *hash, + unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; @@ -1427,9 +1436,8 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, if (idx < *idx_skip) goto skip; - if (batadv_mcast_flags_dump_entry(msg, portid, seq, - orig_node)) { - rcu_read_unlock(); + if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { + spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; @@ -1438,7 +1446,7 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, skip: idx++; } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); return 0; } @@ -1447,7 +1455,7 @@ skip: * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: the bat priv with all the soft interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump @@ -1455,19 +1463,17 @@ skip: * Return: 0 or error code. */ static int -__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq, +__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; - struct hlist_head *head; long idx_tmp = *idx; while (bucket_tmp < hash->size) { - head = &hash->table[bucket_tmp]; - - if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head, - &idx_tmp)) + if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, + *bucket, &idx_tmp)) break; bucket_tmp++; @@ -1550,8 +1556,7 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) return ret; bat_priv = netdev_priv(primary_if->soft_iface); - ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, bucket, idx); + ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0d9459b69bdb..2dc3304cee54 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -29,11 +29,11 @@ #include <linux/if_ether.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> @@ -445,23 +445,27 @@ out: * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @hard_iface: Hard interface to dump * * Return: error code, or 0 on success */ static int -batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_hard_iface *hard_iface) { struct net_device *net_dev = hard_iface->net_dev; void *hdr; - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_HARDIFS); if (!hdr) return -EMSGSIZE; + genl_dump_check_consistent(cb, hdr); + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, @@ -498,7 +502,6 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_hard_iface *hard_iface; int ifindex; int portid = NETLINK_CB(cb->skb).portid; - int seq = cb->nlh->nlmsg_seq; int skip = cb->args[0]; int i = 0; @@ -516,23 +519,24 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) return -ENODEV; } - rcu_read_lock(); + rtnl_lock(); + cb->seq = batadv_hardif_generation << 1 | 1; - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (i++ < skip) continue; - if (batadv_netlink_dump_hardif_entry(msg, portid, seq, + if (batadv_netlink_dump_hardif_entry(msg, portid, cb, hard_iface)) { i--; break; } } - rcu_read_unlock(); + rtnl_unlock(); dev_put(soft_iface); diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3d57f9981f25..8e1024217cff 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -16,7 +16,5 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include <linux/module.h> - #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 3acda26a30ca..104784be94d7 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -21,7 +21,13 @@ #include "main.h" +#include <linux/bug.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/percpu.h> +#include <linux/printk.h> #include <linux/tracepoint.h> +#include <linux/types.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM batadv diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index d21624c44665..8dcd4968cde7 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1145,14 +1145,15 @@ out: * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @common: tt local & tt global common data * * Return: Error code, or 0 on success */ static int -batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, +batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common) { @@ -1173,12 +1174,14 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, batadv_softif_vlan_put(vlan); - hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, - NLM_F_MULTI, + hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, + &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_LOCAL); if (!hdr) return -ENOBUFS; + genl_dump_check_consistent(cb, hdr); + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || @@ -1201,34 +1204,39 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request - * @seq: Sequence number of netlink message + * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @head: Pointer to the list containing the local tt entries + * @hash: hash to dump + * @bucket: bucket index to dump * @idx_s: Number of entries to skip * * Return: Error code, or 0 on success */ static int -batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, +batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, + struct netlink_callback *cb, struct batadv_priv *bat_priv, - struct hlist_head *head, int *idx_s) + struct batadv_hashtable *hash, unsigned int bucket, + int *idx_s) { struct batadv_tt_common_entry *common; int idx = 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(common, head, hash_entry) { + spin_lock_bh(&hash->list_locks[bucket]); + cb->seq = atomic_read(&hash->generation) << 1 | 1; + + hlist_for_each_entry(common, &hash->table[bucket], hash_entry) { if (idx++ < *idx_s) continue; - if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv, + if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv, common)) { - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = idx - 1; return -EMSGSIZE; } } - rcu_read_unlock(); + spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = 0; return 0; @@ -1248,7 +1256,6 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; - struct hlist_head *head; int ret; int ifindex; int bucket = cb->args[0]; @@ -1276,10 +1283,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) hash = bat_priv->tt.local_hash; while (bucket < hash->size) { - head = &hash->table[bucket]; - - if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, - bat_priv, head, &idx)) + if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv, + hash, bucket, &idx)) break; bucket++; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 45b5592de816..cbe17da36fcb 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1096,12 +1096,15 @@ struct batadv_priv_gw { /** @gateway_list: list of available gateway nodes */ struct hlist_head gateway_list; - /** @list_lock: lock protecting gateway_list & curr_gw */ + /** @list_lock: lock protecting gateway_list, curr_gw, generation */ spinlock_t list_lock; /** @curr_gw: pointer to currently selected gateway node */ struct batadv_gw_node __rcu *curr_gw; + /** @generation: current (generation) sequence number */ + unsigned int generation; + /** * @mode: gateway operation: off, client or server (see batadv_gw_modes) */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 828e87fe8027..9d79c7de234a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -607,7 +607,7 @@ static void ifup(struct net_device *netdev) int err; rtnl_lock(); - err = dev_open(netdev); + err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ef9928d7b4fb..ac2826ce162b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5711,6 +5711,12 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, return true; } + /* Check if request ended in Command Status - no way to retreive + * any extra parameters in this case. + */ + if (hdr->evt == HCI_EV_CMD_STATUS) + return false; + if (hdr->evt != HCI_EV_CMD_COMPLETE) { bt_dev_err(hdev, "last event is not cmd complete (0x%2.2x)", hdr->evt); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index e8c9ef1e1922..ca73d36cc149 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1556,7 +1556,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || mgmt_get_connectable(hdev); - if (!is_advertising_allowed(hdev, connectable)) + if (!is_advertising_allowed(hdev, connectable)) return -EPERM; /* Set require_privacy to true only when non-connectable diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2146e0f3b6f8..2a7fb517d460 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7650,17 +7650,7 @@ static int l2cap_debugfs_show(struct seq_file *f, void *p) return 0; } -static int l2cap_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, l2cap_debugfs_show, inode->i_private); -} - -static const struct file_operations l2cap_debugfs_fops = { - .open = l2cap_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(l2cap_debugfs); static struct dentry *l2cap_debugfs; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index b98225d65e87..1a635df80643 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -2166,17 +2166,7 @@ static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x) return 0; } -static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private); -} - -static const struct file_operations rfcomm_dlc_debugfs_fops = { - .open = rfcomm_dlc_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(rfcomm_dlc_debugfs); static struct dentry *rfcomm_dlc_debugfs; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index d606e9212291..aa0db1d1bd9b 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1020,17 +1020,7 @@ static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p) return 0; } -static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, rfcomm_sock_debugfs_show, inode->i_private); -} - -static const struct file_operations rfcomm_sock_debugfs_fops = { - .open = rfcomm_sock_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(rfcomm_sock_debugfs); static struct dentry *rfcomm_sock_debugfs; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 8f0f9279eac9..529b38996d8b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1173,17 +1173,7 @@ static int sco_debugfs_show(struct seq_file *f, void *p) return 0; } -static int sco_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, sco_debugfs_show, inode->i_private); -} - -static const struct file_operations sco_debugfs_fops = { - .open = sco_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(sco_debugfs); static struct dentry *sco_debugfs; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c822e626761b..621146d04c03 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1390,7 +1390,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (!smp) return NULL; - smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + smp->tfm_aes = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(smp->tfm_aes)) { BT_ERR("Unable to create AES crypto context"); goto zfree_smp; @@ -3233,7 +3233,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) if (!smp) return ERR_PTR(-ENOMEM); - tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(tfm_aes)) { BT_ERR("Unable to create AES crypto context"); kzfree(smp); @@ -3906,13 +3906,13 @@ int __init bt_selftest_smp(void) struct crypto_kpp *tfm_ecdh; int err; - tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(tfm_aes)) { BT_ERR("Unable to create AES crypto context"); return PTR_ERR(tfm_aes); } - tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); crypto_free_cipher(tfm_aes); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c89c22c49015..fa2644d276ef 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -28,12 +28,13 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, return ret; } -static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, + u32 *time) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; - u32 ret = 0, i; + u32 i; for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); @@ -49,7 +50,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) repeat = 1; time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - ret = bpf_test_run_one(prog, ctx, storage); + *ret = bpf_test_run_one(prog, ctx, storage); if (need_resched()) { if (signal_pending(current)) break; @@ -65,7 +66,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); - return ret; + return 0; } static int bpf_test_finish(const union bpf_attr *kattr, @@ -74,8 +75,18 @@ static int bpf_test_finish(const union bpf_attr *kattr, { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; + u32 copy_size = size; + + /* Clamp copy if the user has provided a size hint, but copy the full + * buffer if not to retain old behaviour. + */ + if (kattr->test.data_size_out && + copy_size > kattr->test.data_size_out) { + copy_size = kattr->test.data_size_out; + err = -ENOSPC; + } - if (data_out && copy_to_user(data_out, data, size)) + if (data_out && copy_to_user(data_out, data, copy_size)) goto out; if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; @@ -83,7 +94,8 @@ static int bpf_test_finish(const union bpf_attr *kattr, goto out; if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) goto out; - err = 0; + if (err != -ENOSPC) + err = 0; out: return err; } @@ -165,7 +177,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); - retval = bpf_test_run(prog, skb, repeat, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + if (ret) { + kfree_skb(skb); + kfree(sk); + return ret; + } if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -212,11 +229,14 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - retval = bpf_test_run(prog, &xdp, repeat, &duration); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + if (ret) + goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); +out: kfree(data); return ret; } diff --git a/net/bridge/br.c b/net/bridge/br.c index 360ad66c21e9..a5174e5001d8 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -31,6 +31,8 @@ */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); + struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; @@ -56,6 +58,17 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v br_mtu_auto_adjust(br); break; + case NETDEV_PRE_CHANGEADDR: + if (br->dev->addr_assign_type == NET_ADDR_SET) + break; + prechaddr_info = ptr; + err = dev_pre_changeaddr_notify(br->dev, + prechaddr_info->dev_addr, + extack); + if (err) + return notifier_from_errno(err); + break; + case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); @@ -175,6 +188,82 @@ static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; +/* br_boolopt_toggle - change user-controlled boolean option + * + * @br: bridge device + * @opt: id of the option to change + * @on: new option value + * @extack: extack for error messages + * + * Changes the value of the respective boolean option to @on taking care of + * any internal option value mapping and configuration. + */ +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack) +{ + switch (opt) { + case BR_BOOLOPT_NO_LL_LEARN: + br_opt_toggle(br, BROPT_NO_LL_LEARN, on); + break; + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) +{ + switch (opt) { + case BR_BOOLOPT_NO_LL_LEARN: + return br_opt_get(br, BROPT_NO_LL_LEARN); + default: + /* shouldn't be called with unsupported options */ + WARN_ON(1); + break; + } + + return 0; +} + +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack) +{ + unsigned long bitmap = bm->optmask; + int err = 0; + int opt_id; + + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { + bool on = !!(bm->optval & BIT(opt_id)); + + err = br_boolopt_toggle(br, opt_id, on, extack); + if (err) { + br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", + opt_id, br_boolopt_get(br, opt_id), on, err); + break; + } + } + + return err; +} + +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm) +{ + u32 optval = 0; + int opt_id; + + for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) + optval |= (br_boolopt_get(br, opt_id) << opt_id); + + bm->optval = optval; + bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); +} + +/* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c6abf927f0c9..013323b6dbe4 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -131,9 +131,17 @@ static int br_dev_init(struct net_device *dev) return err; } + err = br_mdb_hash_init(br); + if (err) { + free_percpu(br->stats); + br_fdb_hash_fini(br); + return err; + } + err = br_vlan_init(br); if (err) { free_percpu(br->stats); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); return err; } @@ -142,6 +150,7 @@ static int br_dev_init(struct net_device *dev) if (err) { free_percpu(br->stats); br_vlan_flush(br); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); } br_set_lockdep_class(dev); @@ -156,6 +165,7 @@ static void br_dev_uninit(struct net_device *dev) br_multicast_dev_del(br); br_multicast_uninit_stats(br); br_vlan_flush(br); + br_mdb_hash_fini(br); br_fdb_hash_fini(br); free_percpu(br->stats); } @@ -393,6 +403,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_dump = br_fdb_dump, + .ndo_fdb_get = br_fdb_get, .ndo_bridge_getlink = br_getlink, .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e56ba3912a90..fe3c758791ca 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -773,6 +773,32 @@ skip: return err; } +int br_fdb_get(struct sk_buff *skb, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, + u16 vid, u32 portid, u32 seq, + struct netlink_ext_ack *extack) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_fdb_entry *f; + int err = 0; + + rcu_read_lock(); + f = br_fdb_find_rcu(br, addr, vid); + if (!f) { + NL_SET_ERR_MSG(extack, "Fdb entry not found"); + err = -ENOENT; + goto errout; + } + + err = fdb_fill_info(skb, br, f, portid, seq, + RTM_NEWNEIGH, 0); +errout: + rcu_read_unlock(); + return err; +} + /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, u16 state, u16 flags, u16 vid, @@ -1164,3 +1190,23 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, spin_unlock_bh(&br->hash_lock); } + +void br_fdb_clear_offload(const struct net_device *dev, u16 vid) +{ + struct net_bridge_fdb_entry *f; + struct net_bridge_port *p; + + ASSERT_RTNL(); + + p = br_port_get_rtnl(dev); + if (!p) + return; + + spin_lock_bh(&p->br->hash_lock); + hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { + if (f->dst == p && f->key.vlan_id == vid) + f->offloaded = 0; + } + spin_unlock_bh(&p->br->hash_lock); +} +EXPORT_SYMBOL_GPL(br_fdb_clear_offload); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 9b46d2dc4c22..41f0a696a65f 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -650,7 +650,16 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); - err = nbp_vlan_init(p); + if (br->dev->addr_assign_type != NET_ADDR_SET) { + /* Ask for permission to use this MAC address now, even if we + * don't end up choosing it below. + */ + err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); + if (err) + goto err7; + } + + err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); goto err7; @@ -741,3 +750,15 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) if (mask & BR_NEIGH_SUPPRESS) br_recalculate_neigh_suppress_enabled(br); } + +bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) +{ + struct net_bridge_port *p; + + p = br_port_get_rtnl_rcu(dev); + if (!p) + return false; + + return p->flags & flag; +} +EXPORT_SYMBOL_GPL(br_port_flag_is_set); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 3ddca11f44c2..5ea7e56119c1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -188,7 +188,9 @@ static void __br_handle_local_finish(struct sk_buff *skb) u16 vid = 0; /* check if vlan is allowed, to avoid spoofing */ - if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) + if ((p->flags & BR_LEARNING) && + !br_opt_get(p->br, BROPT_NO_LL_LEARN) && + br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index a7ea2d431714..f69c8d91dc81 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -78,82 +78,72 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { + int idx = 0, s_idx = cb->args[1], err = 0; struct net_bridge *br = netdev_priv(dev); - struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; - int i, err = 0; - int idx = 0, s_idx = cb->args[1]; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; - mdb = rcu_dereference(br->mdb); - if (!mdb) - return 0; - nest = nla_nest_start(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; - for (i = 0; i < mdb->max; i++) { - struct net_bridge_mdb_entry *mp; + hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct net_bridge_port *port; - hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) { - if (idx < s_idx) - goto skip; + if (idx < s_idx) + goto skip; - nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); - if (nest2 == NULL) { - err = -EMSGSIZE; - goto out; - } + nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); + if (!nest2) { + err = -EMSGSIZE; + break; + } - for (pp = &mp->ports; - (p = rcu_dereference(*pp)) != NULL; - pp = &p->next) { - struct nlattr *nest_ent; - struct br_mdb_entry e; - - port = p->port; - if (!port) - continue; - - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.vid = p->addr.vid; - __mdb_entry_fill_flags(&e, p->flags); - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; + for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; + pp = &p->next) { + struct nlattr *nest_ent; + struct br_mdb_entry e; + + port = p->port; + if (!port) + continue; + + memset(&e, 0, sizeof(e)); + e.ifindex = port->dev->ifindex; + e.vid = p->addr.vid; + __mdb_entry_fill_flags(&e, p->flags); + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif - e.addr.proto = p->addr.proto; - nest_ent = nla_nest_start(skb, - MDBA_MDB_ENTRY_INFO); - if (!nest_ent) { - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } - if (nla_put_nohdr(skb, sizeof(e), &e) || - nla_put_u32(skb, - MDBA_MDB_EATTR_TIMER, - br_timer_value(&p->timer))) { - nla_nest_cancel(skb, nest_ent); - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } - nla_nest_end(skb, nest_ent); + e.addr.proto = p->addr.proto; + nest_ent = nla_nest_start(skb, MDBA_MDB_ENTRY_INFO); + if (!nest_ent) { + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; } - nla_nest_end(skb, nest2); - skip: - idx++; + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(&p->timer))) { + nla_nest_cancel(skb, nest_ent); + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; + } + nla_nest_end(skb, nest_ent); } + nla_nest_end(skb, nest2); +skip: + idx++; } out: @@ -203,8 +193,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); - /* In theory this could be wrapped to 0... */ - cb->seq = net->dev_base_seq + br_mdb_rehash_seq; + cb->seq = net->dev_base_seq; for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_EBRIDGE) { @@ -297,7 +286,6 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv) struct br_mdb_complete_info *data = priv; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; @@ -306,8 +294,7 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv) goto err; spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, &data->ip); + mp = br_mdb_ip_get(br, &data->ip); if (!mp) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; @@ -344,7 +331,7 @@ static void br_mdb_switchdev_host_port(struct net_device *dev, mdb.obj.orig_dev = dev; switch (type) { case RTM_NEWMDB: - switchdev_port_obj_add(lower_dev, &mdb.obj); + switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); break; case RTM_DELMDB: switchdev_port_obj_del(lower_dev, &mdb.obj); @@ -394,7 +381,7 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, __mdb_entry_to_br_ip(entry, &complete_info->ip); mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_mdb_complete; - if (switchdev_port_obj_add(port_dev, &mdb.obj)) + if (switchdev_port_obj_add(port_dev, &mdb.obj, NULL)) kfree(complete_info); } } else if (p && port_dev && type == RTM_DELMDB) { @@ -588,14 +575,12 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct net_bridge_mdb_htable *mdb; unsigned long now = jiffies; int err; - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, group); + mp = br_mdb_ip_get(br, group); if (!mp) { - mp = br_multicast_new_group(br, port, group); + mp = br_multicast_new_group(br, group); err = PTR_ERR_OR_ZERO(mp); if (err) return err; @@ -696,7 +681,6 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; @@ -709,9 +693,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - - mp = br_mdb_ip_get(mdb, &ip); + mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; @@ -728,7 +710,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); err = 0; if (!mp->ports && !mp->host_joined && diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 6bac0d6b7b94..3aeff0895669 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -37,6 +37,14 @@ #include "br_private.h" +static const struct rhashtable_params br_mdb_rht_params = { + .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode), + .key_offset = offsetof(struct net_bridge_mdb_entry, addr), + .key_len = sizeof(struct br_ip), + .automatic_shrinking = true, + .locks_mul = 1, +}; + static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); static void br_multicast_add_router(struct net_bridge *br, @@ -54,7 +62,6 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif -unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) { @@ -73,89 +80,58 @@ static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) return 0; } -static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip, - __u16 vid) -{ - return jhash_2words((__force u32)ip, vid, mdb->secret) & (mdb->max - 1); -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb, - const struct in6_addr *ip, - __u16 vid) +static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, + struct br_ip *dst) { - return jhash_2words(ipv6_addr_hash(ip), vid, - mdb->secret) & (mdb->max - 1); + return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); } -#endif -static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb, - struct br_ip *ip) -{ - switch (ip->proto) { - case htons(ETH_P_IP): - return __br_ip4_hash(mdb, ip->u.ip4, ip->vid); -#if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return __br_ip6_hash(mdb, &ip->u.ip6, ip->vid); -#endif - } - return 0; -} - -static struct net_bridge_mdb_entry *__br_mdb_ip_get( - struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash) +struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br, + struct br_ip *dst) { - struct net_bridge_mdb_entry *mp; - - hlist_for_each_entry_rcu(mp, &mdb->mhash[hash], hlist[mdb->ver]) { - if (br_ip_equal(&mp->addr, dst)) - return mp; - } + struct net_bridge_mdb_entry *ent; - return NULL; -} + lockdep_assert_held_once(&br->multicast_lock); -struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, - struct br_ip *dst) -{ - if (!mdb) - return NULL; + rcu_read_lock(); + ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); + rcu_read_unlock(); - return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst)); + return ent; } -static struct net_bridge_mdb_entry *br_mdb_ip4_get( - struct net_bridge_mdb_htable *mdb, __be32 dst, __u16 vid) +static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br, + __be32 dst, __u16 vid) { struct br_ip br_dst; + memset(&br_dst, 0, sizeof(br_dst)); br_dst.u.ip4 = dst; br_dst.proto = htons(ETH_P_IP); br_dst.vid = vid; - return br_mdb_ip_get(mdb, &br_dst); + return br_mdb_ip_get(br, &br_dst); } #if IS_ENABLED(CONFIG_IPV6) -static struct net_bridge_mdb_entry *br_mdb_ip6_get( - struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst, - __u16 vid) +static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, + const struct in6_addr *dst, + __u16 vid) { struct br_ip br_dst; + memset(&br_dst, 0, sizeof(br_dst)); br_dst.u.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); br_dst.vid = vid; - return br_mdb_ip_get(mdb, &br_dst); + return br_mdb_ip_get(br, &br_dst); } #endif struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb); struct br_ip ip; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) @@ -164,6 +140,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, if (BR_INPUT_SKB_CB(skb)->igmp) return NULL; + memset(&ip, 0, sizeof(ip)); ip.proto = skb->protocol; ip.vid = vid; @@ -180,70 +157,13 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return NULL; } - return br_mdb_ip_get(mdb, &ip); -} - -static void br_mdb_free(struct rcu_head *head) -{ - struct net_bridge_mdb_htable *mdb = - container_of(head, struct net_bridge_mdb_htable, rcu); - struct net_bridge_mdb_htable *old = mdb->old; - - mdb->old = NULL; - kfree(old->mhash); - kfree(old); -} - -static int br_mdb_copy(struct net_bridge_mdb_htable *new, - struct net_bridge_mdb_htable *old, - int elasticity) -{ - struct net_bridge_mdb_entry *mp; - int maxlen; - int len; - int i; - - for (i = 0; i < old->max; i++) - hlist_for_each_entry(mp, &old->mhash[i], hlist[old->ver]) - hlist_add_head(&mp->hlist[new->ver], - &new->mhash[br_ip_hash(new, &mp->addr)]); - - if (!elasticity) - return 0; - - maxlen = 0; - for (i = 0; i < new->max; i++) { - len = 0; - hlist_for_each_entry(mp, &new->mhash[i], hlist[new->ver]) - len++; - if (len > maxlen) - maxlen = len; - } - - return maxlen > elasticity ? -EINVAL : 0; -} - -void br_multicast_free_pg(struct rcu_head *head) -{ - struct net_bridge_port_group *p = - container_of(head, struct net_bridge_port_group, rcu); - - kfree(p); -} - -static void br_multicast_free_group(struct rcu_head *head) -{ - struct net_bridge_mdb_entry *mp = - container_of(head, struct net_bridge_mdb_entry, rcu); - - kfree(mp); + return br_mdb_ip_get_rcu(br, &ip); } static void br_multicast_group_expired(struct timer_list *t) { struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); struct net_bridge *br = mp->br; - struct net_bridge_mdb_htable *mdb; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&mp->timer)) @@ -255,12 +175,11 @@ static void br_multicast_group_expired(struct timer_list *t) if (mp->ports) goto out; - mdb = mlock_dereference(br->mdb, br); - - hlist_del_rcu(&mp->hlist[mdb->ver]); - mdb->size--; + rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + hlist_del_rcu(&mp->mdb_node); - call_rcu_bh(&mp->rcu, br_multicast_free_group); + kfree_rcu(mp, rcu); out: spin_unlock(&br->multicast_lock); @@ -269,14 +188,11 @@ out: static void br_multicast_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - mdb = mlock_dereference(br->mdb, br); - - mp = br_mdb_ip_get(mdb, &pg->addr); + mp = br_mdb_ip_get(br, &pg->addr); if (WARN_ON(!mp)) return; @@ -291,7 +207,7 @@ static void br_multicast_del_pg(struct net_bridge *br, del_timer(&p->timer); br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, p->flags); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) @@ -319,53 +235,6 @@ out: spin_unlock(&br->multicast_lock); } -static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max, - int elasticity) -{ - struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1); - struct net_bridge_mdb_htable *mdb; - int err; - - mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC); - if (!mdb) - return -ENOMEM; - - mdb->max = max; - mdb->old = old; - - mdb->mhash = kcalloc(max, sizeof(*mdb->mhash), GFP_ATOMIC); - if (!mdb->mhash) { - kfree(mdb); - return -ENOMEM; - } - - mdb->size = old ? old->size : 0; - mdb->ver = old ? old->ver ^ 1 : 0; - - if (!old || elasticity) - get_random_bytes(&mdb->secret, sizeof(mdb->secret)); - else - mdb->secret = old->secret; - - if (!old) - goto out; - - err = br_mdb_copy(mdb, old, elasticity); - if (err) { - kfree(mdb->mhash); - kfree(mdb); - return err; - } - - br_mdb_rehash_seq++; - call_rcu_bh(&mdb->rcu, br_mdb_free); - -out: - rcu_assign_pointer(*mdbp, mdb); - - return 0; -} - static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, __be32 group, u8 *igmp_type) @@ -589,111 +458,19 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, return NULL; } -static struct net_bridge_mdb_entry *br_multicast_get_group( - struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, int hash) -{ - struct net_bridge_mdb_htable *mdb; - struct net_bridge_mdb_entry *mp; - unsigned int count = 0; - unsigned int max; - int elasticity; - int err; - - mdb = rcu_dereference_protected(br->mdb, 1); - hlist_for_each_entry(mp, &mdb->mhash[hash], hlist[mdb->ver]) { - count++; - if (unlikely(br_ip_equal(group, &mp->addr))) - return mp; - } - - elasticity = 0; - max = mdb->max; - - if (unlikely(count > br->hash_elasticity && count)) { - if (net_ratelimit()) - br_info(br, "Multicast hash table " - "chain limit reached: %s\n", - port ? port->dev->name : br->dev->name); - - elasticity = br->hash_elasticity; - } - - if (mdb->size >= max) { - max *= 2; - if (unlikely(max > br->hash_max)) { - br_warn(br, "Multicast hash table maximum of %d " - "reached, disabling snooping: %s\n", - br->hash_max, - port ? port->dev->name : br->dev->name); - err = -E2BIG; -disable: - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); - goto err; - } - } - - if (max > mdb->max || elasticity) { - if (mdb->old) { - if (net_ratelimit()) - br_info(br, "Multicast hash table " - "on fire: %s\n", - port ? port->dev->name : br->dev->name); - err = -EEXIST; - goto err; - } - - err = br_mdb_rehash(&br->mdb, max, elasticity); - if (err) { - br_warn(br, "Cannot rehash multicast " - "hash table, disabling snooping: %s, %d, %d\n", - port ? port->dev->name : br->dev->name, - mdb->size, err); - goto disable; - } - - err = -EAGAIN; - goto err; - } - - return NULL; - -err: - mp = ERR_PTR(err); - return mp; -} - struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, - struct net_bridge_port *p, struct br_ip *group) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - int hash; int err; - mdb = rcu_dereference_protected(br->mdb, 1); - if (!mdb) { - err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0); - if (err) - return ERR_PTR(err); - goto rehash; - } - - hash = br_ip_hash(mdb, group); - mp = br_multicast_get_group(br, p, group, hash); - switch (PTR_ERR(mp)) { - case 0: - break; - - case -EAGAIN: -rehash: - mdb = rcu_dereference_protected(br->mdb, 1); - hash = br_ip_hash(mdb, group); - break; + mp = br_mdb_ip_get(br, group); + if (mp) + return mp; - default: - goto out; + if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); + return ERR_PTR(-E2BIG); } mp = kzalloc(sizeof(*mp), GFP_ATOMIC); @@ -703,11 +480,15 @@ rehash: mp->br = br; mp->addr = *group; timer_setup(&mp->timer, br_multicast_group_expired, 0); + err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + if (err) { + kfree(mp); + mp = ERR_PTR(err); + } else { + hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list); + } - hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]); - mdb->size++; - -out: return mp; } @@ -768,7 +549,7 @@ static int br_multicast_add_group(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - mp = br_multicast_new_group(br, port, group); + mp = br_multicast_new_group(br, group); err = PTR_ERR(mp); if (IS_ERR(mp)) goto err; @@ -837,6 +618,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, if (ipv6_addr_is_ll_all_nodes(group)) return 0; + memset(&br_group, 0, sizeof(br_group)); br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; @@ -1483,7 +1265,7 @@ static void br_ip4_multicast_query(struct net_bridge *br, goto out; } - mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid); + mp = br_mdb_ip4_get(br, group, vid); if (!mp) goto out; @@ -1567,7 +1349,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, goto out; } - mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid); + mp = br_mdb_ip6_get(br, group, vid); if (!mp) goto out; @@ -1601,7 +1383,6 @@ br_multicast_leave_group(struct net_bridge *br, struct bridge_mcast_own_query *own_query, const unsigned char *src) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; unsigned long now; @@ -1612,8 +1393,7 @@ br_multicast_leave_group(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - mdb = mlock_dereference(br->mdb, br); - mp = br_mdb_ip_get(mdb, group); + mp = br_mdb_ip_get(br, group); if (!mp) goto out; @@ -1629,7 +1409,7 @@ br_multicast_leave_group(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + kfree_rcu(p, rcu); br_mdb_notify(br->dev, port, group, RTM_DELMDB, p->flags); @@ -1961,8 +1741,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t) void br_multicast_init(struct net_bridge *br) { - br->hash_elasticity = 4; - br->hash_max = 512; + br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_last_member_count = 2; @@ -1999,6 +1778,7 @@ void br_multicast_init(struct net_bridge *br) timer_setup(&br->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif + INIT_HLIST_HEAD(&br->mdb_list); } static void __br_multicast_open(struct net_bridge *br, @@ -2033,40 +1813,20 @@ void br_multicast_stop(struct net_bridge *br) void br_multicast_dev_del(struct net_bridge *br) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; - struct hlist_node *n; - u32 ver; - int i; + struct hlist_node *tmp; spin_lock_bh(&br->multicast_lock); - mdb = mlock_dereference(br->mdb, br); - if (!mdb) - goto out; - - br->mdb = NULL; - - ver = mdb->ver; - for (i = 0; i < mdb->max; i++) { - hlist_for_each_entry_safe(mp, n, &mdb->mhash[i], - hlist[ver]) { - del_timer(&mp->timer); - call_rcu_bh(&mp->rcu, br_multicast_free_group); - } - } - - if (mdb->old) { - spin_unlock_bh(&br->multicast_lock); - rcu_barrier_bh(); - spin_lock_bh(&br->multicast_lock); - WARN_ON(mdb->old); + hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) { + del_timer(&mp->timer); + rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, + br_mdb_rht_params); + hlist_del_rcu(&mp->mdb_node); + kfree_rcu(mp, rcu); } - - mdb->old = mdb; - call_rcu_bh(&mdb->rcu, br_mdb_free); - -out: spin_unlock_bh(&br->multicast_lock); + + rcu_barrier(); } int br_multicast_set_router(struct net_bridge *br, unsigned long val) @@ -2176,9 +1936,7 @@ static void br_multicast_start_querier(struct net_bridge *br, int br_multicast_toggle(struct net_bridge *br, unsigned long val) { - struct net_bridge_mdb_htable *mdb; struct net_bridge_port *port; - int err = 0; spin_lock_bh(&br->multicast_lock); if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) @@ -2192,21 +1950,6 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) if (!netif_running(br->dev)) goto unlock; - mdb = mlock_dereference(br->mdb, br); - if (mdb) { - if (mdb->old) { - err = -EEXIST; -rollback: - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); - goto unlock; - } - - err = br_mdb_rehash(&br->mdb, mdb->max, - br->hash_elasticity); - if (err) - goto rollback; - } - br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port(port); @@ -2214,7 +1957,7 @@ rollback: unlock: spin_unlock_bh(&br->multicast_lock); - return err; + return 0; } bool br_multicast_enabled(const struct net_device *dev) @@ -2271,45 +2014,6 @@ unlock: return 0; } -int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) -{ - int err = -EINVAL; - u32 old; - struct net_bridge_mdb_htable *mdb; - - spin_lock_bh(&br->multicast_lock); - if (!is_power_of_2(val)) - goto unlock; - - mdb = mlock_dereference(br->mdb, br); - if (mdb && val < mdb->size) - goto unlock; - - err = 0; - - old = br->hash_max; - br->hash_max = val; - - if (mdb) { - if (mdb->old) { - err = -EEXIST; -rollback: - br->hash_max = old; - goto unlock; - } - - err = br_mdb_rehash(&br->mdb, br->hash_max, - br->hash_elasticity); - if (err) - goto rollback; - } - -unlock: - spin_unlock_bh(&br->multicast_lock); - - return err; -} - int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) { /* Currently we support only version 2 and 3 */ @@ -2646,3 +2350,13 @@ void br_multicast_get_stats(const struct net_bridge *br, } memcpy(dest, &tdst, sizeof(*dest)); } + +int br_mdb_hash_init(struct net_bridge *br) +{ + return rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); +} + +void br_mdb_hash_fini(struct net_bridge *br) +{ + rhashtable_destroy(&br->mdb_hash_tbl); +} diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index b1b5e8516724..d21a23698410 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -132,10 +132,7 @@ static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); static void nf_bridge_info_free(struct sk_buff *skb) { - if (skb->nf_bridge) { - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; - } + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) @@ -148,19 +145,7 @@ static inline struct net_device *bridge_parent(const struct net_device *dev) static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; - - if (refcount_read(&nf_bridge->use) > 1) { - struct nf_bridge_info *tmp = nf_bridge_alloc(skb); - - if (tmp) { - memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); - refcount_set(&tmp->use, 1); - } - nf_bridge_put(nf_bridge); - nf_bridge = tmp; - } - return nf_bridge; + return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) @@ -247,7 +232,9 @@ drop: void nf_bridge_update_protocol(struct sk_buff *skb) { - switch (skb->nf_bridge->orig_proto) { + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + switch (nf_bridge->orig_proto) { case BRNF_PROTO_8021Q: skb->protocol = htons(ETH_P_8021Q); break; @@ -506,7 +493,6 @@ static unsigned int br_nf_pre_routing(void *priv, if (br_validate_ipv4(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); if (!nf_bridge_alloc(skb)) return NF_DROP; if (!setup_pre_routing(skb)) @@ -569,7 +555,8 @@ static unsigned int br_nf_forward_ip(void *priv, struct net_device *parent; u_int8_t pf; - if (!skb->nf_bridge) + nf_bridge = nf_bridge_info_get(skb); + if (!nf_bridge) return NF_ACCEPT; /* Need exclusive nf_bridge_info since we might have multiple @@ -671,10 +658,8 @@ static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff return 0; } - if (data->vlan_tci) { - skb->vlan_tci = data->vlan_tci; - skb->vlan_proto = data->vlan_proto; - } + if (data->vlan_proto) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); @@ -703,7 +688,9 @@ br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { - if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE) return PPPOE_SES_HLEN; return 0; } @@ -740,8 +727,13 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff data = this_cpu_ptr(&brnf_frag_data_storage); - data->vlan_tci = skb->vlan_tci; - data->vlan_proto = skb->vlan_proto; + if (skb_vlan_tag_present(skb)) { + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + } else { + data->vlan_proto = 0; + } + data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; @@ -836,7 +828,9 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge && !nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; @@ -874,7 +868,9 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) static int br_nf_dev_xmit(struct sk_buff *skb) { - if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) { + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge && nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 96c072e71ea2..94039f588f1d 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -224,8 +224,8 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, if (br_validate_ipv6(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) + nf_bridge = nf_bridge_alloc(skb); + if (!nf_bridge) return NF_DROP; if (!setup_pre_routing(skb)) return NF_DROP; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3345f1984542..9c07591b0232 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -525,7 +525,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, - int cmd, struct bridge_vlan_info *vinfo, bool *changed) + int cmd, struct bridge_vlan_info *vinfo, bool *changed, + struct netlink_ext_ack *extack) { bool curr_change; int err = 0; @@ -537,11 +538,11 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, * per-VLAN entry as well */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags, - &curr_change); + &curr_change, extack); } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags, - &curr_change); + &curr_change, extack); } if (curr_change) *changed = true; @@ -568,7 +569,8 @@ static int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, - bool *changed) + bool *changed, + struct netlink_ext_ack *extack) { if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK) return -EINVAL; @@ -598,7 +600,8 @@ static int br_process_vlan_info(struct net_bridge *br, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { tmp_vinfo.vid = v; - err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed); + err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed, + extack); if (err) break; } @@ -607,13 +610,14 @@ static int br_process_vlan_info(struct net_bridge *br, return err; } - return br_vlan_info(br, p, cmd, vinfo_curr, changed); + return br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); } static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, - int cmd, bool *changed) + int cmd, bool *changed, + struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo_curr = NULL; struct bridge_vlan_info *vinfo_last = NULL; @@ -643,7 +647,8 @@ static int br_afspec(struct net_bridge *br, return -EINVAL; vinfo_curr = nla_data(attr); err = br_process_vlan_info(br, p, cmd, vinfo_curr, - &vinfo_last, changed); + &vinfo_last, changed, + extack); if (err) return err; break; @@ -850,7 +855,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) } /* Change state and parameters on port. */ -int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) +int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, + struct netlink_ext_ack *extack) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct nlattr *tb[IFLA_BRPORT_MAX + 1]; @@ -897,7 +903,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) } if (afspec) - err = br_afspec(br, p, afspec, RTM_SETLINK, &changed); + err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack); if (changed) br_ifinfo_notify(RTM_NEWLINK, br, p); @@ -923,7 +929,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (!p && !(dev->priv_flags & IFF_EBRIDGE)) return -EINVAL; - err = br_afspec(br, p, afspec, RTM_DELLINK, &changed); + err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels @@ -1035,6 +1041,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, + [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_boolopt_multi) }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1103,7 +1111,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); - err = __br_vlan_set_default_pvid(br, defpvid); + err = __br_vlan_set_default_pvid(br, defpvid, extack); if (err) return err; } @@ -1167,9 +1175,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_SNOOPING]) { u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); - err = br_multicast_toggle(br, mcast_snooping); - if (err) - return err; + br_multicast_toggle(br, mcast_snooping); } if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { @@ -1187,19 +1193,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } - if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) { - u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]); - - br->hash_elasticity = val; - } + if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) + br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", + RHT_ELASTICITY); - if (data[IFLA_BR_MCAST_HASH_MAX]) { - u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); - - err = br_multicast_set_hash_max(br, hash_max); - if (err) - return err; - } + if (data[IFLA_BR_MCAST_HASH_MAX]) + br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); @@ -1296,6 +1295,15 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], } #endif + if (data[IFLA_BR_MULTI_BOOLOPT]) { + struct br_boolopt_multi *bm; + + bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); + err = br_boolopt_multi_toggle(br, bm, extack); + if (err) + return err; + } + return 0; } @@ -1374,6 +1382,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif + nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ 0; } @@ -1387,6 +1396,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); + struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); @@ -1403,6 +1413,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; + br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || @@ -1420,7 +1431,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || - nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr)) + nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || + nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING @@ -1442,8 +1454,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) br_opt_get(br, BROPT_MULTICAST_QUERIER)) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || - nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, - br->hash_elasticity) || + nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_last_member_count) || diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 04c19a37e500..d240b3e7919f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -31,6 +31,8 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) +#define BR_MULTICAST_DEFAULT_HASH_MAX 4096 + #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ @@ -213,23 +215,14 @@ struct net_bridge_port_group { }; struct net_bridge_mdb_entry { - struct hlist_node hlist[2]; + struct rhash_head rhnode; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; struct rcu_head rcu; struct timer_list timer; struct br_ip addr; bool host_joined; -}; - -struct net_bridge_mdb_htable { - struct hlist_head *mhash; - struct rcu_head rcu; - struct net_bridge_mdb_htable *old; - u32 size; - u32 max; - u32 secret; - u32 ver; + struct hlist_node mdb_node; }; struct net_bridge_port { @@ -328,6 +321,7 @@ enum net_bridge_opts { BROPT_NEIGH_SUPPRESS_ENABLED, BROPT_MTU_SET_BY_USER, BROPT_VLAN_STATS_PER_PORT, + BROPT_NO_LL_LEARN, }; struct net_bridge { @@ -380,7 +374,6 @@ struct net_bridge { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - u32 hash_elasticity; u32 hash_max; u32 multicast_last_member_count; @@ -399,7 +392,9 @@ struct net_bridge { unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; - struct net_bridge_mdb_htable __rcu *mdb; + struct rhashtable mdb_hash_tbl; + + struct hlist_head mdb_list; struct hlist_head router_list; struct timer_list multicast_router_timer; @@ -507,6 +502,14 @@ static inline int br_opt_get(const struct net_bridge *br, return test_bit(opt, &br->options); } +int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, + struct netlink_ext_ack *extack); +int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt); +int br_boolopt_multi_toggle(struct net_bridge *br, + struct br_boolopt_multi *bm, + struct netlink_ext_ack *extack); +void br_boolopt_multi_get(const struct net_bridge *br, + struct br_boolopt_multi *bm); void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); /* br_device.c */ @@ -572,6 +575,9 @@ int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *fdev, int *idx); +int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, + const unsigned char *addr, u16 vid, u32 portid, u32 seq, + struct netlink_ext_ack *extack); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, @@ -650,7 +656,6 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING -extern unsigned int br_mdb_rehash_seq; int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid); struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, @@ -675,17 +680,15 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val); #endif struct net_bridge_mdb_entry * -br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst); +br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); struct net_bridge_mdb_entry * -br_multicast_new_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group); -void br_multicast_free_pg(struct rcu_head *head); +br_multicast_new_group(struct net_bridge *br, struct br_ip *group); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src); -void br_mdb_init(void); -void br_mdb_uninit(void); +int br_mdb_hash_init(struct net_bridge *br); +void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_ip *group, int type, u8 flags); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, @@ -697,6 +700,8 @@ void br_multicast_uninit_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest); +void br_mdb_init(void); +void br_mdb_uninit(void); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -822,6 +827,15 @@ static inline void br_mdb_uninit(void) { } +static inline int br_mdb_hash_init(struct net_bridge *br) +{ + return 0; +} + +static inline void br_mdb_hash_fini(struct net_bridge *br) +{ +} + static inline void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, @@ -857,7 +871,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, - bool *changed); + bool *changed, struct netlink_ext_ack *extack); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); @@ -870,12 +884,13 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); -int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, + struct netlink_ext_ack *extack); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, - bool *changed); + bool *changed, struct netlink_ext_ack *extack); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); -int nbp_vlan_init(struct net_bridge_port *port); +int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, struct br_vlan_stats *stats); @@ -912,7 +927,7 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) int err = 0; if (skb_vlan_tag_present(skb)) { - *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK; + *vid = skb_vlan_tag_get_id(skb); } else { *vid = 0; err = -EINVAL; @@ -960,7 +975,7 @@ static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, } static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, - bool *changed) + bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; @@ -985,7 +1000,7 @@ static inline int br_vlan_init(struct net_bridge *br) } static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, - bool *changed) + bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; @@ -1006,7 +1021,8 @@ static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group return NULL; } -static inline int nbp_vlan_init(struct net_bridge_port *port) +static inline int nbp_vlan_init(struct net_bridge_port *port, + struct netlink_ext_ack *extack) { return 0; } @@ -1127,7 +1143,8 @@ int br_netlink_init(void); void br_netlink_fini(void); void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port); -int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); +int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, + struct netlink_ext_ack *extack); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); @@ -1162,7 +1179,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long mask); void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type); -int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags); +int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, + struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) @@ -1194,7 +1212,8 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, } static inline int br_switchdev_port_vlan_add(struct net_device *dev, - u16 vid, u16 flags) + u16 vid, u16 flags, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index b993df770675..035ff59d9cbd 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -140,7 +140,8 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) } } -int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags) +int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, + struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, @@ -150,7 +151,7 @@ int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags) .vid_end = vid, }; - return switchdev_port_obj_add(dev, &v.obj); + return switchdev_port_obj_add(dev, &v.obj, extack); } int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 60182bef6341..b05b94e9c595 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -328,6 +328,27 @@ static ssize_t flush_store(struct device *d, } static DEVICE_ATTR_WO(flush); +static ssize_t no_linklocal_learn_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); +} + +static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val) +{ + return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, NULL); +} + +static ssize_t no_linklocal_learn_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_no_linklocal_learn); +} +static DEVICE_ATTR_RW(no_linklocal_learn); + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) @@ -403,13 +424,13 @@ static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { - struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->hash_elasticity); + return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val) { - br->hash_elasticity = val; + br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", + RHT_ELASTICITY); return 0; } @@ -428,10 +449,16 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, return sprintf(buf, "%u\n", br->hash_max); } +static int set_hash_max(struct net_bridge *br, unsigned long val) +{ + br->hash_max = val; + return 0; +} + static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, br_multicast_set_hash_max); + return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); @@ -841,6 +868,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, + &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 7c87a2fe5248..88715edb119a 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -320,9 +320,6 @@ static ssize_t brport_store(struct kobject *kobj, if (!rtnl_trylock()) return restart_syscall(); - if (!p->dev || !p->br) - goto out_unlock; - if (brport_attr->store_raw) { char *buf_copy; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index e84be08b8285..4a2f31157ef5 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -80,14 +80,14 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, - u16 vid, u16 flags) + u16 vid, u16 flags, struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ - err = br_switchdev_port_vlan_add(dev, vid, flags); + err = br_switchdev_port_vlan_add(dev, vid, flags, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, vid); return err; @@ -139,7 +139,9 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, /* Returns a master vlan, if it didn't exist it gets created. In all cases a * a reference is taken to the master vlan before returning. */ -static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid) +static struct net_bridge_vlan * +br_vlan_get_master(struct net_bridge *br, u16 vid, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; @@ -150,7 +152,7 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid bool changed; /* missing global ctx, create it now */ - if (br_vlan_add(br, vid, 0, &changed)) + if (br_vlan_add(br, vid, 0, &changed, extack)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) @@ -214,7 +216,8 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu) * 4. same as 3 but with both master and brentry flags set so the entry * will be used for filtering in both the port and the bridge */ -static int __vlan_add(struct net_bridge_vlan *v, u16 flags) +static int __vlan_add(struct net_bridge_vlan *v, u16 flags, + struct netlink_ext_ack *extack) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; @@ -239,7 +242,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = __vlan_vid_add(dev, br, v->vid, flags); + err = __vlan_vid_add(dev, br, v->vid, flags, extack); if (err) goto out; @@ -249,12 +252,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) err = br_vlan_add(br, v->vid, flags | BRIDGE_VLAN_INFO_BRENTRY, - &changed); + &changed, extack); if (err) goto out_filt; } - masterv = br_vlan_get_master(br, v->vid); + masterv = br_vlan_get_master(br, v->vid, extack); if (!masterv) goto out_filt; v->brvlan = masterv; @@ -269,7 +272,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) v->stats = masterv->stats; } } else { - err = br_switchdev_port_vlan_add(dev, v->vid, flags); + err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); if (err && err != -EOPNOTSUPP) goto out; } @@ -421,7 +424,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, } if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { @@ -494,8 +497,8 @@ static bool __allowed_ingress(const struct net_bridge *br, __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. - * At this point, We know that skb->vlan_tci had - * VLAN_TAG_PRESENT bit and its VID field was 0x000. + * At this point, we know that skb->vlan_tci VID + * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; @@ -591,11 +594,12 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) static int br_vlan_add_existing(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, - u16 flags, bool *changed) + u16 flags, bool *changed, + struct netlink_ext_ack *extack) { int err; - err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags); + err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, extack); if (err && err != -EOPNOTSUPP) return err; @@ -634,7 +638,8 @@ err_flags: * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ -int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) +int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; @@ -646,7 +651,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) - return br_vlan_add_existing(br, vg, vlan, flags, changed); + return br_vlan_add_existing(br, vg, vlan, flags, changed, + extack); vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) @@ -663,7 +669,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) vlan->br = br; if (flags & BRIDGE_VLAN_INFO_BRENTRY) refcount_set(&vlan->refcnt, 1); - ret = __vlan_add(vlan, flags); + ret = __vlan_add(vlan, flags, extack); if (ret) { free_percpu(vlan->stats); kfree(vlan); @@ -914,7 +920,8 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) br->default_pvid = 0; } -int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) +int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, + struct netlink_ext_ack *extack) { const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; @@ -946,7 +953,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, - &vlchange); + &vlchange, extack); if (err) goto out; br_vlan_delete(br, old_pvid); @@ -966,7 +973,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, - &vlchange); + &vlchange, extack); if (err) goto err_port; nbp_vlan_delete(p, old_pvid); @@ -988,7 +995,7 @@ err_port: nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, - &vlchange); + &vlchange, NULL); nbp_vlan_delete(p, pvid); } @@ -998,7 +1005,7 @@ err_port: BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, - &vlchange); + &vlchange, NULL); br_vlan_delete(br, pvid); } goto out; @@ -1021,7 +1028,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) err = -EPERM; goto out; } - err = __br_vlan_set_default_pvid(br, pvid); + err = __br_vlan_set_default_pvid(br, pvid, NULL); out: return err; } @@ -1047,7 +1054,7 @@ int br_vlan_init(struct net_bridge *br) rcu_assign_pointer(br->vlgrp, vg); ret = br_vlan_add(br, 1, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_BRENTRY, &changed); + BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); if (ret) goto err_vlan_add; @@ -1064,7 +1071,7 @@ err_rhtbl: goto out; } -int nbp_vlan_init(struct net_bridge_port *p) +int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->br->dev, @@ -1097,7 +1104,7 @@ int nbp_vlan_init(struct net_bridge_port *p) ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, - &changed); + &changed, extack); if (ret) goto err_vlan_add; } @@ -1122,7 +1129,7 @@ err_vlan_enabled: * changed must be true only if the vlan was created or updated */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, - bool *changed) + bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan *vlan; int ret; @@ -1133,7 +1140,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { /* Pass the flags to the hardware bridge */ - ret = br_switchdev_port_vlan_add(port->dev, vid, flags); + ret = br_switchdev_port_vlan_add(port->dev, vid, flags, extack); if (ret && ret != -EOPNOTSUPP) return ret; *changed = __vlan_add_flags(vlan, flags); @@ -1147,7 +1154,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, vlan->vid = vid; vlan->port = port; - ret = __vlan_add(vlan, flags); + ret = __vlan_add(vlan, flags, extack); if (ret) kfree(vlan); else @@ -1217,9 +1224,13 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; + struct net_bridge_port *p; ASSERT_RTNL(); - if (netif_is_bridge_master(dev)) + p = br_port_get_check_rtnl(dev); + if (p) + vg = nbp_vlan_group(p); + else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; diff --git a/net/can/raw.c b/net/can/raw.c index 3aab7664933f..c70207537488 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -771,7 +771,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err < 0) goto free_skb; - sock_tx_timestamp(sk, sk->sk_tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sk->sk_tsflags); skb->dev = dev; skb->sk = sk; diff --git a/net/compat.c b/net/compat.c index 47a614b370cd..f7084780a8f8 100644 --- a/net/compat.c +++ b/net/compat.c @@ -810,34 +810,23 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen); } -static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags, - struct old_timespec32 __user *timeout) +COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct __kernel_timespec __user *, timeout) { - int datagrams; - struct timespec64 ktspec; - - if (timeout == NULL) - return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, NULL); - - if (compat_get_timespec64(&ktspec, timeout)) - return -EFAULT; - - datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout)) - datagrams = -EFAULT; - - return datagrams; + return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, timeout, NULL); } +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct old_timespec32 __user *, timeout) { - return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); + return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, NULL, timeout); } +#endif COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { @@ -925,8 +914,9 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); break; case SYS_RECVMMSG: - ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3], - compat_ptr(a[4])); + ret = __sys_recvmmsg(a0, compat_ptr(a1), a[2], + a[3] | MSG_CMSG_COMPAT, NULL, + compat_ptr(a[4])); break; case SYS_ACCEPT4: ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]); diff --git a/net/core/datagram.c b/net/core/datagram.c index 57f3a6fcfc1e..b2651bb6d2a3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -408,27 +408,20 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); -/** - * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: iovec iterator to copy to - * @len: amount of data to copy from buffer to iovec - */ -int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len) +int __skb_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, bool fault_short, + size_t (*cb)(const void *, size_t, void *, struct iov_iter *), + void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; - trace_skb_copy_datagram_iovec(skb, len); - /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; - n = copy_to_iter(skb->data + offset, copy, to); + n = cb(skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; @@ -445,11 +438,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { + struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); + if (copy > len) copy = len; - n = copy_page_to_iter(skb_frag_page(frag), - frag->page_offset + offset - - start, copy, to); + n = cb(vaddr + frag->page_offset + + offset - start, copy, data, to); + kunmap(page); offset += n; if (n != copy) goto short_copy; @@ -468,8 +464,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_iter(frag_iter, offset - start, - to, copy)) + if (__skb_datagram_iter(frag_iter, offset - start, + to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; @@ -490,11 +486,50 @@ fault: return -EFAULT; short_copy: - if (iov_iter_count(to)) + if (fault_short || iov_iter_count(to)) goto fault; return 0; } + +/** + * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator + * and update a hash. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @hash: hash request to update + */ +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash) +{ + return __skb_datagram_iter(skb, offset, to, len, true, + hash_and_copy_to_iter, hash); +} +EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); + +static size_t simple_copy_to_iter(const void *addr, size_t bytes, + void *data __always_unused, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +/** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + trace_skb_copy_datagram_iovec(skb, len); + return __skb_datagram_iter(skb, offset, to, len, false, + simple_copy_to_iter, NULL); +} EXPORT_SYMBOL(skb_copy_datagram_iter); /** @@ -645,131 +680,22 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) } EXPORT_SYMBOL(zerocopy_sg_from_iter); +/** + * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * and update a checksum. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @csump: checksum pointer + */ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - int start = skb_headlen(skb); - int i, copy = start - offset, start_off = offset; - struct sk_buff *frag_iter; - int pos = 0; - int n; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); - offset += n; - if (n != copy) - goto fault; - if ((len -= copy) == 0) - return 0; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); - - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(vaddr + frag->page_offset + - offset - start, copy, - &csum2, to); - kunmap(page); - offset += n; - if (n != copy) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if (!(len -= copy)) - return 0; - pos += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - if (copy > len) - copy = len; - if (skb_copy_and_csum_datagram(frag_iter, - offset - start, - to, copy, - &csum2)) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if ((len -= copy) == 0) - return 0; - offset += copy; - pos += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - iov_iter_revert(to, offset - start_off); - return -EFAULT; -} - -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) -{ - __sum16 sum; - - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - if (!skb_shared(skb)) - skb->csum_valid = !sum; - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete_head); - -__sum16 __skb_checksum_complete(struct sk_buff *skb) -{ - __wsum csum; - __sum16 sum; - - csum = skb_checksum(skb, 0, skb->len, 0); - - /* skb->csum holds pseudo checksum */ - sum = csum_fold(csum_add(skb->csum, csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - - if (!skb_shared(skb)) { - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; - } - - return sum; + return __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, csump); } -EXPORT_SYMBOL(__skb_checksum_complete); /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. @@ -810,7 +736,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(NULL); + netdev_rx_csum_fault(NULL, skb); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index ddc551f24ba2..1b5a4410be0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> +#include <linux/indirect_call_wrapper.h> #include "net-sysfs.h" @@ -162,6 +163,9 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); static struct napi_struct *napi_by_id(unsigned int napi_id); /* @@ -1361,7 +1365,7 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1377,7 +1381,7 @@ static int __dev_open(struct net_device *dev) */ netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; @@ -1406,7 +1410,8 @@ static int __dev_open(struct net_device *dev) /** * dev_open - prepare an interface for use. - * @dev: device to open + * @dev: device to open + * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally @@ -1416,14 +1421,14 @@ static int __dev_open(struct net_device *dev) * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ -int dev_open(struct net_device *dev) +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; @@ -1585,6 +1590,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1733,6 +1739,18 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1744,11 +1762,7 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info = { - .dev = dev, - }; - - return call_netdevice_notifiers_info(val, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -2175,6 +2189,20 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void reset_xps_maps(struct net_device *dev, + struct xps_dev_maps *dev_maps, + bool is_rxqs_map) +{ + if (is_rxqs_map) { + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + } + static_key_slow_dec_cpuslocked(&xps_needed); + kfree_rcu(dev_maps, rcu); +} + static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, struct xps_dev_maps *dev_maps, unsigned int nr_ids, u16 offset, u16 count, bool is_rxqs_map) @@ -2186,18 +2214,15 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); - if (!active) { - if (is_rxqs_map) { - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (!is_rxqs_map) { + for (i = offset + (count - 1); count--; i--) { + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); } - kfree_rcu(dev_maps, rcu); } } @@ -2234,10 +2259,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, false); out_no_maps: - if (static_key_enabled(&xps_rxqs_needed)) - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - - static_key_slow_dec_cpuslocked(&xps_needed); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2355,9 +2376,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) goto out_no_new_maps; - static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) - static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + if (!dev_maps) { + /* Increment static keys at most once per type */ + static_key_slow_inc_cpuslocked(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + } for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { @@ -2455,13 +2479,8 @@ out_no_new_maps: } /* free map if not active */ - if (!active) { - if (is_rxqs_map) - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - else - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - kfree_rcu(dev_maps, rcu); - } + if (!active) + reset_xps_maps(dev, dev_maps, is_rxqs_map); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -3091,10 +3110,17 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + if (dev) + pr_err("dev features: %pNF\n", &dev->features); + pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", + skb->len, skb->data_len, skb->pkt_type, + skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, + skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level); dump_stack(); } } @@ -4520,9 +4546,14 @@ static int netif_rx_internal(struct sk_buff *skb) int netif_rx(struct sk_buff *skb) { + int ret; + trace_netif_rx_entry(skb); - return netif_rx_internal(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_rx); @@ -4537,6 +4568,7 @@ int netif_rx_ni(struct sk_buff *skb) if (local_softirq_pending()) do_softirq(); preempt_enable(); + trace_netif_rx_ni_exit(err); return err; } @@ -4889,7 +4921,7 @@ skip_classify: * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; @@ -5009,7 +5041,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; - list_del(&skb->list); + skb_list_del_init(skb); __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; @@ -5165,7 +5197,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(netdev_tstamp_prequeue, skb); - list_del(&skb->list); + skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } @@ -5176,7 +5208,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) rcu_read_lock(); list_for_each_entry_safe(skb, next, head, list) { xdp_prog = rcu_dereference(skb->dev->xdp_prog); - list_del(&skb->list); + skb_list_del_init(skb); if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) list_add_tail(&skb->list, &sublist); } @@ -5195,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) if (cpu >= 0) { /* Will be handled, remove from list */ - list_del(&skb->list); + skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } @@ -5222,9 +5254,14 @@ static void netif_receive_skb_list_internal(struct list_head *head) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + trace_netif_receive_skb_entry(skb); - return netif_receive_skb_internal(skb); + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -5244,9 +5281,12 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; - list_for_each_entry(skb, head, list) - trace_netif_receive_skb_list_entry(skb); + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); @@ -5299,6 +5339,8 @@ static void flush_all_backlogs(void) put_online_cpus(); } +INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct sk_buff *skb) { struct packet_offload *ptype; @@ -5318,7 +5360,9 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb, 0); + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); break; } rcu_read_unlock(); @@ -5357,11 +5401,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - u32 i; + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - if (test_bit(i, &napi->gro_bitmask)) - __napi_gro_flush_chain(napi, i, flush_old); + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); @@ -5386,7 +5432,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5461,6 +5509,10 @@ static void gro_flush_oldest(struct list_head *head) napi_gro_complete(oldest); } +INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, + struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -5510,7 +5562,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(gro_head, skb); + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + gro_head, skb); break; } rcu_read_unlock(); @@ -5634,12 +5688,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + gro_result_t ret; + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + trace_napi_gro_receive_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_receive); @@ -5652,7 +5711,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; @@ -5757,6 +5816,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) gro_result_t napi_gro_frags(struct napi_struct *napi) { + gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); if (!skb) @@ -5764,7 +5824,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_frags); @@ -5780,10 +5843,11 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; @@ -6204,8 +6268,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) - pr_err_once("netif_napi_add() called with weight %d on device %s\n", - weight, dev->name); + netdev_err_once(dev, "%s() called with weight %d\n", __func__, + weight); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; @@ -7462,7 +7526,8 @@ unsigned int dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -7499,7 +7564,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if (old_flags & IFF_UP) __dev_close(dev); else - ret = __dev_open(dev); + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { @@ -7559,16 +7624,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, * dev_change_flags - change device settings * @dev: device * @flags: device state flags + * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; @@ -7701,13 +7768,36 @@ void dev_set_group(struct net_device *dev, int new_group) EXPORT_SYMBOL(dev_set_group); /** + * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + */ +int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL(dev_pre_changeaddr_notify); + +/** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address + * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -7718,6 +7808,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; + err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + if (err) + return err; err = ops->ndo_set_mac_address(dev, sa); if (err) return err; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index d884d8f5f0e5..a6723b306717 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,6 +278,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + +/** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync @@ -401,6 +498,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); + err = dev_pre_changeaddr_notify(dev, addr, NULL); + if (err) + return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 90e8aa36881e..31380fd5a4e2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ @@ -246,7 +246,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a4b29a13d31..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) diff --git a/net/core/filter.c b/net/core/filter.c index e521c5ebc7d1..447dd1bad31f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -296,22 +296,18 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_VLAN_TAG: - case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); - if (skb_field == SKF_AD_VLAN_TAG) { - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, - ~VLAN_TAG_PRESENT); - } else { - /* dst_reg >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); - /* dst_reg &= 1 */ + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); - } break; } @@ -467,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { @@ -2428,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3908,6 +4073,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -4825,47 +5010,40 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; - int dif = 0; - - if (skb->dev) - dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - u16 hnum = ntohs(tuple->ipv6.dport); - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, tuple->ipv6.dport, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4882,31 +5060,34 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; - if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + if (unlikely(family == AF_UNSPEC || flags || + !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); - if (netns_id) { + sdif = inet6_sdif(skb); + + if ((s32)netns_id < 0) { + net = caller_net; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); - } else { - net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); } if (sk) @@ -4915,6 +5096,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4964,6 +5164,87 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -4986,6 +5267,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5070,6 +5352,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -5214,6 +5504,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5240,6 +5538,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } @@ -5264,6 +5564,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } @@ -5436,8 +5738,12 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): - if (size != sizeof(struct bpf_flow_keys *)) + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) return false; break; default: @@ -5465,8 +5771,10 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5490,7 +5798,8 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -5505,6 +5814,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5531,7 +5844,9 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5741,6 +6056,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -5757,7 +6073,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5959,7 +6275,9 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5995,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size, if (type == BPF_WRITE) return false; + if (off % size != 0) + return false; + switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; @@ -6006,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - default: + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; - } - - if (off < 0 || off >= sizeof(struct sk_msg_md)) - return false; - if (off % size != 0) + break; + default: return false; - + } return true; } @@ -6040,12 +6365,14 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6140,19 +6467,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - case offsetof(struct __sk_buff, vlan_tci): - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); - if (si->off == offsetof(struct __sk_buff, vlan_tci)) { - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, - ~VLAN_TAG_PRESENT); - } else { - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); - } break; case offsetof(struct __sk_buff, cb[0]) ... @@ -6355,6 +6682,33 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + break; + + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); } return insn - insn_buf; @@ -7071,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, int off; #endif + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), @@ -7183,6 +7540,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; } return insn - insn_buf; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 588f475019d4..9f2840510e63 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -783,6 +783,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, /* Pass parameters to the BPF program */ cb->qdisc_cb.flow_keys = &flow_keys; flow_keys.nhoff = nhoff; + flow_keys.thoff = nhoff; bpf_compute_data_pointers((struct sk_buff *)skb); result = BPF_PROG_RUN(attached, skb); @@ -790,9 +791,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, /* Restore state */ memcpy(cb, &cb_saved, sizeof(cb_saved)); + flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); + flow_keys.thoff = clamp_t(u16, flow_keys.thoff, + flow_keys.nhoff, skb->len); + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); - key_control->thoff = min_t(u16, key_control->thoff, skb->len); rcu_read_unlock(); return result == BPF_OK; } @@ -952,8 +956,7 @@ proto_again: if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); - key_vlan->vlan_priority = - (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 4b54e5f107c6..acf45ddbe924 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -84,6 +84,7 @@ void gro_cells_destroy(struct gro_cells *gcells) for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + napi_disable(&cell->napi); netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 41954e42a2de..763a7b08df67 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,21 +118,77 @@ unsigned long neigh_rand_reach_time(unsigned long base) } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } +} + +static void neigh_update_gc_list(struct neighbour *n) +{ + bool on_gc_list, exempt_from_gc; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + + /* remove from the gc list if new state is permanent or if neighbor + * is externally learned; otherwise entry should be on the gc list + */ + exempt_from_gc = n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_LEARNED; + on_gc_list = !list_empty(&n->gc_list); + + if (exempt_from_gc && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!exempt_from_gc && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } + + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} -static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, - struct neighbour __rcu **np, struct neigh_table *tbl) +static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, + int *notify) +{ + bool rc = false; + u8 ndm_flags; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return rc; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + rc = true; + *notify = 1; + } + + return rc; +} + +static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, + struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); - if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && - !(n->flags & flags)) { + if (refcount_read(&n->refcnt) == 1) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); - n->dead = 1; + neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); @@ -158,7 +214,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) - return neigh_del(n, 0, 0, np, tbl); + return neigh_del(n, np, tbl); np = &n->next; } return false; @@ -166,32 +222,29 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + unsigned long tref = jiffies - 5 * HZ; + struct neighbour *n, *tmp; int shrunk = 0; - int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ - if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, - tbl)) { - shrunk = 1; - continue; - } - np = &n->next; + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if ((n->nud_state == NUD_FAILED) || + time_after(tref, n->updated)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; } } @@ -260,8 +313,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); - n->dead = 1; - + neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, @@ -321,13 +373,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - entries = atomic_inc_return(&tbl->entries) - 1; + if (exempt_from_gc) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -340,6 +397,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -358,11 +416,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; + INIT_LIST_HEAD(&n->gc_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); goto out; } @@ -505,13 +567,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) +static struct neighbour *___neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool exempt_from_gc, bool want_ref) { + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); u32 hash_val; unsigned int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -574,6 +638,9 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } n->dead = 0; + if (!exempt_from_gc) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, @@ -591,6 +658,12 @@ out_neigh_release: neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, false, want_ref); +} EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) @@ -652,6 +725,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (!n) goto out; + n->protocol = 0; write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; @@ -854,7 +928,7 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; - n->dead = 1; + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; @@ -1137,9 +1211,11 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags, u32 nlmsg_pid) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { + bool ext_learn_change = false; u8 old; int err; int notify = 0; @@ -1155,10 +1231,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - if (neigh->dead) + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); goto out; + } - neigh_update_ext_learned(neigh, flags, ¬ify); + ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1193,8 +1271,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } @@ -1302,11 +1382,20 @@ out: neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); + if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) + neigh_update_gc_list(neigh); + if (notify) neigh_update_notify(neigh, nlmsg_pid); return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is @@ -1571,6 +1660,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1662,6 +1752,19 @@ static struct neigh_table *neigh_find_table(int family) return tbl; } +const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, + [NDA_PROTOCOL] = { .type = NLA_U8 }, +}; + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1678,8 +1781,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1694,8 +1799,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < (int)tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); @@ -1711,10 +1818,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); @@ -1736,16 +1842,19 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; + u8 protocol = 0; int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1755,19 +1864,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } + dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (tb[NDA_PROTOCOL]) + protocol = nla_get_u8(tb[NDA_PROTOCOL]); + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -1775,22 +1892,30 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm->ndm_flags; + if (protocol) + pn->protocol = protocol; err = 0; } goto out; } - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); goto out; + } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { + bool exempt_from_gc; + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } - neigh = __neigh_lookup_errno(tbl, dst, dev); + exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || + ndm->ndm_flags & NTF_EXT_LEARNED; + neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -1817,8 +1942,12 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + + if (protocol) + neigh->protocol = protocol; + neigh_release(neigh); out: @@ -2312,6 +2441,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; + if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2343,6 +2475,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; + if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -2494,16 +2629,21 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || - ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { + ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); return -EINVAL; } + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); + return -EINVAL; + } + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } else { err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + nda_policy, extack); } if (err < 0) return err; @@ -2515,17 +2655,9 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request"); - return -EINVAL; - } filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: - if (nla_len(tb[i]) != sizeof(u32)) { - NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request"); - return -EINVAL; - } filter->master_idx = nla_get_u32(tb[i]); break; default: @@ -2585,6 +2717,186 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, + void **dst, int *dev_idx, u8 *ndm_flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *dev_idx = ndm->ndm_ifindex; + *tbl = neigh_find_table(ndm->ndm_family); + if (*tbl == NULL) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + return -EAFNOSUPPORT; + } + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_DST: + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + return -EINVAL; + } + *dst = nla_data(tb[i]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); + return -EINVAL; + } + } + + return 0; +} + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int neigh_get_reply(struct net *net, struct neighbour *neigh, + u32 pid, u32 seq) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static inline size_t pneigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, + u32 pid, u32 seq, struct neigh_table *tbl) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct net_device *dev = NULL; + struct neigh_table *tbl = NULL; + struct neighbour *neigh; + void *dst = NULL; + u8 ndm_flags = 0; + int dev_idx = 0; + int err; + + err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, + extack); + if (err < 0) + return err; + + if (dev_idx) { + dev = __dev_get_by_index(net, dev_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (!dst) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + return -EINVAL; + } + + if (ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + pn = pneigh_lookup(tbl, net, dst, dev, 0); + if (!pn) { + NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); + return -ENOENT; + } + return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, tbl); + } + + if (!dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -EINVAL; + } + + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + return -ENOENT; + } + + err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq); + + neigh_release(neigh); + + return err; +} + void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; @@ -2631,7 +2943,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); - n->dead = 1; + neigh_mark_dead(n); } else np = &n->next; write_unlock(&n->lock); @@ -2992,15 +3304,6 @@ static const struct seq_operations neigh_stat_seq_ops = { }; #endif /* CONFIG_PROC_FS */ -static inline size_t neigh_nlmsg_size(void) -{ - return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ - + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4); /* NDA_PROBES */ -} - static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { @@ -3384,7 +3687,7 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bd67c4d0fcfd..ff9fd2bb4ce4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int)new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..b02fb19df2cc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -735,23 +736,38 @@ static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } -static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, int nsid) +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; + bool add_ref; + int ref_nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nla_put_s32(skb, NETNSA_NSID, nsid)) + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) + goto nla_put_failure; + + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -767,10 +783,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; + struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; - struct net *peer; - int err, id; + int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); @@ -782,6 +803,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; @@ -793,15 +819,29 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); + } + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } - id = peernet2id(net, peer); - err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, id); + fillargs.nsid = peernet2id(target, peer); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -811,14 +851,17 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: + if (fillargs.add_ref) + put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { - struct net *net; + struct net *tgt_net; + struct net *ref_net; struct sk_buff *skb; - struct netlink_callback *cb; + struct net_fill_args fillargs; int idx; int s_idx; }; @@ -831,9 +874,10 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) if (net_cb->idx < net_cb->s_idx) goto cont; - ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, - net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, id); + net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -842,33 +886,96 @@ cont: return 0; } +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; + net_cb->tgt_net = net; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { - .net = net, + .tgt_net = sock_net(skb->sk), .skb = skb, - .cb = cb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, .idx = 0, .s_idx = cb->args[0], }; + int err = 0; - if (cb->strict_check && - nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { - NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); - return -EINVAL; + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; } - spin_lock_bh(&net->nsid_lock); - idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_bh(&net->nsid_lock); + spin_lock_bh(&net_cb.tgt_net->nsid_lock); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net) && + !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + err = -EAGAIN; + goto end; + } + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net)) + spin_unlock_bh(&net_cb.ref_net->nsid_lock); + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; - return skb->len; +end: + if (net_cb.fillargs.add_ref) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id) { + struct net_fill_args fillargs = { + .cmd = cmd, + .nsid = id, + }; struct sk_buff *msg; int err = -ENOMEM; @@ -876,7 +983,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -917,7 +1024,8 @@ static int __init net_ns_init(void) init_net_initialized = true; up_write(&pernet_ops_rwsem); - register_pernet_subsys(&net_ns_ops); + if (register_pernet_subsys(&net_ns_ops)) + panic("Could not register network namespace subsystems"); rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2b9fdbc43205..361aabffb8c0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - err = dev_open(ndev); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); @@ -801,7 +801,7 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); - call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); } @@ -812,7 +812,7 @@ void __netpoll_free(struct netpoll *np) ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); + synchronize_rcu(); __netpoll_cleanup(np); kfree(np); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 33d9227a8b80..48f61885fd6f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> -#define RTNL_MAX_TYPE 49 +#define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -2444,7 +2444,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa); + err = dev_set_mac_address(dev, sa, extack); kfree(sa); if (err) goto errout; @@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } @@ -2870,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } @@ -2885,9 +2887,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *net, - const char *ifname, unsigned char name_assign_type, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; @@ -2903,11 +2907,15 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - if (num_tx_queues < 1 || num_tx_queues > 4096) + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); + } - if (num_rx_queues < 1 || num_rx_queues > 4096) + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); + } dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); @@ -2965,20 +2973,24 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attr, struct netlink_ext_ack *extack) { + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; + unsigned char name_assign_type = NET_NAME_USER; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + const struct rtnl_link_ops *m_ops = NULL; + struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *tb[IFLA_MAX + 1]; + struct net *dest_net, *link_net; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; struct net_device *dev; - struct net_device *master_dev = NULL; struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - unsigned char name_assign_type = NET_NAME_USER; + struct nlattr **data; int err; #ifdef CONFIG_MODULES @@ -3034,193 +3046,200 @@ replay: ops = NULL; } - if (1) { - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; - struct nlattr **data = NULL; - struct nlattr **slave_data = NULL; - struct net *dest_net, *link_net = NULL; - - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, NULL); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; } + } - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - NULL); - if (err < 0) - return err; - slave_data = slave_attr; - } + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + slave_data = slave_attr; } + } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (dev) { + int status = 0; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } - err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data, - extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; - return do_setlink(skb, dev, ifm, extack, tb, ifname, - status); + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, + return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); - return -ENODEV; - } + return -ENODEV; + } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; - if (!ops) { + if (!ops) { #ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - return -EOPNOTSUPP; + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } - if (!ops->setup) - return -EOPNOTSUPP; + if (!ops->setup) + return -EOPNOTSUPP; - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } + if (!ifname[0]) { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } - - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } - dev->ifindex = ifm->ifi_index; + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } - if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data, - extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(dev); - goto out; - } - } else { - err = register_netdevice(dev); - if (err < 0) { + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) free_netdev(dev); - goto out; - } + goto out; + } + } else { + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; } - err = rtnl_configure_link(dev, ifm); + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), - extack); - if (err) - goto out_unregister; - } + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); + if (ops->newlink) { + LIST_HEAD(list_kill); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); } + goto out; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **attr; + int ret; + + attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, attr, extack); + kfree(attr); + return ret; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3800,6 +3819,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, { int err; + if (dev->type != ARPHRD_ETHER) + return -EINVAL; + netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) @@ -3999,6 +4021,160 @@ out: return skb->len; } +static int valid_fdb_get_strict(const struct nlmsghdr *nlh, + struct nlattr **tb, u8 *ndm_flags, + int *br_idx, int *brport_idx, u8 **addr, + u16 *vid, struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_MASTER: + *br_idx = nla_get_u32(tb[i]); + break; + case NDA_LLADDR: + if (nla_len(tb[i]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); + return -EINVAL; + } + *addr = nla_data(tb[i]); + break; + case NDA_VLAN: + err = fdb_vid_parse(tb[i], vid, extack); + if (err) + return err; + break; + case NDA_VNI: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL, *br_dev = NULL; + const struct net_device_ops *ops = NULL; + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct sk_buff *skb; + int brport_idx = 0; + u8 ndm_flags = 0; + int br_idx = 0; + u8 *addr = NULL; + u16 vid = 0; + int err; + + err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, + &brport_idx, &addr, &vid, extack); + if (err < 0) + return err; + + if (brport_idx) { + dev = __dev_get_by_index(net, brport_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (br_idx) { + if (dev) { + NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); + return -EINVAL; + } + + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Invalid master ifindex"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } + + if (dev) { + if (!ndm_flags || (ndm_flags & NTF_MASTER)) { + if (!(dev->priv_flags & IFF_BRIDGE_PORT)) { + NL_SET_ERR_MSG(extack, "Device is not a bridge port"); + return -EINVAL; + } + br_dev = netdev_master_upper_dev_get(dev); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Master of device not found"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } else { + if (!(ndm_flags & NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); + return -EINVAL; + } + ops = dev->netdev_ops; + } + } + + if (!br_dev && !dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -ENODEV; + } + + if (!ops || !ops->ndo_fdb_get) { + NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); + return -EOPNOTSUPP; + } + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (br_dev) + dev = br_dev; + err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); + if (err) + goto out; + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + kfree_skb(skb); + return err; +} + static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { @@ -4310,7 +4486,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, + extack); if (err) goto out; @@ -4322,7 +4499,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, - flags); + flags, + extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; @@ -5057,7 +5235,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0); + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a8217e221e19..37317ffec146 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,9 @@ struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -606,7 +609,6 @@ fastpath: void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -614,9 +616,7 @@ void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); -#endif + skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ @@ -796,9 +796,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); -#endif + __skb_ext_copy(new, old); __nf_copy(new, old, false); /* Note : this field could be in headers_start/headers_end section @@ -1089,7 +1087,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1095,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1105,6 +1104,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) @@ -1131,7 +1136,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1151,7 +1156,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } @@ -1925,8 +1930,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; @@ -2366,19 +2369,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer @@ -2645,6 +2635,65 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( @@ -2962,28 +3011,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) @@ -3873,6 +3900,46 @@ done: } EXPORT_SYMBOL_GPL(skb_gro_receive); +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) + +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif +}; + +static __always_inline unsigned int skb_ext_total_length(void) +{ + return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif +#ifdef CONFIG_XFRM + skb_ext_type_len[SKB_EXT_SEC_PATH] + +#endif + 0; +} + +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(skb_ext_total_length() > 255); + + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} +#else +static void skb_extensions_init(void) {} +#endif + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", @@ -3887,6 +3954,7 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_extensions_init(); } static int @@ -4856,7 +4924,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; - skb->offload_mr_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; #endif if (!xnet) @@ -5128,7 +5196,7 @@ int skb_vlan_pop(struct sk_buff *skb) int err; if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; @@ -5525,3 +5593,148 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +static struct skb_ext *skb_ext_alloc(void) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif + __skb_ext_put(old); + return new; +} + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old, skb->active_extensions); + if (!new) + return NULL; + + if (__skb_ext_exist(new, id)) + goto set_active; + + newoff = new->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = skb_ext_alloc(); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; +set_active: + skb->extensions = new; + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif + + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 56a99d0c9aa0..d6d5c20d7044 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -94,6 +94,9 @@ int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, } while (len) { + if (sk_msg_full(dst)) + return -ENOSPC; + sge_len = sge->length - off; sge_off = sge->offset + off; if (sge_len > len) @@ -403,7 +406,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) msg->skb = skb; sk_psock_queue_msg(psock, msg); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); return copied; } @@ -572,6 +575,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { rcu_assign_sk_user_data(sk, NULL); sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); sk_psock_restore_proto(sk, psock); write_lock_bh(&sk->sk_callback_lock); @@ -580,7 +584,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - call_rcu_sched(&psock->rcu, sk_psock_destroy); + call_rcu(&psock->rcu, sk_psock_destroy); } EXPORT_SYMBOL_GPL(sk_psock_drop); @@ -669,6 +673,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, bool ingress; switch (verdict) { + case __SK_PASS: + sk_other = psock->sk; + if (sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + goto out_free; + } + if (atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf) { + struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); + + tcp->bpf.flags |= BPF_F_INGRESS; + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + break; + } + goto out_free; case __SK_REDIRECT: sk_other = tcp_skb_bpf_redirect_fetch(skb); if (unlikely(!sk_other)) @@ -735,7 +755,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) } /* Called with socket lock held. */ -static void sk_psock_data_ready(struct sock *sk) +static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; @@ -783,7 +803,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } diff --git a/net/core/sock.c b/net/core/sock.c index 080a880a1761..f00902c532cc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); @@ -698,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -950,10 +953,12 @@ set_rcvbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else + } else if (val != sk->sk_mark) { sk->sk_mark = val; + sk_dst_reset(sk); + } break; case SO_RXQ_OVFL: @@ -1014,7 +1019,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ba5cba56f574..d8fe3e549373 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } +EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { diff --git a/net/core/stream.c b/net/core/stream.c index 7d329fb1f553..e94bb02a5629 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_is_writeable(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 37b4667128a3..d67ec17f2cc8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -28,6 +28,8 @@ static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static long long_one __maybe_unused = 1; +static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } + +static int +proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} #endif static struct ctl_table net_core_table[] = { @@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = { { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0600, - .proc_handler = proc_dointvec_minmax_bpf_restricted, - .extra1 = &one, + .proc_handler = proc_dolongvec_minmax_bpf_restricted, + .extra1 = &long_one, + .extra2 = &long_max, }, #endif { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8e08cea6f178..26a21d97b6b0 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -231,7 +231,7 @@ EXPORT_SYMBOL(dccp_req_err); * check at all. A more general error queue to queue errors for later handling * is probably better. */ -static void dccp_v4_err(struct sk_buff *skb, u32 info) +static int dccp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (struct iphdr *)skb->data; const u8 offset = iph->ihl << 2; @@ -259,16 +259,18 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) - return dccp_req_err(sk, seq); + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + dccp_req_err(sk, seq); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -357,6 +359,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6344f1b18a6a..d5740bad5b18 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -68,7 +68,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) } -static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; @@ -96,16 +96,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) - return dccp_req_err(sk, seq); + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + dccp_req_err(sk, seq); + return 0; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -183,6 +185,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out: bh_unlock_sock(sk); sock_put(sk); + return 0; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 43733accf58e..2cc5fbb1b29e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -948,6 +948,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; + sk->sk_max_ack_backlog = backlog; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -960,7 +961,6 @@ int inet_dccp_listen(struct socket *sock, int backlog) if (err) goto out; } - sk->sk_max_ack_backlog = backlog; err = 0; out: @@ -1139,8 +1139,11 @@ static int __init dccp_init(void) rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); if (rc) goto out_fail; - rc = -ENOBUFS; inet_hashinfo_init(&dccp_hashinfo); + rc = inet_hashinfo2_init_mod(&dccp_hashinfo); + if (rc) + goto out_fail; + rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7d6ff983ba2c..bdccc46a2921 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -192,7 +192,7 @@ static int check_port(__le16 port) static unsigned short port_alloc(struct sock *sk) { struct dn_scp *scp = DN_SK(sk); -static unsigned short port = 0x2000; + static unsigned short port = 0x2000; unsigned short i_port = port; while(check_port(cpu_to_le16(++port)) != 0) { @@ -2405,7 +2405,7 @@ static void __exit decnet_exit(void) proto_unregister(&dn_proto); - rcu_barrier_bh(); /* Wait for completion of call_rcu_bh()'s */ + rcu_barrier(); /* Wait for completion of call_rcu()'s */ } module_exit(decnet_exit); #endif diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 48c41918fb35..91e52973ee13 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -44,6 +44,10 @@ config NET_DSA_TAG_GSWIP config NET_DSA_TAG_KSZ bool +config NET_DSA_TAG_KSZ9477 + bool + select NET_DSA_TAG_KSZ + config NET_DSA_TAG_LAN9303 bool diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index a69c1790bbfc..aee909bcddc4 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -55,8 +55,8 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_GSWIP [DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops, #endif -#ifdef CONFIG_NET_DSA_TAG_KSZ - [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops, +#ifdef CONFIG_NET_DSA_TAG_KSZ9477 + [DSA_TAG_PROTO_KSZ9477] = &ksz9477_netdev_ops, #endif #ifdef CONFIG_NET_DSA_TAG_LAN9303 [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, @@ -91,8 +91,8 @@ const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) #ifdef CONFIG_NET_DSA_TAG_GSWIP [DSA_TAG_PROTO_GSWIP] = "gswip", #endif -#ifdef CONFIG_NET_DSA_TAG_KSZ - [DSA_TAG_PROTO_KSZ] = "ksz", +#ifdef CONFIG_NET_DSA_TAG_KSZ9477 + [DSA_TAG_PROTO_KSZ9477] = "ksz9477", #endif #ifdef CONFIG_NET_DSA_TAG_LAN9303 [DSA_TAG_PROTO_LAN9303] = "lan9303", diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9e4fd04ab53c..026a05774bf7 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -210,7 +210,7 @@ extern const struct dsa_device_ops edsa_netdev_ops; extern const struct dsa_device_ops gswip_netdev_ops; /* tag_ksz.c */ -extern const struct dsa_device_ops ksz_netdev_ops; +extern const struct dsa_device_ops ksz9477_netdev_ops; /* tag_lan9303.c */ extern const struct dsa_device_ops lan9303_netdev_ops; diff --git a/net/dsa/master.c b/net/dsa/master.c index c90ee3227dea..71bb15f491c8 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -158,8 +158,59 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) cpu_dp->orig_ethtool_ops = NULL; } +static ssize_t tagging_show(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct dsa_port *cpu_dp = dev->dsa_ptr; + + return sprintf(buf, "%s\n", + dsa_tag_protocol_to_str(cpu_dp->tag_ops)); +} +static DEVICE_ATTR_RO(tagging); + +static struct attribute *dsa_slave_attrs[] = { + &dev_attr_tagging.attr, + NULL +}; + +static const struct attribute_group dsa_group = { + .name = "dsa", + .attrs = dsa_slave_attrs, +}; + +static void dsa_master_set_mtu(struct net_device *dev, struct dsa_port *cpu_dp) +{ + unsigned int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + int err; + + rtnl_lock(); + if (mtu <= dev->max_mtu) { + err = dev_set_mtu(dev, mtu); + if (err) + netdev_dbg(dev, "Unable to set MTU to include for DSA overheads\n"); + } + rtnl_unlock(); +} + +static void dsa_master_reset_mtu(struct net_device *dev) +{ + int err; + + rtnl_lock(); + err = dev_set_mtu(dev, ETH_DATA_LEN); + if (err) + netdev_dbg(dev, + "Unable to reset MTU to exclude DSA overheads\n"); + rtnl_unlock(); +} + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { + int ret; + + dsa_master_set_mtu(dev, cpu_dp); + /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -168,12 +219,22 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) dev->dsa_ptr = cpu_dp; - return dsa_master_ethtool_setup(dev); + ret = dsa_master_ethtool_setup(dev); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->dev.kobj, &dsa_group); + if (ret) + dsa_master_ethtool_teardown(dev); + + return ret; } void dsa_master_teardown(struct net_device *dev) { + sysfs_remove_group(&dev->dev.kobj, &dsa_group); dsa_master_ethtool_teardown(dev); + dsa_master_reset_mtu(dev); dev->dsa_ptr = NULL; diff --git a/net/dsa/port.c b/net/dsa/port.c index ed0595459df1..2d7e01b23572 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -252,9 +252,6 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; - if (netif_is_bridge_master(vlan->obj.orig_dev)) - return -EOPNOTSUPP; - if (br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7d0c19e7edcf..a3fcc1d01615 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1050,35 +1050,12 @@ static const struct net_device_ops dsa_slave_netdev_ops = { static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_attr_set = dsa_slave_port_attr_set, - .switchdev_port_obj_add = dsa_slave_port_obj_add, - .switchdev_port_obj_del = dsa_slave_port_obj_del, }; static struct device_type dsa_type = { .name = "dsa", }; -static ssize_t tagging_show(struct device *d, struct device_attribute *attr, - char *buf) -{ - struct net_device *dev = to_net_dev(d); - struct dsa_port *dp = dsa_slave_to_port(dev); - - return sprintf(buf, "%s\n", - dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops)); -} -static DEVICE_ATTR_RO(tagging); - -static struct attribute *dsa_slave_attrs[] = { - &dev_attr_tagging.attr, - NULL -}; - -static const struct attribute_group dsa_group = { - .name = "dsa", - .attrs = dsa_slave_attrs, -}; - static void dsa_slave_phylink_validate(struct net_device *dev, unsigned long *supported, struct phylink_link_state *state) @@ -1374,14 +1351,8 @@ int dsa_slave_create(struct dsa_port *port) goto out_phy; } - ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group); - if (ret) - goto out_unreg; - return 0; -out_unreg: - unregister_netdev(slave_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); @@ -1405,7 +1376,6 @@ void dsa_slave_destroy(struct net_device *slave_dev) rtnl_unlock(); dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); - sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group); unregister_netdev(slave_dev); phylink_destroy(dp->pl); free_percpu(p->stats64); @@ -1557,6 +1527,44 @@ err_fdb_work_init: return NOTIFY_BAD; } +static int +dsa_slave_switchdev_port_obj_event(unsigned long event, + struct net_device *netdev, + struct switchdev_notifier_port_obj_info *port_obj_info) +{ + int err = -EOPNOTSUPP; + + switch (event) { + case SWITCHDEV_PORT_OBJ_ADD: + err = dsa_slave_port_obj_add(netdev, port_obj_info->obj, + port_obj_info->trans); + break; + case SWITCHDEV_PORT_OBJ_DEL: + err = dsa_slave_port_obj_del(netdev, port_obj_info->obj); + break; + } + + port_obj_info->handled = true; + return notifier_from_errno(err); +} + +static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + + if (!dsa_slave_dev_check(dev)) + return NOTIFY_DONE; + + switch (event) { + case SWITCHDEV_PORT_OBJ_ADD: /* fall through */ + case SWITCHDEV_PORT_OBJ_DEL: + return dsa_slave_switchdev_port_obj_event(event, dev, ptr); + } + + return NOTIFY_DONE; +} + static struct notifier_block dsa_slave_nb __read_mostly = { .notifier_call = dsa_slave_netdevice_event, }; @@ -1565,8 +1573,13 @@ static struct notifier_block dsa_slave_switchdev_notifier = { .notifier_call = dsa_slave_switchdev_event, }; +static struct notifier_block dsa_slave_switchdev_blocking_notifier = { + .notifier_call = dsa_slave_switchdev_blocking_event, +}; + int dsa_slave_register_notifier(void) { + struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_slave_nb); @@ -1577,8 +1590,15 @@ int dsa_slave_register_notifier(void) if (err) goto err_switchdev_nb; + nb = &dsa_slave_switchdev_blocking_notifier; + err = register_switchdev_blocking_notifier(nb); + if (err) + goto err_switchdev_blocking_nb; + return 0; +err_switchdev_blocking_nb: + unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_slave_nb); return err; @@ -1586,8 +1606,14 @@ err_switchdev_nb: void dsa_slave_unregister_notifier(void) { + struct notifier_block *nb; int err; + nb = &dsa_slave_switchdev_blocking_notifier; + err = unregister_switchdev_blocking_notifier(nb); + if (err) + pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); + err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 2b06bb91318b..4aa1d368a5ae 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -174,6 +174,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops brcm_netdev_ops = { .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, + .overhead = BRCM_TAG_LEN, }; #endif @@ -196,5 +197,6 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, const struct dsa_device_ops brcm_prepend_netdev_ops = { .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, + .overhead = BRCM_TAG_LEN, }; #endif diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cd13cfc542ce..8b2f92e3f3a2 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -149,4 +149,5 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops dsa_netdev_ops = { .xmit = dsa_xmit, .rcv = dsa_rcv, + .overhead = DSA_HLEN, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 4083326b806e..f5b87ee5c94e 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -168,4 +168,5 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops edsa_netdev_ops = { .xmit = edsa_xmit, .rcv = edsa_rcv, + .overhead = EDSA_HLEN, }; diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 49e9b73f1be3..cb6f82ffe5eb 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -106,4 +106,5 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, const struct dsa_device_ops gswip_netdev_ops = { .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, + .overhead = GSWIP_RX_HEADER_LEN, }; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0f62effad88f..da71b9e2af52 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -14,34 +14,18 @@ #include <net/dsa.h> #include "dsa_priv.h" -/* For Ingress (Host -> KSZ), 2 bytes are added before FCS. - * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) - * --------------------------------------------------------------------------- - * tag0 : Prioritization (not used now) - * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) - * - * For Egress (KSZ -> Host), 1 byte is added before FCS. - * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) - * --------------------------------------------------------------------------- - * tag0 : zero-based value represents port - * (eg, 0x00=port1, 0x02=port3, 0x06=port7) - */ - -#define KSZ_INGRESS_TAG_LEN 2 -#define KSZ_EGRESS_TAG_LEN 1 +/* Typically only one byte is used for tail tag. */ +#define KSZ_EGRESS_TAG_LEN 1 -static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *ksz_common_xmit(struct sk_buff *skb, + struct net_device *dev, int len) { - struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; int padlen; - u8 *tag; padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; - if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) { + if (skb_tailroom(skb) >= padlen + len) { /* Let dsa_slave_xmit() free skb */ if (__skb_put_padto(skb, skb->len + padlen, false)) return NULL; @@ -49,7 +33,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) nskb = skb; } else { nskb = alloc_skb(NET_IP_ALIGN + skb->len + - padlen + KSZ_INGRESS_TAG_LEN, GFP_ATOMIC); + padlen + len, GFP_ATOMIC); if (!nskb) return NULL; skb_reserve(nskb, NET_IP_ALIGN); @@ -70,33 +54,88 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); } - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); - tag[0] = 0; - tag[1] = 1 << dp->index; /* destination port */ - return nskb; } -static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, + struct net_device *dev, + unsigned int port, unsigned int len) { - u8 *tag; - int source_port; + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) + return NULL; - tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + pskb_trim_rcsum(skb, skb->len - len); - source_port = tag[0] & 7; + return skb; +} - skb->dev = dsa_master_find_slave(dev, 0, source_port); - if (!skb->dev) +/* + * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : Prioritization (not used now) + * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) + * + * For Egress (KSZ9477 -> Host), 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : zero-based value represents port + * (eg, 0x00=port1, 0x02=port3, 0x06=port7) + */ + +#define KSZ9477_INGRESS_TAG_LEN 2 +#define KSZ9477_PTP_TAG_LEN 4 +#define KSZ9477_PTP_TAG_INDICATION 0x80 + +#define KSZ9477_TAIL_TAG_OVERRIDE BIT(9) +#define KSZ9477_TAIL_TAG_LOOKUP BIT(10) + +static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct sk_buff *nskb; + u16 *tag; + u8 *addr; + + nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN); + if (!nskb) return NULL; - pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); + /* Tag encoding */ + tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN); + addr = skb_mac_header(nskb); - return skb; + *tag = BIT(dp->index); + + if (is_link_local_ether_addr(addr)) + *tag |= KSZ9477_TAIL_TAG_OVERRIDE; + + *tag = cpu_to_be16(*tag); + + return nskb; +} + +static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + /* Tag decoding */ + u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + unsigned int port = tag[0] & 7; + unsigned int len = KSZ_EGRESS_TAG_LEN; + + /* Extra 4-bytes PTP timestamp */ + if (tag[0] & KSZ9477_PTP_TAG_INDICATION) + len += KSZ9477_PTP_TAG_LEN; + + return ksz_common_rcv(skb, dev, port, len); } -const struct dsa_device_ops ksz_netdev_ops = { - .xmit = ksz_xmit, - .rcv = ksz_rcv, +const struct dsa_device_ops ksz9477_netdev_ops = { + .xmit = ksz9477_xmit, + .rcv = ksz9477_rcv, + .overhead = KSZ9477_INGRESS_TAG_LEN, }; diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 548c00254c07..f48889e46ff7 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -140,4 +140,5 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops lan9303_netdev_ops = { .xmit = lan9303_xmit, .rcv = lan9303_rcv, + .overhead = LAN9303_TAG_LEN, }; diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 11535bc70743..f39f4dfeda34 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -109,4 +109,5 @@ const struct dsa_device_ops mtk_netdev_ops = { .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, .flow_dissect = mtk_tag_flow_dissect, + .overhead = MTK_HDR_LEN, }; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 613f4ee97771..ed4f6dc26365 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -101,4 +101,5 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops qca_netdev_ops = { .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, + .overhead = QCA_HDR_LEN, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 56197f0d9608..b40756ed6e57 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -84,4 +84,5 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, const struct dsa_device_ops trailer_netdev_ops = { .xmit = trailer_xmit, .rcv = trailer_rcv, + .overhead = 4, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index fd8faa0dfa61..4c520110b04f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -47,6 +47,7 @@ #include <linux/inet.h> #include <linux/ip.h> #include <linux/netdevice.h> +#include <linux/nvmem-consumer.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> @@ -165,15 +166,17 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { - if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) - skb->pkt_type = PACKET_BROADCAST; - else - skb->pkt_type = PACKET_MULTICAST; + if (unlikely(!ether_addr_equal_64bits(eth->h_dest, + dev->dev_addr))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { + if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + skb->pkt_type = PACKET_OTHERHOST; + } } - else if (unlikely(!ether_addr_equal_64bits(eth->h_dest, - dev->dev_addr))) - skb->pkt_type = PACKET_OTHERHOST; /* * Some variants of DSA tagging don't have an ethertype field @@ -548,3 +551,40 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) return 0; } EXPORT_SYMBOL(eth_platform_get_mac_address); + +/** + * Obtain the MAC address from an nvmem cell named 'mac-address' associated + * with given device. + * + * @dev: Device with which the mac-address cell is associated. + * @addrbuf: Buffer to which the MAC address will be copied on success. + * + * Returns 0 on success or a negative error number on failure. + */ +int nvmem_get_mac_address(struct device *dev, void *addrbuf) +{ + struct nvmem_cell *cell; + const void *mac; + size_t len; + + cell = nvmem_cell_get(dev, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + ether_addr_copy(addrbuf, mac); + kfree(mac); + + return 0; +} +EXPORT_SYMBOL(nvmem_get_mac_address); diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index ca53efa17be1..8bec827081cd 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -48,6 +48,9 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, const struct ipv6hdr *hdr = ipv6_hdr(skb); struct neighbour *n; + if (!daddr) + return -EINVAL; + /* TODO: * if this package isn't ipv6 one, where should it be routed? */ diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index b231e40f006a..0c25c0bcc4da 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -242,7 +242,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); - rc = dev_set_mac_address(dev, &addr); + rc = dev_set_mac_address(dev, &addr, NULL); rtnl_unlock(); if (rc) goto dev_unregister; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1fbe2f815474..0dfb72c46671 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; + sk->sk_max_ack_backlog = backlog; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog) goto out; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } - sk->sk_max_ack_backlog = backlog; err = 0; out: @@ -1385,6 +1385,10 @@ out: } EXPORT_SYMBOL(inet_gso_segment); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, + struct sk_buff *)); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; @@ -1494,7 +1498,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); - pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, + ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -1556,6 +1561,8 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) return -EINVAL; } +INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); @@ -1581,7 +1588,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) * because any hdr with option will have been flushed in * inet_gro_receive(). */ - err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); + err = INDIRECT_CALL_2(ops->callbacks.gro_complete, + tcp4_gro_complete, udp4_gro_complete, + skb, nhoff + sizeof(*iph)); out_unlock: rcu_read_unlock(); @@ -1964,6 +1973,8 @@ static int __init inet_init(void) /* Add UDP-Lite (RFC 3828) */ udplite4_register(); + raw_init(); + ping_init(); /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a34602ae27de..04ba321ae5ce 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -952,17 +952,18 @@ static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ - if (ipv4_is_zeronet(addr)) + if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) rc = 0; else { __u32 haddr = ntohl(addr); - if (IN_CLASSA(haddr)) rc = 8; else if (IN_CLASSB(haddr)) rc = 16; else if (IN_CLASSC(haddr)) rc = 24; + else if (IN_CLASSE(haddr)) + rc = 32; } return rc; @@ -1100,7 +1101,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) inet_del_ifa(in_dev, ifap, 1); break; } - ret = dev_change_flags(dev, ifr->ifr_flags); + ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9e1c840596c5..5459f41fc26f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -125,10 +125,13 @@ static void esp_output_done(struct crypto_async_request *base, int err) void *tmp; struct xfrm_state *x; - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + struct sec_path *sp = skb_sec_path(skb); + + x = sp->xvec[sp->len - 1]; + } else { x = skb_dst(skb)->xfrm; + } tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 58834a10c0be..8756e0e790d2 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -46,11 +46,12 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { - err = secpath_set(skb); - if (err) + struct sec_path *sp = secpath_set(skb); + + if (!sp) goto out; - if (skb->sp->len == XFRM_MAX_DEPTH) + if (sp->len == XFRM_MAX_DEPTH) goto out; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, @@ -59,8 +60,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (!x) goto out; - skb->sp->xvec[skb->sp->len++] = x; - skb->sp->olen++; + sp->xvec[sp->len++] = x; + sp->olen++; xo = xfrm_offload(skb); if (!xo) { @@ -114,6 +115,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); @@ -121,7 +123,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b5c3937ca6ec..5022bc63863a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!fi) goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, - cfg->fc_mx_len); + cfg->fc_mx_len, extack); if (unlikely(IS_ERR(fi->fib_metrics))) { err = PTR_ERR(fi->fib_metrics); kfree(fi); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 500a59906b87..0c9f171fb085 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -3,6 +3,7 @@ #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <linux/icmp.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> @@ -1003,15 +1004,89 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } +static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info) +{ + const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]); + + if (ipprot && ipprot->err_handler) { + if (!ipprot->err_handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +static int gue_err(struct sk_buff *skb, u32 info) +{ + int transport_offset = skb_transport_offset(skb); + struct guehdr *guehdr; + size_t optlen; + int ret; + + if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + + switch (((struct iphdr *)guehdr)->version) { + case 4: + ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info); + goto out; +#if IS_ENABLED(CONFIG_IPV6) + case 6: + ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info); + goto out; +#endif + default: + ret = -EOPNOTSUPP; + goto out; + } + } + default: /* Undefined version */ + return -EOPNOTSUPP; + } + + if (guehdr->control) + return -ENOENT; + + optlen = guehdr->hlen << 2; + + if (validate_gue_flags(guehdr, optlen)) + return -EINVAL; + + /* Handling exceptions for direct UDP encapsulation in GUE would lead to + * recursion. Besides, this kind of encapsulation can't even be + * configured currently. Discard this. + */ + if (guehdr->proto_ctype == IPPROTO_UDP) + return -EOPNOTSUPP; + + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); + +out: + skb_set_transport_header(skb, transport_offset); + return ret; +} + static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, + .err_handler = gue_err, }; static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, + .err_handler = gue_err, }; static int ip_tunnel_encap_add_fou_ops(void) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7efe740c06eb..a4bf22ee3aed 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -151,20 +151,25 @@ drop: return NET_RX_DROP; } -static void gre_err(struct sk_buff *skb, u32 info) +static int gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; const struct iphdr *iph = (const struct iphdr *)skb->data; u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; + int err = 0; if (ver >= GREPROTO_MAX) - return; + return -EINVAL; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (proto && proto->err_handler) proto->err_handler(skb, info); + else + err = -EPROTONOSUPPORT; rcu_read_unlock(); + + return err; } static const struct net_protocol net_gre_protocol = { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d832beed6e3a..065997f414e6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1079,7 +1079,7 @@ error: goto drop; } -void icmp_err(struct sk_buff *skb, u32 info) +int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; @@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info) */ if (icmph->type != ICMP_ECHOREPLY) { ping_err(skb, offset, info); - return; + return 0; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP); else if (type == ICMP_REDIRECT) ipv4_redirect(skb, net, 0, IPPROTO_ICMP); + + return 0; } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 15e7f7915a21..6ea523d71947 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * int i, low, high, attempt_half; struct inet_bind_bucket *tb; u32 remaining, offset; + int l3mdev; + l3mdev = inet_sk_bound_l3mdev(sk); attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); @@ -219,7 +221,8 @@ other_parity_scan: hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (!inet_csk_bind_conflict(sk, tb, false, false)) goto success; goto next_port; @@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); struct inet_bind_bucket *tb = NULL; kuid_t uid = sock_i_uid(sk); + int l3mdev; + + l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &port); @@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) goto tb_found; tb_not_found: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, l3mdev); if (!tb) goto fail_unlock; tb_found: @@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog) reqsk_queue_alloc(&icsk->icsk_accept_queue); - sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4e5bc4b2f14e..1a4e9ff02762 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -998,7 +998,9 @@ next_chunk: if (!inet_diag_bc_sk(bc, sk)) goto next_normal; - sock_hold(sk); + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_normal; + num_arr[accum] = num; sk_arr[accum] = sk; if (++accum == SKARR_SZ) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 411dd7a90046..942265d65eb3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk) struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum) + const unsigned short snum, + int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); + tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; + int l3mdev; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; @@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOENT; } if (tb->port != port) { + l3mdev = inet_sk_bound_l3mdev(sk); + /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same @@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && - tb->port == port) + tb->l3mdev == l3mdev && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, - sock_net(sk), head, port); + sock_net(sk), head, port, + l3mdev); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; @@ -228,26 +234,16 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; - struct inet_sock *inet = inet_sk(sk); - if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && + if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { - __be32 rcv_saddr = inet->inet_rcv_saddr; + if (sk->sk_rcv_saddr != daddr) + return -1; + + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) + return -1; + score = sk->sk_family == PF_INET ? 2 : 1; - if (rcv_saddr) { - if (rcv_saddr != daddr) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } @@ -303,26 +299,12 @@ struct sock *__inet_lookup_listener(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - unsigned int hash = inet_lhashfn(net, hnum); - struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - bool exact_dif = inet_exact_dif_match(net, skb); struct inet_listen_hashbucket *ilb2; - struct sock *sk, *result = NULL; - int score, hiscore = 0; + struct sock *result = NULL; unsigned int hash2; - u32 phash = 0; - - if (ilb->count <= 10 || !hashinfo->lhash2) - goto port_lookup; - - /* Too many sk in the ilb bucket (which is hashed by port alone). - * Try lhash2 (which is hashed by port and addr) instead. - */ hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, @@ -331,34 +313,12 @@ struct sock *__inet_lookup_listener(struct net *net, goto done; /* Lookup lhash2 with INADDR_ANY */ - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, + saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); - goto done; - -port_lookup: - sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, - dif, sdif, exact_dif); - if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - goto done; - } - result = sk; - hiscore = score; - } - } done: if (unlikely(IS_ERR(result))) return NULL; @@ -675,6 +635,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 remaining, offset; int ret, i, low, high; static u32 hint; + int l3mdev; if (port) { head = &hinfo->bhash[inet_bhashfn(net, port, @@ -693,6 +654,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return ret; } + l3mdev = inet_sk_bound_l3mdev(sk); + inet_get_local_port_range(net, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; @@ -719,7 +682,8 @@ other_parity_scan: * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && tb->port == port) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; @@ -732,7 +696,7 @@ other_parity_scan: } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; @@ -798,13 +762,22 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +static void init_hashinfo_lhash2(struct inet_hashinfo *h) +{ + int i; + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { - unsigned int i; - h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, @@ -814,13 +787,23 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, &h->lhash2_mask, low_limit, high_limit); + init_hashinfo_lhash2(h); +} - for (i = 0; i <= h->lhash2_mask; i++) { - spin_lock_init(&h->lhash2[i].lock); - INIT_HLIST_HEAD(&h->lhash2[i].head); - h->lhash2[i].count = 0; - } +int inet_hashinfo2_init_mod(struct inet_hashinfo *h) +{ + h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); + if (!h->lhash2) + return -ENOMEM; + + h->lhash2_mask = INET_LHTABLE_SIZE - 1; + /* INET_LHTABLE_SIZE must be a power of 2 */ + BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); + + init_hashinfo_lhash2(h); + return 0; } +EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 32662e9e5d21..00ec819f949b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -69,9 +69,17 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV + if (skb->offload_l3_fwd_mark) { + consume_skb(skb); + return 0; + } +#endif + if (unlikely(opt->optlen)) ip_forward_options(skb); + skb->tstamp = 0; return dst_output(net, sk, skb); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d6ee343fdb86..867be8f7f1fa 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -346,10 +346,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct rb_node **rbn, *parent; struct sk_buff *skb1, *prev_tail; + int ihl, end, skb1_run_end; struct net_device *dev; unsigned int fragsize; int flags, offset; - int ihl, end; int err = -ENOENT; u8 ecn; @@ -419,7 +419,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * - * We do the same here for IPv4 (and increment an snmp counter). + * We do the same here for IPv4 (and increment an snmp counter) but + * we do not want to drop the whole queue in response to a duplicate + * fragment. */ err = -EINVAL; @@ -444,13 +446,17 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) do { parent = *rbn; skb1 = rb_to_skb(parent); + skb1_run_end = skb1->ip_defrag_offset + + FRAG_CB(skb1)->frag_run_len; if (end <= skb1->ip_defrag_offset) rbn = &parent->rb_left; - else if (offset >= skb1->ip_defrag_offset + - FRAG_CB(skb1)->frag_run_len) + else if (offset >= skb1_run_end) rbn = &parent->rb_right; - else /* Found an overlap with skb1. */ - goto overlap; + else if (offset >= skb1->ip_defrag_offset && + end <= skb1_run_end) + goto err; /* No new data, potential duplicate */ + else + goto overlap; /* Found an overlap */ } while (*rbn); /* Here we have parent properly set, and rbn pointing to * one of its NULL left/right children. Insert skb. @@ -515,6 +521,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct rb_node *rbn; int len; int ihlen; + int delta; int err; u8 ecn; @@ -556,10 +563,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, if (len > 65535) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; + delta += head->truesize; + if (delta) + add_frag_mem_limit(qp->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 38befe829caf..c7a7bd58a23c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; static unsigned int erspan_net_id __read_mostly; -static void ipgre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) +static int ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info, unsigned int data_len = 0; struct ip_tunnel *t; + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else if (tpi->proto == htons(ETH_P_ERSPAN) || + tpi->proto == htons(ETH_P_ERSPAN2)) + itn = net_generic(net, erspan_net_id); + else + itn = net_generic(net, ipgre_net_id); + + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->daddr, iph->saddr, tpi->key); + + if (!t) + return -ENOENT; + switch (type) { default: case ICMP_PARAMETERPROB: - return; + return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return; + return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return; + return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; @@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info, break; } - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else if (tpi->proto == htons(ETH_P_ERSPAN) || - tpi->proto == htons(ETH_P_ERSPAN2)) - itn = net_generic(net, erspan_net_id); - else - itn = net_generic(net, ipgre_net_id); - - iph = (const struct iphdr *)(icmp_hdr(skb) + 1); - t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, - iph->daddr, iph->saddr, tpi->key); - - if (!t) - return; - #if IS_ENABLED(CONFIG_IPV6) if (tpi->proto == htons(ETH_P_IPV6) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type, data_len)) - return; + return 0; #endif if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return; + return 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return; + return 0; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; + + return 0; } static void gre_err(struct sk_buff *skb, u32 info) @@ -1339,12 +1341,6 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } -bool is_gretap_dev(const struct net_device *dev) -{ - return dev->netdev_ops == &gre_tap_netdev_ops; -} -EXPORT_SYMBOL_GPL(is_gretap_dev); - static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) @@ -1601,7 +1597,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, - &ipgre_tap_ops, tb); + &ipgre_tap_ops, tb, NULL); if (IS_ERR(dev)) return dev; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 35a786c0aaa0..26921f6b3b92 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { - __skb_pull(skb, skb_network_header_len(skb)); - - rcu_read_lock(); - { - int protocol = ip_hdr(skb)->protocol; - const struct net_protocol *ipprot; - int raw; + const struct net_protocol *ipprot; + int raw, ret; - resubmit: - raw = raw_local_deliver(skb, protocol); +resubmit: + raw = raw_local_deliver(skb, protocol); - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot) { - int ret; - - if (!ipprot->no_policy) { - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - kfree_skb(skb); - goto out; - } - nf_reset(skb); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot) { + if (!ipprot->no_policy) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return; } - ret = ipprot->handler(skb); - if (ret < 0) { - protocol = -ret; - goto resubmit; + nf_reset(skb); + } + ret = ipprot->handler(skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + } else { + if (!raw) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); } - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); } else { - if (!raw) { - if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_PROT_UNREACH, 0); - } - kfree_skb(skb); - } else { - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); - consume_skb(skb); - } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + consume_skb(skb); } } - out: +} + +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + __skb_pull(skb, skb_network_header_len(skb)); + + rcu_read_lock(); + ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; @@ -547,7 +546,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; - list_del(&skb->list); + skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -594,7 +593,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - list_del(&skb->list); + skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c09219e7f230..c80188875f39 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); #if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif @@ -867,6 +868,7 @@ static int __ip_append_data(struct sock *sk, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; @@ -880,8 +882,8 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; + bool paged, extra_uref; u32 tskey = 0; - bool paged; skb = skb_peek_tail(queue); @@ -916,6 +918,20 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + extra_uref = true; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + } + } + cork->length += length; /* So, what's going on in the loop below? @@ -939,7 +955,7 @@ static int __ip_append_data(struct sock *sk, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - unsigned int pagedlen = 0; + unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; @@ -956,6 +972,7 @@ alloc_new_skb: if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; + pagedlen = 0; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) @@ -1000,12 +1017,6 @@ alloc_new_skb: skb->csum = 0; skb_reserve(skb, hh_len); - /* only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - /* * Find where to start putting bytes. */ @@ -1038,6 +1049,13 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + /* only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1067,7 +1085,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1097,6 +1115,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1109,6 +1131,8 @@ alloc_new_skb: error_efault: err = -EFAULT; error: + if (uarg) + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c248e0dccbe1..9a0e67b52a4e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, } skb_clear_hash_if_not_l4(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, xnet); @@ -151,6 +151,7 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, sizeof(struct in6_addr)); else dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->key.tun_flags = src->key.tun_flags; dst->mode = src->mode | IP_TUNNEL_INFO_TX; return res; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 88212615bf4c..b9a9873c25c6 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -220,7 +220,7 @@ static int __init ic_open_devs(void) for_each_netdev(&init_net, dev) { if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) continue; - if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0) pr_err("IP-Config: Failed to open %s\n", dev->name); } @@ -238,7 +238,7 @@ static int __init ic_open_devs(void) if (ic_proto_enabled && !able) continue; oflags = dev->flags; - if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) { pr_err("IP-Config: Failed to open %s\n", dev->name); continue; @@ -315,7 +315,7 @@ static void __init ic_close_devs(void) dev = d->dev; if (d != ic_dev && !netdev_uses_dsa(dev)) { pr_debug("IP-Config: Downing %s\n", dev->name); - dev_change_flags(dev, d->flags); + dev_change_flags(dev, d->flags, NULL); } kfree(d); } @@ -429,6 +429,8 @@ static int __init ic_defaults(void) ic_netmask = htonl(IN_CLASSB_NET); else if (IN_CLASSC(ntohl(ic_myaddr))) ic_netmask = htonl(IN_CLASSC_NET); + else if (IN_CLASSE(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSE_NET); else { pr_err("IP-Config: Unable to guess netmask for address %pI4\n", &ic_myaddr); @@ -1361,18 +1363,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v) } return 0; } - -static int ntp_servers_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, ntp_servers_seq_show, NULL); -} - -static const struct file_operations ntp_servers_seq_fops = { - .open = ntp_servers_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq); #endif /* CONFIG_PROC_FS */ /* diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e65287c27e3d..57c5dd283a2c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) struct ip_tunnel *t; int err = 0; + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!t) { + err = -ENOENT; + goto out; + } + switch (type) { case ICMP_DEST_UNREACH: switch (code) { @@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) goto out; } - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); - if (!t) { - err = -ENOENT; - goto out; - } - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a6defbec4f1b..ddbf8c9a1abb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -69,6 +69,8 @@ #include <net/nexthop.h> #include <net/switchdev.h> +#include <linux/nospec.h> + struct ipmr_rule { struct fib_rule common; }; @@ -506,7 +508,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); } @@ -589,7 +591,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); @@ -1612,6 +1614,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) return -EFAULT; if (vr.vifi >= mrt->maxvif) return -EINVAL; + vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); read_lock(&mrt_lock); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { @@ -1686,6 +1689,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) return -EFAULT; if (vr.vifi >= mrt->maxvif) return -EINVAL; + vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); read_lock(&mrt_lock); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { @@ -1802,7 +1806,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; - if (!skb->offload_mr_fwd_mark) + if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; @@ -1820,8 +1824,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - int in_vifi, struct sk_buff *skb, - struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -2027,7 +2030,7 @@ forward: if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, - skb2, c, psend); + skb2, psend); } psend = ct; } @@ -2039,9 +2042,9 @@ last_forward: if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, - c, psend); + psend); } else { - ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, psend); return; } } diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 6d218f5a2e71..ca9a5fefdefa 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -6,7 +6,8 @@ #include <net/tcp.h> static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, - int fc_mx_len, u32 *metrics) + int fc_mx_len, u32 *metrics, + struct netlink_ext_ack *extack) { bool ecn_ca = false; struct nlattr *nla; @@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, if (!type) continue; - if (type > RTAX_MAX) + if (type > RTAX_MAX) { + NL_SET_ERR_MSG(extack, "Invalid metric type"); return -EINVAL; + } if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) + if (val == TCP_CA_UNSPEC) { + NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; + } } else { - if (nla_len(nla) != sizeof(u32)) + if (nla_len(nla) != sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid attribute in metrics"); return -EINVAL; + } val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) @@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, val = 65535 - 15; if (type == RTAX_HOPLIMIT && val > 255) val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { + NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute"); return -EINVAL; + } metrics[type - 1] = val; } @@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, } struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, - int fc_mx_len) + int fc_mx_len, + struct netlink_ext_ack *extack) { struct dst_metrics *fib_metrics; int err; @@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); - err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics); + err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics, + extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); } else { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 184bf2e0a1ed..80f72cc5ca8d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -156,15 +156,10 @@ config NF_NAT_SNMP_BASIC To compile it as a module, choose M here. If unsure, say N. -config NF_NAT_PROTO_GRE - tristate - depends on NF_CT_PROTO_GRE - config NF_NAT_PPTP tristate depends on NF_CONNTRACK default NF_CONNTRACK_PPTP - select NF_NAT_PROTO_GRE config NF_NAT_H323 tristate diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 367993adf4d3..fd7122e0e2c9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,7 +3,7 @@ # Makefile for the netfilter modules on top of IPv4. # -nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o +nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o @@ -28,9 +28,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -# NAT protocols (nf_nat) -obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o - obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2c8d313ae216..b61977db9b7f 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -56,18 +56,15 @@ struct clusterip_config { #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ - struct rcu_head rcu; - + struct rcu_head rcu; /* for call_rcu_bh */ + struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ - struct notifier_block notifier; /* refresh c->ifindex in it */ }; #ifdef CONFIG_PROC_FS static const struct file_operations clusterip_proc_fops; #endif -static unsigned int clusterip_net_id __read_mostly; - struct clusterip_net { struct list_head configs; /* lock protects the configs list */ @@ -75,51 +72,66 @@ struct clusterip_net { #ifdef CONFIG_PROC_FS struct proc_dir_entry *procdir; + /* mutex protects the config->pde*/ + struct mutex mutex; #endif }; +static unsigned int clusterip_net_id __read_mostly; +static inline struct clusterip_net *clusterip_pernet(struct net *net) +{ + return net_generic(net, clusterip_net_id); +} + static inline void clusterip_config_get(struct clusterip_config *c) { refcount_inc(&c->refcount); } - static void clusterip_config_rcu_free(struct rcu_head *head) { - kfree(container_of(head, struct clusterip_config, rcu)); + struct clusterip_config *config; + struct net_device *dev; + + config = container_of(head, struct clusterip_config, rcu); + dev = dev_get_by_name(config->net, config->ifname); + if (dev) { + dev_mc_del(dev, config->clustermac); + dev_put(dev); + } + kfree(config); } static inline void clusterip_config_put(struct clusterip_config *c) { if (refcount_dec_and_test(&c->refcount)) - call_rcu_bh(&c->rcu, clusterip_config_rcu_free); + call_rcu(&c->rcu, clusterip_config_rcu_free); } /* decrease the count of entries using/referencing this config. If last * entry(rule) is removed, remove the config from lists, but don't free it * yet, since proc-files could still be holding references */ static inline void -clusterip_config_entry_put(struct net *net, struct clusterip_config *c) +clusterip_config_entry_put(struct clusterip_config *c) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(c->net); local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { + list_del_rcu(&c->list); + spin_unlock(&cn->lock); + local_bh_enable(); /* In case anyone still accesses the file, the open/close * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS + mutex_lock(&cn->mutex); if (cn->procdir) proc_remove(c->pde); + mutex_unlock(&cn->mutex); #endif - list_del_rcu(&c->list); - spin_unlock(&cn->lock); - local_bh_enable(); - - unregister_netdevice_notifier(&c->notifier); - return; } local_bh_enable(); @@ -129,7 +141,7 @@ static struct clusterip_config * __clusterip_config_find(struct net *net, __be32 clusterip) { struct clusterip_config *c; - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); list_for_each_entry_rcu(c, &cn->configs, list) { if (c->clusterip == clusterip) @@ -181,32 +193,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; - c = container_of(this, struct clusterip_config, notifier); - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } else if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; + spin_lock_bh(&cn->lock); + list_for_each_entry_rcu(c, &cn->configs, list) { + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } else if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; } - break; } + spin_unlock_bh(&cn->lock); return NOTIFY_DONE; } @@ -215,30 +232,44 @@ static struct clusterip_config * clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, __be32 ip, const char *iniface) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; + struct net_device *dev; int err; + if (iniface[0] == '\0') { + pr_info("Please specify an interface name\n"); + return ERR_PTR(-EINVAL); + } + c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return ERR_PTR(-ENOMEM); - strcpy(c->ifname, iniface); - c->ifindex = -1; - c->clusterip = ip; + dev = dev_get_by_name(net, iniface); + if (!dev) { + pr_info("no such interface %s\n", iniface); + kfree(c); + return ERR_PTR(-ENOENT); + } + c->ifindex = dev->ifindex; + strcpy(c->ifname, dev->name); memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); + dev_mc_add(dev, c->clustermac); + dev_put(dev); + + c->clusterip = ip; c->num_total_nodes = i->num_total_nodes; clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; + c->net = net; refcount_set(&c->refcount, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { - spin_unlock_bh(&cn->lock); - kfree(c); - - return ERR_PTR(-EBUSY); + err = -EBUSY; + goto out_config_put; } list_add_rcu(&c->list, &cn->configs); @@ -250,9 +281,11 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); + mutex_lock(&cn->mutex); c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_fops, c); + mutex_unlock(&cn->mutex); if (!c->pde) { err = -ENOMEM; goto err; @@ -260,22 +293,17 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, } #endif - c->notifier.notifier_call = clusterip_netdev_event; - err = register_netdevice_notifier(&c->notifier); - if (!err) { - refcount_set(&c->entries, 1); - return c; - } + refcount_set(&c->entries, 1); + return c; #ifdef CONFIG_PROC_FS - proc_remove(c->pde); err: #endif spin_lock_bh(&cn->lock); list_del_rcu(&c->list); +out_config_put: spin_unlock_bh(&cn->lock); clusterip_config_put(c); - return ERR_PTR(err); } @@ -475,34 +503,20 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) &e->ip.dst.s_addr); return -EINVAL; } else { - struct net_device *dev; - - if (e->ip.iniface[0] == '\0') { - pr_info("Please specify an interface name\n"); - return -EINVAL; - } - - dev = dev_get_by_name(par->net, e->ip.iniface); - if (!dev) { - pr_info("no such interface %s\n", - e->ip.iniface); - return -ENOENT; - } - dev_put(dev); - config = clusterip_config_init(par->net, cipinfo, e->ip.dst.s_addr, e->ip.iniface); if (IS_ERR(config)) return PTR_ERR(config); } - } + } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) + return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); - clusterip_config_entry_put(par->net, config); + clusterip_config_entry_put(config); clusterip_config_put(config); return ret; } @@ -524,7 +538,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ - clusterip_config_entry_put(par->net, cipinfo->config); + clusterip_config_entry_put(cipinfo->config); clusterip_config_put(cipinfo->config); @@ -806,7 +820,7 @@ static const struct file_operations clusterip_proc_fops = { static int clusterip_net_init(struct net *net) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); int ret; INIT_LIST_HEAD(&cn->configs); @@ -824,6 +838,7 @@ static int clusterip_net_init(struct net *net) pr_err("Unable to proc dir entry\n"); return -ENOMEM; } + mutex_init(&cn->mutex); #endif /* CONFIG_PROC_FS */ return 0; @@ -831,13 +846,15 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); + #ifdef CONFIG_PROC_FS + mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; + mutex_unlock(&cn->mutex); #endif nf_unregister_net_hook(net, &cip_arp_ops); - WARN_ON_ONCE(!list_empty(&cn->configs)); } static struct pernet_operations clusterip_net_ops = { @@ -847,6 +864,10 @@ static struct pernet_operations clusterip_net_ops = { .size = sizeof(struct clusterip_net), }; +struct notifier_block cip_netdev_notifier = { + .notifier_call = clusterip_netdev_event +}; + static int __init clusterip_tg_init(void) { int ret; @@ -859,11 +880,17 @@ static int __init clusterip_tg_init(void) if (ret < 0) goto cleanup_subsys; + ret = register_netdevice_notifier(&cip_netdev_notifier); + if (ret < 0) + goto unregister_target; + pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); return 0; +unregister_target: + xt_unregister_target(&clusterip_tg_reg); cleanup_subsys: unregister_pernet_subsys(&clusterip_net_ops); return ret; @@ -873,11 +900,12 @@ static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); + unregister_netdevice_notifier(&cip_netdev_notifier); xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops); - /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ - rcu_barrier_bh(); + /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ + rcu_barrier(); } module_init(clusterip_tg_init); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index ce1512b02cb2..fd3f9e8a74da 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -81,9 +81,12 @@ static int __init masquerade_tg_init(void) int ret; ret = xt_register_target(&masquerade_tg_reg); + if (ret) + return ret; - if (ret == 0) - nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_ipv4_register_notifier(); + if (ret) + xt_unregister_target(&masquerade_tg_reg); return ret; } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 78a67f961d86..2687db015b6f 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -62,22 +62,8 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, } #endif /* CONFIG_XFRM */ -static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range2 *range) -{ - return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && - ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); -} - -static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t, - __be16 dport) -{ - return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport); -} - static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, - const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { @@ -90,8 +76,8 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, iph = (void *)skb->data + iphdroff; hdroff = iphdroff + iph->ihl * 4; - if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff, - target, maniptype)) + if (!nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, + hdroff, target, maniptype)) return false; iph = (void *)skb->data + iphdroff; @@ -161,8 +147,6 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, - .in_range = nf_nat_ipv4_in_range, - .secure_port = nf_nat_ipv4_secure_port, .manip_pkt = nf_nat_ipv4_manip_pkt, .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc, @@ -186,7 +170,6 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); - const struct nf_nat_l4proto *l4proto; struct nf_conntrack_tuple target; unsigned long statusbit; @@ -217,9 +200,8 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, if (!(ct->status & statusbit)) return 1; - l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol); if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), - l4proto, &ct->tuplehash[!dir].tuple, !manip)) + &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { @@ -233,8 +215,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); - if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) + if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; return 1; @@ -391,26 +372,12 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn); static int __init nf_nat_l3proto_ipv4_init(void) { - int err; - - err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp); - if (err < 0) - goto err1; - err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); - if (err < 0) - goto err2; - return err; - -err2: - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); -err1: - return err; + return nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); } static void __exit nf_nat_l3proto_ipv4_exit(void) { nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4); - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); } MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index a9d5e013e555..41327bb99093 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -147,28 +147,50 @@ static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; -static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); +static int masq_refcnt; +static DEFINE_MUTEX(masq_mutex); -void nf_nat_masquerade_ipv4_register_notifier(void) +int nf_nat_masquerade_ipv4_register_notifier(void) { + int ret = 0; + + mutex_lock(&masq_mutex); /* check if the notifier was already set */ - if (atomic_inc_return(&masquerade_notifier_refcount) > 1) - return; + if (++masq_refcnt > 1) + goto out_unlock; /* Register for device down reports */ - register_netdevice_notifier(&masq_dev_notifier); + ret = register_netdevice_notifier(&masq_dev_notifier); + if (ret) + goto err_dec; /* Register IP address change reports */ - register_inetaddr_notifier(&masq_inet_notifier); + ret = register_inetaddr_notifier(&masq_inet_notifier); + if (ret) + goto err_unregister; + + mutex_unlock(&masq_mutex); + return ret; + +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); +err_dec: + masq_refcnt--; +out_unlock: + mutex_unlock(&masq_mutex); + return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); void nf_nat_masquerade_ipv4_unregister_notifier(void) { + mutex_lock(&masq_mutex); /* check if the notifier still has clients */ - if (atomic_dec_return(&masquerade_notifier_refcount) > 0) - return; + if (--masq_refcnt > 0) + goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); +out_unlock: + mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 5d259a12e25f..68b4d450391b 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -299,8 +299,6 @@ pptp_inbound_pkt(struct sk_buff *skb, static int __init nf_nat_helper_pptp_init(void) { - nf_nat_need_gre(); - BUG_ON(nf_nat_pptp_hook_outbound != NULL); RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c deleted file mode 100644 index 00fda6331ce5..000000000000 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * nf_nat_proto_gre.c - * - * NAT protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * (C) 2006-2012 Patrick McHardy <kaber@trash.net> - * - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_l4proto.h> -#include <linux/netfilter/nf_conntrack_proto_gre.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); - -/* generate unique tuple ... */ -static void -gre_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - static u_int16_t key; - __be16 *keyptr; - unsigned int min, i, range_size; - - /* If there is no master conntrack we are not PPTP, - do not change tuples */ - if (!ct->master) - return; - - if (maniptype == NF_NAT_MANIP_SRC) - keyptr = &tuple->src.u.gre.key; - else - keyptr = &tuple->dst.u.gre.key; - - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { - pr_debug("%p: NATing GRE PPTP\n", ct); - min = 1; - range_size = 0xffff; - } else { - min = ntohs(range->min_proto.gre.key); - range_size = ntohs(range->max_proto.gre.key) - min + 1; - } - - pr_debug("min = %u, range_size = %u\n", min, range_size); - - for (i = 0; ; ++key) { - *keyptr = htons(min + key % range_size); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) - return; - } - - pr_debug("%p: no NAT mapping\n", ct); - return; -} - -/* manipulate a GRE packet according to maniptype */ -static bool -gre_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - const struct gre_base_hdr *greh; - struct pptp_gre_header *pgreh; - - /* pgreh includes two optional 32bit fields which are not required - * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) - return false; - - greh = (void *)skb->data + hdroff; - pgreh = (struct pptp_gre_header *)greh; - - /* we only have destination manip of a packet, since 'source key' - * is not present in the packet itself */ - if (maniptype != NF_NAT_MANIP_DST) - return true; - - switch (greh->flags & GRE_VERSION) { - case GRE_VERSION_0: - /* We do not currently NAT any GREv0 packets. - * Try to behave like "nf_nat_proto_unknown" */ - break; - case GRE_VERSION_1: - pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); - pgreh->call_id = tuple->dst.u.gre.key; - break; - default: - pr_debug("can't nat unknown GRE version\n"); - return false; - } - return true; -} - -static const struct nf_nat_l4proto gre = { - .l4proto = IPPROTO_GRE, - .manip_pkt = gre_manip_pkt, - .in_range = nf_nat_l4proto_in_range, - .unique_tuple = gre_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_gre_init(void) -{ - return nf_nat_l4proto_register(NFPROTO_IPV4, &gre); -} - -static void __exit nf_nat_proto_gre_fini(void) -{ - nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre); -} - -module_init(nf_nat_proto_gre_init); -module_exit(nf_nat_proto_gre_fini); - -void nf_nat_need_gre(void) -{ - return; -} -EXPORT_SYMBOL_GPL(nf_nat_need_gre); diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c deleted file mode 100644 index 6d7cf1d79baf..000000000000 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ /dev/null @@ -1,83 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/ip.h> -#include <linux/icmp.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static bool -icmp_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && - ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); -} - -static void -icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - static u_int16_t id; - unsigned int range_size; - unsigned int i; - - range_size = ntohs(range->max_proto.icmp.id) - - ntohs(range->min_proto.icmp.id) + 1; - /* If no range specified... */ - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) - range_size = 0xFFFF; - - for (i = 0; ; ++id) { - tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + - (id % range_size)); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) - return; - } - return; -} - -static bool -icmp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct icmphdr *hdr; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - hdr = (struct icmphdr *)(skb->data + hdroff); - inet_proto_csum_replace2(&hdr->checksum, skb, - hdr->un.echo.id, tuple->src.u.icmp.id, false); - hdr->un.echo.id = tuple->src.u.icmp.id; - return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_icmp = { - .l4proto = IPPROTO_ICMP, - .manip_pkt = icmp_manip_pkt, - .in_range = icmp_in_range, - .unique_tuple = icmp_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 5cd06ba3535d..aa8304c618b8 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -102,6 +102,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); /* Send RST reply */ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { + struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct iphdr *niph; const struct tcphdr *oth; @@ -147,10 +148,11 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - if (oldskb->nf_bridge) { + br_indev = nf_bridge_get_physindev(oldskb); + if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = nf_bridge_get_physindev(oldskb); + nskb->dev = br_indev; niph->tot_len = htons(nskb->len); ip_send_check(niph); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index f1193e1e928a..6847de1d1db8 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -69,7 +69,9 @@ static int __init nft_masq_ipv4_module_init(void) if (ret < 0) return ret; - nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_ipv4_register_notifier(); + if (ret) + nft_unregister_expr(&nft_masq_ipv4_type); return ret; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 70289682a670..c3610b37bb4c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 32a691b7ce2c..92d249e053be 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -29,6 +29,7 @@ #include <net/protocol.h> struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet_protos); const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8ca3eb06ba04..c55a5432cf37 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) goto found; /* gotcha */ } sk = NULL; @@ -391,7 +390,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; - sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sockc->tsflags); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); @@ -805,7 +804,7 @@ out: return copied; } -static int raw_init(struct sock *sk) +static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); @@ -970,7 +969,7 @@ struct proto raw_prot = { .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .ioctl = raw_ioctl, - .init = raw_init, + .init = raw_sk_init, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, @@ -1134,3 +1133,27 @@ void __init raw_proc_exit(void) unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ + +static void raw_sysctl_init_net(struct net *net) +{ +#ifdef CONFIG_NET_L3_MASTER_DEV + net->ipv4.sysctl_raw_l3mdev_accept = 1; +#endif +} + +static int __net_init raw_sysctl_init(struct net *net) +{ + raw_sysctl_init_net(net); + return 0; +} + +static struct pernet_operations __net_initdata raw_sysctl_ops = { + .init = raw_sysctl_init, +}; + +void __init raw_init(void) +{ + raw_sysctl_init_net(&init_net); + if (register_pernet_subsys(&raw_sysctl_ops)) + panic("RAW: failed to init sysctl parameters.\n"); +} diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c0a9d26c06ce..ce92f73cf104 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev, print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), - dev->hard_header_len, true); + dev->hard_header_len, false); } } #endif @@ -2849,6 +2849,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; + skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 891ed2f91467..ba0fc4b18465 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_ping_group_range, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "raw_l3mdev_accept", + .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9e6bc4d6daa7..27e2f6837062 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1423,7 +1423,7 @@ do_error: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, } continue; - found_ok_skb: +found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) @@ -2147,7 +2147,7 @@ skip_copy: sk_eat_skb(sk, skb); continue; - found_fin_ok: +found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) @@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state) * socket sitting in hash tables. */ inet_sk_state_store(sk, state); - -#ifdef STATE_TRACE - SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -#endif } EXPORT_SYMBOL_GPL(tcp_set_state); @@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ 0; } @@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); + nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); return stats; } @@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void) if (unlikely(!tcp_md5sig_pool_populated)) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool_populated) + if (!tcp_md5sig_pool_populated) { __tcp_alloc_md5sig_pool(); + if (tcp_md5sig_pool_populated) + static_key_slow_inc(&tcp_md5_needed); + } mutex_unlock(&tcp_md5sig_mutex); } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 9277abdd822a..0f497fc49c3f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; -/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */ static const int bbr_pacing_margin_percent = 1; /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain @@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); } -/* Pace using current bw estimate and a gain factor. In order to help drive the - * network toward lower queues while maintaining high utilization and low - * latency, the average pacing rate aims to be slightly (~1%) lower than the - * estimated bandwidth. This is an important aspect of the design. In this - * implementation this slightly lower pacing rate is achieved implicitly by not - * including link-layer headers in the packet size used for the pacing rate. - */ +/* Pace using current bw estimate and a gain factor. */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3b45fe530f91..1bb7321a256d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -8,6 +8,7 @@ #include <linux/wait.h> #include <net/inet_common.h> +#include <net/tls.h> static bool tcp_bpf_stream_read(const struct sock *sk) { @@ -198,7 +199,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, msg->sg.start = i; msg->sg.size -= apply_bytes; sk_psock_queue_msg(psock, tmp); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); @@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, u32 off; while (1) { + bool has_tx_ulp; + sge = sk_msg_elem(msg, msg->sg.start); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; @@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, tcp_rate_check_app_limited(sk); retry: - ret = do_tcp_sendpages(sk, page, off, size, flags); + has_tx_ulp = tls_sw_has_ctx_tx(sk); + if (has_tx_ulp) { + flags |= MSG_SENDPAGE_NOPOLICY; + ret = kernel_sendpage_locked(sk, + page, off, size, flags); + } else { + ret = do_tcp_sendpages(sk, page, off, size, flags); + } + if (ret <= 0) return ret; if (apply) @@ -289,12 +300,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; - u32 tosend; + u32 tosend, delta = 0; int ret; more_data: - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + /* Track delta in msg size to add/subtract it on SK_DROP from + * returned to user copied size. This ensures user doesn't + * get a positive return code with msg_cut_data and SK_DROP + * verdict. + */ + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->sg.size < delta) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { @@ -350,7 +372,7 @@ more_data: default: sk_msg_free_partial(sk, msg, tosend); sk_msg_apply_bytes(psock, tosend); - *copied -= tosend; + *copied -= (tosend + delta); return -EACCES; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1e37c1388189..76858b14ebe9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -579,10 +579,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; - if (!delta) - delta = 1; - delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - tcp_rcv_rtt_update(tp, delta_us, 0); + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + tcp_rcv_rtt_update(tp, delta_us, 0); + } } } @@ -1863,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack) { - struct tcp_sock *tp = tcp_sk(sk); - u32 prior_sacked = tp->sacked_out; + if (num_dupack) { + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + s32 delivered; - tp->sacked_out++; - tcp_check_reno_reordering(sk, 0); - if (tp->sacked_out > prior_sacked) - tp->delivered++; /* Some out-of-order packet is delivered */ - tcp_verify_left_out(tp); + tp->sacked_out += num_dupack; + tcp_check_reno_reordering(sk, 0); + delivered = tp->sacked_out - prior_sacked; + if (delivered > 0) + tp->delivered += delivered; + tcp_verify_left_out(tp); + } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ @@ -2457,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && - !(flag & FLAG_LOST_RETRANS)) { + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == + FLAG_RETRANS_DATA_ACKED) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); @@ -2634,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); @@ -2653,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, return; if (after(tp->snd_nxt, tp->high_seq)) { - if (flag & FLAG_DATA_SACKED || is_dupack) + if (flag & FLAG_DATA_SACKED || num_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; @@ -2679,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. */ - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) - tcp_add_reno_sack(sk); + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) + tcp_add_reno_sack(sk, num_dupack); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2757,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, - bool is_dupack, int *ack_flag, int *rexmit) + int num_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - tcp_force_fast_retransmit(sk)); + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && + tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; @@ -2810,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { - if (tcp_is_reno(tp) && is_dupack) - tcp_add_reno_sack(sk); + if (tcp_is_reno(tp)) + tcp_add_reno_sack(sk, num_dupack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2826,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_identify_packet_loss(sk, ack_flag); break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack, rexmit); + tcp_process_loss(sk, flag, num_dupack, rexmit); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) @@ -2837,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - if (is_dupack) - tcp_add_reno_sack(sk); + tcp_add_reno_sack(sk, num_dupack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2910,9 +2915,11 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - seq_rtt_us = ca_rtt_us = delta_us; + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + ca_rtt_us = seq_rtt_us; + } } rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) @@ -3558,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; - bool is_dupack = false; + int num_dupack = 0; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; @@ -3610,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == + FLAG_SND_UNA_ADVANCED) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. @@ -3668,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_set_xmit_timer(sk); if (tcp_ack_is_dubious(sk, flag)) { - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { + num_dupack = 1; + /* Consider if pure acks were aggregated in tcp_add_backlog() */ + if (!(flag & FLAG_DATA)) + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + } + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); } @@ -3687,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); } @@ -3712,7 +3725,7 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); @@ -4602,13 +4615,12 @@ end: } } -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, - bool *fragstolen) +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, + bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); - __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; @@ -4659,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { + if (tcp_queue_rcv(sk, skb, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } @@ -4719,7 +4731,7 @@ queue_and_out: goto drop; } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5595,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); + __skb_pull(skb, tcp_header_len); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index de47038afdf0..efc6fef692ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err); * */ -void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); @@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, - type == ICMP_PARAMETERPROB || - type == ICMP_TIME_EXCEEDED || - (type == ICMP_DEST_UNREACH && - (code == ICMP_NET_UNREACH || - code == ICMP_HOST_UNREACH))); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_rtx_queue_head(sk); - BUG_ON(!skb); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); @@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) @@ -969,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) * We need to maintain these in the sk structure. */ +struct static_key tcp_md5_needed __read_mostly; +EXPORT_SYMBOL(tcp_md5_needed); + /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, + const union tcp_md5_addr *addr, + int family) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1010,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, } return best_match; } -EXPORT_SYMBOL(tcp_md5_do_lookup); +EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1618,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; - - /* Only socket owner can try to collapse/prune rx queues - * to reduce memory overhead, so add a little headroom here. - * Few sockets backlog are possibly concurrently non empty. - */ - limit += 64*1024; + struct skb_shared_info *shinfo; + const struct tcphdr *th; + struct tcphdr *thtail; + struct sk_buff *tail; + unsigned int hdrlen; + bool fragstolen; + u32 gso_segs; + int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -1633,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ skb_condense(skb); + skb_dst_drop(skb); + + if (unlikely(tcp_checksum_complete(skb))) { + bh_unlock_sock(sk); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + return true; + } + + /* Attempt coalescing to last skb in backlog, even if we are + * above the limits. + * This is okay because skb capacity is limited to MAX_SKB_FRAGS. + */ + th = (const struct tcphdr *)skb->data; + hdrlen = th->doff * 4; + shinfo = skb_shinfo(skb); + + if (!shinfo->gso_size) + shinfo->gso_size = skb->len - hdrlen; + + if (!shinfo->gso_segs) + shinfo->gso_segs = 1; + + tail = sk->sk_backlog.tail; + if (!tail) + goto no_coalesce; + thtail = (struct tcphdr *)tail->data; + + if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || + TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || + ((TCP_SKB_CB(tail)->tcp_flags | + TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || + ((TCP_SKB_CB(tail)->tcp_flags ^ + TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || +#ifdef CONFIG_TLS_DEVICE + tail->decrypted != skb->decrypted || +#endif + thtail->doff != th->doff || + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + goto no_coalesce; + + __skb_pull(skb, hdrlen); + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + thtail->window = th->window; + + TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) + TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + + TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + TCP_SKB_CB(tail)->has_rxtstamp = true; + tail->tstamp = skb->tstamp; + skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + } + + /* Not as strict as GRO. We only need to carry mss max value */ + skb_shinfo(tail)->gso_size = max(shinfo->gso_size, + skb_shinfo(tail)->gso_size); + + gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; + skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + + sk->sk_backlog.len += delta; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPBACKLOGCOALESCE); + kfree_skb_partial(skb, fragstolen); + return false; + } + __skb_push(skb, hdrlen); + +no_coalesce: + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); @@ -2573,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of four TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* Default TSQ limit of 16 TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 870b0a335061..0fbf7d4df9da 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -10,6 +10,7 @@ * TCPv4 GSO/GRO support */ +#include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/tcp.h> #include <net/protocol.h> @@ -305,7 +306,8 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && @@ -318,7 +320,7 @@ static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff * return tcp_gro_receive(head, skb); } -static int tcp4_gro_complete(struct sk_buff *skb, int thoff) +INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3f510cad0b3e..730bc44dbad9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - (*rcv_wscale) = 0; + *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { - space >>= 1; - (*rcv_wscale)++; - } + *rcv_wscale = clamp_t(int, ilog2(space) - 15, + 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); @@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -1904,24 +1904,27 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited, u32 max_segs) + bool *is_cwnd_limited, + bool *is_rwnd_limited, + u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 age, send_win, cong_win, limit, in_flight; + u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; - - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - goto send_now; + s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer - * only if the last write was recent. + * only if the last write was recent (1 ms). + * Note that tp->tcp_wstamp_ns can be in the future if we have + * packets waiting in a qdisc or device for EDT delivery. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) + delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; + if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1968,15 +1971,33 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); + delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ - if (age < (tp->srtt_us >> 4)) + if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; - /* Ok, it looks like it is advisable to defer. */ + /* Ok, it looks like it is advisable to defer. + * Three cases are tracked : + * 1) We are cwnd-limited + * 2) We are rwnd-limited + * 3) We are application limited. + */ + if (cong_win < send_win) { + if (cong_win <= skb->len) { + *is_cwnd_limited = true; + return true; + } + } else { + if (send_win <= skb->len) { + *is_rwnd_limited = true; + return true; + } + } - if (cong_win < send_win && cong_win <= skb->len) - *is_cwnd_limited = true; + /* If this packet won't get more data, do not wait. */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || + TCP_SKB_CB(skb)->eor) + goto send_now; return true; @@ -2212,8 +2233,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); - limit = min_t(unsigned long, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + if (sk->sk_pacing_status == SK_PACING_NONE) + limit = min_t(unsigned long, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { @@ -2356,7 +2378,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, - max_segs)) + &is_rwnd_limited, max_segs)) break; } @@ -2494,15 +2516,18 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); + if (unlikely(!skb)) { + WARN_ONCE(tp->packets_out, + "invalid inflight: %u state %u cwnd %u mss %d\n", + tp->packets_out, sk->sk_state, tp->snd_cwnd, mss); + inet_csk(sk)->icsk_pending = 0; + return; + } /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) goto rearm_timer; - /* Retransmit last segment. */ - if (WARN_ON(!skb)) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2920,7 +2945,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } return err; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5f8b6d3cd855..f87dbc78b6bc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -40,15 +40,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts; + s32 remaining; start_ts = tcp_retransmit_stamp(sk); if (!icsk->icsk_user_timeout || !start_ts) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - if (elapsed >= icsk->icsk_user_timeout) + remaining = icsk->icsk_user_timeout - elapsed; + if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ - else - return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); + + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } /** @@ -209,7 +211,7 @@ static bool retransmits_timed_out(struct sock *sk, (boundary - linear_backoff_thresh) * TCP_RTO_MAX; timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; + return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ @@ -376,7 +378,7 @@ static void tcp_probe_timer(struct sock *sk) return; } - if (icsk->icsk_probes_out > max_probes) { + if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ @@ -482,11 +484,12 @@ void tcp_retransmit_timer(struct sock *sk) goto out_reset_timer; } + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { - int mib_idx; + int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) @@ -501,10 +504,9 @@ void tcp_retransmit_timer(struct sock *sk) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; - } else { - mib_idx = LINUX_MIB_TCPTIMEOUTS; } - __NET_INC_STATS(sock_net(sk), mib_idx); + if (mib_idx) + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c0630013c1ae..33bf8e9c8663 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -149,34 +149,40 @@ drop: } #endif -static void tunnel4_err(struct sk_buff *skb, u32 info) +static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) -static void tunnel64_err(struct sk_buff *skb, u32 info) +static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) -static void tunnelmpls4_err(struct sk_buff *skb, u32 info) +static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1976fddb9e00..3fb0ed5e4789 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -105,6 +105,7 @@ #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> +#include <net/ip_tunnels.h> #include <net/route.h> #include <net/checksum.h> #include <net/xfrm.h> @@ -115,6 +116,7 @@ #include "udp_impl.h" #include <net/sock_reuseport.h> #include <net/addrconf.h> +#include <net/udp_tunnel.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -371,21 +373,19 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || ipv6_only_sock(sk)) return -1; - score = (sk->sk_family == PF_INET) ? 2 : 1; - inet = inet_sk(sk); + if (sk->sk_rcv_saddr != daddr) + return -1; - if (inet->inet_rcv_saddr) { - if (inet->inet_rcv_saddr != daddr) - return -1; - score += 4; - } + score = (sk->sk_family == PF_INET) ? 2 : 1; + inet = inet_sk(sk); if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; @@ -398,15 +398,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -465,65 +461,30 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { - struct sock *sk, *result; + struct sock *result; unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); - struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + unsigned int hash2, slot2; + struct udp_hslot *hslot2; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness; - u32 hash = 0; - if (hslot->count > 10) { - hash2 = ipv4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp4_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, + exact_dif, hslot2, skb); + if (!result) { + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, + htonl(INADDR_ANY), hnum, dif, sdif, exact_dif, hslot2, skb); - if (!result) { - unsigned int old_slot2 = slot2; - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - /* avoid searching the same slot again. */ - if (unlikely(slot2 == old_slot2)) - return result; - - hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; - - result = udp4_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, - exact_dif, hslot2, skb); - } - if (unlikely(IS_ERR(result))) - return NULL; - return result; - } -begin: - result = NULL; - badness = 0; - sk_for_each_rcu(sk, &hslot->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); - if (score > badness) { - if (sk->sk_reuseport) { - hash = udp_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (unlikely(IS_ERR(result))) - return NULL; - if (result) - return result; - } - result = sk; - badness = score; - } } + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -585,6 +546,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +void udp_encap_enable(void) +{ + static_branch_inc(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) +{ + int i; + + for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { + int (*handler)(struct sk_buff *skb, u32 info); + + if (!iptun_encaps[i]) + continue; + handler = rcu_dereference(iptun_encaps[i]->err_handler); + if (handler && !handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp4_lib_err_encap(struct net *net, + const struct iphdr *iph, + struct udphdr *uh, + struct udp_table *udptable, + struct sk_buff *skb, u32 info) +{ + int network_offset, transport_offset; + struct sock *sk; + + network_offset = skb_network_offset(skb); + transport_offset = skb_transport_offset(skb); + + /* Network header needs to point to the outer IPv4 header inside ICMP */ + skb_reset_network_header(skb); + + /* Transport header needs to point to the UDP header */ + skb_set_transport_header(skb, iph->ihl << 2); + + sk = __udp4_lib_lookup(net, iph->daddr, uh->source, + iph->saddr, uh->dest, skb->dev->ifindex, 0, + udptable, NULL); + if (sk) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); + struct udp_sock *up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (!lookup || lookup(sk, skb)) + sk = NULL; + } + + if (!sk) + sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); + + skb_set_transport_header(skb, transport_offset); + skb_set_network_header(skb, network_offset); + + return sk; +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -596,13 +640,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, * to find the appropriate port. */ -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + bool tunnel = false; struct sock *sk; int harderr; int err; @@ -612,8 +657,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; /* No socket for error */ + /* No socket for error: try tunnels before discarding */ + sk = ERR_PTR(-ENOENT); + if (static_branch_unlikely(&udp_encap_needed_key)) { + sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb, + info); + if (!sk) + return 0; + } + + if (IS_ERR(sk)) { + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); + return PTR_ERR(sk); + } + + tunnel = true; } err = 0; @@ -656,6 +714,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ + if (tunnel) { + /* ...not for tunnels though: we don't have a sending socket */ + goto out; + } if (!inet->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -665,12 +727,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - return; + return 0; } -void udp_err(struct sk_buff *skb, u32 info) +int udp_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udp_table); + return __udp4_lib_err(skb, info, &udp_table); } /* @@ -1713,6 +1775,10 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } + + if (udp_sk(sk)->gro_enabled) + udp_cmsg_recv(msg, sk, skb); + if (inet->cmsg_flags) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); @@ -1889,13 +1955,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -void udp_encap_enable(void) -{ - static_branch_enable(&udp_encap_needed_key); -} -EXPORT_SYMBOL(udp_encap_enable); - /* returns: * -1: error * 0: success @@ -1904,7 +1963,7 @@ EXPORT_SYMBOL(udp_encap_enable); * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); @@ -2007,6 +2066,27 @@ drop: return -1; } +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *next, *segs; + int ret; + + if (likely(!udp_unexpected_gso(sk, skb))) + return udp_queue_rcv_one_skb(sk, skb); + + BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); + __skb_push(skb, -skb_mac_offset(skb)); + segs = udp_rcv_segment(sk, skb, true); + for (skb = segs; skb; skb = next) { + next = skb->next; + __skb_pull(skb, skb_transport_offset(skb)); + ret = udp_queue_rcv_one_skb(sk, skb); + if (ret > 0) + ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); + } + return 0; +} + /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -2398,11 +2478,15 @@ void udp_destroy_sock(struct sock *sk) bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); - if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { - void (*encap_destroy)(struct sock *sk); - encap_destroy = READ_ONCE(up->encap_destroy); - if (encap_destroy) - encap_destroy(sk); + if (static_branch_unlikely(&udp_encap_needed_key)) { + if (up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = READ_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + if (up->encap_enabled) + static_branch_dec(&udp_encap_needed_key); } } @@ -2447,7 +2531,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; - udp_encap_enable(); + lock_sock(sk); + udp_tunnel_encap_enable(sk->sk_socket); + release_sock(sk); break; default: err = -ENOPROTOOPT; @@ -2469,6 +2555,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, up->gso_size = val; break; + case UDP_GRO: + lock_sock(sk); + if (valbool) + udp_tunnel_encap_enable(sk->sk_socket); + up->gro_enabled = valbool; + release_sock(sk); + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index e7d18b140287..322672655419 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -7,7 +7,7 @@ #include <net/inet_common.h> int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 802f2bc00d69..64f9715173ac 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -13,6 +13,7 @@ #include <linux/skbuff.h> #include <net/udp.h> #include <net/protocol.h> +#include <net/inet_common.h> static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -343,6 +344,56 @@ out: return segs; } +#define UDP_GRO_CNT_MAX 64 +static struct sk_buff *udp_gro_receive_segment(struct list_head *head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + struct sk_buff *pp = NULL; + struct udphdr *uh2; + struct sk_buff *p; + + /* requires non zero csum, for symmetry with GSO */ + if (!uh->check) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + /* pull encapsulating udp header */ + skb_gro_pull(skb, sizeof(struct udphdr)); + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + + list_for_each_entry(p, head, list) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = udp_hdr(p); + + /* Match ports only, as csum is always non zero */ + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* Terminate the flow on len mismatch or if it grow "too much". + * Under small packet flood GRO count could elsewhere grow a lot + * leading to execessive truesize values + */ + if (!skb_gro_receive(p, skb) && + NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) + pp = p; + else if (uh->len != uh2->len) + pp = p; + + return pp; + } + + /* mismatch, but we never need to flush */ + return NULL; +} + +INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, udp_lookup_t lookup) { @@ -353,23 +404,28 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, int flush = 1; struct sock *sk; + rcu_read_lock(); + sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, + udp4_lib_lookup_skb, skb, uh->source, uh->dest); + if (!sk) + goto out_unlock; + + if (udp_sk(sk)->gro_enabled) { + pp = call_gro_receive(udp_gro_receive_segment, head, skb); + rcu_read_unlock(); + return pp; + } + if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid)) - goto out; + !NAPI_GRO_CB(skb)->csum_valid) || + !udp_sk(sk)->gro_receive) + goto out_unlock; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; - rcu_read_lock(); - sk = (*lookup)(skb, uh->source, uh->dest); - - if (sk && udp_sk(sk)->gro_receive) - goto unflush; - goto out_unlock; - -unflush: flush = 0; list_for_each_entry(p, head, list) { @@ -394,14 +450,13 @@ unflush: out_unlock: rcu_read_unlock(); -out: skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff *udp4_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); @@ -427,6 +482,19 @@ flush: return NULL; } +static int udp_gro_complete_segment(struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + + skb->csum_start = (unsigned char *)uh - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; + return 0; +} + int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup) { @@ -437,16 +505,22 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, uh->len = newlen; - /* Set encapsulation before calling into inner gro_complete() functions - * to make them set up the inner offsets. - */ - skb->encapsulation = 1; - rcu_read_lock(); - sk = (*lookup)(skb, uh->source, uh->dest); - if (sk && udp_sk(sk)->gro_complete) + sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, + udp4_lib_lookup_skb, skb, uh->source, uh->dest); + if (sk && udp_sk(sk)->gro_enabled) { + err = udp_gro_complete_segment(skb); + } else if (sk && udp_sk(sk)->gro_complete) { + skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM + : SKB_GSO_UDP_TUNNEL; + + /* Set encapsulation before calling into inner gro_complete() + * functions to make them set up the inner offsets. + */ + skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); + } rcu_read_unlock(); if (skb->remcsum_offload) @@ -456,18 +530,14 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, } EXPORT_SYMBOL(udp_gro_complete); -static int udp4_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) { const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (uh->check) { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); - } else { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; - } return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6539ff15e9a3..be8b5b2157d8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; + if (cfg->bind_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, cfg->bind_ifindex); + if (!dev) { + err = -ENODEV; + goto error; + } + + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, + dev->name, strlen(dev->name) + 1); + dev_put(dev); + + if (err < 0) + goto error; + } + udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -68,6 +85,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 8545457752fb..39c7f17d916f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } -static void udplite_err(struct sk_buff *skb, u32 info) +static int udplite_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udplite_table); + return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8dd0e6ab8606..35c54865dc42 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +static int xfrm4_esp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(esp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ah_rcv(struct sk_buff *skb) @@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +static int xfrm4_ah_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ah4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ipcomp_rcv(struct sk_buff *skb) @@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ipcomp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static const struct net_protocol esp4_protocol = { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 045597b9a7c0..521e471f1cf9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2820,7 +2820,7 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) dev = __dev_get_by_name(net, p.name); if (!dev) goto err_exit; - err = dev_open(dev); + err = dev_open(dev, NULL); } } #endif diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 94999058e110..cca3b3603c42 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -433,7 +433,6 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { - unsigned int hash = inet6_acaddr_hash(net, addr); struct net_device *nh_dev; struct ifacaddr6 *aca; bool found = false; @@ -441,7 +440,9 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); - else + else { + unsigned int hash = inet6_acaddr_hash(net, addr); + hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], aca_addr_lst) { nh_dev = fib6_info_nh_dev(aca->aca_rt); @@ -452,6 +453,7 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, break; } } + } rcu_read_unlock(); return found; } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1ede7a16a0be..bde08aa549f3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -772,6 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, case IPV6_2292PKTINFO: { struct net_device *dev = NULL; + int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; @@ -779,12 +780,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + src_idx = src_info->ipi6_ifindex; - if (src_info->ipi6_ifindex) { + if (src_idx) { if (fl6->flowi6_oif && - src_info->ipi6_ifindex != fl6->flowi6_oif) + src_idx != fl6->flowi6_oif && + (sk->sk_bound_dev_if != fl6->flowi6_oif || + !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; - fl6->flowi6_oif = src_info->ipi6_ifindex; + fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 63b2b66f9dfa..5afe9f83374d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -145,10 +145,13 @@ static void esp_output_done(struct crypto_async_request *base, int err) void *tmp; struct xfrm_state *x; - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + struct sec_path *sp = skb_sec_path(skb); + + x = sp->xvec[sp->len - 1]; + } else { x = skb_dst(skb)->xfrm; + } tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 6177e2171171..d46b4eb645c2 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -68,11 +68,12 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { - err = secpath_set(skb); - if (err) + struct sec_path *sp = secpath_set(skb); + + if (!sp) goto out; - if (skb->sp->len == XFRM_MAX_DEPTH) + if (sp->len == XFRM_MAX_DEPTH) goto out; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, @@ -81,8 +82,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (!x) goto out; - skb->sp->xvec[skb->sp->len++] = x; - skb->sp->olen++; + sp->xvec[sp->len++] = x; + sp->olen++; xo = xfrm_offload(skb); if (!xo) { @@ -141,6 +142,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); @@ -148,7 +150,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index 6de3c04b0f30..bd675c61deb1 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -4,6 +4,7 @@ #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/udp.h> +#include <linux/icmpv6.h> #include <linux/types.h> #include <linux/kernel.h> #include <net/fou.h> @@ -69,14 +70,87 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } +static int gue6_err_proto_handler(int proto, struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info) +{ + const struct inet6_protocol *ipprot; + + ipprot = rcu_dereference(inet6_protos[proto]); + if (ipprot && ipprot->err_handler) { + if (!ipprot->err_handler(skb, opt, type, code, offset, info)) + return 0; + } + + return -ENOENT; +} + +static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + int transport_offset = skb_transport_offset(skb); + struct guehdr *guehdr; + size_t optlen; + int ret; + + if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); + + switch (((struct iphdr *)guehdr)->version) { + case 4: + ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt, + type, code, offset, info); + goto out; + case 6: + ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt, + type, code, offset, info); + goto out; + default: + ret = -EOPNOTSUPP; + goto out; + } + } + default: /* Undefined version */ + return -EOPNOTSUPP; + } + + if (guehdr->control) + return -ENOENT; + + optlen = guehdr->hlen << 2; + + if (validate_gue_flags(guehdr, optlen)) + return -EINVAL; + + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); + ret = gue6_err_proto_handler(guehdr->proto_ctype, skb, + opt, type, code, offset, info); + +out: + skb_set_transport_header(skb, transport_offset); + return ret; +} + + static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou6_build_header, + .err_handler = gue6_err, }; static const struct ip6_tnl_encap_ops gue_ip6tun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue6_build_header, + .err_handler = gue6_err, }; static int ip6_tnl_encap_add_fou_ops(void) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index c9c53ade55c3..5d7aa2c2770c 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -84,7 +84,7 @@ static inline struct sock *icmpv6_sk(struct net *net) return net->ipv6.icmp_sk[smp_processor_id()]; } -static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ @@ -100,6 +100,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) ping_err(skb, offset, ntohl(info)); + + return 0; } static int icmpv6_rcv(struct sk_buff *skb); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3d7c7460a0c5..f3515ebe9b3a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -102,22 +102,13 @@ static inline int compute_score(struct sock *sk, struct net *net, if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) + return -1; score = 1; - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) - return -1; - score++; - } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score++; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } @@ -166,26 +157,12 @@ struct sock *inet6_lookup_listener(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { - unsigned int hash = inet_lhashfn(net, hnum); - struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - bool exact_dif = inet6_exact_dif_match(net, skb); struct inet_listen_hashbucket *ilb2; - struct sock *sk, *result = NULL; - int score, hiscore = 0; + struct sock *result = NULL; unsigned int hash2; - u32 phash = 0; - - if (ilb->count <= 10 || !hashinfo->lhash2) - goto port_lookup; - - /* Too many sk in the ilb bucket (which is hashed by port alone). - * Try lhash2 (which is hashed by port and addr) instead. - */ hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, @@ -194,33 +171,12 @@ struct sock *inet6_lookup_listener(struct net *net, goto done; /* Lookup lhash2 with in6addr_any */ - hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, + saddr, sport, &in6addr_any, hnum, dif, sdif); - goto done; - -port_lookup: - sk_for_each(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); - if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - goto done; - } - result = sk; - hiscore = score; - } - } done: if (unlikely(IS_ERR(result))) return NULL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 515adbdba1d2..229e55c99021 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -423,7 +423,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) } -static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); @@ -433,13 +433,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6), offset) < 0) - return; + return -EINVAL; ipv6h = (const struct ipv6hdr *)skb->data; t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, tpi.key, tpi.proto); if (!t) - return; + return -ENOENT; switch (type) { struct ipv6_tlv_tnl_enc_lim *tel; @@ -449,14 +449,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, t->parms.name); if (code != ICMPV6_PORT_UNREACH) break; - return; + return 0; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); break; } - return; + return 0; case ICMPV6_PARAMPROB: teli = 0; if (code == ICMPV6_HDR_FIELD) @@ -472,14 +472,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } - return; + return 0; case ICMPV6_PKT_TOOBIG: ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); - return; + return 0; case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); - return; + return 0; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) @@ -487,6 +487,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, else t->err_count = 1; t->err_time = jiffies; + + return 0; } static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) @@ -1883,12 +1885,6 @@ static void ip6gre_tap_setup(struct net_device *dev) netif_keep_dst(dev); } -bool is_ip6gretap_dev(const struct net_device *dev) -{ - return dev->netdev_ops == &ip6gre_tap_netdev_ops; -} -EXPORT_SYMBOL_GPL(is_ip6gretap_dev); - static bool ip6gre_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *ipencap) { diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..c7ed2b6d5a1d 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -95,7 +95,7 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; - list_del(&skb->list); + skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -296,7 +296,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - list_del(&skb->list); + skb_list_del_init(skb); skb = ip6_rcv_core(skb, dev, net); if (skb == NULL) continue; @@ -319,28 +319,26 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, /* * Deliver the packet to the host */ - - -static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, + bool have_final) { const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; - int nexthdr; bool raw; - bool have_final = false; /* * Parse extension headers */ - rcu_read_lock(); resubmit: idev = ip6_dst_idev(skb_dst(skb)); - if (!pskb_pull(skb, skb_transport_offset(skb))) - goto discard; nhoff = IP6CB(skb)->nhoff; - nexthdr = skb_network_header(skb)[nhoff]; + if (!have_final) { + if (!pskb_pull(skb, skb_transport_offset(skb))) + goto discard; + nexthdr = skb_network_header(skb)[nhoff]; + } resubmit_final: raw = raw6_local_deliver(skb, nexthdr); @@ -359,6 +357,8 @@ resubmit_final: } } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + int sdif = inet6_sdif(skb); + struct net_device *dev; /* Only do this once for first final protocol */ have_final = true; @@ -371,9 +371,19 @@ resubmit_final: skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + dev = dev_get_by_index_rcu(net, sdif); + if (!dev) + goto discard; + } else { + dev = skb->dev; + } + if (ipv6_addr_is_multicast(&hdr->daddr) && - !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, - &hdr->saddr) && + !ipv6_chk_mcast_addr(dev, &hdr->daddr, + &hdr->saddr) && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } @@ -411,13 +421,19 @@ resubmit_final: consume_skb(skb); } } - rcu_read_unlock(); - return 0; + return; discard: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); - rcu_read_unlock(); kfree_skb(skb); +} + +static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + rcu_read_lock(); + ip6_protocol_deliver_rcu(net, skb, 0, false); + rcu_read_unlock(); + return 0; } @@ -432,15 +448,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c7e495f12011..5c045691c302 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -20,6 +20,23 @@ #include "ip6_offload.h" +/* All GRO functions are always builtin, except UDP over ipv6, which lays in + * ipv6 module, as it depends on UDPv6 lookup function, so we need special care + * when ipv6 is built as a module + */ +#if IS_BUILTIN(CONFIG_IPV6) +#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) +#endif + +#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ +({ \ + unlikely(gro_recursion_inc_test(skb)) ? \ + NAPI_GRO_CB(skb)->flush |= 1, NULL : \ + INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ +}) + static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; @@ -164,8 +181,12 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff *ipv6_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; @@ -229,14 +250,21 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head, * XXX skbs on the gro_list have all been parsed and pulled * already so we don't need to compare nlen * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) - * memcmp() alone below is suffcient, right? + * memcmp() alone below is sufficient, right? */ if ((first_word & htonl(0xF00FFFFF)) || - memcmp(&iph->nexthdr, &iph2->nexthdr, - nlen - offsetof(struct ipv6hdr, nexthdr))) { + !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || + !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || + *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) { +not_same_flow: NAPI_GRO_CB(p)->same_flow = 0; continue; } + if (unlikely(nlen > sizeof(struct ipv6hdr))) { + if (memcmp(iph + 1, iph2 + 1, + nlen - sizeof(struct ipv6hdr))) + goto not_same_flow; + } /* flush if Traffic Class fields are different */ NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); NAPI_GRO_CB(p)->flush |= flush; @@ -253,7 +281,8 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head, skb_gro_postpull_rcsum(skb, iph, nlen); - pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, + ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -294,7 +323,9 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, return inet_gro_receive(head, skb); } -static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); @@ -313,7 +344,8 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb, nhoff); + err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, + udp6_gro_complete, skb, nhoff); out_unlock: rcu_read_unlock(); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 89e0d5118afe..5f9fa0302b5a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -195,37 +195,37 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); + unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; - if (opt) { - unsigned int head_room; + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + if (opt) + head_room += opt->opt_nflen + opt->opt_flen; - /* First: exthdrs may take lots of space (~8K for now) - MAX_HEADER is not enough. - */ - head_room = opt->opt_nflen + opt->opt_flen; - seg_len += head_room; - head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); - - if (skb_headroom(skb) < head_room) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); - if (!skb2) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); - return -ENOBUFS; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; + if (unlikely(skb_headroom(skb) < head_room)) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + if (!skb2) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -ENOBUFS; } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + if (opt) { + seg_len += opt->opt_nflen + opt->opt_flen; + if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, &fl6->saddr); @@ -378,6 +378,14 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV + if (skb->offload_l3_fwd_mark) { + consume_skb(skb); + return 0; + } +#endif + + skb->tstamp = 0; return dst_output(net, sk, skb); } @@ -574,6 +582,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); skb_copy_secmark(to, from); } @@ -1245,6 +1254,7 @@ static int __ip6_append_data(struct sock *sk, { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; @@ -1257,7 +1267,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged; + bool paged, extra_uref; skb = skb_peek_tail(queue); if (!skb) { @@ -1322,6 +1332,20 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + extra_uref = true; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + } + } + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1354,7 +1378,7 @@ emsgsize: unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - unsigned int pagedlen = 0; + unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ if (skb) @@ -1378,6 +1402,7 @@ alloc_new_skb: if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; + pagedlen = 0; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) @@ -1439,12 +1464,6 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - /* * Find where to start putting bytes */ @@ -1476,6 +1495,13 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1505,7 +1531,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1535,6 +1561,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1547,6 +1577,8 @@ alloc_new_skb: error_efault: err = -EFAULT; error: + if (uarg) + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a9d06d4dd057..99179b9c8384 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -901,6 +901,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; + ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index b283f293ee4a..ad1a9ccd4b44 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -15,7 +15,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { - struct sockaddr_in6 udp6_addr; + struct sockaddr_in6 udp6_addr = {}; int err; struct socket *sock = NULL; @@ -31,6 +31,22 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; } + if (cfg->bind_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, cfg->bind_ifindex); + if (!dev) { + err = -ENODEV; + goto error; + } + + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, + dev->name, strlen(dev->name) + 1); + dev_put(dev); + + if (err < 0) + goto error; + } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, @@ -42,6 +58,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->peer_udp_port) { + memset(&udp6_addr, 0, sizeof(udp6_addr)); udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, sizeof(udp6_addr.sin6_addr)); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index eeaf7455d51e..706fe42e4928 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -318,6 +318,7 @@ static int vti6_rcv(struct sk_buff *skb) return 0; } + ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { t->dev->stats.rx_dropped++; rcu_read_unlock(); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e2ea691e42c6..8276f1224f16 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -52,6 +52,8 @@ #include <net/ip6_checksum.h> #include <linux/netconf.h> +#include <linux/nospec.h> + struct ip6mr_rule { struct fib_rule common; }; @@ -655,7 +657,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) return NULL; } - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); @@ -1841,6 +1843,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; + vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); read_lock(&mrt_lock); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { @@ -1915,6 +1918,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; + vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); read_lock(&mrt_lock); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { @@ -1968,7 +1972,7 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct */ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *c, int vifi) + struct sk_buff *skb, int vifi) { struct ipv6hdr *ipv6h; struct vif_device *vif = &mrt->vif_table[vifi]; @@ -2134,15 +2138,14 @@ forward: if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ip6mr_forward2(net, mrt, skb2, - c, psend); + ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { - ip6mr_forward2(net, mrt, skb, c, psend); + ip6mr_forward2(net, mrt, skb, psend); return; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 381ce38940ae..973e215c3114 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ sticky_done: retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 5ae8e1c51079..8b075f0bc351 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -24,7 +24,8 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) unsigned int hh_len; struct dst_entry *dst; struct flowi6 fl6 = { - .flowi6_oif = sk ? sk->sk_bound_dev_if : 0, + .flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if : + rt6_need_strict(&iph->daddr) ? skb_dst(skb)->dev->ifindex : 0, .flowi6_mark = skb->mark, .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 200c0c235565..9ea43d5256e0 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o -nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o +nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 491f808e356a..29c7f1915a96 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -58,8 +58,12 @@ static int __init masquerade_tg6_init(void) int err; err = xt_register_target(&masquerade_tg6_reg); - if (err == 0) - nf_nat_masquerade_ipv6_register_notifier(); + if (err) + return err; + + err = nf_nat_masquerade_ipv6_register_notifier(); + if (err) + xt_unregister_target(&masquerade_tg6_reg); return err; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d219979c3e52..181da2c40f9a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -341,7 +341,7 @@ static bool nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) { struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; u8 ecn; inet_frag_kill(&fq->q); @@ -363,10 +363,16 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic return false; } + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) return false; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index ca6d38698b1a..23022447eb49 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -61,22 +61,8 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, } #endif -static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range2 *range) -{ - return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && - ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; -} - -static u32 nf_nat_ipv6_secure_port(const struct nf_conntrack_tuple *t, - __be16 dport) -{ - return secure_ipv6_port_ephemeral(t->src.u3.ip6, t->dst.u3.ip6, dport); -} - static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, - const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { @@ -96,8 +82,8 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, goto manip_addr; if ((frag_off & htons(~0x7)) == 0 && - !l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff, - target, maniptype)) + !nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff, + target, maniptype)) return false; /* must reload, offset might have changed */ @@ -171,8 +157,6 @@ static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { .l3proto = NFPROTO_IPV6, - .secure_port = nf_nat_ipv6_secure_port, - .in_range = nf_nat_ipv6_in_range, .manip_pkt = nf_nat_ipv6_manip_pkt, .csum_update = nf_nat_ipv6_csum_update, .csum_recalc = nf_nat_ipv6_csum_recalc, @@ -196,7 +180,6 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); - const struct nf_nat_l4proto *l4proto; struct nf_conntrack_tuple target; unsigned long statusbit; @@ -227,9 +210,8 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, if (!(ct->status & statusbit)) return 1; - l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, inside->ip6.nexthdr); if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), - l4proto, &ct->tuplehash[!dir].tuple, !manip)) + &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { @@ -244,8 +226,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, } nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, IPPROTO_ICMPV6); - if (!nf_nat_ipv6_manip_pkt(skb, 0, l4proto, &target, manip)) + if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; return 1; @@ -415,26 +396,12 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn); static int __init nf_nat_l3proto_ipv6_init(void) { - int err; - - err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); - if (err < 0) - goto err1; - err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv6); - if (err < 0) - goto err2; - return err; - -err2: - nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); -err1: - return err; + return nf_nat_l3proto_register(&nf_nat_l3proto_ipv6); } static void __exit nf_nat_l3proto_ipv6_exit(void) { nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv6); - nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); } MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 3e4bf2286abe..0ad0da5a2600 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -132,8 +132,8 @@ static void iterate_cleanup_work(struct work_struct *work) * of ipv6 addresses being deleted), we also need to add an upper * limit to the number of queued work items. */ -static int masq_inet_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int masq_inet6_event(struct notifier_block *this, + unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; @@ -171,30 +171,53 @@ static int masq_inet_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, +static struct notifier_block masq_inet6_notifier = { + .notifier_call = masq_inet6_event, }; -static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); +static int masq_refcnt; +static DEFINE_MUTEX(masq_mutex); -void nf_nat_masquerade_ipv6_register_notifier(void) +int nf_nat_masquerade_ipv6_register_notifier(void) { + int ret = 0; + + mutex_lock(&masq_mutex); /* check if the notifier is already set */ - if (atomic_inc_return(&masquerade_notifier_refcount) > 1) - return; + if (++masq_refcnt > 1) + goto out_unlock; + + ret = register_netdevice_notifier(&masq_dev_notifier); + if (ret) + goto err_dec; + + ret = register_inet6addr_notifier(&masq_inet6_notifier); + if (ret) + goto err_unregister; - register_netdevice_notifier(&masq_dev_notifier); - register_inet6addr_notifier(&masq_inet_notifier); + mutex_unlock(&masq_mutex); + return ret; + +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); +err_dec: + masq_refcnt--; +out_unlock: + mutex_unlock(&masq_mutex); + return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); void nf_nat_masquerade_ipv6_unregister_notifier(void) { + mutex_lock(&masq_mutex); /* check if the notifier still has clients */ - if (atomic_dec_return(&masquerade_notifier_refcount) > 0) - return; + if (--masq_refcnt > 0) + goto out_unlock; - unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_inet6addr_notifier(&masq_inet6_notifier); unregister_netdevice_notifier(&masq_dev_notifier); +out_unlock: + mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c deleted file mode 100644 index d9bf42ba44fa..000000000000 --- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2011 Patrick Mchardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on Rusty Russell's IPv4 ICMP NAT code. Development of IPv6 - * NAT funded by Astaro. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/icmpv6.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static bool -icmpv6_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && - ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); -} - -static void -icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - static u16 id; - unsigned int range_size; - unsigned int i; - - range_size = ntohs(range->max_proto.icmp.id) - - ntohs(range->min_proto.icmp.id) + 1; - - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) - range_size = 0xffff; - - for (i = 0; ; ++id) { - tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + - (id % range_size)); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) - return; - } -} - -static bool -icmpv6_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct icmp6hdr *hdr; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - hdr = (struct icmp6hdr *)(skb->data + hdroff); - l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum, - tuple, maniptype); - if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || - hdr->icmp6_type == ICMPV6_ECHO_REPLY) { - inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, - hdr->icmp6_identifier, - tuple->src.u.icmp.id, false); - hdr->icmp6_identifier = tuple->src.u.icmp.id; - } - return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_icmpv6 = { - .l4proto = IPPROTO_ICMPV6, - .manip_pkt = icmpv6_manip_pkt, - .in_range = icmpv6_in_range, - .unique_tuple = icmpv6_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 24858402e374..b9c8a763c863 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) { + struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct tcphdr _otcph; const struct tcphdr *otcph; @@ -197,15 +198,18 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - if (oldskb->nf_bridge) { + br_indev = nf_bridge_get_physindev(oldskb); + if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = nf_bridge_get_physindev(oldskb); + nskb->dev = br_indev; nskb->protocol = htons(ETH_P_IPV6); ip6h->payload_len = htons(sizeof(struct tcphdr)); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) + oeth->h_source, oeth->h_dest, nskb->len) < 0) { + kfree_skb(nskb); return; + } dev_queue_xmit(nskb); } else #endif diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index dd0122f3cffe..e06c82e9dfcd 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -70,7 +70,9 @@ static int __init nft_masq_ipv6_module_init(void) if (ret < 0) return ret; - nf_nat_masquerade_ipv6_register_notifier(); + ret = nf_nat_masquerade_ipv6_register_notifier(); + if (ret) + nft_unregister_expr(&nft_masq_ipv6_type); return ret; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5e0efd3954e9..5a426226c762 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif)) continue; if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { @@ -658,6 +657,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->ip_summed = CHECKSUM_NONE; + skb_setup_tx_timestamp(skb, sockc->tsflags); + if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5c3c92713096..a5bb59ee50ac 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -281,7 +281,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, { struct net *net = container_of(fq->q.net, struct net, ipv6.frags); struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; unsigned int nhoff; int sum_truesize; u8 ecn; @@ -322,10 +322,16 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, if (payload_len > IPV6_MAXPLEN) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ @@ -378,6 +384,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, if (skb_try_coalesce(head, fp, &headstolen, &delta)) { kfree_skb_partial(fp, headstolen); } else { + fp->sk = NULL; if (!skb_shinfo(head)->frag_list) skb_shinfo(head)->frag_list = fp; head->data_len += fp->len; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 059f0531f7c1..194bc162866d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2977,7 +2977,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!rt) goto out; - rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len); + rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, + extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); /* Do not leave garbage there. */ @@ -3710,7 +3711,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, if (!f6i) return ERR_PTR(-ENOMEM); - f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0); + f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); f6i->dst_nocount = true; f6i->dst_host = true; f6i->fib6_protocol = RTPROT_KERNEL; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index a8854dd3e9c5..8181ee7e1e27 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -347,6 +347,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 03e6b7a2bc53..b81eb7cb815e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -349,7 +349,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk) } } -static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; @@ -371,17 +371,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, fatal); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, fatal); + return 0; + } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -467,6 +469,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out: bh_unlock_sock(sk); sock_put(sk); + return 0; } @@ -734,6 +737,7 @@ static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { + bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = inet6_sk(sk_listener); @@ -741,7 +745,7 @@ static void tcp_v6_init_req(struct request_sock *req, ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ - if (!sk_listener->sk_bound_dev_if && + if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index e72947c99454..3179c425d7ff 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -9,14 +9,15 @@ * * TCPv6 GSO/GRO support */ +#include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" -static struct sk_buff *tcp6_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && @@ -29,7 +30,7 @@ static struct sk_buff *tcp6_gro_receive(struct list_head *head, return tcp_gro_receive(head, skb); } -static int tcp6_gro_complete(struct sk_buff *skb, int thoff) +INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index dae25cad05cd..1991dede7367 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -134,24 +134,28 @@ drop: return 0; } -static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } -static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static const struct inet6_protocol tunnel6_protocol = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d2d97d07ef27..9cbf363172bd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -45,6 +45,7 @@ #include <net/raw.h> #include <net/tcp_states.h> #include <net/ip6_checksum.h> +#include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> @@ -117,12 +118,16 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || sk->sk_family != PF_INET6) return -1; + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + score = 0; inet = inet_sk(sk); @@ -132,27 +137,16 @@ static int compute_score(struct sock *sk, struct net *net, score++; } - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr)) { if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) return -1; score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score++; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); + if (!dev_match) + return -1; + score++; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -200,66 +194,32 @@ struct sock *__udp6_lib_lookup(struct net *net, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { - struct sock *sk, *result; unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); - struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + unsigned int hash2, slot2; + struct udp_hslot *hslot2; + struct sock *result; bool exact_dif = udp6_lib_exact_dif_match(net, skb); - int score, badness; - u32 hash = 0; - if (hslot->count > 10) { - hash2 = ipv6_portaddr_hash(net, daddr, hnum); + hash2 = ipv6_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp6_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, exact_dif, + hslot2, skb); + if (!result) { + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif, - hslot2, skb); - if (!result) { - unsigned int old_slot2 = slot2; - hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; - /* avoid searching the same slot again. */ - if (unlikely(slot2 == old_slot2)) - return result; - - hslot2 = &udptable->hash2[slot2]; - if (hslot->count < hslot2->count) - goto begin; - - result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, - exact_dif, hslot2, - skb); - } - if (unlikely(IS_ERR(result))) - return NULL; - return result; - } -begin: - result = NULL; - badness = -1; - sk_for_each_rcu(sk, &hslot->head) { - score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, - sdif, exact_dif); - if (score > badness) { - if (sk->sk_reuseport) { - hash = udp6_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (unlikely(IS_ERR(result))) - return NULL; - if (result) - return result; - } - result = sk; - badness = score; - } + &in6addr_any, hnum, dif, sdif, + exact_dif, hslot2, + skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__udp6_lib_lookup); @@ -329,6 +289,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; + struct udp_mib *mib; int is_udp4; if (flags & MSG_ERRQUEUE) @@ -352,6 +313,7 @@ try_again: msg->msg_flags |= MSG_TRUNC; is_udp4 = (skb->protocol == htons(ETH_P_IP)); + mib = __UDPX_MIB(sk, is_udp4); /* * If checksum is needed at all, try to do it while copying the @@ -380,24 +342,13 @@ try_again: if (unlikely(err)) { if (!peeked) { atomic_inc(&sk->sk_drops); - if (is_udp4) - UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - is_udplite); - else - UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - is_udplite); + SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); return err; } - if (!peeked) { - if (is_udp4) - UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, - is_udplite); - else - UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, - is_udplite); - } + if (!peeked) + SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS); sock_recv_ts_and_drops(msg, sk, skb); @@ -421,6 +372,9 @@ try_again: *addr_len = sizeof(*sin6); } + if (udp_sk(sk)->gro_enabled) + udp_cmsg_recv(msg, sk, skb); + if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); @@ -443,17 +397,8 @@ try_again: csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { - if (is_udp4) { - UDP_INC_STATS(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); - } else { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); - } + SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS); + SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -463,15 +408,106 @@ csum_copy_err: goto try_again; } -void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info, - struct udp_table *udptable) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +void udpv6_encap_enable(void) +{ + static_branch_inc(&udpv6_encap_needed_key); +} +EXPORT_SYMBOL(udpv6_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info) +{ + int i; + + for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { + int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info); + + if (!ip6tun_encaps[i]) + continue; + handler = rcu_dereference(ip6tun_encaps[i]->err_handler); + if (handler && !handler(skb, opt, type, code, offset, info)) + return 0; + } + + return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp6_lib_err_encap(struct net *net, + const struct ipv6hdr *hdr, int offset, + struct udphdr *uh, + struct udp_table *udptable, + struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, __be32 info) +{ + int network_offset, transport_offset; + struct sock *sk; + + network_offset = skb_network_offset(skb); + transport_offset = skb_transport_offset(skb); + + /* Network header needs to point to the outer IPv6 header inside ICMP */ + skb_reset_network_header(skb); + + /* Transport header needs to point to the UDP header */ + skb_set_transport_header(skb, offset); + + sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source, + &hdr->saddr, uh->dest, + inet6_iif(skb), 0, udptable, skb); + if (sk) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); + struct udp_sock *up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (!lookup || lookup(sk, skb)) + sk = NULL; + } + + if (!sk) { + sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code, + offset, info)); + } + + skb_set_transport_header(skb, transport_offset); + skb_set_network_header(skb, network_offset); + + return sk; +} + +int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info, + struct udp_table *udptable) { struct ipv6_pinfo *np; const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct in6_addr *saddr = &hdr->saddr; const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); + bool tunnel = false; struct sock *sk; int harderr; int err; @@ -480,9 +516,23 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, skb); if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return; + /* No socket for error: try tunnels before discarding */ + sk = ERR_PTR(-ENOENT); + if (static_branch_unlikely(&udpv6_encap_needed_key)) { + sk = __udp6_lib_err_encap(net, hdr, offset, uh, + udptable, skb, + opt, type, code, info); + if (!sk) + return 0; + } + + if (IS_ERR(sk)) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); + return PTR_ERR(sk); + } + + tunnel = true; } harderr = icmpv6_err_convert(type, code, &err); @@ -496,10 +546,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, harderr = 1; } if (type == NDISC_REDIRECT) { - ip6_sk_redirect(skb, sk); + if (tunnel) { + ip6_redirect(skb, sock_net(sk), inet6_iif(skb), + sk->sk_mark, sk->sk_uid); + } else { + ip6_sk_redirect(skb, sk); + } goto out; } + /* Tunnels don't have an application socket: don't pass errors back */ + if (tunnel) + goto out; + if (!np->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -510,7 +569,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk->sk_err = err; sk->sk_error_report(sk); out: - return; + return 0; } static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -541,21 +600,14 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static __inline__ void udpv6_err(struct sk_buff *skb, - struct inet6_skb_parm *opt, u8 type, - u8 code, int offset, __be32 info) +static __inline__ int udpv6_err(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, + u8 code, int offset, __be32 info) { - __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); + return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); -void udpv6_encap_enable(void) -{ - static_branch_enable(&udpv6_encap_needed_key); -} -EXPORT_SYMBOL(udpv6_encap_enable); - -static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); @@ -638,10 +690,32 @@ drop: return -1; } +static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *next, *segs; + int ret; + + if (likely(!udp_unexpected_gso(sk, skb))) + return udpv6_queue_rcv_one_skb(sk, skb); + + __skb_push(skb, -skb_mac_offset(skb)); + segs = udp_rcv_segment(sk, skb, false); + for (skb = segs; skb; skb = next) { + next = skb->next; + __skb_pull(skb, skb_transport_offset(skb)); + + ret = udpv6_queue_rcv_one_skb(sk, skb); + if (ret > 0) + ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret, + true); + } + return 0; +} + static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -653,7 +727,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))) return false; @@ -687,6 +761,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -701,7 +776,8 @@ start_lookup: sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. @@ -1458,11 +1534,15 @@ void udpv6_destroy_sock(struct sock *sk) udp_v6_flush_pending_frames(sk); release_sock(sk); - if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { - void (*encap_destroy)(struct sock *sk); - encap_destroy = READ_ONCE(up->encap_destroy); - if (encap_destroy) - encap_destroy(sk); + if (static_branch_unlikely(&udpv6_encap_needed_key)) { + if (up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = READ_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + if (up->encap_enabled) + static_branch_dec(&udpv6_encap_needed_key); } inet6_destroy_sock(sk); diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 7903e21c178b..5730e6503cb4 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -9,8 +9,8 @@ #include <net/transp_v6.h> int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, - __be32, struct udp_table *); +int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, + __be32, struct udp_table *); int udp_v6_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 1b8e161ac527..83b11d0ac091 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -11,6 +11,7 @@ */ #include <linux/skbuff.h> #include <linux/netdevice.h> +#include <linux/indirect_call_wrapper.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/udp.h> @@ -114,8 +115,8 @@ out: return segs; } -static struct sk_buff *udp6_gro_receive(struct list_head *head, - struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); @@ -142,18 +143,14 @@ flush: return NULL; } -static int udp6_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (uh->check) { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + if (uh->check) uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, &ipv6h->daddr, 0); - } else { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; - } return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb); } diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 5000ad6878e6..a125aebc29e5 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -20,11 +20,12 @@ static int udplitev6_rcv(struct sk_buff *skb) return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } -static void udplitev6_err(struct sk_buff *skb, +static int udplitev6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table); + return __udp6_lib_err(skb, opt, type, code, offset, info, + &udplite_table); } static const struct inet6_protocol udplitev6_protocol = { diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 9ef490dddcea..a52cb3fc6df5 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -86,14 +86,16 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, { struct net *net = dev_net(skb->dev); struct xfrm_state *x = NULL; + struct sec_path *sp; int i = 0; - if (secpath_set(skb)) { + sp = secpath_set(skb); + if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } - if (1 + skb->sp->len == XFRM_MAX_DEPTH) { + if (1 + sp->len == XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } @@ -145,7 +147,7 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, goto drop; } - skb->sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; spin_lock(&x->lock); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index d35bcf92969c..769f8f78d3b8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -262,7 +262,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); - BUG_ON(!loopback_idev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index b2dc8ce49378..cc979b702c89 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -80,14 +80,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(esp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static int xfrm6_ah_rcv(struct sk_buff *skb) @@ -107,14 +109,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ah6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static int xfrm6_ipcomp_rcv(struct sk_buff *skb) @@ -134,14 +138,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ipcomp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static const struct inet6_protocol esp6_protocol = { diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 4a46df8441c9..f5b4febeaa25 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -144,6 +144,9 @@ static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; + + if (spi == XFRM6_TUNNEL_SPI_MAX) + break; } for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 0bed4cc20603..78ea5a739d10 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1873,30 +1873,26 @@ static void iucv_callback_txdone(struct iucv_path *path, struct sock *sk = path->private; struct sk_buff *this = NULL; struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q; - struct sk_buff *list_skb = list->next; + struct sk_buff *list_skb; unsigned long flags; bh_lock_sock(sk); - if (!skb_queue_empty(list)) { - spin_lock_irqsave(&list->lock, flags); - while (list_skb != (struct sk_buff *)list) { - if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { - this = list_skb; - break; - } - list_skb = list_skb->next; + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk(list, list_skb) { + if (msg->tag == IUCV_SKB_CB(list_skb)->tag) { + this = list_skb; + break; } - if (this) - __skb_unlink(this, list); - - spin_unlock_irqrestore(&list->lock, flags); + } + if (this) + __skb_unlink(this, list); + spin_unlock_irqrestore(&list->lock, flags); - if (this) { - kfree_skb(this); - /* wake up any process waiting for sending */ - iucv_sock_wake_msglim(sk); - } + if (this) { + kfree_skb(this); + /* wake up any process waiting for sending */ + iucv_sock_wake_msglim(sk); } if (sk->sk_state == IUCV_CLOSING) { @@ -2284,11 +2280,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, list = &iucv->send_skb_q; spin_lock_irqsave(&list->lock, flags); - if (skb_queue_empty(list)) - goto out_unlock; - list_skb = list->next; - nskb = list_skb->next; - while (list_skb != (struct sk_buff *)list) { + skb_queue_walk_safe(list, list_skb, nskb) { if (skb_shinfo(list_skb) == skb_shinfo(skb)) { switch (n) { case TX_NOTIFY_OK: @@ -2321,10 +2313,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, } break; } - list_skb = nskb; - nskb = nskb->next; } -out_unlock: spin_unlock_irqrestore(&list->lock, flags); if (sk->sk_state == IUCV_CLOSING) { diff --git a/net/key/af_key.c b/net/key/af_key.c index 9d61266526e7..655c787f9d54 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2020,7 +2020,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { - struct xfrm_sec_ctx *xfrm_ctx = xp->security; + struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 8da86ceca33d..309dee76724e 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -47,6 +47,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev) EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); /** + * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master + * device + * @net: network namespace for device index lookup + * @ifindex: targeted interface + */ +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, ifindex); + while (dev && !netif_is_l3_master(dev)) + dev = netdev_master_upper_dev_get(dev); + + return dev ? dev->ifindex : 0; +} +EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); + +/** * l3mdev_fib_table - get FIB table id associated with an L3 * master interface * @dev: targeted interface diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index f869e35d0974..be471fe95048 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -57,14 +57,13 @@ comment "Some wireless drivers require a rate control algorithm" depends on MAC80211 && MAC80211_HAS_RC=n config MAC80211_MESH - bool "Enable mac80211 mesh networking (pre-802.11s) support" + bool "Enable mac80211 mesh networking support" depends on MAC80211 ---help--- - This options enables support of Draft 802.11s mesh networking. - The implementation is based on Draft 2.08 of the Mesh Networking - amendment. However, no compliance with that draft is claimed or even - possible, as drafts leave a number of identifiers to be defined after - ratification. For more information visit http://o11s.org/. + Select this option to enable 802.11 mesh operation in mac80211 + drivers that support it. 802.11 mesh connects multiple stations + over (possibly multi-hop) wireless links to form a single logical + LAN. config MAC80211_LEDS bool "Enable LED triggers" diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 51622333d460..de65fe3ed9cc 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -800,8 +800,8 @@ static int ieee80211_set_ftm_responder_params( u8 *pos; int len; - if ((!lci || !lci_len) && (!civicloc || !civicloc_len)) - return 1; + if (!lci_len && !civicloc_len) + return 0; bss_conf = &sdata->vif.bss_conf; old = bss_conf->ftmr_params; @@ -2028,6 +2028,9 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, nconf->dot11MeshAwakeWindowDuration; if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask)) conf->plink_timeout = nconf->plink_timeout; + if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) + conf->dot11MeshConnectedToMeshGate = + nconf->dot11MeshConnectedToMeshGate; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } @@ -2891,7 +2894,7 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + - beacon->probe_resp_len; + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) @@ -2934,8 +2937,9 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } - if (beacon->ftm_responder) - new_beacon->ftm_responder = beacon->ftm_responder; + + /* might copy -1, meaning no changes requested */ + new_beacon->ftm_responder = beacon->ftm_responder; if (beacon->lci) { new_beacon->lci_len = beacon->lci_len; new_beacon->lci = pos; @@ -3849,6 +3853,26 @@ ieee80211_get_ftm_responder_stats(struct wiphy *wiphy, return drv_get_ftm_responder_stats(local, sdata, ftm_stats); } +static int +ieee80211_start_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, + struct cfg80211_pmsr_request *request) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); + + return drv_start_pmsr(local, sdata, request); +} + +static void +ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, + struct cfg80211_pmsr_request *request) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); + + return drv_abort_pmsr(local, sdata, request); +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -3944,4 +3968,6 @@ const struct cfg80211_ops mac80211_config_ops = { .tx_control_port = ieee80211_tx_control_port, .get_txq_stats = ieee80211_get_txq_stats, .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats, + .start_pmsr = ieee80211_start_pmsr, + .abort_pmsr = ieee80211_abort_pmsr, }; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index c813207bb123..cff0fb3578c9 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -641,6 +641,8 @@ IEEE80211_IF_FILE(dot11MeshHWMPconfirmationInterval, IEEE80211_IF_FILE(power_mode, u.mesh.mshcfg.power_mode, DEC); IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC); +IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, + u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -762,6 +764,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(dot11MeshHWMPconfirmationInterval); MESHPARAMS_ADD(power_mode); MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); + MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index af5185a836e5..b753194710ad 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -795,22 +795,22 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, #define PRINT_NSS_SUPP(f, n) \ do { \ - int i; \ + int _i; \ u16 v = le16_to_cpu(nss->f); \ p += scnprintf(p, buf_sz + buf - p, n ": %#.4x\n", v); \ - for (i = 0; i < 8; i += 2) { \ - switch ((v >> i) & 0x3) { \ + for (_i = 0; _i < 8; _i += 2) { \ + switch ((v >> _i) & 0x3) { \ case 0: \ - PRINT(n "-%d-SUPPORT-0-7", i / 2); \ + PRINT(n "-%d-SUPPORT-0-7", _i / 2); \ break; \ case 1: \ - PRINT(n "-%d-SUPPORT-0-9", i / 2); \ + PRINT(n "-%d-SUPPORT-0-9", _i / 2); \ break; \ case 2: \ - PRINT(n "-%d-SUPPORT-0-11", i / 2); \ + PRINT(n "-%d-SUPPORT-0-11", _i / 2); \ break; \ case 3: \ - PRINT(n "-%d-NOT-SUPPORTED", i / 2); \ + PRINT(n "-%d-NOT-SUPPORTED", _i / 2); \ break; \ } \ } \ diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 0b1747a2313d..3e0d5922a440 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1199,6 +1199,40 @@ drv_get_ftm_responder_stats(struct ieee80211_local *local, return ret; } +static inline int drv_start_pmsr(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_pmsr_request *request) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_start_pmsr(local, sdata); + + if (local->ops->start_pmsr) + ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline void drv_abort_pmsr(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_pmsr_request *request) +{ + trace_drv_abort_pmsr(local, sdata); + + might_sleep(); + if (!check_sdata_in_driver(sdata)) + return; + + if (local->ops->abort_pmsr) + local->ops->abort_pmsr(&local->hw, &sdata->vif, request); + trace_drv_return_void(local); +} + static inline int drv_start_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 10a05062e4a0..7dfb4e2f98b2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -500,6 +500,7 @@ struct ieee80211_if_managed { unsigned int uapsd_max_sp_len; int wmm_last_param_set; + int mu_edca_last_param_set; u8 use_4addr; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5836ddeac9e3..4a6ff1482a9f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -7,6 +7,7 @@ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (c) 2016 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1015,6 +1016,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (local->open_count == 0) ieee80211_clear_tx_pending(local); + sdata->vif.bss_conf.beacon_int = 0; + /* * If the interface goes down while suspended, presumably because * the device was unplugged and that happens before our resume, @@ -1799,7 +1802,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, } ieee80211_assign_perm_addr(local, ndev->perm_addr, type); - if (params && is_valid_ether_addr(params->macaddr)) + if (is_valid_ether_addr(params->macaddr)) memcpy(ndev->dev_addr, params->macaddr, ETH_ALEN); else memcpy(ndev->dev_addr, ndev->perm_addr, ETH_ALEN); @@ -1868,11 +1871,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ieee80211_setup_sdata(sdata, type); if (ndev) { - if (params) { - ndev->ieee80211_ptr->use_4addr = params->use_4addr; - if (type == NL80211_IFTYPE_STATION) - sdata->u.mgd.use_4addr = params->use_4addr; - } + ndev->ieee80211_ptr->use_4addr = params->use_4addr; + if (type == NL80211_IFTYPE_STATION) + sdata->u.mgd.use_4addr = params->use_4addr; ndev->features |= local->hw.netdev_features; @@ -1949,6 +1950,8 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) WARN(local->open_count, "%s: open count remains %d\n", wiphy_name(local->hw.wiphy), local->open_count); + ieee80211_txq_teardown_flows(local); + mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { list_del(&sdata->list); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 83e71e6b2ebe..87a729926734 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1221,8 +1221,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && !ieee80211_hw_check(hw, NO_AUTO_VIF)) { + struct vif_params params = {0}; + result = ieee80211_if_add(local, "wlan%d", NET_NAME_ENUM, NULL, - NL80211_IFTYPE_STATION, NULL); + NL80211_IFTYPE_STATION, ¶ms); if (result) wiphy_warn(local->hw.wiphy, "Failed to add default virtual iface\n"); @@ -1262,7 +1264,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); ieee80211_led_exit(local); ieee80211_wep_free(local); - ieee80211_txq_teardown_flows(local); fail_flows: destroy_workqueue(local->workqueue); fail_workqueue: @@ -1288,7 +1289,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&local->ifa6_notifier); #endif - ieee80211_txq_teardown_flows(local); rtnl_lock(); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 8bad414c52ad..c90452aa0c42 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -254,6 +254,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u8 *pos, neighbors; u8 meshconf_len = sizeof(struct ieee80211_meshconf_ie); + bool is_connected_to_gate = ifmsh->num_gates > 0 || + ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol || + ifmsh->mshcfg.dot11MeshConnectedToMeshGate; if (skb_tailroom(skb) < 2 + meshconf_len) return -ENOMEM; @@ -278,7 +281,7 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh Formation Info - number of neighbors */ neighbors = atomic_read(&ifmsh->estab_plinks); neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS); - *pos++ = neighbors << 1; + *pos++ = (neighbors << 1) | is_connected_to_gate; /* Mesh capability */ *pos = 0x00; *pos |= ifmsh->mshcfg.dot11MeshForwarding ? @@ -1191,7 +1194,8 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, if (!sdata->u.mesh.user_mpm || sdata->u.mesh.mshcfg.rssi_threshold == 0 || sdata->u.mesh.mshcfg.rssi_threshold < rx_status->signal) - mesh_neighbour_update(sdata, mgmt->sa, &elems); + mesh_neighbour_update(sdata, mgmt->sa, &elems, + rx_status); } if (ifmsh->sync_ops) diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 21526630bf65..cad6592c52a1 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -273,7 +273,8 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata); /* Mesh plinks */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, - u8 *hw_addr, struct ieee802_11_elems *ie); + u8 *hw_addr, struct ieee802_11_elems *ie, + struct ieee80211_rx_status *rx_status); bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); void mesh_plink_timer(struct timer_list *t); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 5b5b0f95ffd1..33055c8ed37e 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -513,7 +513,8 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) static struct sta_info * mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, - struct ieee802_11_elems *elems) + struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) { struct sta_info *sta = NULL; @@ -521,11 +522,17 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, if (sdata->u.mesh.user_mpm || sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) { if (mesh_peer_accepts_plinks(elems) && - mesh_plink_availables(sdata)) + mesh_plink_availables(sdata)) { + int sig = 0; + + if (ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) + sig = rx_status->signal; + cfg80211_notify_new_peer_candidate(sdata->dev, addr, elems->ie_start, elems->total_len, - GFP_KERNEL); + sig, GFP_KERNEL); + } } else sta = __mesh_sta_info_alloc(sdata, addr); @@ -538,13 +545,15 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, * @sdata: local meshif * @addr: peer's address * @elems: IEs from beacon or mesh peering frame. + * @rx_status: rx status for the frame for signal reporting * * Return existing or newly allocated sta_info under RCU read lock. * (re)initialize with given IEs. */ static struct sta_info * mesh_sta_info_get(struct ieee80211_sub_if_data *sdata, - u8 *addr, struct ieee802_11_elems *elems) __acquires(RCU) + u8 *addr, struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) __acquires(RCU) { struct sta_info *sta = NULL; @@ -555,7 +564,7 @@ mesh_sta_info_get(struct ieee80211_sub_if_data *sdata, } else { rcu_read_unlock(); /* can't run atomic */ - sta = mesh_sta_info_alloc(sdata, addr, elems); + sta = mesh_sta_info_alloc(sdata, addr, elems, rx_status); if (!sta) { rcu_read_lock(); return NULL; @@ -576,20 +585,25 @@ mesh_sta_info_get(struct ieee80211_sub_if_data *sdata, * @sdata: local meshif * @addr: peer's address * @elems: IEs from beacon or mesh peering frame + * @rx_status: rx status for the frame for signal reporting * * Initiates peering if appropriate. */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, u8 *hw_addr, - struct ieee802_11_elems *elems) + struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) { struct sta_info *sta; u32 changed = 0; - sta = mesh_sta_info_get(sdata, hw_addr, elems); + sta = mesh_sta_info_get(sdata, hw_addr, elems, rx_status); if (!sta) goto out; + sta->mesh->connected_to_gate = elems->mesh_config->meshconf_form & + IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE; + if (mesh_peer_accepts_plinks(elems) && sta->mesh->plink_state == NL80211_PLINK_LISTEN && sdata->u.mesh.accepting_plinks && @@ -1069,7 +1083,8 @@ out: static void mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - struct ieee802_11_elems *elems) + struct ieee802_11_elems *elems, + struct ieee80211_rx_status *rx_status) { struct sta_info *sta; @@ -1134,7 +1149,7 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, if (event == OPN_ACPT) { rcu_read_unlock(); /* allocate sta entry if necessary and update info */ - sta = mesh_sta_info_get(sdata, mgmt->sa, elems); + sta = mesh_sta_info_get(sdata, mgmt->sa, elems, rx_status); if (!sta) { mpl_dbg(sdata, "Mesh plink: failed to init peer!\n"); goto unlock_rcu; @@ -1200,5 +1215,5 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, return; } ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems); - mesh_process_plink_frame(sdata, mgmt, &elems); + mesh_process_plink_frame(sdata, mgmt, &elems, rx_status); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d2bc8d57c87e..687821567287 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -916,6 +916,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); + /* + * If AP doesn't support HT, mark HE as disabled. + * If on the 5GHz band, make sure it supports VHT. + */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || + (sband->band == NL80211_BAND_5GHZ && + ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) ieee80211_add_he_ie(sdata, skb, sband); @@ -1869,7 +1878,7 @@ ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t left; - int count, ac; + int count, mu_edca_count, ac; const u8 *pos; u8 uapsd_queues = 0; @@ -1889,9 +1898,16 @@ ieee80211_sta_wmm_params(struct ieee80211_local *local, uapsd_queues = ifmgd->uapsd_queues; count = wmm_param[6] & 0x0f; - if (count == ifmgd->wmm_last_param_set) + /* -1 is the initial value of ifmgd->mu_edca_last_param_set. + * if mu_edca was preset before and now it disappeared tell + * the driver about it. + */ + mu_edca_count = mu_edca ? mu_edca->mu_qos_info & 0x0f : -1; + if (count == ifmgd->wmm_last_param_set && + mu_edca_count == ifmgd->mu_edca_last_param_set) return false; ifmgd->wmm_last_param_set = count; + ifmgd->mu_edca_last_param_set = mu_edca_count; pos = wmm_param + 8; left = wmm_param_len - 8; @@ -2766,6 +2782,7 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata, { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sta_info *sta; + bool result = true; sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; @@ -2778,15 +2795,18 @@ static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata, sta = sta_info_get(sdata, bssid); if (!sta) { WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid); - return false; + result = false; + goto out; } if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { sdata_info(sdata, "failed moving %pM to auth\n", bssid); - return false; + result = false; + goto out; } - mutex_unlock(&sdata->local->sta_mtx); - return true; +out: + mutex_unlock(&sdata->local->sta_mtx); + return result; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, @@ -3058,6 +3078,19 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, } } +static bool ieee80211_twt_req_supported(const struct sta_info *sta, + const struct ieee802_11_elems *elems) +{ + if (elems->ext_capab_len < 10) + return false; + + if (!(elems->ext_capab[9] & WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT)) + return false; + + return sta->sta.he_cap.he_cap_elem.mac_cap_info[0] & + IEEE80211_HE_MAC_CAP0_TWT_RES; +} + static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len) @@ -3211,16 +3244,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } - /* - * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark - * HE as disabled. If on the 5GHz band, make sure it supports VHT. - */ - if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || - (sband->band == NL80211_BAND_5GHZ && - ifmgd->flags & IEEE80211_STA_DISABLE_VHT) || - (!elems.he_cap && !elems.he_operation)) - ifmgd->flags |= IEEE80211_STA_DISABLE_HE; - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && (!elems.he_cap || !elems.he_operation)) { mutex_unlock(&sdata->local->sta_mtx); @@ -3247,8 +3270,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, sta); bss_conf->he_support = sta->sta.he_cap.has_he; + bss_conf->twt_requester = + ieee80211_twt_req_supported(sta, &elems); } else { bss_conf->he_support = false; + bss_conf->twt_requester = false; } if (bss_conf->he_support) { @@ -3333,6 +3359,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 4-bit value. */ ifmgd->wmm_last_param_set = -1; + ifmgd->mu_edca_last_param_set = -1; if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); @@ -4656,8 +4683,10 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && - ieee80211_get_he_sta_cap(sband)) { + if (!ieee80211_get_he_sta_cap(sband)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) { const struct cfg80211_bss_ies *ies; const u8 *he_oper_ie; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3bd3b5769797..45aad3d3108c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -143,6 +143,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, /* allocate extra bitmaps */ if (status->chains) len += 4 * hweight8(status->chains); + /* vendor presence bitmap */ + if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) + len += 4; if (ieee80211_have_rx_timestamp(status)) { len = ALIGN(len, 8); @@ -207,8 +210,6 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { struct ieee80211_vendor_radiotap *rtap = (void *)skb->data; - /* vendor presence bitmap */ - len += 4; /* alignment for fixed 6-byte vendor data header */ len = ALIGN(len, 2); /* vendor data header */ @@ -753,6 +754,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, struct ieee80211_sub_if_data *monitor_sdata = rcu_dereference(local->monitor_sdata); bool only_monitor = false; + unsigned int min_head_len; if (status->flag & RX_FLAG_RADIOTAP_HE) rtap_space += sizeof(struct ieee80211_radiotap_he); @@ -760,12 +762,18 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (status->flag & RX_FLAG_RADIOTAP_HE_MU) rtap_space += sizeof(struct ieee80211_radiotap_he_mu); + if (status->flag & RX_FLAG_RADIOTAP_LSIG) + rtap_space += sizeof(struct ieee80211_radiotap_lsig); + if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { - struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; + struct ieee80211_vendor_radiotap *rtap = + (void *)(origskb->data + rtap_space); rtap_space += sizeof(*rtap) + rtap->len + rtap->pad; } + min_head_len = rtap_space; + /* * First, we may need to make a copy of the skb because * (1) we need to modify it for radiotap (if not present), and @@ -775,18 +783,23 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, * the SKB because it has a bad FCS/PLCP checksum. */ - if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) { - if (unlikely(origskb->len <= FCS_LEN)) { - /* driver bug */ - WARN_ON(1); - dev_kfree_skb(origskb); - return NULL; + if (!(status->flag & RX_FLAG_NO_PSDU)) { + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) { + if (unlikely(origskb->len <= FCS_LEN + rtap_space)) { + /* driver bug */ + WARN_ON(1); + dev_kfree_skb(origskb); + return NULL; + } + present_fcs_len = FCS_LEN; } - present_fcs_len = FCS_LEN; + + /* also consider the hdr->frame_control */ + min_head_len += 2; } - /* ensure hdr->frame_control and vendor radiotap data are in skb head */ - if (!pskb_may_pull(origskb, 2 + rtap_space)) { + /* ensure that the expected data elements are in skb head */ + if (!pskb_may_pull(origskb, min_head_len)) { dev_kfree_skb(origskb); return NULL; } @@ -1403,6 +1416,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (ieee80211_is_ctl(hdr->frame_control) || + ieee80211_is_nullfunc(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; @@ -3063,7 +3077,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, &sta_opmode, - GFP_KERNEL); + GFP_ATOMIC); goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { @@ -3100,7 +3114,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, &sta_opmode, - GFP_KERNEL); + GFP_ATOMIC); goto handled; } default: diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 5d2a11777718..95413413f98c 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -356,7 +356,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); - bool hw_scan = local->ops->hw_scan; + bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; @@ -606,6 +606,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; + bool hw_scan = local->ops->hw_scan; int rc; lockdep_assert_held(&local->mtx); @@ -620,7 +621,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, return 0; } - if (local->ops->hw_scan) { + again: + if (hw_scan) { u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; @@ -679,7 +681,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, else memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN); - if (local->ops->hw_scan) { + if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && (req->channels[0] == local->_oper_chandef.chan)) { @@ -722,7 +724,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_idle(local); - if (local->ops->hw_scan) { + if (hw_scan) { WARN_ON(!ieee80211_prep_hw_scan(local)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { @@ -740,6 +742,18 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, RCU_INIT_POINTER(local->scan_sdata, NULL); } + if (hw_scan && rc == 1) { + /* + * we can't fall back to software for P2P-GO + * as it must update NoA etc. + */ + if (ieee80211_vif_type_p2p(&sdata->vif) == + NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + hw_scan = false; + goto again; + } + return rc; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index fb8c2252ac0e..c4a8f115ed33 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2253,11 +2253,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { - for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { - struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i]; - - sta_set_tidstats(sta, tidstats, i); - } + for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) + sta_set_tidstats(sta, &sinfo->pertid[i], i); } if (ieee80211_vif_is_mesh(&sdata->vif)) { @@ -2267,7 +2264,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | - BIT_ULL(NL80211_STA_INFO_NONPEER_PM); + BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; @@ -2279,6 +2277,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; + sinfo->connected_to_gate = sta->mesh->connected_to_gate; #endif } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 9a04327d71d1..8eb29041be54 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -364,6 +364,7 @@ DECLARE_EWMA(mesh_fail_avg, 20, 8) * @nonpeer_pm: STA power save mode towards non-peer neighbors * @processed_beacon: set to true after peer rates and capabilities are * processed + * @connected_to_gate: true if mesh STA has a path to a mesh gate * @fail_avg: moving percentage of failed MSDUs */ struct mesh_sta { @@ -381,6 +382,7 @@ struct mesh_sta { u8 plink_retries; bool processed_beacon; + bool connected_to_gate; enum nl80211_plink_state plink_state; u32 plink_timeout; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index aa4afbf0abaf..3f0b96e1e02f 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -556,6 +556,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, } ieee80211_led_tx(local); + + if (skb_has_frag_list(skb)) { + kfree_skb_list(skb_shinfo(skb)->frag_list); + skb_shinfo(skb)->frag_list = NULL; + } } /* @@ -964,6 +969,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, /* Track when last TDLS packet was ACKed */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) sta->status_stats.last_tdls_pkt_time = jiffies; + } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { + return; } else { ieee80211_lost_packet(sta, info); } diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 588c51a67c89..35ea0dcb55e6 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1052,10 +1052,10 @@ TRACE_EVENT(drv_ampdu_action, ); TRACE_EVENT(drv_get_survey, - TP_PROTO(struct ieee80211_local *local, int idx, + TP_PROTO(struct ieee80211_local *local, int _idx, struct survey_info *survey), - TP_ARGS(local, idx, survey), + TP_ARGS(local, _idx, survey), TP_STRUCT__entry( LOCAL_ENTRY @@ -1064,7 +1064,7 @@ TRACE_EVENT(drv_get_survey, TP_fast_assign( LOCAL_ASSIGN; - __entry->idx = idx; + __entry->idx = _idx; ), TP_printk( @@ -1882,6 +1882,18 @@ TRACE_EVENT(drv_del_nan_func, ) ); +DEFINE_EVENT(local_sdata_evt, drv_start_pmsr, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + +DEFINE_EVENT(local_sdata_evt, drv_abort_pmsr, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + /* * Tracing for API calls that drivers call. */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e0ccee23fbcd..f170d6c6629a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -439,8 +439,8 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; - /* no stations in PS mode */ - if (!atomic_read(&ps->num_sta_ps)) + /* no stations in PS mode and no buffered packets */ + if (!atomic_read(&ps->num_sta_ps) && skb_queue_empty(&ps->bc_buf)) return TX_CONTINUE; info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; @@ -3218,6 +3218,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) return false; + if (skb_is_gso(skb)) + return false; + if (!txq) return false; @@ -3242,7 +3245,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, tin = &txqi->tin; flow = fq_flow_classify(fq, tin, skb, fq_flow_get_default_func); head = skb_peek_tail(&flow->queue); - if (!head) + if (!head || skb_is_gso(head)) goto out; orig_len = head->len; @@ -3583,7 +3586,7 @@ begin: skb_queue_splice_tail(&tx.skbs, &txqi->frags); } - if (skb && skb_has_frag_list(skb) && + if (skb_has_frag_list(skb) && !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { if (skb_linearize(skb)) { ieee80211_free_txskb(&local->hw, skb); @@ -4579,7 +4582,7 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS); if (qos) { - __le16 qos = cpu_to_le16(7); + __le16 qoshdr = cpu_to_le16(7); BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_STYPE_NULLFUNC) != @@ -4588,7 +4591,7 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC); skb->priority = 7; skb_set_queue_mapping(skb, IEEE80211_AC_VO); - skb_put_data(skb, &qos, sizeof(qos)); + skb_put_data(skb, &qoshdr, sizeof(qoshdr)); } memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bec424316ea4..d0eb38b890aa 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -299,16 +299,16 @@ out: spin_unlock_bh(&fq->lock); } -void ieee80211_wake_txqs(unsigned long data) +static void +__releases(&local->queue_stop_reason_lock) +__acquires(&local->queue_stop_reason_lock) +_ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) { - struct ieee80211_local *local = (struct ieee80211_local *)data; struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; - unsigned long flags; int i; rcu_read_lock(); - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; @@ -317,7 +317,7 @@ void ieee80211_wake_txqs(unsigned long data) if (local->queue_stop_reasons[i]) continue; - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; @@ -329,13 +329,22 @@ void ieee80211_wake_txqs(unsigned long data) __ieee80211_wake_txqs(sdata, ac); } } - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + spin_lock_irqsave(&local->queue_stop_reason_lock, *flags); } - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); rcu_read_unlock(); } +void ieee80211_wake_txqs(unsigned long data) +{ + struct ieee80211_local *local = (struct ieee80211_local *)data; + unsigned long flags; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + _ieee80211_wake_txqs(local, &flags); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) { struct ieee80211_sub_if_data *sdata; @@ -371,7 +380,8 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, - bool refcounted) + bool refcounted, + unsigned long *flags) { struct ieee80211_local *local = hw_to_local(hw); @@ -405,8 +415,19 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, } else tasklet_schedule(&local->tx_pending_tasklet); - if (local->ops->wake_tx_queue) - tasklet_schedule(&local->wake_txqs_tasklet); + /* + * Calling _ieee80211_wake_txqs here can be a problem because it may + * release queue_stop_reason_lock which has been taken by + * __ieee80211_wake_queue's caller. It is certainly not very nice to + * release someone's lock, but it is fine because all the callers of + * __ieee80211_wake_queue call it right before releasing the lock. + */ + if (local->ops->wake_tx_queue) { + if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) + tasklet_schedule(&local->wake_txqs_tasklet); + else + _ieee80211_wake_txqs(local, flags); + } } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, @@ -417,7 +438,7 @@ void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - __ieee80211_wake_queue(hw, queue, reason, refcounted); + __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } @@ -514,7 +535,7 @@ void ieee80211_add_pending_skb(struct ieee80211_local *local, false); __skb_queue_tail(&local->pending[queue], skb); __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, - false); + false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } @@ -547,7 +568,7 @@ void ieee80211_add_pending_skbs(struct ieee80211_local *local, for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, - false); + false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } @@ -605,7 +626,7 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) - __ieee80211_wake_queue(hw, i, reason, refcounted); + __ieee80211_wake_queue(hw, i, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } @@ -1202,6 +1223,8 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA && elen >= (sizeof(*elems->mu_edca_param_set) + 1)) { elems->mu_edca_param_set = (void *)&pos[1]; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) { elems->he_cap = (void *)&pos[1]; elems->he_cap_len = elen - 1; diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index 73e8f347802e..bfe9ed9f4c48 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -30,13 +30,13 @@ int ieee80211_wep_init(struct ieee80211_local *local) /* start WEP IV from a random value */ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN); - local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(local->wep_tx_tfm)) { local->wep_rx_tfm = ERR_PTR(-EINVAL); return PTR_ERR(local->wep_tx_tfm); } - local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(local->wep_rx_tfm)) { crypto_free_cipher(local->wep_tx_tfm); local->wep_tx_tfm = ERR_PTR(-EINVAL); diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 1dae77c54009..87505600dbb2 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -73,10 +73,15 @@ enum { #define NCSI_OEM_MFR_BCM_ID 0x113d /* Broadcom specific OEM Command */ #define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */ +/* Mellanox specific OEM Command */ +#define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */ +#define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */ /* OEM Command payload lengths*/ #define NCSI_OEM_BCM_CMD_GMA_LEN 12 +#define NCSI_OEM_MLX_CMD_GMA_LEN 8 /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 +#define MLX_MAC_ADDR_OFFSET 8 struct ncsi_channel_version { @@ -222,6 +227,10 @@ struct ncsi_package { unsigned int channel_num; /* Number of channels */ struct list_head channels; /* List of chanels */ struct list_head node; /* Form list of packages */ + + bool multi_channel; /* Enable multiple channels */ + u32 channel_whitelist; /* Channels to configure */ + struct ncsi_channel *preferred_channel; /* Primary channel */ }; struct ncsi_request { @@ -287,16 +296,16 @@ struct ncsi_dev_priv { #define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ #define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ #define NCSI_DEV_RESHUFFLE 4 +#define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ spinlock_t lock; /* Protect the NCSI device */ #if IS_ENABLED(CONFIG_IPV6) unsigned int inet6_addr_num; /* Number of IPv6 addresses */ #endif + unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ - struct ncsi_package *force_package; /* Force a specific package */ - struct ncsi_channel *force_channel; /* Force a specific channel */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ #define NCSI_REQ_START_IDX 1 @@ -309,6 +318,9 @@ struct ncsi_dev_priv { struct list_head node; /* Form NCSI device list */ #define NCSI_MAX_VLAN_VIDS 15 struct list_head vlan_vids; /* List of active VLAN IDs */ + + bool multi_package; /* Enable multiple packages */ + u32 package_whitelist; /* Packages to configure */ }; struct ncsi_cmd_arg { @@ -341,6 +353,7 @@ extern spinlock_t ncsi_dev_lock; list_for_each_entry_rcu(nc, &np->channels, node) /* Resources */ +int ncsi_reset_dev(struct ncsi_dev *nd); void ncsi_start_channel_monitor(struct ncsi_channel *nc); void ncsi_stop_channel_monitor(struct ncsi_channel *nc); struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, @@ -361,6 +374,13 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); +bool ncsi_channel_has_link(struct ncsi_channel *channel); +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp, + struct ncsi_channel *channel); +int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp, + struct ncsi_package *np, + struct ncsi_channel *disable, + struct ncsi_channel *enable); /* Packet handlers */ u32 ncsi_calculate_checksum(unsigned char *data, int len); diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 25e483e8278b..26d67e27551f 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -50,13 +50,15 @@ static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h, static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, struct ncsi_aen_pkt_hdr *h) { - struct ncsi_aen_lsc_pkt *lsc; - struct ncsi_channel *nc; + struct ncsi_channel *nc, *tmp; struct ncsi_channel_mode *ncm; - bool chained; - int state; unsigned long old_data, data; + struct ncsi_aen_lsc_pkt *lsc; + struct ncsi_package *np; + bool had_link, has_link; unsigned long flags; + bool chained; + int state; /* Find the NCSI channel */ ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); @@ -73,6 +75,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); + had_link = !!(old_data & 0x1); + has_link = !!(data & 0x1); + netdev_dbg(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n", nc->id, data & 0x1 ? "up" : "down"); @@ -80,22 +85,60 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, state = nc->state; spin_unlock_irqrestore(&nc->lock, flags); - if (!((old_data ^ data) & 0x1) || chained) - return 0; - if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) && - !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1))) + if (state == NCSI_CHANNEL_INACTIVE) + netdev_warn(ndp->ndev.dev, + "NCSI: Inactive channel %u received AEN!\n", + nc->id); + + if ((had_link == has_link) || chained) return 0; - if (!(ndp->flags & NCSI_DEV_HWA) && - state == NCSI_CHANNEL_ACTIVE) - ndp->flags |= NCSI_DEV_RESHUFFLE; + if (!ndp->multi_package && !nc->package->multi_channel) { + if (had_link) { + ndp->flags |= NCSI_DEV_RESHUFFLE; + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + return ncsi_process_next_channel(ndp); + } + /* Configured channel came up */ + return 0; + } - ncsi_stop_channel_monitor(nc); - spin_lock_irqsave(&ndp->lock, flags); - list_add_tail_rcu(&nc->link, &ndp->channel_queue); - spin_unlock_irqrestore(&ndp->lock, flags); + if (had_link) { + ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; + if (ncsi_channel_is_last(ndp, nc)) { + /* No channels left, reconfigure */ + return ncsi_reset_dev(&ndp->ndev); + } else if (ncm->enable) { + /* Need to failover Tx channel */ + ncsi_update_tx_channel(ndp, nc->package, nc, NULL); + } + } else if (has_link && nc->package->preferred_channel == nc) { + /* Return Tx to preferred channel */ + ncsi_update_tx_channel(ndp, nc->package, NULL, nc); + } else if (has_link) { + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, tmp) { + /* Enable Tx on this channel if the current Tx + * channel is down. + */ + ncm = &tmp->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable && + !ncsi_channel_has_link(tmp)) { + ncsi_update_tx_channel(ndp, nc->package, + tmp, nc); + break; + } + } + } + } - return ncsi_process_next_channel(ndp); + /* Leave configured channels active in a multi-channel scenario so + * AEN events are still received. + */ + return 0; } static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index bfc43b28c7a6..31359d5e14ad 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -28,6 +28,29 @@ LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); +bool ncsi_channel_has_link(struct ncsi_channel *channel) +{ + return !!(channel->modes[NCSI_MODE_LINK].data[2] & 0x1); +} + +bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp, + struct ncsi_channel *channel) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + + NCSI_FOR_EACH_PACKAGE(ndp, np) + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (nc == channel) + continue; + if (nc->state == NCSI_CHANNEL_ACTIVE && + ncsi_channel_has_link(nc)) + return false; + } + + return true; +} + static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) { struct ncsi_dev *nd = &ndp->ndev; @@ -52,7 +75,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) continue; } - if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + if (ncsi_channel_has_link(nc)) { spin_unlock_irqrestore(&nc->lock, flags); nd->link_up = 1; goto report; @@ -113,10 +136,8 @@ static void ncsi_channel_monitor(struct timer_list *t) default: netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n", nc->id); - if (!(ndp->flags & NCSI_DEV_HWA)) { - ncsi_report_link(ndp, true); - ndp->flags |= NCSI_DEV_RESHUFFLE; - } + ncsi_report_link(ndp, true); + ndp->flags |= NCSI_DEV_RESHUFFLE; ncsi_stop_channel_monitor(nc); @@ -269,6 +290,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, np->ndp = ndp; spin_lock_init(&np->lock); INIT_LIST_HEAD(&np->channels); + np->channel_whitelist = UINT_MAX; spin_lock_irqsave(&ndp->lock, flags); tmp = ncsi_find_package(ndp, id); @@ -442,12 +464,14 @@ static void ncsi_request_timeout(struct timer_list *t) static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; - struct ncsi_package *np = ndp->active_package; - struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_package *np; + struct ncsi_channel *nc, *tmp; struct ncsi_cmd_arg nca; unsigned long flags; int ret; + np = ndp->active_package; + nc = ndp->active_channel; nca.ndp = ndp; nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { @@ -523,6 +547,15 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) if (ret) goto error; + NCSI_FOR_EACH_CHANNEL(np, tmp) { + /* If there is another channel active on this package + * do not deselect the package. + */ + if (tmp != nc && tmp->state == NCSI_CHANNEL_ACTIVE) { + nd->state = ncsi_dev_state_suspend_done; + break; + } + } break; case ncsi_dev_state_suspend_deselect: ndp->pending_req_num = 1; @@ -541,8 +574,10 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); - ncsi_process_next_channel(ndp); - + if (ndp->flags & NCSI_DEV_RESET) + ncsi_reset_dev(nd); + else + ncsi_process_next_channel(ndp); break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n", @@ -675,12 +710,38 @@ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) +{ + union { + u8 data_u8[NCSI_OEM_MLX_CMD_GMA_LEN]; + u32 data_u32[NCSI_OEM_MLX_CMD_GMA_LEN / sizeof(u32)]; + } u; + int ret = 0; + + nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN; + + memset(&u, 0, sizeof(u)); + u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA; + u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM; + + nca->data = u.data_u8; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; int (*handler)(struct ncsi_cmd_arg *nca); } ncsi_oem_gma_handlers[] = { - { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm } + { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }, + { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx } }; static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) @@ -717,13 +778,144 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) #endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ +/* Determine if a given channel from the channel_queue should be used for Tx */ +static bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, + struct ncsi_channel *nc) +{ + struct ncsi_channel_mode *ncm; + struct ncsi_channel *channel; + struct ncsi_package *np; + + /* Check if any other channel has Tx enabled; a channel may have already + * been configured and removed from the channel queue. + */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (!ndp->multi_package && np != nc->package) + continue; + NCSI_FOR_EACH_CHANNEL(np, channel) { + ncm = &channel->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable) + return false; + } + } + + /* This channel is the preferred channel and has link */ + list_for_each_entry_rcu(channel, &ndp->channel_queue, link) { + np = channel->package; + if (np->preferred_channel && + ncsi_channel_has_link(np->preferred_channel)) { + return np->preferred_channel == nc; + } + } + + /* This channel has link */ + if (ncsi_channel_has_link(nc)) + return true; + + list_for_each_entry_rcu(channel, &ndp->channel_queue, link) + if (ncsi_channel_has_link(channel)) + return false; + + /* No other channel has link; default to this one */ + return true; +} + +/* Change the active Tx channel in a multi-channel setup */ +int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp, + struct ncsi_package *package, + struct ncsi_channel *disable, + struct ncsi_channel *enable) +{ + struct ncsi_cmd_arg nca; + struct ncsi_channel *nc; + struct ncsi_package *np; + int ret = 0; + + if (!package->multi_channel && !ndp->multi_package) + netdev_warn(ndp->ndev.dev, + "NCSI: Trying to update Tx channel in single-channel mode\n"); + nca.ndp = ndp; + nca.req_flags = 0; + + /* Find current channel with Tx enabled */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (disable) + break; + if (!ndp->multi_package && np != package) + continue; + + NCSI_FOR_EACH_CHANNEL(np, nc) + if (nc->modes[NCSI_MODE_TX_ENABLE].enable) { + disable = nc; + break; + } + } + + /* Find a suitable channel for Tx */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (enable) + break; + if (!ndp->multi_package && np != package) + continue; + if (!(ndp->package_whitelist & (0x1 << np->id))) + continue; + + if (np->preferred_channel && + ncsi_channel_has_link(np->preferred_channel)) { + enable = np->preferred_channel; + break; + } + + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!(np->channel_whitelist & 0x1 << nc->id)) + continue; + if (nc->state != NCSI_CHANNEL_ACTIVE) + continue; + if (ncsi_channel_has_link(nc)) { + enable = nc; + break; + } + } + } + + if (disable == enable) + return -1; + + if (!enable) + return -1; + + if (disable) { + nca.channel = disable->id; + nca.package = disable->package->id; + nca.type = NCSI_PKT_CMD_DCNT; + ret = ncsi_xmit_cmd(&nca); + if (ret) + netdev_err(ndp->ndev.dev, + "Error %d sending DCNT\n", + ret); + } + + netdev_info(ndp->ndev.dev, "NCSI: channel %u enables Tx\n", enable->id); + + nca.channel = enable->id; + nca.package = enable->package->id; + nca.type = NCSI_PKT_CMD_ECNT; + ret = ncsi_xmit_cmd(&nca); + if (ret) + netdev_err(ndp->ndev.dev, + "Error %d sending ECNT\n", + ret); + + return ret; +} + static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_dev *nd = &ndp->ndev; - struct net_device *dev = nd->dev; struct ncsi_package *np = ndp->active_package; struct ncsi_channel *nc = ndp->active_channel; struct ncsi_channel *hot_nc = NULL; + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; struct ncsi_cmd_arg nca; unsigned char index; unsigned long flags; @@ -845,20 +1037,29 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; - nd->state = ncsi_dev_state_config_ecnt; + if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; #if IS_ENABLED(CONFIG_IPV6) if (ndp->inet6_addr_num > 0 && (nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) nd->state = ncsi_dev_state_config_egmf; - else - nd->state = ncsi_dev_state_config_ecnt; } else if (nd->state == ncsi_dev_state_config_egmf) { nca.type = NCSI_PKT_CMD_EGMF; nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; - nd->state = ncsi_dev_state_config_ecnt; + if (ncsi_channel_is_tx(ndp, nc)) + nd->state = ncsi_dev_state_config_ecnt; + else + nd->state = ncsi_dev_state_config_ec; #endif /* CONFIG_IPV6 */ } else if (nd->state == ncsi_dev_state_config_ecnt) { + if (np->preferred_channel && + nc != np->preferred_channel) + netdev_info(ndp->ndev.dev, + "NCSI: Tx failed over to channel %u\n", + nc->id); nca.type = NCSI_PKT_CMD_ECNT; nd->state = ncsi_dev_state_config_ec; } else if (nd->state == ncsi_dev_state_config_ec) { @@ -889,6 +1090,16 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) netdev_dbg(ndp->ndev.dev, "NCSI: channel %u config done\n", nc->id); spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_ACTIVE; + + if (ndp->flags & NCSI_DEV_RESET) { + /* A reset event happened during config, start it now */ + nc->reconfigure_needed = false; + spin_unlock_irqrestore(&nc->lock, flags); + ncsi_reset_dev(nd); + break; + } + if (nc->reconfigure_needed) { /* This channel's configuration has been updated * part-way during the config state - start the @@ -909,10 +1120,8 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { hot_nc = nc; - nc->state = NCSI_CHANNEL_ACTIVE; } else { hot_nc = NULL; - nc->state = NCSI_CHANNEL_INACTIVE; netdev_dbg(ndp->ndev.dev, "NCSI: channel %u link down after config\n", nc->id); @@ -940,43 +1149,35 @@ error: static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_package *np, *force_package; - struct ncsi_channel *nc, *found, *hot_nc, *force_channel; + struct ncsi_channel *nc, *found, *hot_nc; struct ncsi_channel_mode *ncm; - unsigned long flags; + unsigned long flags, cflags; + struct ncsi_package *np; + bool with_link; spin_lock_irqsave(&ndp->lock, flags); hot_nc = ndp->hot_channel; - force_channel = ndp->force_channel; - force_package = ndp->force_package; spin_unlock_irqrestore(&ndp->lock, flags); - /* Force a specific channel whether or not it has link if we have been - * configured to do so - */ - if (force_package && force_channel) { - found = force_channel; - ncm = &found->modes[NCSI_MODE_LINK]; - if (!(ncm->data[2] & 0x1)) - netdev_info(ndp->ndev.dev, - "NCSI: Channel %u forced, but it is link down\n", - found->id); - goto out; - } - - /* The search is done once an inactive channel with up - * link is found. + /* By default the search is done once an inactive channel with up + * link is found, unless a preferred channel is set. + * If multi_package or multi_channel are configured all channels in the + * whitelist are added to the channel queue. */ found = NULL; + with_link = false; NCSI_FOR_EACH_PACKAGE(ndp, np) { - if (ndp->force_package && np != ndp->force_package) + if (!(ndp->package_whitelist & (0x1 << np->id))) continue; NCSI_FOR_EACH_CHANNEL(np, nc) { - spin_lock_irqsave(&nc->lock, flags); + if (!(np->channel_whitelist & (0x1 << nc->id))) + continue; + + spin_lock_irqsave(&nc->lock, cflags); if (!list_empty(&nc->link) || nc->state != NCSI_CHANNEL_INACTIVE) { - spin_unlock_irqrestore(&nc->lock, flags); + spin_unlock_irqrestore(&nc->lock, cflags); continue; } @@ -988,32 +1189,49 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) ncm = &nc->modes[NCSI_MODE_LINK]; if (ncm->data[2] & 0x1) { - spin_unlock_irqrestore(&nc->lock, flags); found = nc; - goto out; + with_link = true; } - spin_unlock_irqrestore(&nc->lock, flags); + /* If multi_channel is enabled configure all valid + * channels whether or not they currently have link + * so they will have AENs enabled. + */ + if (with_link || np->multi_channel) { + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, + &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + netdev_dbg(ndp->ndev.dev, + "NCSI: Channel %u added to queue (link %s)\n", + nc->id, + ncm->data[2] & 0x1 ? "up" : "down"); + } + + spin_unlock_irqrestore(&nc->lock, cflags); + + if (with_link && !np->multi_channel) + break; } + if (with_link && !ndp->multi_package) + break; } - if (!found) { + if (list_empty(&ndp->channel_queue) && found) { + netdev_info(ndp->ndev.dev, + "NCSI: No channel with link found, configuring channel %u\n", + found->id); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&found->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + } else if (!found) { netdev_warn(ndp->ndev.dev, - "NCSI: No channel found with link\n"); + "NCSI: No channel found to configure!\n"); ncsi_report_link(ndp, true); return -ENODEV; } - ncm = &found->modes[NCSI_MODE_LINK]; - netdev_dbg(ndp->ndev.dev, - "NCSI: Channel %u added to queue (link %s)\n", - found->id, ncm->data[2] & 0x1 ? "up" : "down"); - -out: - spin_lock_irqsave(&ndp->lock, flags); - list_add_tail_rcu(&found->link, &ndp->channel_queue); - spin_unlock_irqrestore(&ndp->lock, flags); - return ncsi_process_next_channel(ndp); } @@ -1050,35 +1268,6 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) return false; } -static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) -{ - struct ncsi_package *np; - struct ncsi_channel *nc; - unsigned long flags; - - /* Move all available channels to processing queue */ - spin_lock_irqsave(&ndp->lock, flags); - NCSI_FOR_EACH_PACKAGE(ndp, np) { - NCSI_FOR_EACH_CHANNEL(np, nc) { - WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE || - !list_empty(&nc->link)); - ncsi_stop_channel_monitor(nc); - list_add_tail_rcu(&nc->link, &ndp->channel_queue); - } - } - spin_unlock_irqrestore(&ndp->lock, flags); - - /* We can have no channels in extremely case */ - if (list_empty(&ndp->channel_queue)) { - netdev_err(ndp->ndev.dev, - "NCSI: No available channels for HWA\n"); - ncsi_report_link(ndp, false); - return -ENOENT; - } - - return ncsi_process_next_channel(ndp); -} - static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; @@ -1110,70 +1299,28 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: - ndp->pending_req_num = 16; + ndp->pending_req_num = 1; - /* Select all possible packages */ nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; + nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; - for (index = 0; index < 8; index++) { - nca.package = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - - /* Disable all possible packages */ - nca.type = NCSI_PKT_CMD_DP; - for (index = 0; index < 8; index++) { - nca.package = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; nd->state = ncsi_dev_state_probe_channel; break; case ncsi_dev_state_probe_channel: - if (!ndp->active_package) - ndp->active_package = list_first_or_null_rcu( - &ndp->packages, struct ncsi_package, node); - else if (list_is_last(&ndp->active_package->node, - &ndp->packages)) - ndp->active_package = NULL; - else - ndp->active_package = list_next_entry( - ndp->active_package, node); - - /* All available packages and channels are enumerated. The - * enumeration happens for once when the NCSI interface is - * started. So we need continue to start the interface after - * the enumeration. - * - * We have to choose an active channel before configuring it. - * Note that we possibly don't have active channel in extreme - * situation. - */ + ndp->active_package = ncsi_find_package(ndp, + ndp->package_probe_id); if (!ndp->active_package) { - ndp->flags |= NCSI_DEV_PROBED; - if (ncsi_check_hwa(ndp)) - ncsi_enable_hwa(ndp); - else - ncsi_choose_active_channel(ndp); - return; + /* No response */ + nd->state = ncsi_dev_state_probe_dp; + schedule_work(&ndp->work); + break; } - - /* Select the active package */ - ndp->pending_req_num = 1; - nca.type = NCSI_PKT_CMD_SP; - nca.bytes[0] = 1; - nca.package = ndp->active_package->id; - nca.channel = NCSI_RESERVED_CHANNEL; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - nd->state = ncsi_dev_state_probe_cis; + schedule_work(&ndp->work); break; case ncsi_dev_state_probe_cis: ndp->pending_req_num = NCSI_RESERVED_CHANNEL; @@ -1222,22 +1369,35 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) case ncsi_dev_state_probe_dp: ndp->pending_req_num = 1; - /* Deselect the active package */ + /* Deselect the current package */ nca.type = NCSI_PKT_CMD_DP; - nca.package = ndp->active_package->id; + nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; - /* Scan channels in next package */ - nd->state = ncsi_dev_state_probe_channel; + /* Probe next package */ + ndp->package_probe_id++; + if (ndp->package_probe_id >= 8) { + /* Probe finished */ + ndp->flags |= NCSI_DEV_PROBED; + break; + } + nd->state = ncsi_dev_state_probe_package; + ndp->active_package = NULL; break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n", nd->state); } + if (ndp->flags & NCSI_DEV_PROBED) { + /* Check if all packages have HWA support */ + ncsi_check_hwa(ndp); + ncsi_choose_active_channel(ndp); + } + return; error: netdev_err(ndp->ndev.dev, @@ -1556,6 +1716,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, INIT_LIST_HEAD(&ndp->channel_queue); INIT_LIST_HEAD(&ndp->vlan_vids); INIT_WORK(&ndp->work, ncsi_dev_work); + ndp->package_whitelist = UINT_MAX; /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); @@ -1592,26 +1753,19 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev); int ncsi_start_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); - int ret; if (nd->state != ncsi_dev_state_registered && nd->state != ncsi_dev_state_functional) return -ENOTTY; if (!(ndp->flags & NCSI_DEV_PROBED)) { + ndp->package_probe_id = 0; nd->state = ncsi_dev_state_probe; schedule_work(&ndp->work); return 0; } - if (ndp->flags & NCSI_DEV_HWA) { - netdev_info(ndp->ndev.dev, "NCSI: Enabling HWA mode\n"); - ret = ncsi_enable_hwa(ndp); - } else { - ret = ncsi_choose_active_channel(ndp); - } - - return ret; + return ncsi_reset_dev(nd); } EXPORT_SYMBOL_GPL(ncsi_start_dev); @@ -1624,7 +1778,10 @@ void ncsi_stop_dev(struct ncsi_dev *nd) int old_state; unsigned long flags; - /* Stop the channel monitor and reset channel's state */ + /* Stop the channel monitor on any active channels. Don't reset the + * channel state so we know which were active when ncsi_start_dev() + * is next called. + */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { ncsi_stop_channel_monitor(nc); @@ -1632,7 +1789,6 @@ void ncsi_stop_dev(struct ncsi_dev *nd) spin_lock_irqsave(&nc->lock, flags); chained = !list_empty(&nc->link); old_state = nc->state; - nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); WARN_ON_ONCE(chained || @@ -1645,6 +1801,92 @@ void ncsi_stop_dev(struct ncsi_dev *nd) } EXPORT_SYMBOL_GPL(ncsi_stop_dev); +int ncsi_reset_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_channel *nc, *active, *tmp; + struct ncsi_package *np; + unsigned long flags; + + spin_lock_irqsave(&ndp->lock, flags); + + if (!(ndp->flags & NCSI_DEV_RESET)) { + /* Haven't been called yet, check states */ + switch (nd->state & ncsi_dev_state_major) { + case ncsi_dev_state_registered: + case ncsi_dev_state_probe: + /* Not even probed yet - do nothing */ + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + case ncsi_dev_state_suspend: + case ncsi_dev_state_config: + /* Wait for the channel to finish its suspend/config + * operation; once it finishes it will check for + * NCSI_DEV_RESET and reset the state. + */ + ndp->flags |= NCSI_DEV_RESET; + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + } + } else { + switch (nd->state) { + case ncsi_dev_state_suspend_done: + case ncsi_dev_state_config_done: + case ncsi_dev_state_functional: + /* Ok */ + break; + default: + /* Current reset operation happening */ + spin_unlock_irqrestore(&ndp->lock, flags); + return 0; + } + } + + if (!list_empty(&ndp->channel_queue)) { + /* Clear any channel queue we may have interrupted */ + list_for_each_entry_safe(nc, tmp, &ndp->channel_queue, link) + list_del_init(&nc->link); + } + spin_unlock_irqrestore(&ndp->lock, flags); + + active = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + + if (nc->state == NCSI_CHANNEL_ACTIVE) { + active = nc; + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + ncsi_stop_channel_monitor(nc); + break; + } + + spin_unlock_irqrestore(&nc->lock, flags); + } + if (active) + break; + } + + if (!active) { + /* Done */ + spin_lock_irqsave(&ndp->lock, flags); + ndp->flags &= ~NCSI_DEV_RESET; + spin_unlock_irqrestore(&ndp->lock, flags); + return ncsi_choose_active_channel(ndp); + } + + spin_lock_irqsave(&ndp->lock, flags); + ndp->flags |= NCSI_DEV_RESET; + ndp->active_channel = active; + ndp->active_package = active->package; + spin_unlock_irqrestore(&ndp->lock, flags); + + nd->state = ncsi_dev_state_suspend; + schedule_work(&ndp->work); + return 0; +} + void ncsi_unregister_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 33314381b4f5..5d782445d2fc 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -30,6 +30,9 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 }, + [NCSI_ATTR_MULTI_FLAG] = { .type = NLA_FLAG }, + [NCSI_ATTR_PACKAGE_MASK] = { .type = NLA_U32 }, + [NCSI_ATTR_CHANNEL_MASK] = { .type = NLA_U32 }, }; static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) @@ -69,7 +72,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb, nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); if (nc->state == NCSI_CHANNEL_ACTIVE) nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); - if (ndp->force_channel == nc) + if (nc == nc->package->preferred_channel) nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version); @@ -114,7 +117,7 @@ static int ncsi_write_package_info(struct sk_buff *skb, if (!pnest) return -ENOMEM; nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); - if (ndp->force_package == np) + if ((0x1 << np->id) == ndp->package_whitelist) nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST); if (!cnest) { @@ -290,49 +293,58 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); package = NULL; - spin_lock_irqsave(&ndp->lock, flags); - NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) package = np; if (!package) { /* The user has set a package that does not exist */ - spin_unlock_irqrestore(&ndp->lock, flags); return -ERANGE; } channel = NULL; - if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { - /* Allow any channel */ - channel_id = NCSI_RESERVED_CHANNEL; - } else { + if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); NCSI_FOR_EACH_CHANNEL(package, nc) - if (nc->id == channel_id) + if (nc->id == channel_id) { channel = nc; + break; + } + if (!channel) { + netdev_info(ndp->ndev.dev, + "NCSI: Channel %u does not exist!\n", + channel_id); + return -ERANGE; + } } - if (channel_id != NCSI_RESERVED_CHANNEL && !channel) { - /* The user has set a channel that does not exist on this - * package - */ - spin_unlock_irqrestore(&ndp->lock, flags); - netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", - channel_id); - return -ERANGE; - } - - ndp->force_package = package; - ndp->force_channel = channel; + spin_lock_irqsave(&ndp->lock, flags); + ndp->package_whitelist = 0x1 << package->id; + ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); - netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n", - package_id, channel_id, - channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : ""); + spin_lock_irqsave(&package->lock, flags); + package->multi_channel = false; + if (channel) { + package->channel_whitelist = 0x1 << channel->id; + package->preferred_channel = channel; + } else { + /* Allow any channel */ + package->channel_whitelist = UINT_MAX; + package->preferred_channel = NULL; + } + spin_unlock_irqrestore(&package->lock, flags); + + if (channel) + netdev_info(ndp->ndev.dev, + "Set package 0x%x, channel 0x%x as preferred\n", + package_id, channel_id); + else + netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n", + package_id); - /* Bounce the NCSI channel to set changes */ - ncsi_stop_dev(&ndp->ndev); - ncsi_start_dev(&ndp->ndev); + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); return 0; } @@ -340,6 +352,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; + struct ncsi_package *np; unsigned long flags; if (!info || !info->attrs) @@ -353,16 +366,24 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) if (!ndp) return -ENODEV; - /* Clear any override */ + /* Reset any whitelists and disable multi mode */ spin_lock_irqsave(&ndp->lock, flags); - ndp->force_package = NULL; - ndp->force_channel = NULL; + ndp->package_whitelist = UINT_MAX; + ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); + + NCSI_FOR_EACH_PACKAGE(ndp, np) { + spin_lock_irqsave(&np->lock, flags); + np->multi_channel = false; + np->channel_whitelist = UINT_MAX; + np->preferred_channel = NULL; + spin_unlock_irqrestore(&np->lock, flags); + } netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); - /* Bounce the NCSI channel to set changes */ - ncsi_stop_dev(&ndp->ndev); - ncsi_start_dev(&ndp->ndev); + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); return 0; } @@ -563,6 +584,138 @@ int ncsi_send_netlink_err(struct net_device *dev, return nlmsg_unicast(net->genl_sock, skb, snd_portid); } +static int ncsi_set_package_mask_nl(struct sk_buff *msg, + struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned long flags; + int rc; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_MASK]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + spin_lock_irqsave(&ndp->lock, flags); + if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { + if (ndp->flags & NCSI_DEV_HWA) { + ndp->multi_package = true; + rc = 0; + } else { + netdev_err(ndp->ndev.dev, + "NCSI: Can't use multiple packages without HWA\n"); + rc = -EPERM; + } + } else { + ndp->multi_package = false; + rc = 0; + } + + if (!rc) + ndp->package_whitelist = + nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]); + spin_unlock_irqrestore(&ndp->lock, flags); + + if (!rc) { + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); + } + + return rc; +} + +static int ncsi_set_channel_mask_nl(struct sk_buff *msg, + struct genl_info *info) +{ + struct ncsi_package *np, *package; + struct ncsi_channel *nc, *channel; + u32 package_id, channel_id; + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_CHANNEL_MASK]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + package = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) { + package = np; + break; + } + if (!package) + return -ERANGE; + + spin_lock_irqsave(&package->lock, flags); + + channel = NULL; + if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + NCSI_FOR_EACH_CHANNEL(np, nc) + if (nc->id == channel_id) { + channel = nc; + break; + } + if (!channel) { + spin_unlock_irqrestore(&package->lock, flags); + return -ERANGE; + } + netdev_dbg(ndp->ndev.dev, + "NCSI: Channel %u set as preferred channel\n", + channel->id); + } + + package->channel_whitelist = + nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]); + if (package->channel_whitelist == 0) + netdev_dbg(ndp->ndev.dev, + "NCSI: Package %u set to all channels disabled\n", + package->id); + + package->preferred_channel = channel; + + if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { + package->multi_channel = true; + netdev_info(ndp->ndev.dev, + "NCSI: Multi-channel enabled on package %u\n", + package_id); + } else { + package->multi_channel = false; + } + + spin_unlock_irqrestore(&package->lock, flags); + + /* Update channel configuration */ + if (!(ndp->flags & NCSI_DEV_RESET)) + ncsi_reset_dev(&ndp->ndev); + + return 0; +} + static const struct genl_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, @@ -589,6 +742,18 @@ static const struct genl_ops ncsi_ops[] = { .doit = ncsi_send_cmd_nl, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NCSI_CMD_SET_PACKAGE_MASK, + .policy = ncsi_genl_policy, + .doit = ncsi_set_package_mask_nl, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NCSI_CMD_SET_CHANNEL_MASK, + .policy = ncsi_genl_policy, + .doit = ncsi_set_channel_mask_nl, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family ncsi_genl_family __ro_after_init = { diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 4d3f06be38bd..2a6d83a596c9 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -165,6 +165,15 @@ struct ncsi_rsp_oem_pkt { unsigned char data[]; /* Payload data */ }; +/* Mellanox Response Data */ +struct ncsi_rsp_oem_mlx_pkt { + unsigned char cmd_rev; /* Command Revision */ + unsigned char cmd; /* Command ID */ + unsigned char param; /* Parameter */ + unsigned char optional; /* Optional data */ + unsigned char data[]; /* Data */ +}; + /* Broadcom Response Data */ struct ncsi_rsp_oem_bcm_pkt { unsigned char ver; /* Payload Version */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 77e07ba3f493..dc07fcc7938e 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -256,7 +256,7 @@ static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr) if (!ncm->enable) return 0; - ncm->enable = 1; + ncm->enable = 0; return 0; } @@ -611,6 +611,45 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) return 0; } +/* Response handler for Mellanox command Get Mac Address */ +static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + const struct net_device_ops *ops = ndev->netdev_ops; + struct ncsi_rsp_oem_pkt *rsp; + struct sockaddr saddr; + int ret = 0; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + ret = ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) + netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); + + return ret; +} + +/* Response handler for Mellanox card */ +static int ncsi_rsp_handler_oem_mlx(struct ncsi_request *nr) +{ + struct ncsi_rsp_oem_mlx_pkt *mlx; + struct ncsi_rsp_oem_pkt *rsp; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + mlx = (struct ncsi_rsp_oem_mlx_pkt *)(rsp->data); + + if (mlx->cmd == NCSI_OEM_MLX_CMD_GMA && + mlx->param == NCSI_OEM_MLX_CMD_GMA_PARAM) + return ncsi_rsp_handler_oem_mlx_gma(nr); + return 0; +} + /* Response handler for Broadcom command Get Mac Address */ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) { @@ -655,7 +694,7 @@ static struct ncsi_rsp_oem_handler { unsigned int mfr_id; int (*handler)(struct ncsi_request *nr); } ncsi_rsp_oem_handlers[] = { - { NCSI_OEM_MFR_MLX_ID, NULL }, + { NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx }, { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm } }; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 2ab870ef233a..beb3a69ce1d4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -403,21 +403,6 @@ config NF_NAT_NEEDED depends on NF_NAT default y -config NF_NAT_PROTO_DCCP - bool - depends on NF_NAT && NF_CT_PROTO_DCCP - default NF_NAT && NF_CT_PROTO_DCCP - -config NF_NAT_PROTO_UDPLITE - bool - depends on NF_NAT && NF_CT_PROTO_UDPLITE - default NF_NAT && NF_CT_PROTO_UDPLITE - -config NF_NAT_PROTO_SCTP - bool - default NF_NAT && NF_CT_PROTO_SCTP - depends on NF_NAT && NF_CT_PROTO_SCTP - config NF_NAT_AMANDA tristate depends on NF_CONNTRACK && NF_NAT diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 4ddf3ef51ece..1ae65a314d7a 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -47,12 +47,7 @@ obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o -nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ - nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o - -# NAT protocols (nf_nat) -nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o -nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o +nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o # generic transport layer logging obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index c00b6a2e8e3c..980000fc3b50 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -219,10 +219,6 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; - /* MAC can be src only */ - if (!(opt->flags & IPSET_DIM_TWO_SRC)) - return 0; - ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; @@ -233,7 +229,14 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL; e.id = ip_to_id(map, ip); - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); + + if (is_zero_ether_addr(e.ether)) + return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 1577f2f76060..45a257695bef 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -771,11 +771,21 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_put); * The commands are serialized by the nfnl mutex. */ +static inline u8 protocol(const struct nlattr * const tb[]) +{ + return nla_get_u8(tb[IPSET_ATTR_PROTOCOL]); +} + static inline bool protocol_failed(const struct nlattr * const tb[]) { - return !tb[IPSET_ATTR_PROTOCOL] || - nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL; + return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) != IPSET_PROTOCOL; +} + +static inline bool +protocol_min_failed(const struct nlattr * const tb[]) +{ + return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) < IPSET_PROTOCOL_MIN; } static inline u32 @@ -889,7 +899,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, u32 flags = flag_exist(nlh); int ret = 0; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_REVISION] || @@ -1027,7 +1037,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, ip_set_id_t i; int ret = 0; - if (unlikely(protocol_failed(attr))) + if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; /* Must wait for flush to be really finished in list:set */ @@ -1105,7 +1115,7 @@ static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *s; ip_set_id_t i; - if (unlikely(protocol_failed(attr))) + if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; if (!attr[IPSET_ATTR_SETNAME]) { @@ -1147,7 +1157,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, ip_set_id_t i; int ret = 0; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; @@ -1196,7 +1206,7 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; @@ -1291,6 +1301,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_setname_policy, NULL); + cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { struct ip_set *set; @@ -1392,7 +1403,8 @@ dump_last: ret = -EMSGSIZE; goto release_refcount; } - if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, + cb->args[IPSET_CB_PROTO]) || nla_put_string(skb, IPSET_ATTR_SETNAME, set->name)) goto nla_put_failure; if (dump_flags & IPSET_FLAG_LIST_SETNAME) @@ -1407,6 +1419,9 @@ dump_last: nla_put_u8(skb, IPSET_ATTR_REVISION, set->revision)) goto nla_put_failure; + if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN && + nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index))) + goto nla_put_failure; ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; @@ -1466,7 +1481,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { - if (unlikely(protocol_failed(attr))) + if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; { @@ -1560,7 +1575,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, bool use_lineno; int ret = 0; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || @@ -1615,7 +1630,7 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, bool use_lineno; int ret = 0; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || @@ -1667,7 +1682,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) @@ -1704,7 +1719,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, struct nlmsghdr *nlh2; int ret = 0; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; @@ -1720,7 +1735,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; - if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || @@ -1761,7 +1776,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, const char *typename; int ret = 0; - if (unlikely(protocol_failed(attr) || + if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; @@ -1780,7 +1795,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; - if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) || + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) || nla_put_u8(skb2, IPSET_ATTR_REVISION, max) || @@ -1831,6 +1846,111 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL)) goto nla_put_failure; + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL_MIN, IPSET_PROTOCOL_MIN)) + goto nla_put_failure; + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +/* Get set by name or index, from userspace */ + +static int ip_set_byname(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) +{ + struct ip_set_net *inst = ip_set_pernet(net); + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + ip_set_id_t id = IPSET_INVALID_ID; + const struct ip_set *set; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + !attr[IPSET_ATTR_SETNAME])) + return -IPSET_ERR_PROTOCOL; + + set = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &id); + if (id == IPSET_INVALID_ID) + return -ENOENT; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + IPSET_CMD_GET_BYNAME); + if (!nlh2) + goto nlmsg_failure; + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || + nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || + nla_put_net16(skb2, IPSET_ATTR_INDEX, htons(id))) + goto nla_put_failure; + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_INDEX] = { .type = NLA_U16 }, +}; + +static int ip_set_byindex(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) +{ + struct ip_set_net *inst = ip_set_pernet(net); + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + ip_set_id_t id = IPSET_INVALID_ID; + const struct ip_set *set; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + !attr[IPSET_ATTR_INDEX])) + return -IPSET_ERR_PROTOCOL; + + id = ip_set_get_h16(attr[IPSET_ATTR_INDEX]); + if (id >= inst->ip_set_max) + return -ENOENT; + set = ip_set(inst, id); + if (set == NULL) + return -ENOENT; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + IPSET_CMD_GET_BYINDEX); + if (!nlh2) + goto nlmsg_failure; + if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || + nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name)) + goto nla_put_failure; nlmsg_end(skb2, nlh2); ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); @@ -1916,6 +2036,16 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_protocol_policy, }, + [IPSET_CMD_GET_BYNAME] = { + .call = ip_set_byname, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_GET_BYINDEX] = { + .call = ip_set_byindex, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_index_policy, + }, }; static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { @@ -1961,7 +2091,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) goto done; } - if (req_version->version != IPSET_PROTOCOL) { + if (req_version->version < IPSET_PROTOCOL_MIN) { ret = -EPROTO; goto done; } @@ -2024,9 +2154,11 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } nfnl_lock(NFNL_SUBSYS_IPSET); set = ip_set(inst, req_get->set.index); - strncpy(req_get->set.name, set ? set->name : "", - IPSET_MAXNAMELEN); + ret = strscpy(req_get->set.name, set ? set->name : "", + IPSET_MAXNAMELEN); nfnl_unlock(NFNL_SUBSYS_IPSET); + if (ret < 0) + goto done; goto copy; } default: diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index e287da68d5fa..2c9609929c71 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -67,7 +67,7 @@ tune_ahash_max(u8 curr, u32 multi) /* A hash bucket */ struct hbucket { - struct rcu_head rcu; /* for call_rcu_bh */ + struct rcu_head rcu; /* for call_rcu */ /* Which positions are used in the array */ DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ @@ -664,7 +664,7 @@ retry: spin_unlock_bh(&set->lock); /* Give time to other readers of the set */ - synchronize_rcu_bh(); + synchronize_rcu(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 1ab5ed2f6839..c830c68142ff 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -36,9 +36,6 @@ MODULE_ALIAS("ip_set_hash:ip,mac"); /* Type specific function prefix */ #define HTYPE hash_ipmac -/* Zero valued element is not supported */ -static const unsigned char invalid_ether[ETH_ALEN] = { 0 }; - /* IPv4 variant */ /* Member elements */ @@ -103,8 +100,12 @@ hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); - if (ether_addr_equal(e.ether, invalid_ether)) + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); + + if (is_zero_ether_addr(e.ether)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); @@ -140,7 +141,7 @@ hash_ipmac4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); - if (ether_addr_equal(e.ether, invalid_ether)) + if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); @@ -211,16 +212,16 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - /* MAC can be src only */ - if (!(opt->flags & IPSET_DIM_TWO_SRC)) - return 0; - if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); - if (ether_addr_equal(e.ether, invalid_ether)) + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); + + if (is_zero_ether_addr(e.ether)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); @@ -260,7 +261,7 @@ hash_ipmac6_uadt(struct ip_set *set, struct nlattr *tb[], return ret; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); - if (ether_addr_equal(e.ether, invalid_ether)) + if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index f9d5a2a1e3d0..4fe5f243d0a3 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -81,15 +81,15 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - /* MAC can be src only */ - if (!(opt->flags & IPSET_DIM_ONE_SRC)) - return 0; - if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); + if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 4eef55da0878..8da228da53ae 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -531,8 +531,8 @@ nla_put_failure: ret = -EMSGSIZE; } else { cb->args[IPSET_CB_ARG0] = i; + ipset_nest_end(skb, atd); } - ipset_nest_end(skb, atd); out: rcu_read_unlock(); return ret; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 83395bf6dc35..432141f04af3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3980,6 +3980,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, +#ifdef CONFIG_IP_VS_IPV6 + .priority = ADDRCONF_NOTIFY_PRIORITY + 5, +#endif }; int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 02ca7df793f5..9cd180bda092 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -49,6 +49,7 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; + bool dead; struct rcu_head rcu_head; }; @@ -106,15 +107,16 @@ nf_conncount_add(struct nf_conncount_list *list, conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; - spin_lock(&list->list_lock); + conn->dead = false; + spin_lock_bh(&list->list_lock); if (list->dead == true) { kmem_cache_free(conncount_conn_cachep, conn); - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_SKIP; } list_add_tail(&conn->node, &list->head); list->count++; - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_ADDED; } EXPORT_SYMBOL_GPL(nf_conncount_add); @@ -132,19 +134,22 @@ static bool conn_free(struct nf_conncount_list *list, { bool free_entry = false; - spin_lock(&list->list_lock); + spin_lock_bh(&list->list_lock); - if (list->count == 0) { - spin_unlock(&list->list_lock); - return free_entry; + if (conn->dead) { + spin_unlock_bh(&list->list_lock); + return free_entry; } list->count--; + conn->dead = true; list_del_rcu(&conn->node); - if (list->count == 0) + if (list->count == 0) { + list->dead = true; free_entry = true; + } - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); call_rcu(&conn->rcu_head, __conn_free); return free_entry; } @@ -245,7 +250,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); - list->count = 1; + list->count = 0; list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); @@ -259,6 +264,7 @@ bool nf_conncount_gc_list(struct net *net, struct nf_conn *found_ct; unsigned int collected = 0; bool free_entry = false; + bool ret = false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn, &free_entry); @@ -288,7 +294,15 @@ bool nf_conncount_gc_list(struct net *net, if (collected > CONNCOUNT_GC_MAX_NODES) return false; } - return false; + + spin_lock_bh(&list->list_lock); + if (!list->count) { + list->dead = true; + ret = true; + } + spin_unlock_bh(&list->list_lock); + + return ret; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); @@ -309,11 +323,8 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); - if (rbconn->list.count == 0 && rbconn->list.dead == false) { - rbconn->list.dead = true; - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); - } + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); spin_unlock(&rbconn->list.list_lock); } } @@ -414,8 +425,9 @@ insert_tree(struct net *net, nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); count = 1; + rbconn->list.count = count; - rb_link_node(&rbconn->node, parent, rbnode); + rb_link_node_rcu(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); out_unlock: spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 1d66de5151b2..49e523cc49d0 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -25,102 +25,15 @@ static bool nf_ct_acct __read_mostly; module_param_named(acct, nf_ct_acct, bool, 0644); MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); -#ifdef CONFIG_SYSCTL -static struct ctl_table acct_sysctl_table[] = { - { - .procname = "nf_conntrack_acct", - .data = &init_net.ct.sysctl_acct, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - {} -}; -#endif /* CONFIG_SYSCTL */ - -unsigned int -seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) -{ - struct nf_conn_acct *acct; - struct nf_conn_counter *counter; - - acct = nf_conn_acct_find(ct); - if (!acct) - return 0; - - counter = acct->counter; - seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)atomic64_read(&counter[dir].packets), - (unsigned long long)atomic64_read(&counter[dir].bytes)); - - return 0; -}; -EXPORT_SYMBOL_GPL(seq_print_acct); - static const struct nf_ct_ext_type acct_extend = { .len = sizeof(struct nf_conn_acct), .align = __alignof__(struct nf_conn_acct), .id = NF_CT_EXT_ACCT, }; -#ifdef CONFIG_SYSCTL -static int nf_conntrack_acct_init_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table), - GFP_KERNEL); - if (!table) - goto out; - - table[0].data = &net->ct.sysctl_acct; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; - - net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter", - table); - if (!net->ct.acct_sysctl_header) { - pr_err("can't register to sysctl\n"); - goto out_register; - } - return 0; - -out_register: - kfree(table); -out: - return -ENOMEM; -} - -static void nf_conntrack_acct_fini_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = net->ct.acct_sysctl_header->ctl_table_arg; - unregister_net_sysctl_table(net->ct.acct_sysctl_header); - kfree(table); -} -#else -static int nf_conntrack_acct_init_sysctl(struct net *net) -{ - return 0; -} - -static void nf_conntrack_acct_fini_sysctl(struct net *net) -{ -} -#endif - -int nf_conntrack_acct_pernet_init(struct net *net) +void nf_conntrack_acct_pernet_init(struct net *net) { net->ct.sysctl_acct = nf_ct_acct; - return nf_conntrack_acct_init_sysctl(net); -} - -void nf_conntrack_acct_pernet_fini(struct net *net) -{ - nf_conntrack_acct_fini_sysctl(net); } int nf_conntrack_acct_init(void) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e92e749aff53..e87c21e47efe 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2110,10 +2110,7 @@ i_see_dead_people: list_for_each_entry(net, net_exit_list, exit_list) { nf_conntrack_proto_pernet_fini(net); - nf_conntrack_helper_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); - nf_conntrack_tstamp_pernet_fini(net); - nf_conntrack_acct_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); free_percpu(net->ct.pcpu_lists); @@ -2410,32 +2407,19 @@ int nf_conntrack_init_net(struct net *net) ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; - ret = nf_conntrack_acct_pernet_init(net); - if (ret < 0) - goto err_acct; - ret = nf_conntrack_tstamp_pernet_init(net); - if (ret < 0) - goto err_tstamp; - ret = nf_conntrack_ecache_pernet_init(net); - if (ret < 0) - goto err_ecache; - ret = nf_conntrack_helper_pernet_init(net); - if (ret < 0) - goto err_helper; + + nf_conntrack_acct_pernet_init(net); + nf_conntrack_tstamp_pernet_init(net); + nf_conntrack_ecache_pernet_init(net); + nf_conntrack_helper_pernet_init(net); + ret = nf_conntrack_proto_pernet_init(net); if (ret < 0) goto err_proto; return 0; err_proto: - nf_conntrack_helper_pernet_fini(net); -err_helper: nf_conntrack_ecache_pernet_fini(net); -err_ecache: - nf_conntrack_tstamp_pernet_fini(net); -err_tstamp: - nf_conntrack_acct_pernet_fini(net); -err_acct: nf_conntrack_expect_pernet_fini(net); err_expect: free_percpu(net->ct.stat); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index c11822a7d2bf..3d042f8ff183 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -336,85 +336,21 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); #define NF_CT_EVENTS_DEFAULT 1 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; -#ifdef CONFIG_SYSCTL -static struct ctl_table event_sysctl_table[] = { - { - .procname = "nf_conntrack_events", - .data = &init_net.ct.sysctl_events, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - {} -}; -#endif /* CONFIG_SYSCTL */ - static const struct nf_ct_ext_type event_extend = { .len = sizeof(struct nf_conntrack_ecache), .align = __alignof__(struct nf_conntrack_ecache), .id = NF_CT_EXT_ECACHE, }; -#ifdef CONFIG_SYSCTL -static int nf_conntrack_event_init_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table), - GFP_KERNEL); - if (!table) - goto out; - - table[0].data = &net->ct.sysctl_events; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; - - net->ct.event_sysctl_header = - register_net_sysctl(net, "net/netfilter", table); - if (!net->ct.event_sysctl_header) { - pr_err("can't register to sysctl\n"); - goto out_register; - } - return 0; - -out_register: - kfree(table); -out: - return -ENOMEM; -} - -static void nf_conntrack_event_fini_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = net->ct.event_sysctl_header->ctl_table_arg; - unregister_net_sysctl_table(net->ct.event_sysctl_header); - kfree(table); -} -#else -static int nf_conntrack_event_init_sysctl(struct net *net) -{ - return 0; -} - -static void nf_conntrack_event_fini_sysctl(struct net *net) -{ -} -#endif /* CONFIG_SYSCTL */ - -int nf_conntrack_ecache_pernet_init(struct net *net) +void nf_conntrack_ecache_pernet_init(struct net *net) { net->ct.sysctl_events = nf_ct_events; INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); - return nf_conntrack_event_init_sysctl(net); } void nf_conntrack_ecache_pernet_fini(struct net *net) { cancel_delayed_work_sync(&net->ct.ecache_dwork); - nf_conntrack_event_fini_sysctl(net); } int nf_conntrack_ecache_init(void) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e24b762ffa1d..274baf1dab87 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -42,67 +42,6 @@ module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); MODULE_PARM_DESC(nf_conntrack_helper, "Enable automatic conntrack helper assignment (default 0)"); -#ifdef CONFIG_SYSCTL -static struct ctl_table helper_sysctl_table[] = { - { - .procname = "nf_conntrack_helper", - .data = &init_net.ct.sysctl_auto_assign_helper, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - {} -}; - -static int nf_conntrack_helper_init_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table), - GFP_KERNEL); - if (!table) - goto out; - - table[0].data = &net->ct.sysctl_auto_assign_helper; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; - - net->ct.helper_sysctl_header = - register_net_sysctl(net, "net/netfilter", table); - - if (!net->ct.helper_sysctl_header) { - pr_err("nf_conntrack_helper: can't register to sysctl.\n"); - goto out_register; - } - return 0; - -out_register: - kfree(table); -out: - return -ENOMEM; -} - -static void nf_conntrack_helper_fini_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = net->ct.helper_sysctl_header->ctl_table_arg; - unregister_net_sysctl_table(net->ct.helper_sysctl_header); - kfree(table); -} -#else -static int nf_conntrack_helper_init_sysctl(struct net *net) -{ - return 0; -} - -static void nf_conntrack_helper_fini_sysctl(struct net *net) -{ -} -#endif /* CONFIG_SYSCTL */ - /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) @@ -533,16 +472,10 @@ static const struct nf_ct_ext_type helper_extend = { .id = NF_CT_EXT_HELPER, }; -int nf_conntrack_helper_pernet_init(struct net *net) +void nf_conntrack_helper_pernet_init(struct net *net) { net->ct.auto_assign_helper_warned = false; net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; - return nf_conntrack_helper_init_sysctl(net); -} - -void nf_conntrack_helper_pernet_fini(struct net *net) -{ - nf_conntrack_helper_fini_sysctl(net); } int nf_conntrack_helper_init(void) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4ae8e528943a..1213beb5a714 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -47,7 +47,6 @@ #include <net/netfilter/nf_conntrack_synproxy.h> #ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l4proto.h> #include <net/netfilter/nf_nat_helper.h> #endif @@ -1688,6 +1687,22 @@ static int ctnetlink_change_timeout(struct nf_conn *ct, return 0; } +#if defined(CONFIG_NF_CONNTRACK_MARK) +static void ctnetlink_change_mark(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + u32 mark, newmark, mask = 0; + + if (cda[CTA_MARK_MASK]) + mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + + mark = ntohl(nla_get_be32(cda[CTA_MARK])); + newmark = (ct->mark & mask) ^ mark; + if (newmark != ct->mark) + ct->mark = newmark; +} +#endif + static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, @@ -1883,7 +1898,7 @@ ctnetlink_change_conntrack(struct nf_conn *ct, #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) - ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); + ctnetlink_change_mark(ct, cda); #endif if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { @@ -2027,7 +2042,7 @@ ctnetlink_create_conntrack(struct net *net, #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) - ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); + ctnetlink_change_mark(ct, cda); #endif /* setup master conntrack: this is a confirmed expectation */ @@ -2524,14 +2539,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) { - u32 mask = 0, mark, newmark; - if (cda[CTA_MARK_MASK]) - mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); - - mark = ntohl(nla_get_be32(cda[CTA_MARK])); - newmark = (ct->mark & mask) ^ mark; - if (newmark != ct->mark) - ct->mark = newmark; + ctnetlink_change_mark(ct, cda); } #endif return 0; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 40643af7137e..859f5d07a915 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -175,8 +175,7 @@ static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, static int nf_ct_l4proto_register_sysctl(struct net *net, - struct nf_proto_net *pn, - const struct nf_conntrack_l4proto *l4proto) + struct nf_proto_net *pn) { int err = 0; @@ -198,9 +197,7 @@ int nf_ct_l4proto_register_sysctl(struct net *net, } static -void nf_ct_l4proto_unregister_sysctl(struct net *net, - struct nf_proto_net *pn, - const struct nf_conntrack_l4proto *l4proto) +void nf_ct_l4proto_unregister_sysctl(struct nf_proto_net *pn) { #ifdef CONFIG_SYSCTL if (pn->ctl_table_header != NULL) @@ -252,7 +249,7 @@ int nf_ct_l4proto_pernet_register_one(struct net *net, if (pn == NULL) goto out; - ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto); + ret = nf_ct_l4proto_register_sysctl(net, pn); if (ret < 0) goto out; @@ -296,7 +293,7 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net, return; pn->users--; - nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); + nf_ct_l4proto_unregister_sysctl(pn); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); @@ -946,16 +943,14 @@ int nf_conntrack_proto_pernet_init(struct net *net) if (err < 0) return err; err = nf_ct_l4proto_register_sysctl(net, - pn, - &nf_conntrack_l4proto_generic); + pn); if (err < 0) return err; err = nf_ct_l4proto_pernet_register(net, builtin_l4proto, ARRAY_SIZE(builtin_l4proto)); if (err < 0) { - nf_ct_l4proto_unregister_sysctl(net, pn, - &nf_conntrack_l4proto_generic); + nf_ct_l4proto_unregister_sysctl(pn); return err; } @@ -971,9 +966,7 @@ void nf_conntrack_proto_pernet_fini(struct net *net) nf_ct_l4proto_pernet_unregister(net, builtin_l4proto, ARRAY_SIZE(builtin_l4proto)); pn->users--; - nf_ct_l4proto_unregister_sysctl(net, - pn, - &nf_conntrack_l4proto_generic); + nf_ct_l4proto_unregister_sysctl(pn); } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9b48dc8b4b88..8899b51aad44 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -43,24 +43,12 @@ #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> -enum grep_conntrack { - GRE_CT_UNREPLIED, - GRE_CT_REPLIED, - GRE_CT_MAX -}; - static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_UNREPLIED] = 30*HZ, [GRE_CT_REPLIED] = 180*HZ, }; static unsigned int proto_gre_net_id __read_mostly; -struct netns_proto_gre { - struct nf_proto_net nf; - rwlock_t keymap_lock; - struct list_head keymap_list; - unsigned int gre_timeouts[GRE_CT_MAX]; -}; static inline struct netns_proto_gre *gre_pernet(struct net *net) { @@ -332,9 +320,49 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ +#ifdef CONFIG_SYSCTL +static struct ctl_table gre_sysctl_table[] = { + { + .procname = "nf_conntrack_gre_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_gre_timeout_stream", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif + +static int gre_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *nf, + struct netns_proto_gre *net_gre) +{ +#ifdef CONFIG_SYSCTL + int i; + + if (nf->ctl_table) + return 0; + + nf->ctl_table = kmemdup(gre_sysctl_table, + sizeof(gre_sysctl_table), + GFP_KERNEL); + if (!nf->ctl_table) + return -ENOMEM; + + for (i = 0; i < GRE_CT_MAX; i++) + nf->ctl_table[i].data = &net_gre->gre_timeouts[i]; +#endif + return 0; +} + static int gre_init_net(struct net *net) { struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_proto_net *nf = &net_gre->nf; int i; rwlock_init(&net_gre->keymap_lock); @@ -342,7 +370,7 @@ static int gre_init_net(struct net *net) for (i = 0; i < GRE_CT_MAX; i++) net_gre->gre_timeouts[i] = gre_timeouts[i]; - return 0; + return gre_kmemdup_sysctl_table(net, nf, net_gre); } /* protocol helper struct */ @@ -402,6 +430,8 @@ static int __init nf_ct_proto_gre_init(void) { int ret; + BUILD_BUG_ON(offsetof(struct netns_proto_gre, nf) != 0); + ret = register_pernet_subsys(&proto_gre_net_ops); if (ret < 0) goto out_pernet; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index c879d8d78cfd..b4f5d5e82031 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -29,7 +29,7 @@ static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_UNREPLIED] = 30*HZ, - [UDP_CT_REPLIED] = 180*HZ, + [UDP_CT_REPLIED] = 120*HZ, }; static unsigned int *udp_get_timeouts(struct net *net) @@ -100,11 +100,21 @@ static int udp_packet(struct nf_conn *ct, if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); + if (!nf_ct_is_confirmed(ct)) + ct->proto.udp.stream_ts = 2 * HZ + jiffies; + /* If we've seen traffic both ways, this is some kind of UDP - stream. Extend timeout. */ + * stream. Set Assured. + */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_REPLIED]); + unsigned long extra = timeouts[UDP_CT_UNREPLIED]; + + /* Still active after two seconds? Extend timeout. */ + if (time_after(jiffies, ct->proto.udp.stream_ts)) + extra = timeouts[UDP_CT_REPLIED]; + + nf_ct_refresh_acct(ct, ctinfo, skb, extra); + /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index a975efd6b8c3..9da303461069 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -115,12 +115,12 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb, /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, - struct tcphdr *tcph, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - unsigned int dir, optoff, optend; + struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; @@ -128,6 +128,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, if (!skb_make_writable(skb, optend)) return 0; + tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { @@ -207,7 +208,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb, ntohl(newack)); tcph->ack_seq = newack; - res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); + res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 463d17d349c1..b6177fd73304 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -267,6 +267,24 @@ static const char* l4proto_name(u16 proto) return "unknown"; } +static unsigned int +seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) +{ + struct nf_conn_acct *acct; + struct nf_conn_counter *counter; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; + + counter = acct->counter; + seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)atomic64_read(&counter[dir].packets), + (unsigned long long)atomic64_read(&counter[dir].bytes)); + + return 0; +} + /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { @@ -514,36 +532,53 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write, static struct ctl_table_header *nf_ct_netfilter_header; +enum nf_ct_sysctl_index { + NF_SYSCTL_CT_MAX, + NF_SYSCTL_CT_COUNT, + NF_SYSCTL_CT_BUCKETS, + NF_SYSCTL_CT_CHECKSUM, + NF_SYSCTL_CT_LOG_INVALID, + NF_SYSCTL_CT_EXPECT_MAX, + NF_SYSCTL_CT_ACCT, + NF_SYSCTL_CT_HELPER, +#ifdef CONFIG_NF_CONNTRACK_EVENTS + NF_SYSCTL_CT_EVENTS, +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + NF_SYSCTL_CT_TIMESTAMP, +#endif +}; + static struct ctl_table nf_ct_sysctl_table[] = { - { + [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, - { + [NF_SYSCTL_CT_BUCKETS] = { .procname = "nf_conntrack_buckets", .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = nf_conntrack_hash_sysctl, }, - { + [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), @@ -552,13 +587,45 @@ static struct ctl_table nf_ct_sysctl_table[] = { .extra1 = &log_invalid_proto_min, .extra2 = &log_invalid_proto_max, }, - { + [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, + [NF_SYSCTL_CT_ACCT] = { + .procname = "nf_conntrack_acct", + .data = &init_net.ct.sysctl_acct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NF_SYSCTL_CT_HELPER] = { + .procname = "nf_conntrack_helper", + .data = &init_net.ct.sysctl_auto_assign_helper, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_NF_CONNTRACK_EVENTS + [NF_SYSCTL_CT_EVENTS] = { + .procname = "nf_conntrack_events", + .data = &init_net.ct.sysctl_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + [NF_SYSCTL_CT_TIMESTAMP] = { + .procname = "nf_conntrack_timestamp", + .data = &init_net.ct.sysctl_tstamp, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { } }; @@ -582,16 +649,28 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (!table) goto out_kmemdup; - table[1].data = &net->ct.count; - table[3].data = &net->ct.sysctl_checksum; - table[4].data = &net->ct.sysctl_log_invalid; + table[NF_SYSCTL_CT_COUNT].data = &net->ct.count; + table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; + table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; +#ifdef CONFIG_NF_CONNTRACK_EVENTS + table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; +#endif /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) { + table[NF_SYSCTL_CT_MAX].procname = NULL; + table[NF_SYSCTL_CT_ACCT].procname = NULL; + table[NF_SYSCTL_CT_HELPER].procname = NULL; +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL; +#endif +#ifdef CONFIG_NF_CONNTRACK_EVENTS + table[NF_SYSCTL_CT_EVENTS].procname = NULL; +#endif + } if (!net_eq(&init_net, net)) - table[2].mode = 0444; + table[NF_SYSCTL_CT_BUCKETS].mode = 0444; net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.sysctl_header) diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index 56766cb26e40..705b912bd91f 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -22,83 +22,15 @@ static bool nf_ct_tstamp __read_mostly; module_param_named(tstamp, nf_ct_tstamp, bool, 0644); MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); -#ifdef CONFIG_SYSCTL -static struct ctl_table tstamp_sysctl_table[] = { - { - .procname = "nf_conntrack_timestamp", - .data = &init_net.ct.sysctl_tstamp, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - {} -}; -#endif /* CONFIG_SYSCTL */ - static const struct nf_ct_ext_type tstamp_extend = { .len = sizeof(struct nf_conn_tstamp), .align = __alignof__(struct nf_conn_tstamp), .id = NF_CT_EXT_TSTAMP, }; -#ifdef CONFIG_SYSCTL -static int nf_conntrack_tstamp_init_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table), - GFP_KERNEL); - if (!table) - goto out; - - table[0].data = &net->ct.sysctl_tstamp; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; - - net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter", - table); - if (!net->ct.tstamp_sysctl_header) { - pr_err("can't register to sysctl\n"); - goto out_register; - } - return 0; - -out_register: - kfree(table); -out: - return -ENOMEM; -} - -static void nf_conntrack_tstamp_fini_sysctl(struct net *net) -{ - struct ctl_table *table; - - table = net->ct.tstamp_sysctl_header->ctl_table_arg; - unregister_net_sysctl_table(net->ct.tstamp_sysctl_header); - kfree(table); -} -#else -static int nf_conntrack_tstamp_init_sysctl(struct net *net) -{ - return 0; -} - -static void nf_conntrack_tstamp_fini_sysctl(struct net *net) -{ -} -#endif - -int nf_conntrack_tstamp_pernet_init(struct net *net) +void nf_conntrack_tstamp_pernet_init(struct net *net) { net->ct.sysctl_tstamp = nf_ct_tstamp; - return nf_conntrack_tstamp_init_sysctl(net); -} - -void nf_conntrack_tstamp_pernet_fini(struct net *net) -{ - nf_conntrack_tstamp_fini_sysctl(net); } int nf_conntrack_tstamp_init(void) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index b7a4816add76..fa0844e2a68d 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -247,9 +247,10 @@ flow_offload_lookup(struct nf_flowtable *flow_table, } EXPORT_SYMBOL_GPL(flow_offload_lookup); -int nf_flow_table_iterate(struct nf_flowtable *flow_table, - void (*iter)(struct flow_offload *flow, void *data), - void *data) +static int +nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data) { struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; @@ -279,40 +280,19 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } -EXPORT_SYMBOL_GPL(nf_flow_table_iterate); static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return (__s32)(flow->timeout - (u32)jiffies) <= 0; } -static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table) +static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { - struct flow_offload_tuple_rhash *tuplehash; - struct rhashtable_iter hti; - struct flow_offload *flow; - - rhashtable_walk_enter(&flow_table->rhashtable, &hti); - rhashtable_walk_start(&hti); + struct nf_flowtable *flow_table = data; - while ((tuplehash = rhashtable_walk_next(&hti))) { - if (IS_ERR(tuplehash)) { - if (PTR_ERR(tuplehash) != -EAGAIN) - break; - continue; - } - if (tuplehash->tuple.dir) - continue; - - flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); - - if (nf_flow_has_expired(flow) || - (flow->flags & (FLOW_OFFLOAD_DYING | - FLOW_OFFLOAD_TEARDOWN))) - flow_offload_del(flow_table, flow); - } - rhashtable_walk_stop(&hti); - rhashtable_walk_exit(&hti); + if (nf_flow_has_expired(flow) || + (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) + flow_offload_del(flow_table, flow); } static void nf_flow_offload_work_gc(struct work_struct *work) @@ -320,7 +300,7 @@ static void nf_flow_offload_work_gc(struct work_struct *work) struct nf_flowtable *flow_table; flow_table = container_of(work, struct nf_flowtable, gc_work.work); - nf_flow_offload_gc_step(flow_table); + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } @@ -504,7 +484,7 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); - nf_flow_offload_gc_step(flow_table); + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index a8c5c846aec1..3a0d6880b7c9 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -156,22 +156,20 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { + const struct net_device *physoutdev __maybe_unused; + const struct net_device *physindev __maybe_unused; + nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) { - const struct net_device *physindev; - const struct net_device *physoutdev; - - physindev = nf_bridge_get_physindev(skb); - if (physindev && in != physindev) - nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = nf_bridge_get_physoutdev(skb); - if (physoutdev && out != physoutdev) - nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); - } + physindev = nf_bridge_get_physindev(skb); + if (physindev && in != physindev) + nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = nf_bridge_get_physoutdev(skb); + if (physoutdev && out != physoutdev) + nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); #endif } EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index e2b196054dfc..d159e9e7835b 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -23,7 +23,6 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/nf_conntrack_helper.h> @@ -38,8 +37,6 @@ static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; -static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] - __read_mostly; static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; @@ -67,13 +64,6 @@ __nf_nat_l3proto_find(u8 family) return rcu_dereference(nf_nat_l3protos[family]); } -inline const struct nf_nat_l4proto * -__nf_nat_l4proto_find(u8 family, u8 protonum) -{ - return rcu_dereference(nf_nat_l4protos[family][protonum]); -} -EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find); - #ifdef CONFIG_XFRM static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { @@ -117,7 +107,8 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) dst = skb_dst(skb); if (dst->xfrm) dst = ((struct xfrm_dst *)dst)->route; - dst_hold(dst); + if (!dst_hold_safe(dst)) + return -EHOSTUNREACH; if (sk && !net_eq(net, sock_net(sk))) sk = NULL; @@ -172,27 +163,66 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, } EXPORT_SYMBOL(nf_nat_used_tuple); +static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, + const struct nf_nat_range2 *range) +{ + if (t->src.l3num == NFPROTO_IPV4) + return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && + ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); + + return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && + ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; +} + +/* Is the manipable part of the tuple between min and max incl? */ +static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + __be16 port; + + switch (tuple->dst.protonum) { + case IPPROTO_ICMP: /* fallthrough */ + case IPPROTO_ICMPV6: + return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && + ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); + case IPPROTO_GRE: /* all fall though */ + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_DCCP: + case IPPROTO_SCTP: + if (maniptype == NF_NAT_MANIP_SRC) + port = tuple->src.u.all; + else + port = tuple->dst.u.all; + + return ntohs(port) >= ntohs(min->all) && + ntohs(port) <= ntohs(max->all); + default: + return true; + } +} + /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ -static int in_range(const struct nf_nat_l3proto *l3proto, - const struct nf_nat_l4proto *l4proto, - const struct nf_conntrack_tuple *tuple, +static int in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && - !l3proto->in_range(tuple, range)) + !nf_nat_inet_in_range(tuple, range)) return 0; - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) || - l4proto->in_range(tuple, NF_NAT_MANIP_SRC, - &range->min_proto, &range->max_proto)) + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; - return 0; + return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, + &range->min_proto, &range->max_proto); } static inline int @@ -211,8 +241,6 @@ same_src(const struct nf_conn *ct, static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_nat_l3proto *l3proto, - const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) @@ -229,7 +257,7 @@ find_appropriate_src(struct net *net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; - if (in_range(l3proto, l4proto, result, range)) + if (in_range(result, range)) return 1; } } @@ -310,6 +338,123 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone, } } +/* Alter the per-proto part of the tuple (depending on maniptype), to + * give a unique tuple in the given range if possible. + * + * Per-protocol part of tuple is initialized to the incoming packet. + */ +static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range2 *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + unsigned int range_size, min, max, i, attempts; + __be16 *keyptr; + u16 off; + static const unsigned int max_attempts = 128; + + switch (tuple->dst.protonum) { + case IPPROTO_ICMP: /* fallthrough */ + case IPPROTO_ICMPV6: + /* id is same for either direction... */ + keyptr = &tuple->src.u.icmp.id; + min = range->min_proto.icmp.id; + range_size = ntohs(range->max_proto.icmp.id) - + ntohs(range->min_proto.icmp.id) + 1; + goto find_free_id; +#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) + case IPPROTO_GRE: + /* If there is no master conntrack we are not PPTP, + do not change tuples */ + if (!ct->master) + return; + + if (maniptype == NF_NAT_MANIP_SRC) + keyptr = &tuple->src.u.gre.key; + else + keyptr = &tuple->dst.u.gre.key; + + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + min = 1; + range_size = 65535; + } else { + min = ntohs(range->min_proto.gre.key); + range_size = ntohs(range->max_proto.gre.key) - min + 1; + } + goto find_free_id; +#endif + case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDPLITE: /* fallthrough */ + case IPPROTO_TCP: /* fallthrough */ + case IPPROTO_SCTP: /* fallthrough */ + case IPPROTO_DCCP: /* fallthrough */ + if (maniptype == NF_NAT_MANIP_SRC) + keyptr = &tuple->src.u.all; + else + keyptr = &tuple->dst.u.all; + + break; + default: + return; + } + + /* If no range specified... */ + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == NF_NAT_MANIP_DST) + return; + + if (ntohs(*keyptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*keyptr) < 512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min_proto.all); + max = ntohs(range->max_proto.all); + if (unlikely(max < min)) + swap(max, min); + range_size = max - min + 1; + } + +find_free_id: + if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) + off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); + else + off = prandom_u32(); + + attempts = range_size; + if (attempts > max_attempts) + attempts = max_attempts; + + /* We are in softirq; doing a search of the entire range risks + * soft lockup when all tuples are already used. + * + * If we can't find any free port from first offset, pick a new + * one and try again, with ever smaller search window. + */ +another_round: + for (i = 0; i < attempts; i++, off++) { + *keyptr = htons(min + off % range_size); + if (!nf_nat_used_tuple(tuple, ct)) + return; + } + + if (attempts >= range_size || attempts < 16) + return; + attempts /= 2; + off = prandom_u32(); + goto another_round; +} + /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the @@ -324,17 +469,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; - const struct nf_nat_l3proto *l3proto; - const struct nf_nat_l4proto *l4proto; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); - rcu_read_lock(); - l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); - l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, - orig_tuple->dst.protonum); - /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. @@ -346,16 +484,16 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ - if (in_range(l3proto, l4proto, orig_tuple, range)) { + if (in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; - goto out; + return; } - } else if (find_appropriate_src(net, zone, l3proto, l4proto, + } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) - goto out; + return; } } @@ -371,21 +509,19 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && - l4proto->in_range(tuple, maniptype, + l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) - goto out; + return; } else if (!nf_nat_used_tuple(tuple, ct)) { - goto out; + return; } } /* Last chance: get protocol to try to obtain unique tuple. */ - l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); -out: - rcu_read_unlock(); + nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) @@ -501,16 +637,13 @@ static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_dir dir) { const struct nf_nat_l3proto *l3proto; - const struct nf_nat_l4proto *l4proto; struct nf_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); l3proto = __nf_nat_l3proto_find(target.src.l3num); - l4proto = __nf_nat_l4proto_find(target.src.l3num, - target.dst.protonum); - if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) + if (!l3proto->manip_pkt(skb, 0, &target, mtype)) return NF_DROP; return NF_ACCEPT; @@ -666,16 +799,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) return 0; } -static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) -{ - struct nf_nat_proto_clean clean = { - .l3proto = l3proto, - .l4proto = l4proto, - }; - - nf_ct_iterate_destroy(nf_nat_proto_remove, &clean); -} - static void nf_nat_l3proto_clean(u8 l3proto) { struct nf_nat_proto_clean clean = { @@ -685,82 +808,8 @@ static void nf_nat_l3proto_clean(u8 l3proto) nf_ct_iterate_destroy(nf_nat_proto_remove, &clean); } -/* Protocol registration. */ -int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto) -{ - const struct nf_nat_l4proto **l4protos; - unsigned int i; - int ret = 0; - - mutex_lock(&nf_nat_proto_mutex); - if (nf_nat_l4protos[l3proto] == NULL) { - l4protos = kmalloc_array(IPPROTO_MAX, - sizeof(struct nf_nat_l4proto *), - GFP_KERNEL); - if (l4protos == NULL) { - ret = -ENOMEM; - goto out; - } - - for (i = 0; i < IPPROTO_MAX; i++) - RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown); - - /* Before making proto_array visible to lockless readers, - * we must make sure its content is committed to memory. - */ - smp_wmb(); - - nf_nat_l4protos[l3proto] = l4protos; - } - - if (rcu_dereference_protected( - nf_nat_l4protos[l3proto][l4proto->l4proto], - lockdep_is_held(&nf_nat_proto_mutex) - ) != &nf_nat_l4proto_unknown) { - ret = -EBUSY; - goto out; - } - RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto); - out: - mutex_unlock(&nf_nat_proto_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_nat_l4proto_register); - -/* No one stores the protocol anywhere; simply delete it. */ -void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto) -{ - mutex_lock(&nf_nat_proto_mutex); - RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], - &nf_nat_l4proto_unknown); - mutex_unlock(&nf_nat_proto_mutex); - synchronize_rcu(); - - nf_nat_l4proto_clean(l3proto, l4proto->l4proto); -} -EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); - int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) { - mutex_lock(&nf_nat_proto_mutex); - RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], - &nf_nat_l4proto_tcp); - RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP], - &nf_nat_l4proto_udp); -#ifdef CONFIG_NF_NAT_PROTO_DCCP - RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP], - &nf_nat_l4proto_dccp); -#endif -#ifdef CONFIG_NF_NAT_PROTO_SCTP - RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP], - &nf_nat_l4proto_sctp); -#endif -#ifdef CONFIG_NF_NAT_PROTO_UDPLITE - RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE], - &nf_nat_l4proto_udplite); -#endif - mutex_unlock(&nf_nat_proto_mutex); - RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto); return 0; } @@ -801,12 +850,26 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; +static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range2 *range) +{ + if (tb[CTA_PROTONAT_PORT_MIN]) { + range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); + range->max_proto.all = range->min_proto.all; + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + if (tb[CTA_PROTONAT_PORT_MAX]) { + range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); + range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + return 0; +} + static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; - const struct nf_nat_l4proto *l4proto; int err; err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, @@ -814,11 +877,7 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr, if (err < 0) return err; - l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (l4proto->nlattr_to_range) - err = l4proto->nlattr_to_range(tb, range); - - return err; + return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { @@ -1081,7 +1140,6 @@ static int __init nf_nat_init(void) static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; - unsigned int i; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); @@ -1089,10 +1147,6 @@ static void __exit nf_nat_cleanup(void) nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); - synchronize_rcu(); - - for (i = 0; i < NFPROTO_NUMPROTO; i++) - kfree(nf_nat_l4protos[i]); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c new file mode 100644 index 000000000000..f83bf9d8c9f5 --- /dev/null +++ b/net/netfilter/nf_nat_proto.c @@ -0,0 +1,343 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> + +#include <linux/dccp.h> +#include <linux/sctp.h> +#include <net/sctp/checksum.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static void +__udp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, struct udphdr *hdr, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, bool do_csum) +{ + __be16 *portptr, newport; + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + if (do_csum) { + l3proto->csum_update(skb, iphdroff, &hdr->check, + tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, + false); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + } + *portptr = newport; +} + +static bool udp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct udphdr *hdr; + bool do_csum; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct udphdr *)(skb->data + hdroff); + do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL; + + __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum); + return true; +} + +static bool udplite_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + struct udphdr *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct udphdr *)(skb->data + hdroff); + __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true); +#endif + return true; +} + +static bool +sctp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ +#ifdef CONFIG_NF_CT_PROTO_SCTP + struct sctphdr *hdr; + int hdrsize = 8; + + /* This could be an inner header returned in imcp packet; in such + * cases we cannot update the checksum field since it is outside + * of the 8 bytes of transport layer headers we are guaranteed. + */ + if (skb->len >= hdroff + sizeof(*hdr)) + hdrsize = sizeof(*hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + hdr = (struct sctphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + hdr->source = tuple->src.u.sctp.port; + } else { + /* Get rid of dst port */ + hdr->dest = tuple->dst.u.sctp.port; + } + + if (hdrsize < sizeof(*hdr)) + return true; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + hdr->checksum = sctp_compute_cksum(skb, hdroff); + skb->ip_summed = CHECKSUM_NONE; + } + +#endif + return true; +} + +static bool +tcp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct tcphdr *hdr; + __be16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if (skb->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + hdr = (struct tcphdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + /* Get rid of src port */ + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst port */ + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); + return true; +} + +static bool +dccp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ +#ifdef CONFIG_NF_CT_PROTO_DCCP + struct dccp_hdr *hdr; + __be16 *portptr, oldport, newport; + int hdrsize = 8; /* DCCP connection tracking guarantees this much */ + + if (skb->len >= hdroff + sizeof(struct dccp_hdr)) + hdrsize = sizeof(struct dccp_hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + hdr = (struct dccp_hdr *)(skb->data + hdroff); + + if (maniptype == NF_NAT_MANIP_SRC) { + newport = tuple->src.u.dccp.port; + portptr = &hdr->dccph_sport; + } else { + newport = tuple->dst.u.dccp.port; + portptr = &hdr->dccph_dport; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, + tuple, maniptype); + inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, + false); +#endif + return true; +} + +static bool +icmp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct icmphdr *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct icmphdr *)(skb->data + hdroff); + inet_proto_csum_replace2(&hdr->checksum, skb, + hdr->un.echo.id, tuple->src.u.icmp.id, false); + hdr->un.echo.id = tuple->src.u.icmp.id; + return true; +} + +static bool +icmpv6_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct icmp6hdr *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct icmp6hdr *)(skb->data + hdroff); + l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum, + tuple, maniptype); + if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || + hdr->icmp6_type == ICMPV6_ECHO_REPLY) { + inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, + hdr->icmp6_identifier, + tuple->src.u.icmp.id, false); + hdr->icmp6_identifier = tuple->src.u.icmp.id; + } + return true; +} + +/* manipulate a GRE packet according to maniptype */ +static bool +gre_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ +#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) + const struct gre_base_hdr *greh; + struct pptp_gre_header *pgreh; + + /* pgreh includes two optional 32bit fields which are not required + * to be there. That's where the magic '8' comes from */ + if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) + return false; + + greh = (void *)skb->data + hdroff; + pgreh = (struct pptp_gre_header *)greh; + + /* we only have destination manip of a packet, since 'source key' + * is not present in the packet itself */ + if (maniptype != NF_NAT_MANIP_DST) + return true; + + switch (greh->flags & GRE_VERSION) { + case GRE_VERSION_0: + /* We do not currently NAT any GREv0 packets. + * Try to behave like "nf_nat_proto_unknown" */ + break; + case GRE_VERSION_1: + pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); + pgreh->call_id = tuple->dst.u.gre.key; + break; + default: + pr_debug("can't nat unknown GRE version\n"); + return false; + } +#endif + return true; +} + +bool nf_nat_l4proto_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + switch (tuple->dst.protonum) { + case IPPROTO_TCP: + return tcp_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + case IPPROTO_UDP: + return udp_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + case IPPROTO_UDPLITE: + return udplite_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + case IPPROTO_SCTP: + return sctp_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + case IPPROTO_ICMP: + return icmp_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + case IPPROTO_ICMPV6: + return icmpv6_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + case IPPROTO_DCCP: + return dccp_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + case IPPROTO_GRE: + return gre_manip_pkt(skb, l3proto, iphdroff, hdroff, + tuple, maniptype); + } + + /* If we don't know protocol -- no error, pass it unmodified. */ + return true; +} +EXPORT_SYMBOL_GPL(nf_nat_l4proto_manip_pkt); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c deleted file mode 100644 index 5d849d835561..000000000000 --- a/net/netfilter/nf_nat_proto_common.c +++ /dev/null @@ -1,120 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/random.h> -#include <linux/netfilter.h> -#include <linux/export.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> - -bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - __be16 port; - - if (maniptype == NF_NAT_MANIP_SRC) - port = tuple->src.u.all; - else - port = tuple->dst.u.all; - - return ntohs(port) >= ntohs(min->all) && - ntohs(port) <= ntohs(max->all); -} -EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range); - -void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct, - u16 *rover) -{ - unsigned int range_size, min, max, i; - __be16 *portptr; - u_int16_t off; - - if (maniptype == NF_NAT_MANIP_SRC) - portptr = &tuple->src.u.all; - else - portptr = &tuple->dst.u.all; - - /* If no range specified... */ - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == NF_NAT_MANIP_DST) - return; - - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr) < 512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min_proto.all); - max = ntohs(range->max_proto.all); - if (unlikely(max < min)) - swap(max, min); - range_size = max - min + 1; - } - - if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) { - off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC - ? tuple->dst.u.all - : tuple->src.u.all); - } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { - off = prandom_u32(); - } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) { - off = (ntohs(*portptr) - ntohs(range->base_proto.all)); - } else { - off = *rover; - } - - for (i = 0; ; ++off) { - *portptr = htons(min + off % range_size); - if (++i != range_size && nf_nat_used_tuple(tuple, ct)) - continue; - if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL| - NF_NAT_RANGE_PROTO_OFFSET))) - *rover = off; - return; - } -} -EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range2 *range) -{ - if (tb[CTA_PROTONAT_PORT_MIN]) { - range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); - range->max_proto.all = range->min_proto.all; - range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - if (tb[CTA_PROTONAT_PORT_MAX]) { - range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); - range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - return 0; -} -EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range); -#endif diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c deleted file mode 100644 index 67ea0d83aa5a..000000000000 --- a/net/netfilter/nf_nat_proto_dccp.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * DCCP NAT protocol helper - * - * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/dccp.h> - -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static u_int16_t dccp_port_rover; - -static void -dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, - &dccp_port_rover); -} - -static bool -dccp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct dccp_hdr *hdr; - __be16 *portptr, oldport, newport; - int hdrsize = 8; /* DCCP connection tracking guarantees this much */ - - if (skb->len >= hdroff + sizeof(struct dccp_hdr)) - hdrsize = sizeof(struct dccp_hdr); - - if (!skb_make_writable(skb, hdroff + hdrsize)) - return false; - - hdr = (struct dccp_hdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - newport = tuple->src.u.dccp.port; - portptr = &hdr->dccph_sport; - } else { - newport = tuple->dst.u.dccp.port; - portptr = &hdr->dccph_dport; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, - tuple, maniptype); - inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - false); - return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_dccp = { - .l4proto = IPPROTO_DCCP, - .manip_pkt = dccp_manip_pkt, - .in_range = nf_nat_l4proto_in_range, - .unique_tuple = dccp_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c deleted file mode 100644 index 1c5d9b65fbba..000000000000 --- a/net/netfilter/nf_nat_proto_sctp.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/sctp.h> -#include <net/sctp/checksum.h> - -#include <net/netfilter/nf_nat_l4proto.h> - -static u_int16_t nf_sctp_port_rover; - -static void -sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, - &nf_sctp_port_rover); -} - -static bool -sctp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct sctphdr *hdr; - int hdrsize = 8; - - /* This could be an inner header returned in imcp packet; in such - * cases we cannot update the checksum field since it is outside - * of the 8 bytes of transport layer headers we are guaranteed. - */ - if (skb->len >= hdroff + sizeof(*hdr)) - hdrsize = sizeof(*hdr); - - if (!skb_make_writable(skb, hdroff + hdrsize)) - return false; - - hdr = (struct sctphdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of src port */ - hdr->source = tuple->src.u.sctp.port; - } else { - /* Get rid of dst port */ - hdr->dest = tuple->dst.u.sctp.port; - } - - if (hdrsize < sizeof(*hdr)) - return true; - - if (skb->ip_summed != CHECKSUM_PARTIAL) { - hdr->checksum = sctp_compute_cksum(skb, hdroff); - skb->ip_summed = CHECKSUM_NONE; - } - - return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_sctp = { - .l4proto = IPPROTO_SCTP, - .manip_pkt = sctp_manip_pkt, - .in_range = nf_nat_l4proto_in_range, - .unique_tuple = sctp_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c deleted file mode 100644 index f15fcd475f98..000000000000 --- a/net/netfilter/nf_nat_proto_tcp.c +++ /dev/null @@ -1,85 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/tcp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> -#include <net/netfilter/nf_nat_core.h> - -static u16 tcp_port_rover; - -static void -tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, - &tcp_port_rover); -} - -static bool -tcp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct tcphdr *hdr; - __be16 *portptr, newport, oldport; - int hdrsize = 8; /* TCP connection tracking guarantees this much */ - - /* this could be a inner header returned in icmp packet; in such - cases we cannot update the checksum field since it is outside of - the 8 bytes of transport layer headers we are guaranteed */ - if (skb->len >= hdroff + sizeof(struct tcphdr)) - hdrsize = sizeof(struct tcphdr); - - if (!skb_make_writable(skb, hdroff + hdrsize)) - return false; - - hdr = (struct tcphdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of src port */ - newport = tuple->src.u.tcp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst port */ - newport = tuple->dst.u.tcp.port; - portptr = &hdr->dest; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); - return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_tcp = { - .l4proto = IPPROTO_TCP, - .manip_pkt = tcp_manip_pkt, - .in_range = nf_nat_l4proto_in_range, - .unique_tuple = tcp_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c deleted file mode 100644 index 5790f70a83b2..000000000000 --- a/net/netfilter/nf_nat_proto_udp.c +++ /dev/null @@ -1,130 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/export.h> -#include <linux/init.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static u16 udp_port_rover; - -static void -udp_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, - &udp_port_rover); -} - -static void -__udp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, struct udphdr *hdr, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, bool do_csum) -{ - __be16 *portptr, newport; - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of src port */ - newport = tuple->src.u.udp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst port */ - newport = tuple->dst.u.udp.port; - portptr = &hdr->dest; - } - if (do_csum) { - l3proto->csum_update(skb, iphdroff, &hdr->check, - tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, - false); - if (!hdr->check) - hdr->check = CSUM_MANGLED_0; - } - *portptr = newport; -} - -static bool udp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct udphdr *hdr; - bool do_csum; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - hdr = (struct udphdr *)(skb->data + hdroff); - do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL; - - __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum); - return true; -} - -#ifdef CONFIG_NF_NAT_PROTO_UDPLITE -static u16 udplite_port_rover; - -static bool udplite_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct udphdr *hdr; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - hdr = (struct udphdr *)(skb->data + hdroff); - __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true); - return true; -} - -static void -udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, - &udplite_port_rover); -} - -const struct nf_nat_l4proto nf_nat_l4proto_udplite = { - .l4proto = IPPROTO_UDPLITE, - .manip_pkt = udplite_manip_pkt, - .in_range = nf_nat_l4proto_in_range, - .unique_tuple = udplite_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; -#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */ - -const struct nf_nat_l4proto nf_nat_l4proto_udp = { - .l4proto = IPPROTO_UDP, - .manip_pkt = udp_manip_pkt, - .in_range = nf_nat_l4proto_in_range, - .unique_tuple = udp_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c deleted file mode 100644 index c5db3e251232..000000000000 --- a/net/netfilter/nf_nat_proto_unknown.c +++ /dev/null @@ -1,54 +0,0 @@ -/* The "unknown" protocol. This is what is used for protocols we - * don't understand. It's returned by ip_ct_find_proto(). - */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type manip_type, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - return true; -} - -static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - /* Sorry: we can't help you; if it's not unique, we can't frob - * anything. - */ - return; -} - -static bool -unknown_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_unknown = { - .manip_pkt = unknown_manip_pkt, - .in_range = unknown_in_range, - .unique_tuple = unknown_unique_tuple, -}; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 1f3086074981..aa1be643d7a0 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -18,6 +18,7 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_seqadj.h> @@ -316,6 +317,9 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, static void nf_nat_sip_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) { + struct nf_conn_help *help = nfct_help(ct->master); + struct nf_conntrack_expect *pair_exp; + int range_set_for_snat = 0; struct nf_nat_range2 range; /* This must be a fresh one. */ @@ -327,15 +331,42 @@ static void nf_nat_sip_expected(struct nf_conn *ct, range.min_addr = range.max_addr = exp->saved_addr; nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); - /* Change src to where master sends to, but only if the connection - * actually came from the same source. */ - if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, + /* Do media streams SRC manip according with the parameters + * found in the paired expectation. + */ + if (exp->class != SIP_EXPECT_SIGNALLING) { + spin_lock_bh(&nf_conntrack_expect_lock); + hlist_for_each_entry(pair_exp, &help->expectations, lnode) { + if (pair_exp->tuple.src.l3num == nf_ct_l3num(ct) && + pair_exp->tuple.dst.protonum == ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum && + nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &pair_exp->saved_addr) && + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all == pair_exp->saved_proto.all) { + range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); + range.min_proto.all = range.max_proto.all = pair_exp->tuple.dst.u.all; + range.min_addr = range.max_addr = pair_exp->tuple.dst.u3; + range_set_for_snat = 1; + break; + } + } + spin_unlock_bh(&nf_conntrack_expect_lock); + } + + /* When no paired expectation has been found, change src to + * where master sends to, but only if the connection actually came + * from the same source. + */ + if (!range_set_for_snat && + nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &ct->master->tuplehash[exp->dir].tuple.src.u3)) { range.flags = NF_NAT_RANGE_MAP_IPS; range.min_addr = range.max_addr = ct->master->tuplehash[!exp->dir].tuple.dst.u3; - nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); + range_set_for_snat = 1; } + + /* Perform SRC manip. */ + if (range_set_for_snat) + nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index d67a96a25a68..a36a77bae1d6 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,24 @@ void nf_unregister_queue_handler(struct net *net) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge) { + struct net_device *physdev; + + physdev = nf_bridge_get_physindev(skb); + if (physdev) + dev_put(physdev); + physdev = nf_bridge_get_physoutdev(skb); + if (physdev) + dev_put(physdev); + } +#endif +} + void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -57,20 +75,28 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(state->out); if (state->sk) sock_put(state->sk); + + nf_queue_entry_release_br_nf_refs(entry->skb); +} +EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + +static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb) +{ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge) { struct net_device *physdev; - physdev = nf_bridge_get_physindev(entry->skb); + physdev = nf_bridge_get_physindev(skb); if (physdev) - dev_put(physdev); - physdev = nf_bridge_get_physoutdev(entry->skb); + dev_hold(physdev); + physdev = nf_bridge_get_physoutdev(skb); if (physdev) - dev_put(physdev); + dev_hold(physdev); } #endif } -EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) @@ -83,18 +109,8 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(state->out); if (state->sk) sock_hold(state->sk); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { - struct net_device *physdev; - physdev = nf_bridge_get_physindev(entry->skb); - if (physdev) - dev_hold(physdev); - physdev = nf_bridge_get_physoutdev(entry->skb); - if (physdev) - dev_hold(physdev); - } -#endif + nf_queue_entry_get_br_nf_refs(entry->skb); } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 42487d01a3ed..fec814dace5a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1216,7 +1216,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; - if (basechain->stats && nft_dump_stats(skb, basechain->stats)) + if (rcu_access_pointer(basechain->stats) && + nft_dump_stats(skb, rcu_dereference(basechain->stats))) goto nla_put_failure; } @@ -1392,7 +1393,8 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct nft_base_chain *chain, +static void nft_chain_stats_replace(struct net *net, + struct nft_base_chain *chain, struct nft_stats __percpu *newstats) { struct nft_stats __percpu *oldstats; @@ -1400,8 +1402,9 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain, if (newstats == NULL) return; - if (chain->stats) { - oldstats = nfnl_dereference(chain->stats, NFNL_SUBSYS_NFTABLES); + if (rcu_access_pointer(chain->stats)) { + oldstats = rcu_dereference_protected(chain->stats, + lockdep_commit_lock_is_held(net)); rcu_assign_pointer(chain->stats, newstats); synchronize_rcu(); free_percpu(oldstats); @@ -1439,9 +1442,10 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) struct nft_base_chain *basechain = nft_base_chain(chain); module_put(basechain->type->owner); - free_percpu(basechain->stats); - if (basechain->stats) + if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); + free_percpu(rcu_dereference_raw(basechain->stats)); + } kfree(chain->name); kfree(basechain); } else { @@ -1590,7 +1594,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, kfree(basechain); return PTR_ERR(stats); } - basechain->stats = stats; + rcu_assign_pointer(basechain->stats, stats); static_branch_inc(&nft_counters_enabled); } @@ -2291,15 +2295,52 @@ struct nft_rule_dump_ctx { char *chain; }; +static int __nf_tables_dump_rules(struct sk_buff *skb, + unsigned int *idx, + struct netlink_callback *cb, + const struct nft_table *table, + const struct nft_chain *chain) +{ + struct net *net = sock_net(skb->sk); + unsigned int s_idx = cb->args[0]; + const struct nft_rule *rule; + int rc = 1; + + list_for_each_entry_rcu(rule, &chain->rules, list) { + if (!nft_is_active(net, rule)) + goto cont; + if (*idx < s_idx) + goto cont; + if (*idx > s_idx) { + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + } + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWRULE, + NLM_F_MULTI | NLM_F_APPEND, + table->family, + table, chain, rule) < 0) + goto out_unfinished; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + (*idx)++; + } + rc = 0; +out_unfinished: + cb->args[0] = *idx; + return rc; +} + static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); const struct nft_rule_dump_ctx *ctx = cb->data; - const struct nft_table *table; + struct nft_table *table; const struct nft_chain *chain; - const struct nft_rule *rule; - unsigned int idx = 0, s_idx = cb->args[0]; + unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; @@ -2313,37 +2354,34 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) continue; - list_for_each_entry_rcu(chain, &table->chains, list) { - if (ctx && ctx->chain && - strcmp(ctx->chain, chain->name) != 0) - continue; + if (ctx && ctx->chain) { + struct rhlist_head *list, *tmp; - list_for_each_entry_rcu(rule, &chain->rules, list) { - if (!nft_is_active(net, rule)) - goto cont; - if (idx < s_idx) - goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NFT_MSG_NEWRULE, - NLM_F_MULTI | NLM_F_APPEND, - table->family, - table, chain, rule) < 0) - goto done; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; + list = rhltable_lookup(&table->chains_ht, ctx->chain, + nft_chain_ht_params); + if (!list) + goto done; + + rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) { + if (!nft_is_active(net, chain)) + continue; + __nf_tables_dump_rules(skb, &idx, + cb, table, chain); + break; } + goto done; } + + list_for_each_entry_rcu(chain, &table->chains, list) { + if (__nf_tables_dump_rules(skb, &idx, cb, table, chain)) + goto done; + } + + if (ctx && ctx->table) + break; } done: rcu_read_unlock(); - - cb->args[0] = idx; return skb->len; } @@ -2457,7 +2495,7 @@ err: static void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { - struct nft_expr *expr; + struct nft_expr *expr, *next; /* * Careful: some expressions might not be initialized in case this @@ -2465,8 +2503,9 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, */ expr = nft_expr_first(rule); while (expr != nft_expr_last(rule) && expr->ops) { + next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); - expr = nft_expr_next(expr); + expr = next; } kfree(rule); } @@ -2589,17 +2628,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (chain->use == UINT_MAX) return -EOVERFLOW; - } - - if (nla[NFTA_RULE_POSITION]) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) - return -EOPNOTSUPP; - pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); - if (IS_ERR(old_rule)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); - return PTR_ERR(old_rule); + if (nla[NFTA_RULE_POSITION]) { + pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); + old_rule = __nft_rule_lookup(chain, pos_handle); + if (IS_ERR(old_rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); + return PTR_ERR(old_rule); + } } } @@ -2669,21 +2705,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { - if (!nft_is_active_next(net, old_rule)) { - err = -ENOENT; - goto err2; - } - trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, - old_rule); + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err2; } - nft_deactivate_next(net, old_rule); - chain->use--; - - if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { - err = -ENOMEM; + err = nft_delrule(&ctx, old_rule); + if (err < 0) { + nft_trans_destroy(trans); goto err2; } @@ -6189,7 +6218,8 @@ static void nft_chain_commit_update(struct nft_trans *trans) return; basechain = nft_base_chain(trans->ctx.chain); - nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + nft_chain_stats_replace(trans->ctx.net, basechain, + nft_trans_chain_stats(trans)); switch (nft_trans_chain_policy(trans)) { case NF_DROP: @@ -6324,7 +6354,7 @@ static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules) call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); } -static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain) +static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) { struct nft_rule **g0, **g1; bool next_genbit; @@ -6441,11 +6471,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) /* step 2. Make rules_gen_X visible to packet path */ list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(chain, &table->chains, list) { - if (!nft_is_active_next(net, chain)) - continue; - nf_tables_commit_chain_active(net, chain); - } + list_for_each_entry(chain, &table->chains, list) + nf_tables_commit_chain(net, chain); } /* diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 3fbce3b9c5ec..a50500232b0a 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -101,7 +101,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, struct nft_stats *stats; base_chain = nft_base_chain(chain); - if (!base_chain->stats) + if (!rcu_access_pointer(base_chain->stats)) return; local_bh_disable(); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index a518eb162344..109b0d27345a 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -455,7 +455,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, case IPPROTO_TCP: timeouts = nf_tcp_pernet(net)->timeouts; break; - case IPPROTO_UDP: + case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(net)->timeouts; break; case IPPROTO_DCCP: @@ -471,11 +472,21 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, timeouts = nf_sctp_pernet(net)->timeouts; #endif break; + case IPPROTO_GRE: +#ifdef CONFIG_NF_CT_PROTO_GRE + if (l4proto->net_id) { + struct netns_proto_gre *net_gre; + + net_gre = net_generic(net, *l4proto->net_id); + timeouts = net_gre->gre_timeouts; + } +#endif + break; case 255: timeouts = &nf_generic_pernet(net)->timeout; break; default: - WARN_ON_ONCE(1); + WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 332c69d27b47..b1f9c5303f02 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -148,7 +148,7 @@ static void instance_put(struct nfulnl_instance *inst) { if (inst && refcount_dec_and_test(&inst->use)) - call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu); + call_rcu(&inst->rcu, nfulnl_instance_free_rcu); } static void nfulnl_timer(struct timer_list *t); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 43041f087eb3..0dcc3592d053 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -727,13 +727,13 @@ nf_queue_entry_dup(struct nf_queue_entry *e) */ static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { - if (skb->nf_bridge) + if (nf_bridge_info_get(skb)) __skb_push(skb, skb->network_header - skb->mac_header); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { - if (skb->nf_bridge) + if (nf_bridge_info_get(skb)) __skb_pull(skb, skb->network_header - skb->mac_header); } #else @@ -904,23 +904,22 @@ nfqnl_set_mode(struct nfqnl_instance *queue, static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + int physinif, physoutif; + + physinif = nf_bridge_get_physinif(entry->skb); + physoutif = nf_bridge_get_physoutif(entry->skb); + + if (physinif == ifindex || physoutif == ifindex) + return 1; +#endif if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; if (entry->state.out) if (entry->state.out->ifindex == ifindex) return 1; -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { - int physinif, physoutif; - physinif = nf_bridge_get_physinif(entry->skb); - physoutif = nf_bridge_get_physoutif(entry->skb); - - if (physinif == ifindex || physoutif == ifindex) - return 1; - } -#endif return 0; } @@ -1148,8 +1147,9 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; - entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])); - entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]); + __vlan_hwaccel_put_tag(entry->skb, + nla_get_be16(tb[NFQA_VLAN_PROTO]), + ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 9d0ede474224..7334e0b80a5e 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -520,6 +520,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, void *info) { struct xt_match *match = expr->ops->data; + struct module *me = match->me; struct xt_mtdtor_param par; par.net = ctx->net; @@ -530,7 +531,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, par.match->destroy(&par); if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops))) - module_put(match->me); + module_put(me); } static void diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index e82d9a966c45..974525eb92df 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -214,7 +214,9 @@ static int __init nft_flow_offload_module_init(void) { int err; - register_netdevice_notifier(&flow_offload_netdev_notifier); + err = register_netdevice_notifier(&flow_offload_netdev_notifier); + if (err) + goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) @@ -224,6 +226,7 @@ static int __init nft_flow_offload_module_init(void) register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); +err: return err; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 6180626c3f80..6df486c5ebd3 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -229,7 +229,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, } #ifdef CONFIG_XFRM case NFT_META_SECPATH: - nft_reg_store8(dest, !!skb->sp); + nft_reg_store8(dest, secpath_exists(skb)); break; #endif #ifdef CONFIG_NF_TABLES_BRIDGE diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 5322609f7662..b08865ec5ed3 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -161,7 +161,7 @@ static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - const struct sec_path *sp = pkt->skb->sp; + const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index dec843cadf46..9e05c86ba5c4 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -201,18 +201,8 @@ static __net_init int xt_rateest_net_init(struct net *net) return 0; } -static void __net_exit xt_rateest_net_exit(struct net *net) -{ - struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); - int i; - - for (i = 0; i < ARRAY_SIZE(xn->hash); i++) - WARN_ON_ONCE(!hlist_empty(&xn->hash[i])); -} - static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, - .exit = xt_rateest_net_exit, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 3e7d259e5d8d..28e27a32d9b9 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -260,7 +260,7 @@ static inline void dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) { hlist_del_rcu(&ent->node); - call_rcu_bh(&ent->rcu, dsthash_free_rcu); + call_rcu(&ent->rcu, dsthash_free_rcu); ht->count--; } static void htable_gc(struct work_struct *work); @@ -295,9 +295,10 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); - - if (ret) + if (ret) { + vfree(hinfo); return ret; + } hinfo->cfg.size = size; if (hinfo->cfg.max == 0) @@ -814,7 +815,6 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); - if (ret) return ret; @@ -830,7 +830,6 @@ hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); - if (ret) return ret; @@ -921,7 +920,6 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); - if (ret) return ret; @@ -940,7 +938,6 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); - if (ret) return ret; @@ -1329,7 +1326,7 @@ static void __exit hashlimit_mt_exit(void) xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); unregister_pernet_subsys(&hashlimit_net_ops); - rcu_barrier_bh(); + rcu_barrier(); kmem_cache_destroy(hashlimit_cachep); } diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 9d6d67b953ac..4034d70bff39 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -33,7 +33,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) /* Not a bridged IP packet or no info available yet: * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if * the destination device will be a bridge. */ - if (!skb->nf_bridge) { + if (!nf_bridge_info_exists(skb)) { /* Return MATCH if the invert flags of the used options are on */ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && !(info->invert & XT_PHYSDEV_OP_BRIDGED)) diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 13f8ccf946d6..aa84e8121c93 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -56,7 +56,7 @@ match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; - const struct sec_path *sp = skb->sp; + const struct sec_path *sp = skb_sec_path(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6bb9f3cde0b0..3c023d6120f6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1706,7 +1706,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_EXT_ACK; err = 0; break; - case NETLINK_DUMP_STRICT_CHK: + case NETLINK_GET_STRICT_CHK: if (val) nlk->flags |= NETLINK_F_STRICT_CHK; else @@ -1806,7 +1806,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; - case NETLINK_DUMP_STRICT_CHK: + case NETLINK_GET_STRICT_CHK: if (len < sizeof(int)) return -EINVAL; len = sizeof(int); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 85ae53d8fd09..e47ebbbe71b8 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -301,7 +301,7 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, key->eth.vlan.tpid = vlan->vlan_tpid; } return skb_vlan_push(skb, vlan->vlan_tpid, - ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); + ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK); } /* 'src' is already properly masked. */ @@ -822,8 +822,10 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk __skb_dst_copy(skb, data->dst); *OVS_CB(skb) = data->cb; skb->inner_protocol = data->inner_protocol; - skb->vlan_tci = data->vlan_tci; - skb->vlan_proto = data->vlan_proto; + if (data->vlan_tci & VLAN_CFI_MASK) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci & ~VLAN_CFI_MASK); + else + __vlan_hwaccel_clear_tag(skb); /* Reconstruct the MAC header. */ skb_push(skb, data->l2_len); @@ -867,7 +869,10 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb, data->cb = *OVS_CB(skb); data->inner_protocol = skb->inner_protocol; data->network_offset = orig_network_offset; - data->vlan_tci = skb->vlan_tci; + if (skb_vlan_tag_present(skb)) + data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK; + else + data->vlan_tci = 0; data->vlan_proto = skb->vlan_proto; data->mac_proto = mac_proto; data->l2_len = hlen; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index a4660c48ff01..cd94f925495a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1166,7 +1166,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); if (err) { net_warn_ratelimited("openvswitch: zone: %u " - "execeeds conntrack limit\n", + "exceeds conntrack limit\n", info->zone.id); return err; } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 35966da84769..57e07768c9d1 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -325,7 +325,7 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, return -ENOMEM; vh = (struct vlan_head *)skb->data; - key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); + key_vh->tci = vh->tci | htons(VLAN_CFI_MASK); key_vh->tpid = vh->tpid; if (unlikely(untag_vlan)) { @@ -358,7 +358,7 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) int res; if (skb_vlan_tag_present(skb)) { - key->eth.vlan.tci = htons(skb->vlan_tci); + key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK); key->eth.vlan.tpid = skb->vlan_proto; } else { /* Parse outer vlan tag in the non-accelerated case. */ @@ -597,7 +597,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * skb_vlan_pop(), which will later shift the ethertype into * skb->protocol. */ - if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) + if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) skb->protocol = key->eth.cvlan.tpid; else skb->protocol = key->eth.type; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index c670dd24b8b7..ba01fc4270bd 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -60,7 +60,7 @@ struct ovs_tunnel_info { struct vlan_head { __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ - __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */ }; #define OVS_SW_FLOW_KEY_METADATA_SIZE \ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 865ecef68196..435a4bdf8f89 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -990,9 +990,9 @@ static int validate_vlan_from_nlattrs(const struct sw_flow_match *match, if (a[OVS_KEY_ATTR_VLAN]) tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (!(tci & htons(VLAN_CFI_MASK))) { if (tci) { - OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.", + OVS_NLERR(log, "%s TCI does not have VLAN_CFI_MASK bit set.", (inner) ? "C-VLAN" : "VLAN"); return -EINVAL; } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) { @@ -1013,9 +1013,9 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, __be16 tci = 0; __be16 tpid = 0; bool encap_valid = !!(match->key->eth.vlan.tci & - htons(VLAN_TAG_PRESENT)); + htons(VLAN_CFI_MASK)); bool i_encap_valid = !!(match->key->eth.cvlan.tci & - htons(VLAN_TAG_PRESENT)); + htons(VLAN_CFI_MASK)); if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) { /* Not a VLAN. */ @@ -1039,8 +1039,8 @@ static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, (inner) ? "C-VLAN" : "VLAN", ntohs(tpid)); return -EINVAL; } - if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.", + if (!(tci & htons(VLAN_CFI_MASK))) { + OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_CFI_MASK bit.", (inner) ? "C-VLAN" : "VLAN"); return -EINVAL; } @@ -1095,7 +1095,7 @@ static int parse_vlan_from_nlattrs(struct sw_flow_match *match, if (err) return err; - encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT)); + encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_CFI_MASK)); if (encap_valid) { err = __parse_vlan_from_nlattrs(match, key_attrs, true, a, is_mask, log); @@ -2943,7 +2943,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, vlan = nla_data(a); if (!eth_type_vlan(vlan->vlan_tpid)) return -EINVAL; - if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK))) return -EINVAL; vlan_tci = vlan->vlan_tci; break; @@ -2959,7 +2959,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Prohibit push MPLS other than to a white list * for packets that have a known tag order. */ - if (vlan_tci & htons(VLAN_TAG_PRESENT) || + if (vlan_tci & htons(VLAN_CFI_MASK) || (eth_type != htons(ETH_P_IP) && eth_type != htons(ETH_P_IPV6) && eth_type != htons(ETH_P_ARP) && @@ -2971,7 +2971,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, } case OVS_ACTION_ATTR_POP_MPLS: - if (vlan_tci & htons(VLAN_TAG_PRESENT) || + if (vlan_tci & htons(VLAN_CFI_MASK) || !eth_p_mpls(eth_type)) return -EINVAL; @@ -3036,7 +3036,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_POP_ETH: if (mac_proto != MAC_PROTO_ETHERNET) return -EINVAL; - if (vlan_tci & htons(VLAN_TAG_PRESENT)) + if (vlan_tci & htons(VLAN_CFI_MASK)) return -EINVAL; mac_proto = MAC_PROTO_NONE; break; diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 5aaf3babfc3f..acb6077b7478 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -93,7 +93,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 0e72d95b0e8f..c38a62464b85 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -68,7 +68,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 2e5e7a41d8ef..9bec22e3e9e8 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -84,7 +84,6 @@ static struct net_device *get_dpdev(const struct datapath *dp) struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); - BUG_ON(!local); return local->dev; } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 7e6301b2ec4d..8f16f11f7ad3 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -131,7 +131,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) return ERR_CAST(dev); } - err = dev_change_flags(dev, dev->flags | IFF_UP); + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { rtnl_delete_link(dev); rtnl_unlock(); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a74650e98f42..eedacdebcd4c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1965,7 +1965,7 @@ retry: skb->mark = sk->sk_mark; skb->tstamp = sockc.transmit_time; - sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sockc.tsflags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -2460,7 +2460,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; skb->tstamp = sockc->transmit_time; - sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sockc->tsflags); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); @@ -2625,8 +2625,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) sll_addr))) goto out; proto = saddr->sll_protocol; - addr = saddr->sll_addr; + addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); + if (addr && dev && saddr->sll_halen < dev->addr_len) + goto out; } err = -ENXIO; @@ -2823,8 +2825,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; - addr = saddr->sll_addr; + addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); + if (addr && dev && saddr->sll_halen < dev->addr_len) + goto out; } err = -ENXIO; @@ -2898,7 +2902,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } - sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); + skb_setup_tx_timestamp(skb, sockc.tsflags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { diff --git a/net/rds/message.c b/net/rds/message.c index 4b00b1152a5f..f139420ba1f6 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -308,16 +308,27 @@ out: /* * RDS ops use this to grab SG entries from the rm's sg pool. */ -struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents, + int *ret) { struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; struct scatterlist *sg_ret; - WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); - WARN_ON(!nents); + if (WARN_ON(!ret)) + return NULL; - if (rm->m_used_sgs + nents > rm->m_total_sgs) + if (nents <= 0) { + pr_warn("rds: alloc sgs failed! nents <= 0\n"); + *ret = -EINVAL; return NULL; + } + + if (rm->m_used_sgs + nents > rm->m_total_sgs) { + pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n", + rm->m_total_sgs, rm->m_used_sgs, nents); + *ret = -ENOMEM; + return NULL; + } sg_ret = &sg_first[rm->m_used_sgs]; sg_init_table(sg_ret, nents); @@ -332,6 +343,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in unsigned int i; int num_sgs = ceil(total_len, PAGE_SIZE); int extra_bytes = num_sgs * sizeof(struct scatterlist); + int ret; rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); if (!rm) @@ -340,10 +352,10 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->data.op_nents = ceil(total_len, PAGE_SIZE); - rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret); if (!rm->data.op_sg) { rds_message_put(rm); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } for (i = 0; i < rm->data.op_nents; ++i) { diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 98237feb607a..182ab8430594 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -517,9 +517,10 @@ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) return tot_pages; } -int rds_rdma_extra_size(struct rds_rdma_args *args) +int rds_rdma_extra_size(struct rds_rdma_args *args, + struct rds_iov_vector *iov) { - struct rds_iovec vec; + struct rds_iovec *vec; struct rds_iovec __user *local_vec; int tot_pages = 0; unsigned int nr_pages; @@ -530,13 +531,23 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) if (args->nr_local == 0) return -EINVAL; + iov->iov = kcalloc(args->nr_local, + sizeof(struct rds_iovec), + GFP_KERNEL); + if (!iov->iov) + return -ENOMEM; + + vec = &iov->iov[0]; + + if (copy_from_user(vec, local_vec, args->nr_local * + sizeof(struct rds_iovec))) + return -EFAULT; + iov->len = args->nr_local; + /* figure out the number of pages in the vector */ - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) - return -EFAULT; + for (i = 0; i < args->nr_local; i++, vec++) { - nr_pages = rds_pages_in_vec(&vec); + nr_pages = rds_pages_in_vec(vec); if (nr_pages == 0) return -EINVAL; @@ -558,15 +569,15 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) * Extract all arguments and set up the rdma_op */ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg) + struct cmsghdr *cmsg, + struct rds_iov_vector *vec) { struct rds_rdma_args *args; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; - struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; - int iov_size; + struct rds_iovec *iovs; unsigned int i, j; int ret = 0; @@ -586,31 +597,23 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out_ret; } - /* Check whether to allocate the iovec area */ - iov_size = args->nr_local * sizeof(struct rds_iovec); - if (args->nr_local > UIO_FASTIOV) { - iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); - if (!iovs) { - ret = -ENOMEM; - goto out_ret; - } + if (vec->len != args->nr_local) { + ret = -EINVAL; + goto out_ret; } - if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { - ret = -EFAULT; - goto out; - } + iovs = vec->iov; nr_pages = rds_rdma_pages(iovs, args->nr_local); if (nr_pages < 0) { ret = -EINVAL; - goto out; + goto out_ret; } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; - goto out; + goto out_ret; } op->op_write = !!(args->flags & RDS_RDMA_READWRITE); @@ -620,11 +623,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_active = 1; op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); - op->op_sg = rds_message_alloc_sgs(rm, nr_pages); - if (!op->op_sg) { - ret = -ENOMEM; - goto out; - } + op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); + if (!op->op_sg) + goto out_pages; if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because @@ -635,7 +636,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; - goto out; + goto out_pages; } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; @@ -681,7 +682,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) - goto out; + goto out_pages; else ret = 0; @@ -714,13 +715,11 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, nr_bytes, (unsigned int) args->remote_vec.bytes); ret = -EINVAL; - goto out; + goto out_pages; } op->op_bytes = nr_bytes; -out: - if (iovs != iovstack) - sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); +out_pages: kfree(pages); out_ret: if (ret) @@ -838,11 +837,9 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; - rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); - if (!rm->atomic.op_sg) { - ret = -ENOMEM; + rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1, &ret); + if (!rm->atomic.op_sg) goto err; - } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { diff --git a/net/rds/rds.h b/net/rds/rds.h index 6bfaf05b63b2..02ec4a3b2799 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -386,6 +386,18 @@ static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) INIT_LIST_HEAD(&q->zcookie_head); } +struct rds_iov_vector { + struct rds_iovec *iov; + int len; +}; + +struct rds_iov_vector_arr { + struct rds_iov_vector *vec; + int len; + int indx; + int incr; +}; + struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; @@ -827,7 +839,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); -struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents, + int *ret); int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); @@ -904,13 +917,13 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); -int rds_rdma_extra_size(struct rds_rdma_args *args); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); +int rds_rdma_extra_size(struct rds_rdma_args *args, + struct rds_iov_vector *iov); int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); + struct cmsghdr *cmsg, + struct rds_iov_vector *vec); int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void rds_rdma_free_op(struct rm_rdma_op *ro); diff --git a/net/rds/send.c b/net/rds/send.c index fe785ee819dd..3d822bad7de9 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -876,13 +876,18 @@ out: * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ -static int rds_rm_size(struct msghdr *msg, int num_sgs) +static int rds_rm_size(struct msghdr *msg, int num_sgs, + struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; bool zcopy_cookie = false; + struct rds_iov_vector *iov, *tmp_iov; + + if (num_sgs < 0) + return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -893,8 +898,24 @@ static int rds_rm_size(struct msghdr *msg, int num_sgs) switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: + if (vct->indx >= vct->len) { + vct->len += vct->incr; + tmp_iov = + krealloc(vct->vec, + vct->len * + sizeof(struct rds_iov_vector), + GFP_KERNEL); + if (!tmp_iov) { + vct->len -= vct->incr; + return -ENOMEM; + } + vct->vec = tmp_iov; + } + iov = &vct->vec[vct->indx]; + memset(iov, 0, sizeof(struct rds_iov_vector)); + vct->indx++; cmsg_groups |= 1; - retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov); if (retval < 0) return retval; size += retval; @@ -951,10 +972,11 @@ static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, } static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, - struct msghdr *msg, int *allocated_mr) + struct msghdr *msg, int *allocated_mr, + struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; - int ret = 0; + int ret = 0, ind = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -968,7 +990,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: - ret = rds_cmsg_rdma_args(rs, rm, cmsg); + if (ind >= vct->indx) + return -ENOMEM; + ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]); + ind++; break; case RDS_CMSG_RDMA_DEST: @@ -1084,6 +1109,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = ceil(payload_len, PAGE_SIZE); int namelen; + struct rds_iov_vector_arr vct; + int ind; + + memset(&vct, 0, sizeof(vct)); + + /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */ + vct.incr = 1; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1220,7 +1252,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); } /* size of rm including all sgs */ - ret = rds_rm_size(msg, num_sgs); + ret = rds_rm_size(msg, num_sgs, &vct); if (ret < 0) goto out; @@ -1232,11 +1264,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Attach data to the rm */ if (payload_len) { - rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); - if (!rm->data.op_sg) { - ret = -ENOMEM; + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret); + if (!rm->data.op_sg) goto out; - } ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; @@ -1270,7 +1300,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) rm->m_conn_path = cpath; /* Parse any control messages the user may have included. */ - ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); if (ret) { /* Trigger connection so that its ready for the next retry */ if (ret == -EAGAIN) @@ -1348,9 +1378,18 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (ret) goto out; rds_message_put(rm); + + for (ind = 0; ind < vct.indx; ind++) + kfree(vct.vec[ind].iov); + kfree(vct.vec); + return payload_len; out: + for (ind = 0; ind < vct.indx; ind++) + kfree(vct.vec[ind].iov); + kfree(vct.vec); + /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN * or in any other way, we need to destroy the MR again */ diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 0f8465852254..41a5cd4b5c0e 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -16,7 +16,6 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#include <linux/gpio.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9c1b0729aebf..d4b8355737d8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -21,8 +21,6 @@ #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> -#include <linux/rhashtable.h> -#include <linux/list.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> @@ -1522,227 +1520,8 @@ out_module_put: return skb->len; } -struct tcf_action_net { - struct rhashtable egdev_ht; -}; - -static unsigned int tcf_action_net_id; - -struct tcf_action_egdev_cb { - struct list_head list; - tc_setup_cb_t *cb; - void *cb_priv; -}; - -struct tcf_action_egdev { - struct rhash_head ht_node; - const struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; -}; - -static const struct rhashtable_params tcf_action_egdev_ht_params = { - .key_offset = offsetof(struct tcf_action_egdev, dev), - .head_offset = offsetof(struct tcf_action_egdev, ht_node), - .key_len = sizeof(const struct net_device *), -}; - -static struct tcf_action_egdev * -tcf_action_egdev_lookup(const struct net_device *dev) -{ - struct net *net = dev_net(dev); - struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); - - return rhashtable_lookup_fast(&tan->egdev_ht, &dev, - tcf_action_egdev_ht_params); -} - -static struct tcf_action_egdev * -tcf_action_egdev_get(const struct net_device *dev) -{ - struct tcf_action_egdev *egdev; - struct tcf_action_net *tan; - - egdev = tcf_action_egdev_lookup(dev); - if (egdev) - goto inc_ref; - - egdev = kzalloc(sizeof(*egdev), GFP_KERNEL); - if (!egdev) - return NULL; - INIT_LIST_HEAD(&egdev->cb_list); - egdev->dev = dev; - tan = net_generic(dev_net(dev), tcf_action_net_id); - rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, - tcf_action_egdev_ht_params); - -inc_ref: - egdev->refcnt++; - return egdev; -} - -static void tcf_action_egdev_put(struct tcf_action_egdev *egdev) -{ - struct tcf_action_net *tan; - - if (--egdev->refcnt) - return; - tan = net_generic(dev_net(egdev->dev), tcf_action_net_id); - rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node, - tcf_action_egdev_ht_params); - kfree(egdev); -} - -static struct tcf_action_egdev_cb * -tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev_cb *egdev_cb; - - list_for_each_entry(egdev_cb, &egdev->cb_list, list) - if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv) - return egdev_cb; - return NULL; -} - -static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev, - enum tc_setup_type type, - void *type_data, bool err_stop) -{ - struct tcf_action_egdev_cb *egdev_cb; - int ok_count = 0; - int err; - - list_for_each_entry(egdev_cb, &egdev->cb_list, list) { - err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv); - if (err) { - if (err_stop) - return err; - } else { - ok_count++; - } - } - return ok_count; -} - -static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev_cb *egdev_cb; - - egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); - if (WARN_ON(egdev_cb)) - return -EEXIST; - egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL); - if (!egdev_cb) - return -ENOMEM; - egdev_cb->cb = cb; - egdev_cb->cb_priv = cb_priv; - list_add(&egdev_cb->list, &egdev->cb_list); - return 0; -} - -static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev_cb *egdev_cb; - - egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); - if (WARN_ON(!egdev_cb)) - return; - list_del(&egdev_cb->list); - kfree(egdev_cb); -} - -static int __tc_setup_cb_egdev_register(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev); - int err; - - if (!egdev) - return -ENOMEM; - err = tcf_action_egdev_cb_add(egdev, cb, cb_priv); - if (err) - goto err_cb_add; - return 0; - -err_cb_add: - tcf_action_egdev_put(egdev); - return err; -} -int tc_setup_cb_egdev_register(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - int err; - - rtnl_lock(); - err = __tc_setup_cb_egdev_register(dev, cb, cb_priv); - rtnl_unlock(); - return err; -} -EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register); - -static void __tc_setup_cb_egdev_unregister(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); - - if (WARN_ON(!egdev)) - return; - tcf_action_egdev_cb_del(egdev, cb, cb_priv); - tcf_action_egdev_put(egdev); -} -void tc_setup_cb_egdev_unregister(const struct net_device *dev, - tc_setup_cb_t *cb, void *cb_priv) -{ - rtnl_lock(); - __tc_setup_cb_egdev_unregister(dev, cb, cb_priv); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister); - -int tc_setup_cb_egdev_call(const struct net_device *dev, - enum tc_setup_type type, void *type_data, - bool err_stop) -{ - struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); - - if (!egdev) - return 0; - return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop); -} -EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call); - -static __net_init int tcf_action_net_init(struct net *net) -{ - struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); - - return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params); -} - -static void __net_exit tcf_action_net_exit(struct net *net) -{ - struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); - - rhashtable_destroy(&tan->egdev_ht); -} - -static struct pernet_operations tcf_action_net_ops = { - .init = tcf_action_net_init, - .exit = tcf_action_net_exit, - .id = &tcf_action_net_id, - .size = sizeof(struct tcf_action_net), -}; - static int __init tc_action_init(void) { - int err; - - err = register_pernet_subsys(&tcf_action_net_ops); - if (err) - return err; - rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 37c9b8f0e10f..ec8ec55e0fe8 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -85,7 +85,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { - int ret = 0, err; + int ret = 0, tcfp_result = TC_ACT_OK, err, size; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tc_police *parm; struct tcf_police *police; @@ -93,7 +93,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, police_net_id); struct tcf_police_params *new; bool exists = false; - int size; if (nla == NULL) return -EINVAL; @@ -160,6 +159,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, goto failure; } + if (tb[TCA_POLICE_RESULT]) { + tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); + if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) { + NL_SET_ERR_MSG(extack, + "goto chain not allowed on fallback"); + err = -EINVAL; + goto failure; + } + } + new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; @@ -167,6 +176,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } /* No failure allowed after this point */ + new->tcfp_result = tcfp_result; new->tcfp_mtu = parm->mtu; if (!new->tcfp_mtu) { new->tcfp_mtu = ~0; @@ -196,16 +206,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); - if (tb[TCA_POLICE_RESULT]) { - new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) { - NL_SET_ERR_MSG(extack, - "goto chain not allowed on fallback"); - err = -EINVAL; - goto failure; - } - } - spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 4cca8f274662..c3b90fadaff6 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -210,9 +210,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; + __be64 key_id = 0; int opts_len = 0; - __be64 key_id; - __be16 flags; + __be16 flags = 0; u8 tos, ttl; int ret = 0; int err; @@ -246,15 +246,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, case TCA_TUNNEL_KEY_ACT_RELEASE: break; case TCA_TUNNEL_KEY_ACT_SET: - if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { - NL_SET_ERR_MSG(extack, "Missing tunnel key id"); - ret = -EINVAL; - goto err_out; - } + if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + __be32 key32; - key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID])); + key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); + key_id = key32_to_tunnel_id(key32); + flags = TUNNEL_KEY; + } - flags = TUNNEL_KEY | TUNNEL_CSUM; + flags |= TUNNEL_CSUM; if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) flags &= ~TUNNEL_CSUM; @@ -508,10 +508,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); - if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || + if (((key->tun_flags & TUNNEL_KEY) && + nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || - nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || + (key->tp_dst && + nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, + key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, !(key->tun_flags & TUNNEL_CSUM)) || tunnel_key_opts_dump(skb, info)) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ba677d54a7af..93fdaf707313 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -63,7 +63,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { /* in-payload vlan tag, pop it */ err = __skb_vlan_pop(skb, &tci); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f427a1e00e7e..8ce2a0507970 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -25,6 +25,7 @@ #include <linux/kmod.h> #include <linux/slab.h> #include <linux/idr.h> +#include <linux/rhashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -365,6 +366,245 @@ static void tcf_chain_flush(struct tcf_chain *chain) } } +static struct tcf_block *tc_dev_ingress_block(struct net_device *dev) +{ + const struct Qdisc_class_ops *cops; + struct Qdisc *qdisc; + + if (!dev_ingress_queue(dev)) + return NULL; + + qdisc = dev_ingress_queue(dev)->qdisc_sleeping; + if (!qdisc) + return NULL; + + cops = qdisc->ops->cl_ops; + if (!cops) + return NULL; + + if (!cops->tcf_block) + return NULL; + + return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL); +} + +static struct rhashtable indr_setup_block_ht; + +struct tc_indr_block_dev { + struct rhash_head ht_node; + struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; + struct tcf_block *block; +}; + +struct tc_indr_block_cb { + struct list_head list; + void *cb_priv; + tc_indr_block_bind_cb_t *cb; + void *cb_ident; +}; + +static const struct rhashtable_params tc_indr_setup_block_ht_params = { + .key_offset = offsetof(struct tc_indr_block_dev, dev), + .head_offset = offsetof(struct tc_indr_block_dev, ht_node), + .key_len = sizeof(struct net_device *), +}; + +static struct tc_indr_block_dev * +tc_indr_block_dev_lookup(struct net_device *dev) +{ + return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, + tc_indr_setup_block_ht_params); +} + +static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev) +{ + struct tc_indr_block_dev *indr_dev; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (indr_dev) + goto inc_ref; + + indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + INIT_LIST_HEAD(&indr_dev->cb_list); + indr_dev->dev = dev; + indr_dev->block = tc_dev_ingress_block(dev); + if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, + tc_indr_setup_block_ht_params)) { + kfree(indr_dev); + return NULL; + } + +inc_ref: + indr_dev->refcnt++; + return indr_dev; +} + +static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev) +{ + if (--indr_dev->refcnt) + return; + + rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, + tc_indr_setup_block_ht_params); + kfree(indr_dev); +} + +static struct tc_indr_block_cb * +tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + if (indr_block_cb->cb == cb && + indr_block_cb->cb_ident == cb_ident) + return indr_block_cb; + return NULL; +} + +static struct tc_indr_block_cb * +tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + + indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (indr_block_cb) + return ERR_PTR(-EEXIST); + + indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); + if (!indr_block_cb) + return ERR_PTR(-ENOMEM); + + indr_block_cb->cb_priv = cb_priv; + indr_block_cb->cb = cb; + indr_block_cb->cb_ident = cb_ident; + list_add(&indr_block_cb->list, &indr_dev->cb_list); + + return indr_block_cb; +} + +static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb) +{ + list_del(&indr_block_cb->list); + kfree(indr_block_cb); +} + +static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev, + struct tc_indr_block_cb *indr_block_cb, + enum tc_block_command command) +{ + struct tc_block_offload bo = { + .command = command, + .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS, + .block = indr_dev->block, + }; + + if (!indr_dev->block) + return; + + indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, + &bo); +} + +int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + int err; + + indr_dev = tc_indr_block_dev_get(dev); + if (!indr_dev) + return -ENOMEM; + + indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); + err = PTR_ERR_OR_ZERO(indr_block_cb); + if (err) + goto err_dev_put; + + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND); + return 0; + +err_dev_put: + tc_indr_block_dev_put(indr_dev); + return err; +} +EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register); + +int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + int err; + + rtnl_lock(); + err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident); + rtnl_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(tc_indr_block_cb_register); + +void __tc_indr_block_cb_unregister(struct net_device *dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident); + if (!indr_block_cb) + return; + + /* Send unbind message if required to free any block cbs. */ + tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND); + tc_indr_block_cb_del(indr_block_cb); + tc_indr_block_dev_put(indr_dev); +} +EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister); + +void tc_indr_block_cb_unregister(struct net_device *dev, + tc_indr_block_bind_cb_t *cb, void *cb_ident) +{ + rtnl_lock(); + __tc_indr_block_cb_unregister(dev, cb, cb_ident); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister); + +static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev, + struct tcf_block_ext_info *ei, + enum tc_block_command command, + struct netlink_ext_ack *extack) +{ + struct tc_indr_block_cb *indr_block_cb; + struct tc_indr_block_dev *indr_dev; + struct tc_block_offload bo = { + .command = command, + .binder_type = ei->binder_type, + .block = block, + .extack = extack, + }; + + indr_dev = tc_indr_block_dev_lookup(dev); + if (!indr_dev) + return; + + indr_dev->block = command == TC_BLOCK_BIND ? block : NULL; + + list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) + indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, + &bo); +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; @@ -406,12 +646,17 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; - return err; + if (err) + return err; + + tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); + return 0; no_offload_dev_inc: if (tcf_block_offload_in_use(block)) return -EOPNOTSUPP; block->nooffloaddevcnt++; + tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack); return 0; } @@ -421,6 +666,8 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct net_device *dev = q->dev_queue->dev; int err; + tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL); + if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); @@ -1023,29 +1270,6 @@ void tcf_block_cb_unregister(struct tcf_block *block, } EXPORT_SYMBOL(tcf_block_cb_unregister); -static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type, - void *type_data, bool err_stop) -{ - struct tcf_block_cb *block_cb; - int ok_count = 0; - int err; - - /* Make sure all netdevs sharing this block are offload-capable. */ - if (block->nooffloaddevcnt && err_stop) - return -EOPNOTSUPP; - - list_for_each_entry(block_cb, &block->cb_list, list) { - err = block_cb->cb(type, type_data, block_cb->cb_priv); - if (err) { - if (err_stop) - return err; - } else { - ok_count++; - } - } - return ok_count; -} - /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. @@ -2268,54 +2492,26 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); -static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, - enum tc_setup_type type, - void *type_data, bool err_stop) +int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, + void *type_data, bool err_stop) { + struct tcf_block_cb *block_cb; int ok_count = 0; -#ifdef CONFIG_NET_CLS_ACT - const struct tc_action *a; - struct net_device *dev; - int i, ret; + int err; - if (!tcf_exts_has_actions(exts)) - return 0; + /* Make sure all netdevs sharing this block are offload-capable. */ + if (block->nooffloaddevcnt && err_stop) + return -EOPNOTSUPP; - for (i = 0; i < exts->nr_actions; i++) { - a = exts->actions[i]; - if (!a->ops->get_dev) - continue; - dev = a->ops->get_dev(a); - if (!dev) - continue; - ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); - a->ops->put_dev(dev); - if (ret < 0) - return ret; - ok_count += ret; + list_for_each_entry(block_cb, &block->cb_list, list) { + err = block_cb->cb(type, type_data, block_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } } -#endif - return ok_count; -} - -int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, - enum tc_setup_type type, void *type_data, bool err_stop) -{ - int ok_count; - int ret; - - ret = tcf_block_cb_call(block, type, type_data, err_stop); - if (ret < 0) - return ret; - ok_count = ret; - - if (!exts || ok_count) - return ok_count; - ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); - if (ret < 0) - return ret; - ok_count += ret; - return ok_count; } EXPORT_SYMBOL(tc_setup_cb_call); @@ -2355,6 +2551,11 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; + err = rhashtable_init(&indr_setup_block_ht, + &tc_indr_setup_block_ht_params); + if (err) + goto err_rhash_setup_block_ht; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, @@ -2366,6 +2567,8 @@ static int __init tc_filter_init(void) return 0; +err_rhash_setup_block_ht: + unregister_pernet_subsys(&tcf_net_ops); err_register_pernet_subsys: destroy_workqueue(tc_filter_wq); return err; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index fa6fe2fe0f32..a95cb240a606 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -169,7 +169,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, if (oldprog) tcf_block_offload_dec(block, &oldprog->gen_flags); - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); if (prog) { if (err < 0) { cls_bpf_offload_cmd(tp, oldprog, prog, extack); @@ -234,7 +234,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp, cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false); + tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false); } static int cls_bpf_init(struct tcf_proto *tp) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c6c327874abc..dad04e710493 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -55,6 +55,8 @@ struct fl_flow_key { struct flow_dissector_key_ip ip; struct flow_dissector_key_ip enc_ip; struct flow_dissector_key_enc_opts enc_opts; + struct flow_dissector_key_ports tp_min; + struct flow_dissector_key_ports tp_max; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -65,6 +67,7 @@ struct fl_flow_mask_range { struct fl_flow_mask { struct fl_flow_key key; struct fl_flow_mask_range range; + u32 flags; struct rhash_head ht_node; struct rhashtable ht; struct rhashtable_params filter_ht_params; @@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key, memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey) +static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter, + struct fl_flow_key *key, + struct fl_flow_key *mkey) +{ + __be16 min_mask, max_mask, min_val, max_val; + + min_mask = htons(filter->mask->key.tp_min.dst); + max_mask = htons(filter->mask->key.tp_max.dst); + min_val = htons(filter->key.tp_min.dst); + max_val = htons(filter->key.tp_max.dst); + + if (min_mask && max_mask) { + if (htons(key->tp.dst) < min_val || + htons(key->tp.dst) > max_val) + return false; + + /* skb does not have min and max values */ + mkey->tp_min.dst = filter->mkey.tp_min.dst; + mkey->tp_max.dst = filter->mkey.tp_max.dst; + } + return true; +} + +static bool fl_range_port_src_cmp(struct cls_fl_filter *filter, + struct fl_flow_key *key, + struct fl_flow_key *mkey) +{ + __be16 min_mask, max_mask, min_val, max_val; + + min_mask = htons(filter->mask->key.tp_min.src); + max_mask = htons(filter->mask->key.tp_max.src); + min_val = htons(filter->key.tp_min.src); + max_val = htons(filter->key.tp_max.src); + + if (min_mask && max_mask) { + if (htons(key->tp.src) < min_val || + htons(key->tp.src) > max_val) + return false; + + /* skb does not have min and max values */ + mkey->tp_min.src = filter->mkey.tp_min.src; + mkey->tp_max.src = filter->mkey.tp_max.src; + } + return true; +} + +static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask, + struct fl_flow_key *mkey) { return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask), mask->filter_ht_params); } +static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, + struct fl_flow_key *mkey, + struct fl_flow_key *key) +{ + struct cls_fl_filter *filter, *f; + + list_for_each_entry_rcu(filter, &mask->filters, list) { + if (!fl_range_port_dst_cmp(filter, key, mkey)) + continue; + + if (!fl_range_port_src_cmp(filter, key, mkey)) + continue; + + f = __fl_lookup(mask, mkey); + if (f) + return f; + } + return NULL; +} + +static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, + struct fl_flow_key *mkey, + struct fl_flow_key *key) +{ + if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) + return fl_lookup_range(mask, mkey, key); + + return __fl_lookup(mask, mkey); +} + static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -208,7 +287,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, fl_set_masked_key(&skb_mkey, &skb_key, mask); - f = fl_lookup(mask, &skb_mkey); + f = fl_lookup(mask, &skb_mkey, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -289,8 +368,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; - tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); tcf_block_offload_dec(block, &f->flags); } @@ -312,8 +390,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; - err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, - &cls_flower, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); if (err < 0) { fl_hw_destroy_filter(tp, f, NULL); return err; @@ -339,8 +416,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; - tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); } static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, @@ -514,6 +590,31 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask) +{ + fl_set_key_val(tb, &key->tp_min.dst, + TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst)); + fl_set_key_val(tb, &key->tp_max.dst, + TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst, + TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst)); + fl_set_key_val(tb, &key->tp_min.src, + TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src)); + fl_set_key_val(tb, &key->tp_max.src, + TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src, + TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src)); + + if ((mask->tp_min.dst && mask->tp_max.dst && + htons(key->tp_max.dst) <= htons(key->tp_min.dst)) || + (mask->tp_min.src && mask->tp_max.src && + htons(key->tp_max.src) <= htons(key->tp_min.src))) + return -EINVAL; + + return 0; +} + static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask) @@ -921,6 +1022,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, sizeof(key->arp.tha)); } + if (key->basic.ip_proto == IPPROTO_TCP || + key->basic.ip_proto == IPPROTO_UDP || + key->basic.ip_proto == IPPROTO_SCTP) { + ret = fl_set_key_port_range(tb, key, mask); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -1038,8 +1147,9 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_MASKED(mask, keys, cnt, - FLOW_DISSECTOR_KEY_PORTS, tp); + if (FL_KEY_IS_MASKED(mask, tp) || + FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max)) + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -1086,6 +1196,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, fl_mask_copy(newmask, mask); + if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) || + (newmask->key.tp_min.src && newmask->key.tp_max.src)) + newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE; + err = fl_init_mask_hashtable(newmask); if (err) goto errout_free; @@ -1238,18 +1352,16 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout_idr; - if (!tc_skip_sw(fnew->flags)) { - if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) { - err = -EEXIST; - goto errout_mask; - } - - err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, - fnew->mask->filter_ht_params); - if (err) - goto errout_mask; + if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) { + err = -EEXIST; + goto errout_mask; } + err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, + fnew->mask->filter_ht_params); + if (err) + goto errout_mask; + if (!tc_skip_hw(fnew->flags)) { err = fl_hw_replace_filter(tp, fnew, extack); if (err) @@ -1260,10 +1372,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (fold) { - if (!tc_skip_sw(fold->flags)) - rhashtable_remove_fast(&fold->mask->ht, - &fold->ht_node, - fold->mask->filter_ht_params); + rhashtable_remove_fast(&fold->mask->ht, + &fold->ht_node, + fold->mask->filter_ht_params); if (!tc_skip_hw(fold->flags)) fl_hw_destroy_filter(tp, fold, NULL); } @@ -1303,9 +1414,8 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = arg; - if (!tc_skip_sw(f->flags)) - rhashtable_remove_fast(&f->mask->ht, &f->ht_node, - f->mask->filter_ht_params); + rhashtable_remove_fast(&f->mask->ht, &f->ht_node, + f->mask->filter_ht_params); __fl_delete(tp, f, extack); *last = list_empty(&head->masks); return 0; @@ -1388,8 +1498,7 @@ static void fl_hw_create_tmplt(struct tcf_chain *chain, /* We don't care if driver (any of them) fails to handle this * call. It serves just as a hint for it. */ - tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); } static void fl_hw_destroy_tmplt(struct tcf_chain *chain, @@ -1402,8 +1511,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain, cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY; cls_flower.cookie = (unsigned long) tmplt; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, - &cls_flower, false); + tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); } static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, @@ -1476,6 +1584,26 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, + struct fl_flow_key *mask) +{ + if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, + &mask->tp_min.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_min.dst)) || + fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX, + &mask->tp_max.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp_max.dst)) || + fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN, + &mask->tp_min.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_min.src)) || + fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX, + &mask->tp_max.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp_max.src))) + return -1; + + return 0; +} + static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) @@ -1812,6 +1940,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->arp.tha)))) goto nla_put_failure; + if ((key->basic.ip_proto == IPPROTO_TCP || + key->basic.ip_proto == IPPROTO_UDP || + key->basic.ip_proto == IPPROTO_SCTP) && + fl_dump_key_port_range(skb, key, mask)) + goto nla_put_failure; + if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src, diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 856fa79d4ffd..0e408ee9dcec 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -71,7 +71,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp, cls_mall.command = TC_CLSMATCHALL_DESTROY; cls_mall.cookie = cookie; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false); + tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false); tcf_block_offload_dec(block, &head->flags); } @@ -90,8 +90,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, cls_mall.exts = &head->exts; cls_mall.cookie = cookie; - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, - &cls_mall, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw); if (err < 0) { mall_destroy_hw_filter(tp, head, cookie, NULL); return err; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4b28fd44576d..dcea21004604 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -491,7 +491,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); + tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, @@ -509,7 +509,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { u32_clear_hw_hnode(tp, h, NULL); return err; @@ -533,7 +533,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.command = TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = n->handle; - tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); + tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false); tcf_block_offload_dec(block, &n->flags); } @@ -558,11 +558,12 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; + cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; - err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw); if (err < 0) { u32_remove_hw_knode(tp, n, NULL); return err; @@ -1206,6 +1207,7 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; + cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ca3b0f46de53..7e4d1ccf4c87 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -335,7 +335,6 @@ out: static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; - struct Qdisc *leaf; const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) @@ -344,8 +343,7 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) if (cl == 0) return NULL; - leaf = cops->leaf(p, cl); - return leaf; + return cops->leaf(p, cl); } /* Find queueing discipline by name */ @@ -540,7 +538,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab) if (--tab->refcnt == 0) { list_del(&tab->list); - call_rcu_bh(&tab->rcu, stab_kfree_rcu); + call_rcu(&tab->rcu, stab_kfree_rcu); } } EXPORT_SYMBOL(qdisc_put_stab); @@ -810,6 +808,71 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, } EXPORT_SYMBOL(qdisc_tree_reduce_backlog); +int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, + void *type_data) +{ + struct net_device *dev = qdisc_dev(sch); + int err; + + sch->flags &= ~TCQ_F_OFFLOADED; + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return 0; + + err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); + if (err == -EOPNOTSUPP) + return 0; + + if (!err) + sch->flags |= TCQ_F_OFFLOADED; + + return err; +} +EXPORT_SYMBOL(qdisc_offload_dump_helper); + +void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, + struct Qdisc *new, struct Qdisc *old, + enum tc_setup_type type, void *type_data, + struct netlink_ext_ack *extack) +{ + bool any_qdisc_is_offloaded; + int err; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); + + /* Don't report error if the graft is part of destroy operation. */ + if (!err || !new || new == &noop_qdisc) + return; + + /* Don't report error if the parent, the old child and the new + * one are not offloaded. + */ + any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; + any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; + any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; + + if (any_qdisc_is_offloaded) + NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); +} +EXPORT_SYMBOL(qdisc_offload_graft_helper); + +static void qdisc_offload_graft_root(struct net_device *dev, + struct Qdisc *new, struct Qdisc *old, + struct netlink_ext_ack *extack) +{ + struct tc_root_qopt_offload graft_offload = { + .command = TC_ROOT_GRAFT, + .handle = new ? new->handle : 0, + .ingress = (new && new->flags & TCQ_F_INGRESS) || + (old && old->flags & TCQ_F_INGRESS), + }; + + qdisc_offload_graft_helper(dev, NULL, new, old, + TC_SETUP_ROOT_QDISC, &graft_offload, extack); +} + static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { @@ -957,7 +1020,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, { struct Qdisc *q = old; struct net *net = dev_net(dev); - int err = 0; if (parent == NULL) { unsigned int i, num_q, ingress; @@ -977,6 +1039,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (dev->flags & IFF_UP) dev_deactivate(dev); + qdisc_offload_graft_root(dev, new, old, extack); + if (new && new->ops->attach) goto skip; @@ -1012,28 +1076,29 @@ skip: dev_activate(dev); } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; + unsigned long cl; + int err; /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && parent && !(parent->flags & TCQ_F_NOLOCK)) new->flags &= ~TCQ_F_NOLOCK; - err = -EOPNOTSUPP; - if (cops && cops->graft) { - unsigned long cl = cops->find(parent, classid); + if (!cops || !cops->graft) + return -EOPNOTSUPP; - if (cl) { - err = cops->graft(parent, cl, new, &old, - extack); - } else { - NL_SET_ERR_MSG(extack, "Specified class not found"); - err = -ENOENT; - } + cl = cops->find(parent, classid); + if (!cl) { + NL_SET_ERR_MSG(extack, "Specified class not found"); + return -ENOENT; } - if (!err) - notify_and_destroy(net, skb, n, classid, old, new); + + err = cops->graft(parent, cl, new, &old, extack); + if (err) + return err; + notify_and_destroy(net, skb, n, classid, old, new); } - return err; + return 0; } static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 1538d6fa8165..1150f22983df 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -30,7 +30,7 @@ struct etf_sched_data { int queue; s32 delta; /* in ns */ ktime_t last; /* The txtime of the last skb sent to the netdevice. */ - struct rb_root head; + struct rb_root_cached head; struct qdisc_watchdog watchdog; ktime_t (*get_time)(void); }; @@ -104,7 +104,7 @@ static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p; - p = rb_first(&q->head); + p = rb_first_cached(&q->head); if (!p) return NULL; @@ -117,8 +117,10 @@ static void reset_watchdog(struct Qdisc *sch) struct sk_buff *skb = etf_peek_timesortedlist(sch); ktime_t next; - if (!skb) + if (!skb) { + qdisc_watchdog_cancel(&q->watchdog); return; + } next = ktime_sub_ns(skb->tstamp, q->delta); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); @@ -154,8 +156,9 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); - struct rb_node **p = &q->head.rb_node, *parent = NULL; + struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; + bool leftmost = true; if (!is_packet_valid(sch, nskb)) { report_sock_error(nskb, EINVAL, @@ -168,13 +171,15 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, parent = *p; skb = rb_to_skb(parent); - if (ktime_after(txtime, skb->tstamp)) + if (ktime_after(txtime, skb->tstamp)) { p = &parent->rb_right; - else + leftmost = false; + } else { p = &parent->rb_left; + } } rb_link_node(&nskb->rbnode, parent, p); - rb_insert_color(&nskb->rbnode, &q->head); + rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost); qdisc_qstats_backlog_inc(sch, nskb); sch->q.qlen++; @@ -185,12 +190,42 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } -static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, - bool drop) +static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb, + ktime_t now) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *to_free = NULL; + struct sk_buff *tmp = NULL; + + skb_rbtree_walk_from_safe(skb, tmp) { + if (ktime_after(skb->tstamp, now)) + break; + + rb_erase_cached(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + + qdisc_qstats_backlog_dec(sch, skb); + qdisc_drop(skb, sch, &to_free); + qdisc_qstats_overlimit(sch); + sch->q.qlen--; + } + + kfree_skb_list(to_free); +} + +static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); - rb_erase(&skb->rbnode, &q->head); + rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. @@ -201,19 +236,9 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, qdisc_qstats_backlog_dec(sch, skb); - if (drop) { - struct sk_buff *to_free = NULL; + qdisc_bstats_update(sch, skb); - report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); - - qdisc_drop(skb, sch, &to_free); - kfree_skb_list(to_free); - qdisc_qstats_overlimit(sch); - } else { - qdisc_bstats_update(sch, skb); - - q->last = skb->tstamp; - } + q->last = skb->tstamp; sch->q.qlen--; } @@ -232,7 +257,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) /* Drop if packet has expired while in queue. */ if (ktime_before(skb->tstamp, now)) { - timesortedlist_erase(sch, skb, true); + timesortedlist_drop(sch, skb, now); skb = NULL; goto out; } @@ -241,7 +266,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) * txtime from deadline to (now + delta). */ if (q->deadline_mode) { - timesortedlist_erase(sch, skb, false); + timesortedlist_remove(sch, skb); skb->tstamp = now; goto out; } @@ -250,7 +275,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) /* Dequeue only if now is within the [txtime - delta, txtime] range. */ if (ktime_after(now, next)) - timesortedlist_erase(sch, skb, false); + timesortedlist_remove(sch, skb); else skb = NULL; @@ -386,14 +411,14 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, static void timesortedlist_clear(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); - struct rb_node *p = rb_first(&q->head); + struct rb_node *p = rb_first_cached(&q->head); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); - rb_erase(&skb->rbnode, &q->head); + rb_erase_cached(&skb->rbnode, &q->head); rtnl_kfree_skbs(skb, skb); sch->q.qlen--; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 25a7cf6d380f..1a662f2bb7bb 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -94,6 +94,7 @@ struct fq_sched_data { u32 flow_refill_delay; u32 flow_plimit; /* max packets per flow */ unsigned long flow_max_rate; /* optional max rate per flow */ + u64 ce_threshold; u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; @@ -107,6 +108,7 @@ struct fq_sched_data { u64 stat_gc_flows; u64 stat_internal_packets; u64 stat_throttled; + u64 stat_ce_mark; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; @@ -412,16 +414,21 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_get_ns(); struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; unsigned long rate; u32 plen; + u64 now; + + if (!sch->q.qlen) + return NULL; skb = fq_dequeue_head(sch, &q->internal); if (skb) goto out; + + now = ktime_get_ns(); fq_check_throttled(q, now); begin: head = &q->new_flows; @@ -454,6 +461,11 @@ begin: fq_flow_set_throttled(q, f); goto begin; } + if (time_next_packet && + (s64)(now - time_next_packet - q->ce_threshold) > 0) { + INET_ECN_set_ce(skb); + q->stat_ce_mark++; + } } skb = fq_dequeue_head(sch, f); @@ -657,6 +669,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -736,6 +749,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_ORPHAN_MASK]) q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); + if (tb[TCA_FQ_CE_THRESHOLD]) + q->ce_threshold = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (!err) { sch_tree_unlock(sch); err = fq_resize(sch, fq_log); @@ -786,6 +803,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; + + /* Default ce_threshold of 4294 seconds */ + q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; + qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); if (opt) @@ -799,6 +820,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); + u64 ce_threshold = q->ce_threshold; struct nlattr *opts; opts = nla_nest_start(skb, TCA_OPTIONS); @@ -807,6 +829,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + do_div(ce_threshold, NSEC_PER_USEC); + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || @@ -819,6 +843,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, q->low_rate_threshold) || + nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; @@ -848,6 +873,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.throttled_flows = q->throttled_flows; st.unthrottle_latency_ns = min_t(unsigned long, q->unthrottle_latency_ns, ~0U); + st.ce_mark = q->stat_ce_mark; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index de1663f7d3ad..66ba2ce2320f 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1372,7 +1372,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); /* Wait for flying RCU callback before it is freed. */ - rcu_barrier_bh(); + rcu_barrier(); return; } @@ -1380,10 +1380,10 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, &miniqp->miniq1 : &miniqp->miniq2; /* We need to make sure that readers won't see the miniq - * we are about to modify. So wait until previous call_rcu_bh callback + * we are about to modify. So wait until previous call_rcu callback * is done. */ - rcu_barrier_bh(); + rcu_barrier(); miniq->filter_list = tp_head; rcu_assign_pointer(*miniqp->p_miniq, miniq); @@ -1392,7 +1392,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, * block potential new user of miniq_old until all readers * are not seeing it. */ - call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func); + call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func); } EXPORT_SYMBOL(mini_qdisc_pair_swap); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 4a042abf844c..234afbf9115b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -23,19 +23,23 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/red.h> #define GRED_DEF_PRIO (MAX_DPs / 2) #define GRED_VQ_MASK (MAX_DPs - 1) +#define GRED_VQ_RED_FLAGS (TC_RED_ECN | TC_RED_HARDDROP) + struct gred_sched_data; struct gred_sched; struct gred_sched_data { u32 limit; /* HARD maximal queue length */ u32 DP; /* the drop parameters */ - u32 bytesin; /* bytes seen on virtualQ so far*/ + u32 red_flags; /* virtualQ version of red_flags */ + u64 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ u8 prio; /* the prio of this vq */ @@ -139,14 +143,27 @@ static inline void gred_store_wred_set(struct gred_sched *table, table->wred_set.qidlestart = q->vars.qidlestart; } -static inline int gred_use_ecn(struct gred_sched *t) +static int gred_use_ecn(struct gred_sched_data *q) +{ + return q->red_flags & TC_RED_ECN; +} + +static int gred_use_harddrop(struct gred_sched_data *q) { - return t->red_flags & TC_RED_ECN; + return q->red_flags & TC_RED_HARDDROP; } -static inline int gred_use_harddrop(struct gred_sched *t) +static bool gred_per_vq_red_flags_used(struct gred_sched *table) { - return t->red_flags & TC_RED_HARDDROP; + unsigned int i; + + /* Local per-vq flags couldn't have been set unless global are 0 */ + if (table->red_flags) + return false; + for (i = 0; i < MAX_DPs; i++) + if (table->tab[i] && table->tab[i]->red_flags) + return true; + return false; } static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -212,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_PROB_MARK: qdisc_qstats_overlimit(sch); - if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; } @@ -222,7 +239,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_HARD_MARK: qdisc_qstats_overlimit(sch); - if (gred_use_harddrop(t) || !gred_use_ecn(t) || + if (gred_use_harddrop(q) || !gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; goto congestion_drop; @@ -295,15 +312,103 @@ static void gred_reset(struct Qdisc *sch) } } +static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) +{ + struct gred_sched *table = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_gred_qopt_offload opt = { + .command = command, + .handle = sch->handle, + .parent = sch->parent, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + if (command == TC_GRED_REPLACE) { + unsigned int i; + + opt.set.grio_on = gred_rio_mode(table); + opt.set.wred_on = gred_wred_mode(table); + opt.set.dp_cnt = table->DPs; + opt.set.dp_def = table->def; + + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + if (!q) + continue; + opt.set.tab[i].present = true; + opt.set.tab[i].limit = q->limit; + opt.set.tab[i].prio = q->prio; + opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; + opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; + opt.set.tab[i].is_ecn = gred_use_ecn(q); + opt.set.tab[i].is_harddrop = gred_use_harddrop(q); + opt.set.tab[i].probability = q->parms.max_P; + opt.set.tab[i].backlog = &q->backlog; + } + opt.set.qstats = &sch->qstats; + } + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); +} + +static int gred_offload_dump_stats(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_qopt_offload *hw_stats; + unsigned int i; + int ret; + + hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL); + if (!hw_stats) + return -ENOMEM; + + hw_stats->command = TC_GRED_STATS; + hw_stats->handle = sch->handle; + hw_stats->parent = sch->parent; + + for (i = 0; i < MAX_DPs; i++) + if (table->tab[i]) + hw_stats->stats.xstats[i] = &table->tab[i]->stats; + + ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); + /* Even if driver returns failure adjust the stats - in case offload + * ended but driver still wants to adjust the values. + */ + for (i = 0; i < MAX_DPs; i++) { + if (!table->tab[i]) + continue; + table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; + table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; + table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; + + _bstats_update(&sch->bstats, + hw_stats->stats.bstats[i].bytes, + hw_stats->stats.bstats[i].packets); + sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; + sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; + sch->qstats.drops += hw_stats->stats.qstats[i].drops; + sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; + sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; + } + + kfree(hw_stats); + return ret; +} + static inline void gred_destroy_vq(struct gred_sched_data *q) { kfree(q); } -static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) +static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps, + struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_sopt *sopt; + bool red_flags_changed; int i; if (!dps) @@ -311,13 +416,28 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) sopt = nla_data(dps); - if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || - sopt->def_DP >= sopt->DPs) + if (sopt->DPs > MAX_DPs) { + NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high"); + return -EINVAL; + } + if (sopt->DPs == 0) { + NL_SET_ERR_MSG_MOD(extack, + "number of virtual queues can't be 0"); + return -EINVAL; + } + if (sopt->def_DP >= sopt->DPs) { + NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count"); return -EINVAL; + } + if (sopt->flags && gred_per_vq_red_flags_used(table)) { + NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used"); + return -EINVAL; + } sch_tree_lock(sch); table->DPs = sopt->DPs; table->def = sopt->def_DP; + red_flags_changed = table->red_flags != sopt->flags; table->red_flags = sopt->flags; /* @@ -337,6 +457,12 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) gred_disable_wred_mode(table); } + if (red_flags_changed) + for (i = 0; i < table->DPs; i++) + if (table->tab[i]) + table->tab[i]->red_flags = + table->red_flags & GRED_VQ_RED_FLAGS; + for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", @@ -346,25 +472,30 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) } } + gred_offload(sch, TC_GRED_REPLACE); return 0; } static inline int gred_change_vq(struct Qdisc *sch, int dp, struct tc_gred_qopt *ctl, int prio, u8 *stab, u32 max_P, - struct gred_sched_data **prealloc) + struct gred_sched_data **prealloc, + struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) { + NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; + } if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; if (!q) return -ENOMEM; + q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS; } q->DP = dp; @@ -384,14 +515,127 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, return 0; } +static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = { + [TCA_GRED_VQ_DP] = { .type = NLA_U32 }, + [TCA_GRED_VQ_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = { + [TCA_GRED_VQ_ENTRY] = { .type = NLA_NESTED }, +}; + static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, [TCA_GRED_LIMIT] = { .type = NLA_U32 }, + [TCA_GRED_VQ_LIST] = { .type = NLA_NESTED }, }; +static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry) +{ + struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; + u32 dp; + + nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL); + + dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); + + if (tb[TCA_GRED_VQ_FLAGS]) + table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); +} + +static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs) +{ + const struct nlattr *attr; + int rem; + + nla_for_each_nested(attr, vqs, rem) { + switch (nla_type(attr)) { + case TCA_GRED_VQ_ENTRY: + gred_vq_apply(table, attr); + break; + } + } +} + +static int gred_vq_validate(struct gred_sched *table, u32 cdp, + const struct nlattr *entry, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; + int err; + u32 dp; + + err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, + extack); + if (err < 0) + return err; + + if (!tb[TCA_GRED_VQ_DP]) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified"); + return -EINVAL; + } + dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); + if (dp >= table->DPs) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds"); + return -EINVAL; + } + if (dp != cdp && !table->tab[dp]) { + NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated"); + return -EINVAL; + } + + if (tb[TCA_GRED_VQ_FLAGS]) { + u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); + + if (table->red_flags && table->red_flags != red_flags) { + NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used"); + return -EINVAL; + } + if (red_flags & ~GRED_VQ_RED_FLAGS) { + NL_SET_ERR_MSG_MOD(extack, + "invalid RED flags specified"); + return -EINVAL; + } + } + + return 0; +} + +static int gred_vqs_validate(struct gred_sched *table, u32 cdp, + struct nlattr *vqs, struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int rem, err; + + err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX, + gred_vqe_policy, extack); + if (err < 0) + return err; + + nla_for_each_nested(attr, vqs, rem) { + switch (nla_type(attr)) { + case TCA_GRED_VQ_ENTRY: + err = gred_vq_validate(table, cdp, attr, extack); + if (err) + return err; + break; + default: + NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes"); + return -EINVAL; + } + } + + if (rem > 0) { + NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list"); + return -EINVAL; + } + + return 0; +} + static int gred_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -406,29 +650,39 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); - return gred_change_table_def(sch, tb[TCA_GRED_DPS]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } if (tb[TCA_GRED_PARMS] == NULL || tb[TCA_GRED_STAB] == NULL || - tb[TCA_GRED_LIMIT] != NULL) + tb[TCA_GRED_LIMIT] != NULL) { + NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time"); return -EINVAL; + } max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; - err = -EINVAL; ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); - if (ctl->DP >= table->DPs) - goto errout; + if (ctl->DP >= table->DPs) { + NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count"); + return -EINVAL; + } + + if (tb[TCA_GRED_VQ_LIST]) { + err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST], + extack); + if (err) + return err; + } if (gred_rio_mode(table)) { if (ctl->prio == 0) { @@ -448,9 +702,13 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); sch_tree_lock(sch); - err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc); + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc, + extack); if (err < 0) - goto errout_locked; + goto err_unlock_free; + + if (tb[TCA_GRED_VQ_LIST]) + gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]); if (gred_rio_mode(table)) { gred_disable_wred_mode(table); @@ -458,12 +716,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, gred_enable_wred_mode(table); } - err = 0; + sch_tree_unlock(sch); + kfree(prealloc); + + gred_offload(sch, TC_GRED_REPLACE); + return 0; -errout_locked: +err_unlock_free: sch_tree_unlock(sch); kfree(prealloc); -errout: return err; } @@ -476,12 +737,15 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; - if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) + if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) { + NL_SET_ERR_MSG_MOD(extack, + "virtual queue configuration can't be specified at initialization time"); return -EINVAL; + } if (tb[TCA_GRED_LIMIT]) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); @@ -489,13 +753,13 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); - return gred_change_table_def(sch, tb[TCA_GRED_DPS]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) { struct gred_sched *table = qdisc_priv(sch); - struct nlattr *parms, *opts = NULL; + struct nlattr *parms, *vqs, *opts = NULL; int i; u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { @@ -505,6 +769,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) .flags = table->red_flags, }; + if (gred_offload_dump_stats(sch)) + goto nla_put_failure; + opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; @@ -522,6 +789,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) goto nla_put_failure; + /* Old style all-in-one dump of VQs */ parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; @@ -572,6 +840,58 @@ append_opt: nla_nest_end(skb, parms); + /* Dump the VQs again, in more structured way */ + vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST); + if (!vqs) + goto nla_put_failure; + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + struct nlattr *vq; + + if (!q) + continue; + + vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY); + if (!vq) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags)) + goto nla_put_failure; + + /* Stats */ + if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin, + TCA_GRED_VQ_PAD)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG, + gred_backlog(table, q, sch))) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP, + q->stats.prob_drop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK, + q->stats.prob_mark)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP, + q->stats.forced_drop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK, + q->stats.forced_mark)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop)) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other)) + goto nla_put_failure; + + nla_nest_end(skb, vq); + } + nla_nest_end(skb, vqs); + return nla_nest_end(skb, opts); nla_put_failure: @@ -588,6 +908,7 @@ static void gred_destroy(struct Qdisc *sch) if (table->tab[i]) gred_destroy_vq(table->tab[i]); } + gred_offload(sch, TC_GRED_DESTROY); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f20f3a0f8424..203659bc3906 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -38,9 +38,8 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd) return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); } -static void mq_offload_stats(struct Qdisc *sch) +static int mq_offload_stats(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); struct tc_mq_qopt_offload opt = { .command = TC_MQ_STATS, .handle = sch->handle, @@ -50,8 +49,7 @@ static void mq_offload_stats(struct Qdisc *sch) }, }; - if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt); } static void mq_destroy(struct Qdisc *sch) @@ -171,9 +169,8 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } - mq_offload_stats(sch); - return 0; + return mq_offload_stats(sch); } static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) @@ -196,6 +193,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + struct tc_mq_qopt_offload graft_offload; struct net_device *dev = qdisc_dev(sch); if (dev->flags & IFF_UP) @@ -206,6 +204,14 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); + + graft_offload.handle = sch->handle; + graft_offload.graft_params.queue = cl - 1; + graft_offload.graft_params.child_handle = new ? new->handle : 0; + graft_offload.command = TC_MQ_GRAFT; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, + TC_SETUP_QDISC_MQ, &graft_offload, extack); return 0; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 2c38e3d07924..75046ec72144 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -77,6 +77,10 @@ struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; + /* a linear queue; reduces rbtree rebalancing when jitter is low */ + struct sk_buff *t_head; + struct sk_buff *t_tail; + /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; @@ -369,26 +373,39 @@ static void tfifo_reset(struct Qdisc *sch) rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } + + rtnl_kfree_skbs(q->t_head, q->t_tail); + q->t_head = NULL; + q->t_tail = NULL; } static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); u64 tnext = netem_skb_cb(nskb)->time_to_send; - struct rb_node **p = &q->t_root.rb_node, *parent = NULL; - while (*p) { - struct sk_buff *skb; - - parent = *p; - skb = rb_to_skb(parent); - if (tnext >= netem_skb_cb(skb)->time_to_send) - p = &parent->rb_right; + if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { + if (q->t_tail) + q->t_tail->next = nskb; else - p = &parent->rb_left; + q->t_head = nskb; + q->t_tail = nskb; + } else { + struct rb_node **p = &q->t_root.rb_node, *parent = NULL; + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (tnext >= netem_skb_cb(skb)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->t_root); } - rb_link_node(&nskb->rbnode, parent, p); - rb_insert_color(&nskb->rbnode, &q->t_root); sch->q.qlen++; } @@ -431,6 +448,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, int count = 1; int rc = NET_XMIT_SUCCESS; + /* Do not fool qdisc_drop_all() */ + skb->prev = NULL; + /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) ++count; @@ -530,9 +550,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || - t_last->time_to_send > last->time_to_send) { + t_last->time_to_send > last->time_to_send) + last = t_last; + } + if (q->t_tail) { + struct netem_skb_cb *t_last = + netem_skb_cb(q->t_tail); + + if (!last || + t_last->time_to_send > last->time_to_send) last = t_last; - } } if (last) { @@ -611,11 +638,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) q->slot.bytes_left = q->slot_config.max_bytes; } +static struct sk_buff *netem_peek(struct netem_sched_data *q) +{ + struct sk_buff *skb = skb_rb_first(&q->t_root); + u64 t1, t2; + + if (!skb) + return q->t_head; + if (!q->t_head) + return skb; + + t1 = netem_skb_cb(skb)->time_to_send; + t2 = netem_skb_cb(q->t_head)->time_to_send; + if (t1 < t2) + return skb; + return q->t_head; +} + +static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) +{ + if (skb == q->t_head) { + q->t_head = skb->next; + if (!q->t_head) + q->t_tail = NULL; + } else { + rb_erase(&skb->rbnode, &q->t_root); + } +} + static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - struct rb_node *p; tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); @@ -625,20 +679,18 @@ deliver: qdisc_bstats_update(sch, skb); return skb; } - p = rb_first(&q->t_root); - if (p) { + skb = netem_peek(q); + if (skb) { u64 time_to_send; u64 now = ktime_get_ns(); - skb = rb_to_skb(p); - /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; if (q->slot.slot_next && q->slot.slot_next < time_to_send) get_slot_next(q, now); - if (time_to_send <= now && q->slot.slot_next <= now) { - rb_erase(p, &q->t_root); + if (time_to_send <= now && q->slot.slot_next <= now) { + netem_erase_head(q, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index f8af98621179..cdf68706e40f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -220,7 +220,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); - qdisc_put(child); } for (i = oldbands; i < q->bands; i++) { @@ -230,6 +229,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, } sch_tree_unlock(sch); + + for (i = q->bands; i < oldbands; i++) + qdisc_put(q->queues[i]); return 0; } @@ -251,7 +253,6 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt, static int prio_dump_offload(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload hw_stats = { .command = TC_PRIO_STATS, .handle = sch->handle, @@ -263,21 +264,8 @@ static int prio_dump_offload(struct Qdisc *sch) }, }, }; - int err; - - sch->flags &= ~TCQ_F_OFFLOADED; - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) - return 0; - - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, - &hw_stats); - if (err == -EOPNOTSUPP) - return 0; - if (!err) - sch->flags |= TCQ_F_OFFLOADED; - - return err; + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats); } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -309,43 +297,22 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, { struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt_offload graft_offload; - struct net_device *dev = qdisc_dev(sch); unsigned long band = arg - 1; - bool any_qdisc_is_offloaded; - int err; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); - if (!tc_can_offload(dev)) - return 0; - graft_offload.handle = sch->handle; graft_offload.parent = sch->parent; graft_offload.graft_params.band = band; graft_offload.graft_params.child_handle = new->handle; graft_offload.command = TC_PRIO_GRAFT; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, - &graft_offload); - - /* Don't report error if the graft is part of destroy operation. */ - if (err && new != &noop_qdisc) { - /* Don't report error if the parent, the old child and the new - * one are not offloaded. - */ - any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; - any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED; - if (*old) - any_qdisc_is_offloaded |= (*old)->flags & - TCQ_F_OFFLOADED; - - if (any_qdisc_is_offloaded) - NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); - } - + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, + TC_SETUP_QDISC_PRIO, &graft_offload, + extack); return 0; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 3ce6c0a2c493..9df9942340ea 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -166,7 +166,9 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.set.min = q->parms.qth_min >> q->parms.Wlog; opt.set.max = q->parms.qth_max >> q->parms.Wlog; opt.set.probability = q->parms.max_P; + opt.set.limit = q->limit; opt.set.is_ecn = red_use_ecn(q); + opt.set.is_harddrop = red_use_harddrop(q); opt.set.qstats = &sch->qstats; } else { opt.command = TC_RED_DESTROY; @@ -193,10 +195,10 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { static int red_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct Qdisc *old_child = NULL, *child = NULL; struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; struct tc_red_qopt *ctl; - struct Qdisc *child = NULL; int err; u32 max_P; @@ -233,7 +235,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (child) { qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, q->qdisc->qstats.backlog); - qdisc_put(q->qdisc); + old_child = q->qdisc; q->qdisc = child; } @@ -252,7 +254,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, red_start_of_idle_period(&q->vars); sch_tree_unlock(sch); + red_offload(sch, true); + + if (old_child) + qdisc_put(old_child); return 0; } @@ -279,9 +285,8 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, return red_change(sch, opt, extack); } -static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) +static int red_dump_offload_stats(struct Qdisc *sch) { - struct net_device *dev = qdisc_dev(sch); struct tc_red_qopt_offload hw_stats = { .command = TC_RED_STATS, .handle = sch->handle, @@ -291,22 +296,8 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt) .stats.qstats = &sch->qstats, }, }; - int err; - - sch->flags &= ~TCQ_F_OFFLOADED; - - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) - return 0; - - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, - &hw_stats); - if (err == -EOPNOTSUPP) - return 0; - if (!err) - sch->flags |= TCQ_F_OFFLOADED; - - return err; + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_RED, &hw_stats); } static int red_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -324,7 +315,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) }; int err; - err = red_dump_offload_stats(sch, &opt); + err = red_dump_offload_stats(sch); if (err) goto nla_put_failure; @@ -377,6 +368,21 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl, return 0; } +static void red_graft_offload(struct Qdisc *sch, + struct Qdisc *new, struct Qdisc *old, + struct netlink_ext_ack *extack) +{ + struct tc_red_qopt_offload graft_offload = { + .handle = sch->handle, + .parent = sch->parent, + .child_handle = new->handle, + .command = TC_RED_GRAFT, + }; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, + TC_SETUP_QDISC_RED, &graft_offload, extack); +} + static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { @@ -386,6 +392,8 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); + + red_graft_offload(sch, new, *old, extack); return 0; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 6a28b96e779e..201c888604e4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -118,9 +118,6 @@ static struct sctp_association *sctp_association_init( asoc->flowlabel = sp->flowlabel; asoc->dscp = sp->dscp; - /* Initialize default path MTU. */ - asoc->pathmtu = sp->pathmtu; - /* Set association default SACK delay */ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); asoc->sackfreq = sp->sackfreq; @@ -135,6 +132,8 @@ static struct sctp_association *sctp_association_init( */ asoc->max_burst = sp->max_burst; + asoc->subscribe = sp->subscribe; + /* initialize association timers */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; @@ -252,6 +251,10 @@ static struct sctp_association *sctp_association_init( 0, gfp)) goto fail_init; + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + sctp_assoc_update_frag_point(asoc); + /* Assume that peer would support both address types unless we are * told otherwise. */ @@ -434,7 +437,7 @@ static void sctp_association_destroy(struct sctp_association *asoc) WARN_ON(atomic_read(&asoc->rmem_alloc)); - kfree(asoc); + kfree_rcu(asoc, rcu); SCTP_DBG_OBJCNT_DEC(assoc); } diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 7df3704982f5..ebf28adba789 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp, return match; } +int sctp_bind_addrs_check(struct sctp_sock *sp, + struct sctp_sock *sp2, int cnt2) +{ + struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; + struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; + struct sctp_sockaddr_entry *laddr, *laddr2; + bool exist = false; + int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { + if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && + laddr->valid && laddr2->valid) { + exist = true; + goto next; + } + } + cnt = 0; + break; +next: + cnt++; + } + rcu_read_unlock(); + + return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); +} + /* Does the address 'addr' conflict with any addresses in * the bp. */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index ce8087846f05..64bef313d436 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -86,11 +86,10 @@ void sctp_datamsg_free(struct sctp_datamsg *msg) /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { + struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; - struct sctp_sock *sp; struct sctp_ulpevent *ev; - struct sctp_association *asoc = NULL; int error = 0, notify; /* If we failed, we may need to notify. */ @@ -108,9 +107,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) else error = asoc->outqueue.error; - sp = sctp_sk(asoc->base.sk); - notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED, - &sp->subscribe); + notify = sctp_ulpevent_type_enabled(asoc->subscribe, + SCTP_SEND_FAILED); } /* Generate a SEND FAILED event only if enabled. */ @@ -191,6 +189,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * the packet */ max_data = asoc->frag_point; + if (unlikely(!max_data)) { + max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), + sctp_datachk_len(&asoc->stream)); + pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%Zu)", + __func__, asoc, max_data); + } /* If the the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with diff --git a/net/sctp/input.c b/net/sctp/input.c index 5c36a99882ed..d7a649d240e5 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -57,6 +57,7 @@ #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> +#include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); @@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp); -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, - const union sctp_addr *laddr); +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( + struct net *net, struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *daddr); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, @@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb) asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport); if (!asoc) - ep = __sctp_rcv_lookup_endpoint(net, &dest); + ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; @@ -574,7 +577,7 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) * is probably better. * */ -void sctp_v4_err(struct sk_buff *skb, __u32 info) +int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int ihlen = iph->ihl * 4; @@ -599,7 +602,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } /* Warning: The sock lock is held. Remember to call * sctp_err_finish! @@ -653,6 +656,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) out_unlock: sctp_err_finish(sk, transport); + return 0; } /* @@ -720,43 +724,87 @@ discard: } /* Insert endpoint into the hash table. */ -static void __sctp_hash_endpoint(struct sctp_endpoint *ep) +static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); - struct sctp_ep_common *epb; + struct sock *sk = ep->base.sk; + struct net *net = sock_net(sk); struct sctp_hashbucket *head; + struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (sk->sk_reuseport) { + bool any = sctp_is_ep_boundall(sk); + struct sctp_ep_common *epb2; + struct list_head *list; + int cnt = 0, err = 1; + + list_for_each(list, &ep->base.bind_addr.address_list) + cnt++; + + sctp_for_each_hentry(epb2, &head->chain) { + struct sock *sk2 = epb2->sk; + + if (!net_eq(sock_net(sk2), net) || sk2 == sk || + !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || + !sk2->sk_reuseport) + continue; + + err = sctp_bind_addrs_check(sctp_sk(sk2), + sctp_sk(sk), cnt); + if (!err) { + err = reuseport_add_sock(sk, sk2, any); + if (err) + return err; + break; + } else if (err < 0) { + return err; + } + } + + if (err) { + err = reuseport_alloc(sk, any); + if (err) + return err; + } + } + write_lock(&head->lock); hlist_add_head(&epb->node, &head->chain); write_unlock(&head->lock); + return 0; } /* Add an endpoint to the hash. Local BH-safe. */ -void sctp_hash_endpoint(struct sctp_endpoint *ep) +int sctp_hash_endpoint(struct sctp_endpoint *ep) { + int err; + local_bh_disable(); - __sctp_hash_endpoint(ep); + err = __sctp_hash_endpoint(ep); local_bh_enable(); + + return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { - struct net *net = sock_net(ep->base.sk); + struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; struct sctp_ep_common *epb; epb = &ep->base; - epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port); + epb->hashent = sctp_ep_hashfn(sock_net(sk), epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + write_lock(&head->lock); hlist_del_init(&epb->node); write_unlock(&head->lock); @@ -770,16 +818,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep) local_bh_enable(); } +static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, + const union sctp_addr *paddr, __u32 seed) +{ + __u32 addr; + + if (paddr->sa.sa_family == AF_INET6) + addr = jhash(&paddr->v6.sin6_addr, 16, seed); + else + addr = (__force __u32)paddr->v4.sin_addr.s_addr; + + return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | + (__force __u32)lport, net_hash_mix(net), seed); +} + /* Look up an endpoint. */ -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, - const union sctp_addr *laddr) +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( + struct net *net, struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *paddr) { struct sctp_hashbucket *head; struct sctp_ep_common *epb; struct sctp_endpoint *ep; + struct sock *sk; + __be16 lport; int hash; - hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port)); + lport = laddr->v4.sin_port; + hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(epb, &head->chain) { @@ -791,6 +858,15 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net, ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: + sk = ep->base.sk; + if (sk->sk_reuseport) { + __u32 phash = sctp_hashfn(net, lport, paddr, 0); + + sk = reuseport_select_sock(sk, phash, skb, + sizeof(struct sctphdr)); + if (sk) + ep = sctp_sk(sk)->ep; + } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; @@ -829,35 +905,17 @@ out: static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; - const union sctp_addr *paddr = &t->ipaddr; - const struct net *net = sock_net(t->asoc->base.sk); - __be16 lport = htons(t->asoc->base.bind_addr.port); - __u32 addr; - - if (paddr->sa.sa_family == AF_INET6) - addr = jhash(&paddr->v6.sin6_addr, 16, seed); - else - addr = (__force __u32)paddr->v4.sin_addr.s_addr; - return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | - (__force __u32)lport, net_hash_mix(net), seed); + return sctp_hashfn(sock_net(t->asoc->base.sk), + htons(t->asoc->base.bind_addr.port), + &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; - const union sctp_addr *paddr = x->paddr; - const struct net *net = x->net; - __be16 lport = x->lport; - __u32 addr; - if (paddr->sa.sa_family == AF_INET6) - addr = jhash(&paddr->v6.sin6_addr, 16, seed); - else - addr = (__force __u32)paddr->v4.sin_addr.s_addr; - - return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | - (__force __u32)lport, net_hash_mix(net), seed); + return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fc6c5e4bffa5..b9ed271b7ef7 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -101,6 +101,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_port = 0; + addr->a.v6.sin6_flowinfo = 0; addr->a.v6.sin6_addr = ifa->addr; addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; @@ -138,7 +139,7 @@ static struct notifier_block sctp_inet6addr_notifier = { }; /* ICMP error handler. */ -static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct inet6_dev *idev; @@ -147,7 +148,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct sctp_transport *transport; struct ipv6_pinfo *np; __u16 saveip, savesctp; - int err; + int err, ret = 0; struct net *net = dev_net(skb->dev); idev = in6_dev_get(skb->dev); @@ -163,6 +164,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->transport_header = savesctp; if (!sk) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); + ret = -ENOENT; goto out; } @@ -202,6 +204,8 @@ out_unlock: out: if (likely(idev != NULL)) in6_dev_put(idev); + + return ret; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) diff --git a/net/sctp/output.c b/net/sctp/output.c index b0e74a3e77ec..025f48e14a91 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -410,6 +410,7 @@ static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; + refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c index c0817f7a8964..a8c4c33377bc 100644 --- a/net/sctp/primitive.c +++ b/net/sctp/primitive.c @@ -53,7 +53,7 @@ int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \ void *arg) { \ int error = 0; \ - enum sctp_event event_type; union sctp_subtype subtype; \ + enum sctp_event_type event_type; union sctp_subtype subtype; \ enum sctp_state state; \ struct sctp_endpoint *ep; \ \ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 4a4fd1971255..f4ac6c592e13 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2462,6 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, asoc->c.sinit_max_instreams, gfp)) goto clean_up; + /* Update frag_point when stream_interleave may get changed. */ + sctp_assoc_update_frag_point(asoc); + if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 85d393090238..1d143bc3f73d 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -52,7 +52,7 @@ #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> -static int sctp_cmd_interpreter(enum sctp_event event_type, +static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); -static int sctp_side_effects(enum sctp_event event_type, +static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands, /* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, - enum sctp_event event_type, + enum sctp_event_type event_type, union sctp_subtype subtype, struct sctp_chunk *chunk, unsigned int error) @@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association *asoc) * If you want to understand all of lksctp, this is a * good place to start. */ -int sctp_do_sm(struct net *net, enum sctp_event event_type, +int sctp_do_sm(struct net *net, enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp) @@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type, /***************************************************************** * This the master state function side effect processing function. *****************************************************************/ -static int sctp_side_effects(enum sctp_event event_type, +static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, @@ -1285,7 +1285,7 @@ bail: ********************************************************************/ /* This is the side-effect interpreter. */ -static int sctp_cmd_interpreter(enum sctp_event event_type, +static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 691d9dc620e3..d239b94aa48c 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -79,7 +79,7 @@ static const struct sctp_sm_table_entry bug = { const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, - enum sctp_event event_type, + enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bf618d1b41fd..f93c3cf9e567 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2230,7 +2230,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (sp->recvrcvinfo) sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ - if (sp->subscribe.sctp_data_io_event) + if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT)) sctp_ulpevent_read_sndrcvinfo(event, msg); err = copied; @@ -2304,22 +2304,33 @@ static int sctp_setsockopt_disable_fragments(struct sock *sk, static int sctp_setsockopt_events(struct sock *sk, char __user *optval, unsigned int optlen) { + struct sctp_event_subscribe subscribe; + __u8 *sn_type = (__u8 *)&subscribe; + struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_ulpevent *event; + int i; if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; - if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) + + if (copy_from_user(&subscribe, optval, optlen)) return -EFAULT; + for (i = 0; i < optlen; i++) + sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i, + sn_type[i]); + + list_for_each_entry(asoc, &sp->ep->asocs, asocs) + asoc->subscribe = sctp_sk(sk)->subscribe; + /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. */ - if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT, - &sctp_sk(sk)->subscribe)) { - asoc = sctp_id2assoc(sk, 0); + if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) { + struct sctp_ulpevent *event; + asoc = sctp_id2assoc(sk, 0); if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_USER | __GFP_NOWARN); @@ -3324,8 +3335,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) : sizeof(struct sctp_data_chunk); - min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, - datasize); + min_len = sctp_min_frag_point(sp, datasize); max_len = SCTP_MAX_CHUNK_LEN - datasize; if (val < min_len || val > max_len) @@ -4261,6 +4271,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, return 0; } +static int sctp_setsockopt_event(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_ulpevent *event; + struct sctp_event param; + int retval = 0; + + if (optlen < sizeof(param)) { + retval = -EINVAL; + goto out; + } + + optlen = sizeof(param); + if (copy_from_user(¶m, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (param.se_type < SCTP_SN_TYPE_BASE || + param.se_type > SCTP_SN_TYPE_MAX) { + retval = -EINVAL; + goto out; + } + + asoc = sctp_id2assoc(sk, param.se_assoc_id); + if (!asoc) { + sctp_ulpevent_type_set(&sctp_sk(sk)->subscribe, + param.se_type, param.se_on); + goto out; + } + + sctp_ulpevent_type_set(&asoc->subscribe, param.se_type, param.se_on); + + if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) { + if (sctp_outq_is_empty(&asoc->outqueue)) { + event = sctp_ulpevent_make_sender_dry_event(asoc, + GFP_USER | __GFP_NOWARN); + if (!event) { + retval = -ENOMEM; + goto out; + } + + asoc->stream.si->enqueue_event(&asoc->ulpq, event); + } + } + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4458,6 +4519,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_REUSE_PORT: retval = sctp_setsockopt_reuse_port(sk, optval, optlen); break; + case SCTP_EVENT: + retval = sctp_setsockopt_event(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4706,7 +4770,7 @@ static int sctp_init_sock(struct sock *sk) /* Initialize default event subscriptions. By default, all the * options are off. */ - memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe)); + sp->subscribe = 0; /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS @@ -5251,14 +5315,24 @@ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len, static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, int __user *optlen) { + struct sctp_event_subscribe subscribe; + __u8 *sn_type = (__u8 *)&subscribe; + int i; + if (len == 0) return -EINVAL; if (len > sizeof(struct sctp_event_subscribe)) len = sizeof(struct sctp_event_subscribe); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len)) + + for (i = 0; i < len; i++) + sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe, + SCTP_SN_TYPE_BASE + i); + + if (copy_to_user(optval, &subscribe, len)) return -EFAULT; + return 0; } @@ -7393,6 +7467,37 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval, + int __user *optlen) +{ + struct sctp_association *asoc; + struct sctp_event param; + __u16 subscribe; + + if (len < sizeof(param)) + return -EINVAL; + + len = sizeof(param); + if (copy_from_user(¶m, optval, len)) + return -EFAULT; + + if (param.se_type < SCTP_SN_TYPE_BASE || + param.se_type > SCTP_SN_TYPE_MAX) + return -EINVAL; + + asoc = sctp_id2assoc(sk, param.se_assoc_id); + subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe; + param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type); + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, ¶m, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7591,6 +7696,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_REUSE_PORT: retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); break; + case SCTP_EVENT: + retval = sctp_getsockopt_event(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7628,8 +7736,10 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { - bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse); + struct sctp_sock *sp = sctp_sk(sk); + bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ + kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; unsigned short snum; int ret; @@ -7705,7 +7815,10 @@ pp_found: pr_debug("%s: found a possible match\n", __func__); - if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) + if ((pp->fastreuse && reuse && + sk->sk_state != SCTP_SS_LISTENING) || + (pp->fastreuseport && sk->sk_reuseport && + uid_eq(pp->fastuid, uid))) goto success; /* Run through the list of sockets bound to the port @@ -7719,16 +7832,18 @@ pp_found: * in an endpoint. */ sk_for_each_bound(sk2, &pp->owner) { - struct sctp_endpoint *ep2; - ep2 = sctp_sk(sk2)->ep; + struct sctp_sock *sp2 = sctp_sk(sk2); + struct sctp_endpoint *ep2 = sp2->ep; if (sk == sk2 || - (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) && - sk2->sk_state != SCTP_SS_LISTENING)) + (reuse && (sk2->sk_reuse || sp2->reuse) && + sk2->sk_state != SCTP_SS_LISTENING) || + (sk->sk_reuseport && sk2->sk_reuseport && + uid_eq(uid, sock_i_uid(sk2)))) continue; - if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, - sctp_sk(sk2), sctp_sk(sk))) { + if (sctp_bind_addr_conflict(&ep2->base.bind_addr, + addr, sp2, sp)) { ret = (long)sk2; goto fail_unlock; } @@ -7751,19 +7866,32 @@ pp_not_found: pp->fastreuse = 1; else pp->fastreuse = 0; - } else if (pp->fastreuse && - (!reuse || sk->sk_state == SCTP_SS_LISTENING)) - pp->fastreuse = 0; + + if (sk->sk_reuseport) { + pp->fastreuseport = 1; + pp->fastuid = uid; + } else { + pp->fastreuseport = 0; + } + } else { + if (pp->fastreuse && + (!reuse || sk->sk_state == SCTP_SS_LISTENING)) + pp->fastreuse = 0; + + if (pp->fastreuseport && + (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid))) + pp->fastreuseport = 0; + } /* We are set, so fill up all the data in the hash table * entry, tie the socket list information with the rest of the * sockets FIXME: Blurry, NPI (ipg). */ success: - if (!sctp_sk(sk)->bind_hash) { + if (!sp->bind_hash) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &pp->owner); - sctp_sk(sk)->bind_hash = pp; + sp->bind_hash = pp; } ret = 0; @@ -7836,8 +7964,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) } sk->sk_max_ack_backlog = backlog; - sctp_hash_endpoint(ep); - return 0; + return sctp_hash_endpoint(ep); } /* diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 0a78cdf86463..a6bf21579466 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -140,7 +140,7 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; - struct sk_buff *pos; + struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { @@ -166,23 +166,30 @@ static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, return; } + loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && - MID_lt(event->mid, cevent->mid))) + MID_lt(event->mid, cevent->mid))) { + loc = pos; break; - + } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || - event->fsn < cevent->fsn)) + event->fsn < cevent->fsn)) { + loc = pos; break; + } } - __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); + if (!loc) + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + else + __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( @@ -383,7 +390,7 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; - struct sk_buff *pos; + struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { @@ -403,18 +410,25 @@ static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, return; } + loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; - if (cevent->stream > event->stream) + if (cevent->stream > event->stream) { + loc = pos; break; - + } if (cevent->stream == event->stream && - MID_lt(event->mid, cevent->mid)) + MID_lt(event->mid, cevent->mid)) { + loc = pos; break; + } } - __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); + if (!loc) + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + else + __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, @@ -489,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq, sk_incoming_cpu_update(sk); } - if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; if (skb_list) @@ -980,17 +994,19 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; - if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, - &sctp_sk(sk)->subscribe)) + if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, + SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { + struct sctp_sock *sp = sctp_sk(sk); + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); - if (!sctp_sk(sk)->data_ready_signalled) { - sctp_sk(sk)->data_ready_signalled = 1; + if (!sp->data_ready_signalled) { + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 331cc734e3db..5dde92101743 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -219,7 +219,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sk_incoming_cpu_update(sk); } /* Check if the user wishes to receive this event. */ - if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) + if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; /* If we are in partial delivery mode, post to the lobby until @@ -1129,16 +1129,16 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *ev = NULL; - struct sock *sk; struct sctp_sock *sp; + struct sock *sk; if (!ulpq->pd_mode) return; sk = ulpq->asoc->base.sk; sp = sctp_sk(sk); - if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, - &sctp_sk(sk)->subscribe)) + if (sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, + SCTP_PARTIAL_DELIVERY_EVENT)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, 0, 0, 0, gfp); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5fbaf1901571..c4da4a78d369 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -147,8 +147,14 @@ static int smc_release(struct socket *sock) sk->sk_shutdown |= SHUTDOWN_MASK; } if (smc->clcsock) { + if (smc->use_fallback && sk->sk_state == SMC_LISTEN) { + /* wake up clcsock accept */ + rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + } + mutex_lock(&smc->clcsock_release_lock); sock_release(smc->clcsock); smc->clcsock = NULL; + mutex_unlock(&smc->clcsock_release_lock); } if (smc->use_fallback) { if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) @@ -205,6 +211,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); + mutex_init(&smc->clcsock_release_lock); return sk; } @@ -301,14 +308,17 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, optionally send confirm_rkey msg to register with peer */ +/* register a new rmb, send confirm_rkey msg to register with peer */ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, bool conf_rkey) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; - return -EFAULT; + if (!rmb_desc->wr_reg) { + /* register memory region for new rmb */ + if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { + rmb_desc->regerr = 1; + return -EFAULT; + } + rmb_desc->wr_reg = 1; } if (!conf_rkey) return 0; @@ -337,8 +347,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } if (link->llc_confirm_rc) @@ -365,8 +375,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } /* send add link reject message, only one link supported for now */ @@ -535,7 +545,8 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type, if (rc) return rc; /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); + return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT, + CLC_WAIT_TIME); } /* setup for RDMA connection of client */ @@ -583,8 +594,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, local_contact); } else { - if (!smc->conn.rmb_desc->reused && - smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, local_contact); } @@ -821,7 +831,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct socket *new_clcsock = NULL; struct sock *lsk = &lsmc->sk; struct sock *new_sk; - int rc; + int rc = -EINVAL; release_sock(lsk); new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); @@ -834,7 +844,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) } *new_smc = smc_sk(new_sk); - rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + mutex_lock(&lsmc->clcsock_release_lock); + if (lsmc->clcsock) + rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + mutex_unlock(&lsmc->clcsock_release_lock); lock_sock(lsk); if (rc < 0) lsk->sk_err = -rc; @@ -968,8 +981,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } if (link->llc_confirm_resp_rc) @@ -989,8 +1002,8 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); @@ -1145,10 +1158,8 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; if (local_contact != SMC_FIRST_CONTACT) { - if (!new_smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) - return SMC_CLC_DECL_ERR_REGRMB; - } + if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1184,7 +1195,6 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, return 0; decline: - mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); return reason_code; } @@ -1225,7 +1235,7 @@ static void smc_listen_work(struct work_struct *work) */ pclc = (struct smc_clc_msg_proposal *)&buf; reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL); + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (reason_code) { smc_listen_decline(new_smc, reason_code, 0); return; @@ -1275,7 +1285,7 @@ static void smc_listen_work(struct work_struct *work) /* receive SMC Confirm CLC message */ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM); + SMC_CLC_CONFIRM, CLC_WAIT_TIME); if (reason_code) { mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); @@ -1284,8 +1294,10 @@ static void smc_listen_work(struct work_struct *work) /* finish worker */ if (!ism_supported) { - if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) + if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) { + mutex_unlock(&smc_create_lgr_pending); return; + } } smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); @@ -1357,7 +1369,6 @@ static int smc_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; - INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); sock_hold(sk); /* sock_hold in tcp_listen_worker */ if (!schedule_work(&smc->tcp_listen_work)) sock_put(sk); diff --git a/net/smc/smc.h b/net/smc/smc.h index 08786ace6010..5721416d0605 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -219,6 +219,10 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + struct mutex clcsock_release_lock; + /* protects clcsock of a listen + * socket + * */ }; static inline struct smc_sock *smc_sk(const struct sock *sk) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 89c3a8c7859a..776e9dfc915d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -265,7 +265,7 @@ out: * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. */ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, - u8 expected_type) + u8 expected_type, unsigned long timeout) { long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; struct sock *clc_sk = smc->clcsock->sk; @@ -285,7 +285,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; - smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + clc_sk->sk_rcvtimeo = timeout; iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); @@ -297,7 +297,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clc_sk->sk_err) { reason_code = -clc_sk->sk_err; - smc->sk.sk_err = clc_sk->sk_err; + if (clc_sk->sk_err == EAGAIN && + expected_type == SMC_CLC_DECLINE) + clc_sk->sk_err = 0; /* reset for fallback usage */ + else + smc->sk.sk_err = clc_sk->sk_err; goto out; } if (!len) { /* peer has performed orderly shutdown */ @@ -306,7 +310,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, goto out; } if (len < 0) { - smc->sk.sk_err = -len; + if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE) + smc->sk.sk_err = -len; reason_code = len; goto out; } @@ -346,7 +351,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: - smc->clcsock->sk->sk_rcvtimeo = rcvtimeo; + clc_sk->sk_rcvtimeo = rcvtimeo; return reason_code; } @@ -374,10 +379,8 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(struct smc_clc_msg_decline)); if (len < sizeof(struct smc_clc_msg_decline)) - smc->sk.sk_err = EPROTO; - if (len < 0) - smc->sk.sk_err = -len; - return sock_error(&smc->sk); + len = -EPROTO; + return len > 0 ? 0 : len; } /* send CLC PROPOSAL message across internal TCP socket */ @@ -536,7 +539,6 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) struct smc_link *link; struct msghdr msg; struct kvec vec; - int rc = 0; int len; memset(&aclc, 0, sizeof(aclc)); @@ -589,13 +591,8 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) vec.iov_len = ntohs(aclc.hdr.length); len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, ntohs(aclc.hdr.length)); - if (len < ntohs(aclc.hdr.length)) { - if (len >= 0) - new_smc->sk.sk_err = EPROTO; - else - new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err; - rc = sock_error(&new_smc->sk); - } + if (len < ntohs(aclc.hdr.length)) + len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; - return rc; + return len > 0 ? 0 : len; } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 18da89b681c2..24658e8c0de4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -27,6 +27,7 @@ #define SMC_TYPE_D 1 /* SMC-D only */ #define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ +#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ #define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */ #define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */ @@ -182,7 +183,7 @@ struct smcd_dev; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, - u8 expected_type); + u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1c9fa7f0261a..35c1cdc93e1c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -149,6 +149,8 @@ static int smc_link_send_delete(struct smc_link *lnk) return -ENOTCONN; } +static void smc_lgr_free(struct smc_link_group *lgr); + static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), @@ -171,8 +173,11 @@ free: spin_unlock_bh(&smc_lgr_list.lock); if (!lgr->is_smcd && !lgr->terminating) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* try to send del link msg, on error free lgr immediately */ - if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { + if (lnk->state == SMC_LNK_ACTIVE && + !smc_link_send_delete(lnk)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); return; @@ -295,8 +300,13 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - conn->rmb_desc->reused = 1; conn->rmb_desc->used = 0; + if (!lgr->is_smcd) { + /* unregister rmb with peer */ + smc_llc_do_delete_rkey( + &lgr->lnk[SMC_SINGLE_LINK], + conn->rmb_desc); + } } else { /* buf registration failed, reuse not possible */ write_lock_bh(&lgr->rmbs_lock); @@ -410,7 +420,7 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) } /* remove a link group */ -void smc_lgr_free(struct smc_link_group *lgr) +static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index cf98f4d6093e..b00287989a3d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -109,6 +109,9 @@ struct smc_link { int llc_testlink_time; /* testlink interval */ struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ + struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ + int llc_delete_rkey_rc; /* rc from del rkey msg */ + struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -127,7 +130,7 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 reused : 1; /* new created / reused */ + u8 wr_reg : 1; /* mem region registered */ u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ @@ -243,7 +246,6 @@ struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; -void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9c916c709ca7..a6d3623d06f4 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -238,6 +238,29 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, return rc; } +/* send LLC delete rkey request */ +static int smc_llc_send_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_llc_msg_delete_rkey *rkeyllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; + memset(rkeyllc, 0, sizeof(*rkeyllc)); + rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; + rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); + rkeyllc->num_rkeys = 1; + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + /* prepare an add link message */ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, struct smc_link *link, u8 mac[], u8 gid[], @@ -509,7 +532,9 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, int i, max; if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ + link->llc_delete_rkey_rc = llc->hd.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_delete_rkey); } else { max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { @@ -610,6 +635,8 @@ int smc_llc_link_init(struct smc_link *link) init_completion(&link->llc_add); init_completion(&link->llc_add_resp); init_completion(&link->llc_confirm_rkey); + init_completion(&link->llc_delete_rkey); + mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; @@ -650,8 +677,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, { int rc; + /* protected by mutex smc_create_lgr_pending */ reinit_completion(&link->llc_confirm_rkey); - smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_send_confirm_rkey(link, rmb_desc); + if (rc) + return rc; /* receive CONFIRM RKEY response from server over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, SMC_LLC_WAIT_TIME); @@ -660,6 +690,29 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, return 0; } +/* unregister an rtoken at the remote peer */ +int smc_llc_do_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + int rc; + + mutex_lock(&link->llc_delete_rkey_mutex); + reinit_completion(&link->llc_delete_rkey); + rc = smc_llc_send_delete_rkey(link, rmb_desc); + if (rc) + goto out; + /* receive DELETE RKEY response from server over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, + SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_delete_rkey_rc) + rc = -EFAULT; + else + rc = 0; +out: + mutex_unlock(&link->llc_delete_rkey_mutex); + return rc; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 9e2ff088e301..461c0c3ef76e 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -49,6 +49,8 @@ void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); +int smc_llc_do_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/socket.c b/net/socket.c index 334fcc617ef2..e89884e2197b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2341,8 +2341,9 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, * Linux recvmmsg interface */ -int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, - unsigned int flags, struct timespec64 *timeout) +static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct timespec64 *timeout) { int fput_needed, err, datagrams; struct socket *sock; @@ -2451,25 +2452,32 @@ out_put: return datagrams; } -static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags, - struct __kernel_timespec __user *timeout) +int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct __kernel_timespec __user *timeout, + struct old_timespec32 __user *timeout32) { int datagrams; struct timespec64 timeout_sys; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - - if (!timeout) - return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); + if (timeout && get_timespec64(&timeout_sys, timeout)) + return -EFAULT; - if (get_timespec64(&timeout_sys, timeout)) + if (timeout32 && get_old_timespec32(&timeout_sys, timeout32)) return -EFAULT; - datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); + if (!timeout && !timeout32) + return do_recvmmsg(fd, mmsg, vlen, flags, NULL); + + datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); - if (datagrams > 0 && put_timespec64(&timeout_sys, timeout)) + if (datagrams <= 0) + return datagrams; + + if (timeout && put_timespec64(&timeout_sys, timeout)) + datagrams = -EFAULT; + + if (timeout32 && put_old_timespec32(&timeout_sys, timeout32)) datagrams = -EFAULT; return datagrams; @@ -2479,8 +2487,23 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct __kernel_timespec __user *, timeout) { - return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + + return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL); +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct old_timespec32 __user *, timeout) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + + return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout); } +#endif #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ @@ -2600,8 +2623,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) a[2], true); break; case SYS_RECVMMSG: - err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], - a[3], (struct __kernel_timespec __user *)a[4]); + if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, + a[2], a[3], + (struct __kernel_timespec __user *)a[4], + NULL); + else + err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, + a[2], a[3], NULL, + (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 5d3f252659f1..ba765473d1f0 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1791,6 +1791,7 @@ priv_release_snd_buf(struct rpc_rqst *rqstp) for (i=0; i < rqstp->rq_enc_pages_num; i++) __free_page(rqstp->rq_enc_pages[i]); kfree(rqstp->rq_enc_pages); + rqstp->rq_release_snd_buf = NULL; } static int @@ -1799,6 +1800,9 @@ alloc_enc_pages(struct rpc_rqst *rqstp) struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; int first, last, i; + if (rqstp->rq_release_snd_buf) + rqstp->rq_release_snd_buf(rqstp); + if (snd_buf->page_len == 0) { rqstp->rq_enc_pages_num = 0; return 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ae3b8145da35..24cbddc44c88 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1915,6 +1915,13 @@ call_connect_status(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; + /* Check if the task was already transmitted */ + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { + xprt_end_transmit(task); + task->tk_action = call_transmit_status; + return; + } + dprint_status(task); trace_rpc_connect_status(task); @@ -1945,6 +1952,7 @@ call_connect_status(struct rpc_task *task) /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); /* fall through */ + case -ENOTCONN: case -EAGAIN: /* Check for timeouts before looping back to call_bind */ case -ETIMEDOUT: @@ -2302,6 +2310,7 @@ out_retry: task->tk_status = 0; /* Note: rpc_verify_header() may have freed the RPC slot */ if (task->tk_rqstp == req) { + xdr_free_bvec(&req->rq_rcv_buf); req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 9062967575c4..7e55cfc69697 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -175,7 +175,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); return 0; no_checksum: if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 86bea4520c4d..73547d17d3c6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -67,7 +67,6 @@ */ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); -static void xprt_connect_status(struct rpc_task *task); static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); @@ -680,7 +679,9 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(xprtiod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -EAGAIN); + else if (xprt->snd_task) + rpc_wake_up_queued_task_set_status(&xprt->pending, + xprt->snd_task, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -820,46 +821,25 @@ void xprt_connect(struct rpc_task *task) if (!xprt_connected(xprt)) { task->tk_timeout = task->tk_rqstp->rq_timeout; task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; - rpc_sleep_on(&xprt->pending, task, xprt_connect_status); + rpc_sleep_on(&xprt->pending, task, NULL); if (test_bit(XPRT_CLOSING, &xprt->state)) return; if (xprt_test_and_set_connecting(xprt)) return; - xprt->stat.connect_start = jiffies; - xprt->ops->connect(xprt, task); + /* Race breaker */ + if (!xprt_connected(xprt)) { + xprt->stat.connect_start = jiffies; + xprt->ops->connect(xprt, task); + } else { + xprt_clear_connecting(xprt); + task->tk_status = 0; + rpc_wake_up_queued_task(&xprt->pending, task); + } } xprt_release_write(xprt, task); } -static void xprt_connect_status(struct rpc_task *task) -{ - switch (task->tk_status) { - case 0: - dprintk("RPC: %5u xprt_connect_status: connection established\n", - task->tk_pid); - break; - case -ECONNREFUSED: - case -ECONNRESET: - case -ECONNABORTED: - case -ENETUNREACH: - case -EHOSTUNREACH: - case -EPIPE: - case -EAGAIN: - dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); - break; - case -ETIMEDOUT: - dprintk("RPC: %5u xprt_connect_status: connect attempt timed " - "out\n", task->tk_pid); - break; - default: - dprintk("RPC: %5u xprt_connect_status: error %d connecting to " - "server %s\n", task->tk_pid, -task->tk_status, - task->tk_rqstp->rq_xprt->servername); - task->tk_status = -EIO; - } -} - enum xprt_xid_rb_cmp { XID_RB_EQUAL, XID_RB_LEFT, @@ -1623,6 +1603,8 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.buflen = 0; req->rq_rcv_buf.len = 0; req->rq_rcv_buf.buflen = 0; + req->rq_snd_buf.bvec = NULL; + req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; xprt_reset_majortimeo(req); dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ae77c71c1f64..f0b3700cec95 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -330,18 +330,16 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) { size_t i,n; - if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + if (!want || !(buf->flags & XDRBUF_SPARSE_PAGES)) return want; - if (want > buf->page_len) - want = buf->page_len; n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < n; i++) { if (buf->pages[i]) continue; buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); if (!buf->pages[i]) { - buf->page_len = (i * PAGE_SIZE) - buf->page_base; - return buf->page_len; + i *= PAGE_SIZE; + return i > buf->page_base ? i - buf->page_base : 0; } } return want; @@ -378,8 +376,8 @@ static ssize_t xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { - struct kvec kvec = { 0 }; - return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0); + iov_iter_discard(&msg->msg_iter, READ, count); + return sock_recvmsg(sock, msg, flags); } static ssize_t @@ -398,16 +396,17 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) - goto eagain; + goto out; seek = 0; } else { seek -= buf->head[0].iov_len; offset += buf->head[0].iov_len; } - if (seek < buf->page_len) { - want = xs_alloc_sparse_pages(buf, - min_t(size_t, count - offset, buf->page_len), - GFP_NOWAIT); + + want = xs_alloc_sparse_pages(buf, + min_t(size_t, count - offset, buf->page_len), + GFP_NOWAIT); + if (seek < want) { ret = xs_read_bvec(sock, msg, flags, buf->bvec, xdr_buf_pagecount(buf), want + buf->page_base, @@ -418,12 +417,13 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) - goto eagain; + goto out; seek = 0; } else { - seek -= buf->page_len; - offset += buf->page_len; + seek -= want; + offset += want; } + if (seek < buf->tail[0].iov_len) { want = min_t(size_t, count - offset, buf->tail[0].iov_len); ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); @@ -433,17 +433,13 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) - goto eagain; + goto out; } else offset += buf->tail[0].iov_len; ret = -EMSGSIZE; - msg->msg_flags |= MSG_TRUNC; out: *read = offset - seek_init; return ret; -eagain: - ret = -EAGAIN; - goto out; sock_err: offset += seek; goto out; @@ -486,19 +482,20 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, if (transport->recv.offset == transport->recv.len) { if (xs_read_stream_request_done(transport)) msg->msg_flags |= MSG_EOR; - return transport->recv.copied; + return read; } switch (ret) { + default: + break; + case -EFAULT: case -EMSGSIZE: - return transport->recv.copied; + msg->msg_flags |= MSG_TRUNC; + return read; case 0: return -ESHUTDOWN; - default: - if (ret < 0) - return ret; } - return -EAGAIN; + return ret < 0 ? ret : read; } static size_t @@ -537,7 +534,7 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) - xprt_complete_bc_request(req, ret); + xprt_complete_bc_request(req, transport->recv.copied); return ret; } @@ -570,7 +567,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) - xprt_complete_rqst(req->rq_task, ret); + xprt_complete_rqst(req->rq_task, transport->recv.copied); xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); @@ -591,10 +588,8 @@ xs_read_stream(struct sock_xprt *transport, int flags) if (ret <= 0) goto out_err; transport->recv.offset = ret; - if (ret != want) { - ret = -EAGAIN; - goto out_err; - } + if (transport->recv.offset != want) + return transport->recv.offset; transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & RPC_FRAGMENT_SIZE_MASK; transport->recv.offset -= sizeof(transport->recv.fraghdr); @@ -602,6 +597,9 @@ xs_read_stream(struct sock_xprt *transport, int flags) } switch (be32_to_cpu(transport->recv.calldir)) { + default: + msg.msg_flags |= MSG_TRUNC; + break; case RPC_CALL: ret = xs_read_stream_call(transport, &msg, flags); break; @@ -616,6 +614,9 @@ xs_read_stream(struct sock_xprt *transport, int flags) goto out_err; read += ret; if (transport->recv.offset < transport->recv.len) { + if (!(msg.msg_flags & MSG_TRUNC)) + return read; + msg.msg_flags = 0; ret = xs_read_discard(transport->sock, &msg, flags, transport->recv.len - transport->recv.offset); if (ret <= 0) @@ -623,7 +624,7 @@ xs_read_stream(struct sock_xprt *transport, int flags) transport->recv.offset += ret; read += ret; if (transport->recv.offset != transport->recv.len) - return -EAGAIN; + return read; } if (xs_read_stream_request_done(transport)) { trace_xs_stream_read_request(transport); @@ -633,13 +634,7 @@ xs_read_stream(struct sock_xprt *transport, int flags) transport->recv.len = 0; return read; out_err: - switch (ret) { - case 0: - case -ESHUTDOWN: - xprt_force_disconnect(&transport->xprt); - return -ESHUTDOWN; - } - return ret; + return ret != 0 ? ret : -ESHUTDOWN; } static void xs_stream_data_receive(struct sock_xprt *transport) @@ -648,12 +643,12 @@ static void xs_stream_data_receive(struct sock_xprt *transport) ssize_t ret = 0; mutex_lock(&transport->recv_mutex); + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); if (transport->sock == NULL) goto out; - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { ret = xs_read_stream(transport, MSG_DONTWAIT); - if (ret <= 0) + if (ret < 0) break; read += ret; cond_resched(); @@ -1222,6 +1217,8 @@ static void xs_reset_transport(struct sock_xprt *transport) trace_rpc_socket_close(xprt, sock); sock_release(sock); + + xprt_disconnect_done(xprt); } /** @@ -1242,8 +1239,6 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); xprt->reestablish_timeout = 0; - - xprt_disconnect_done(xprt); } static void xs_inject_disconnect(struct rpc_xprt *xprt) @@ -1345,10 +1340,10 @@ static void xs_udp_data_receive(struct sock_xprt *transport) int err; mutex_lock(&transport->recv_mutex); + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); sk = transport->inet; if (sk == NULL) goto out; - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { skb = skb_recv_udp(sk, 0, 1, &err); if (skb == NULL) @@ -1494,8 +1489,6 @@ static void xs_tcp_state_change(struct sock *sk) &transport->sock_state)) xprt_clear_connecting(xprt); clear_bit(XPRT_CLOSING, &xprt->state); - if (sk->sk_err) - xprt_wake_pending_tasks(xprt, -sk->sk_err); /* Trigger the socket release */ xs_tcp_force_close(xprt); } @@ -2097,8 +2090,8 @@ static void xs_udp_setup_socket(struct work_struct *work) trace_rpc_socket_connect(xprt, sock, 0); status = 0; out: - xprt_unlock_connect(xprt, transport); xprt_clear_connecting(xprt); + xprt_unlock_connect(xprt, transport); xprt_wake_pending_tasks(xprt, status); } @@ -2334,8 +2327,8 @@ static void xs_tcp_setup_socket(struct work_struct *work) } status = -EAGAIN; out: - xprt_unlock_connect(xprt, transport); xprt_clear_connecting(xprt); + xprt_unlock_connect(xprt, transport); xprt_wake_pending_tasks(xprt, status); } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 74b9d916a58b..5df9d1138ac9 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -353,34 +353,35 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) return 0; } -static int __switchdev_port_obj_add(struct net_device *dev, - const struct switchdev_obj *obj, - struct switchdev_trans *trans) +static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, + struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans, + struct netlink_ext_ack *extack) { - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct list_head *iter; - int err = -EOPNOTSUPP; - - if (ops && ops->switchdev_port_obj_add) - return ops->switchdev_port_obj_add(dev, obj, trans); + int rc; + int err; - /* Switch device port(s) may be stacked under - * bond/team/vlan dev, so recurse down to add object on - * each port. - */ + struct switchdev_notifier_port_obj_info obj_info = { + .obj = obj, + .trans = trans, + .handled = false, + }; - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = __switchdev_port_obj_add(lower_dev, obj, trans); - if (err) - break; + rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info, extack); + err = notifier_to_errno(rc); + if (err) { + WARN_ON(!obj_info.handled); + return err; } - - return err; + if (!obj_info.handled) + return -EOPNOTSUPP; + return 0; } static int switchdev_port_obj_add_now(struct net_device *dev, - const struct switchdev_obj *obj) + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack) { struct switchdev_trans trans; int err; @@ -397,7 +398,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev, */ trans.ph_prepare = true; - err = __switchdev_port_obj_add(dev, obj, &trans); + err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, &trans, extack); if (err) { /* Prepare phase failed: abort the transaction. Any * resources reserved in the prepare phase are @@ -416,7 +418,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev, */ trans.ph_prepare = false; - err = __switchdev_port_obj_add(dev, obj, &trans); + err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, &trans, extack); WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); switchdev_trans_items_warn_destroy(dev, &trans); @@ -429,7 +432,7 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev, const struct switchdev_obj *obj = data; int err; - err = switchdev_port_obj_add_now(dev, obj); + err = switchdev_port_obj_add_now(dev, obj, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", err, obj->id); @@ -459,38 +462,21 @@ static int switchdev_port_obj_add_defer(struct net_device *dev, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_add(struct net_device *dev, - const struct switchdev_obj *obj) + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_add_defer(dev, obj); ASSERT_RTNL(); - return switchdev_port_obj_add_now(dev, obj); + return switchdev_port_obj_add_now(dev, obj, extack); } EXPORT_SYMBOL_GPL(switchdev_port_obj_add); static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct list_head *iter; - int err = -EOPNOTSUPP; - - if (ops && ops->switchdev_port_obj_del) - return ops->switchdev_port_obj_del(dev, obj); - - /* Switch device port(s) may be stacked under - * bond/team/vlan dev, so recurse down to delete object on - * each port. - */ - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = switchdev_port_obj_del_now(lower_dev, obj); - if (err) - break; - } - - return err; + return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, + dev, obj, NULL, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, @@ -535,6 +521,7 @@ int switchdev_port_obj_del(struct net_device *dev, EXPORT_SYMBOL_GPL(switchdev_port_obj_del); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); +static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier @@ -572,10 +559,38 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info) { info->dev = dev; + info->extack = NULL; return atomic_notifier_call_chain(&switchdev_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); +int register_switchdev_blocking_notifier(struct notifier_block *nb) +{ + struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + + return blocking_notifier_chain_register(chain, nb); +} +EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); + +int unregister_switchdev_blocking_notifier(struct notifier_block *nb) +{ + struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + + return blocking_notifier_chain_unregister(chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); + +int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info, + struct netlink_ext_ack *extack) +{ + info->dev = dev; + info->extack = extack; + return blocking_notifier_call_chain(&switchdev_blocking_notif_chain, + val, info); +} +EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); + bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { @@ -595,3 +610,109 @@ bool switchdev_port_same_parent_id(struct net_device *a, return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); } EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); + +static int __switchdev_handle_port_obj_add(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*add_cb)(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans, + struct netlink_ext_ack *extack)) +{ + struct netlink_ext_ack *extack; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + extack = switchdev_notifier_info_to_extack(&port_obj_info->info); + + if (check_cb(dev)) { + /* This flag is only checked if the return value is success. */ + port_obj_info->handled = true; + return add_cb(dev, port_obj_info->obj, port_obj_info->trans, + extack); + } + + /* Switch ports might be stacked under e.g. a LAG. Ignore the + * unsupported devices, another driver might be able to handle them. But + * propagate to the callers any hard errors. + * + * If the driver does its own bookkeeping of stacked ports, it's not + * necessary to go through this helper. + */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info, + check_cb, add_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return err; +} + +int switchdev_handle_port_obj_add(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*add_cb)(struct net_device *dev, + const struct switchdev_obj *obj, + struct switchdev_trans *trans, + struct netlink_ext_ack *extack)) +{ + int err; + + err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, + add_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); + +static int __switchdev_handle_port_obj_del(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*del_cb)(struct net_device *dev, + const struct switchdev_obj *obj)) +{ + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) { + /* This flag is only checked if the return value is success. */ + port_obj_info->handled = true; + return del_cb(dev, port_obj_info->obj); + } + + /* Switch ports might be stacked under e.g. a LAG. Ignore the + * unsupported devices, another driver might be able to handle them. But + * propagate to the callers any hard errors. + * + * If the driver does its own bookkeeping of stacked ports, it's not + * necessary to go through this helper. + */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info, + check_cb, del_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return err; +} + +int switchdev_handle_port_obj_del(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + int (*del_cb)(struct net_device *dev, + const struct switchdev_obj *obj)) +{ + int err; + + err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, + del_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); diff --git a/net/tipc/Makefile b/net/tipc/Makefile index aca168f2abb1..c86aba0282af 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -9,7 +9,9 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - topsrv.o socket.o group.o + topsrv.o socket.o group.o trace.o + +CFLAGS_trace.o += -I$(src) tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index e65c3a8551e4..fb2c0d8f359f 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -43,6 +43,7 @@ #include "bcast.h" #include "netlink.h" #include "udp_media.h" +#include "trace.h" #define MAX_ADDR_STR 60 @@ -99,7 +100,7 @@ static struct tipc_media *media_find_id(u8 type) /** * tipc_media_addr_printf - record media address in print buffer */ -void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) +int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) { char addr_str[MAX_ADDR_STR]; struct tipc_media *m; @@ -114,9 +115,10 @@ void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) ret = scnprintf(buf, len, "UNKNOWN(%u)", a->media_id); for (i = 0; i < sizeof(a->value); i++) - ret += scnprintf(buf - ret, len + ret, - "-%02x", a->value[i]); + ret += scnprintf(buf + ret, len - ret, + "-%x", a->value[i]); } + return ret; } /** @@ -607,6 +609,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, if (!b) return NOTIFY_DONE; + trace_tipc_l2_device_event(dev, b, evt); switch (evt) { case NETDEV_CHANGE: if (netif_carrier_ok(dev) && netif_oper_up(dev)) { diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 394290cbbb1d..7f4c569594a5 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -207,7 +207,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); -void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); +int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); void tipc_disable_l2_media(struct tipc_bearer *b); diff --git a/net/tipc/link.c b/net/tipc/link.c index 836727e363c4..2792a3cae682 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -43,6 +43,7 @@ #include "discover.h" #include "netlink.h" #include "monitor.h" +#include "trace.h" #include <linux/pkt_sched.h> @@ -105,7 +106,7 @@ struct tipc_stats { * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages - * @last_retransmitted: sequence number of most recently retransmitted message + * @prev_from: sequence number of most previous retransmission request * @stale_cnt: counter for number of identical retransmit attempts * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released @@ -163,7 +164,7 @@ struct tipc_link { u16 limit; } backlog[5]; u16 snd_nxt; - u16 last_retransm; + u16 prev_from; u16 window; u16 stale_cnt; unsigned long stale_limit; @@ -186,9 +187,6 @@ struct tipc_link { u16 acked; struct tipc_link *bc_rcvlink; struct tipc_link *bc_sndlink; - unsigned long prev_retr; - u16 prev_from; - u16 prev_to; u8 nack_state; bool bc_peer_is_up; @@ -210,7 +208,7 @@ enum { BC_NACK_SND_SUPPRESS, }; -#define TIPC_BC_RETR_LIMIT 10 /* [ms] */ +#define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */ /* * Interval between NACKs when packets arrive out of order @@ -359,9 +357,11 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, rcv_l->bc_peer_is_up = true; rcv_l->state = LINK_ESTABLISHED; tipc_link_bc_ack_rcv(rcv_l, ack, xmitq); + trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!"); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; if (!snd_l->ackers) { + trace_tipc_link_reset(snd_l, TIPC_DUMP_ALL, "zero ackers!"); tipc_link_reset(snd_l); snd_l->state = LINK_RESET; __skb_queue_purge(xmitq); @@ -525,6 +525,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, l = *link; strcpy(l->name, tipc_bclink_name); + trace_tipc_link_reset(l, TIPC_DUMP_ALL, "bclink created!"); tipc_link_reset(l); l->state = LINK_RESET; l->ackers = 0; @@ -549,6 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, int tipc_link_fsm_evt(struct tipc_link *l, int evt) { int rc = 0; + int old_state = l->state; switch (l->state) { case LINK_RESETTING: @@ -695,10 +697,12 @@ int tipc_link_fsm_evt(struct tipc_link *l, int evt) default: pr_err("Unknown FSM state %x in %s\n", l->state, l->name); } + trace_tipc_link_fsm(l->name, old_state, l->state, evt); return rc; illegal_evt: pr_err("Illegal FSM event %x in state %x on link %s\n", evt, l->state, l->name); + trace_tipc_link_fsm(l->name, old_state, l->state, evt); return rc; } @@ -743,6 +747,18 @@ static void link_profile_stats(struct tipc_link *l) l->stats.msg_length_profile[6]++; } +/** + * tipc_link_too_silent - check if link is "too silent" + * @l: tipc link to be checked + * + * Returns true if the link 'silent_intv_cnt' is about to reach the + * 'abort_limit' value, otherwise false + */ +bool tipc_link_too_silent(struct tipc_link *l) +{ + return (l->silent_intv_cnt + 2 > l->abort_limit); +} + /* tipc_link_timeout - perform periodic task as instructed from node timeout */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) @@ -756,6 +772,8 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) u16 bc_acked = l->bc_rcvlink->acked; struct tipc_mon_state *mstate = &l->mon_state; + trace_tipc_link_timeout(l, TIPC_DUMP_NONE, " "); + trace_tipc_link_too_silent(l, TIPC_DUMP_ALL, " "); switch (l->state) { case LINK_ESTABLISHED: case LINK_SYNCHING: @@ -818,6 +836,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr); skb_queue_tail(&l->wakeupq, skb); l->stats.link_congs++; + trace_tipc_link_conges(l, TIPC_DUMP_ALL, "wakeup scheduled!"); return -ELINKCONG; } @@ -948,6 +967,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, } __skb_dequeue(list); __skb_queue_tail(transmq, skb); + /* next retransmit attempt */ + if (link_is_bc_sndlink(l)) + TIPC_SKB_CB(skb)->nxt_retr = + jiffies + TIPC_BC_RETR_LIM; __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; l->rcv_unacked = 0; @@ -995,6 +1018,10 @@ static void tipc_link_advance_backlog(struct tipc_link *l, hdr = buf_msg(skb); l->backlog[msg_importance(hdr)].len--; __skb_queue_tail(&l->transmq, skb); + /* next retransmit attempt */ + if (link_is_bc_sndlink(l)) + TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; msg_set_seqno(hdr, seqno); @@ -1036,14 +1063,20 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, if (!skb) return 0; + if (less(to, from)) + return 0; + trace_tipc_link_retrans(r, from, to, &l->transmq); /* Detect repeated retransmit failures on same packet */ - if (r->last_retransm != buf_seqno(skb)) { - r->last_retransm = buf_seqno(skb); + if (r->prev_from != from) { + r->prev_from = from; r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); r->stale_cnt = 0; } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { link_retransmit_failure(l, skb); + trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); + trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); if (link_is_bc_sndlink(l)) return TIPC_LINK_DOWN_EVT; return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); @@ -1055,6 +1088,11 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, continue; if (more(msg_seqno(hdr), to)) break; + if (link_is_bc_sndlink(l)) { + if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) + continue; + TIPC_SKB_CB(skb)->nxt_retr = jiffies + TIPC_BC_RETR_LIM; + } _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; @@ -1398,6 +1436,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, l->stats.sent_nacks++; skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, skb); + trace_tipc_proto_build(skb, false, l->name); } void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, @@ -1561,6 +1600,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, char *if_name; int rc = 0; + trace_tipc_proto_rcv(skb, false, l->name); if (tipc_link_is_blocked(l) || !xmitq) goto exit; @@ -1571,8 +1611,11 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, hdr = buf_msg(skb); data = msg_data(hdr); - if (!tipc_link_validate_msg(l, hdr)) + if (!tipc_link_validate_msg(l, hdr)) { + trace_tipc_skb_dump(skb, false, "PROTO invalid (1)!"); + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (1)!"); goto exit; + } switch (mtyp) { case RESET_MSG: @@ -1737,42 +1780,6 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) l->rcv_nxt = peers_snd_nxt; } -/* link_bc_retr eval()- check if the indicated range can be retransmitted now - * - Adjust permitted range if there is overlap with previous retransmission - */ -static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to) -{ - unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr); - - if (less(*to, *from)) - return false; - - /* New retransmission request */ - if ((elapsed > TIPC_BC_RETR_LIMIT) || - less(*to, l->prev_from) || more(*from, l->prev_to)) { - l->prev_from = *from; - l->prev_to = *to; - l->prev_retr = jiffies; - return true; - } - - /* Inside range of previous retransmit */ - if (!less(*from, l->prev_from) && !more(*to, l->prev_to)) - return false; - - /* Fully or partially outside previous range => exclude overlap */ - if (less(*from, l->prev_from)) { - *to = l->prev_from - 1; - l->prev_from = *from; - } - if (more(*to, l->prev_to)) { - *from = l->prev_to + 1; - l->prev_to = *to; - } - l->prev_retr = jiffies; - return true; -} - /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state */ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, @@ -1803,8 +1810,7 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - if (link_bc_retr_eval(snd_l, &from, &to)) - rc = tipc_link_retrans(snd_l, l, from, to, xmitq); + rc = tipc_link_retrans(snd_l, l, from, to, xmitq); l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) @@ -1852,6 +1858,7 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, if (!more(acked, l->acked)) return; + trace_tipc_link_bc_ack(l, l->acked, acked, &snd_l->transmq); /* Skip over packets peer has already acked */ skb_queue_walk(&snd_l->transmq, skb) { if (more(buf_seqno(skb), l->acked)) @@ -2255,3 +2262,122 @@ void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) { l->abort_limit = limit; } + +char *tipc_link_name_ext(struct tipc_link *l, char *buf) +{ + if (!l) + scnprintf(buf, TIPC_MAX_LINK_NAME, "null"); + else if (link_is_bc_sndlink(l)) + scnprintf(buf, TIPC_MAX_LINK_NAME, "broadcast-sender"); + else if (link_is_bc_rcvlink(l)) + scnprintf(buf, TIPC_MAX_LINK_NAME, + "broadcast-receiver, peer %x", l->addr); + else + memcpy(buf, l->name, TIPC_MAX_LINK_NAME); + + return buf; +} + +/** + * tipc_link_dump - dump TIPC link data + * @l: tipc link to be dumped + * @dqueues: bitmask to decide if any link queue to be dumped? + * - TIPC_DUMP_NONE: don't dump link queues + * - TIPC_DUMP_TRANSMQ: dump link transmq queue + * - TIPC_DUMP_BACKLOGQ: dump link backlog queue + * - TIPC_DUMP_DEFERDQ: dump link deferd queue + * - TIPC_DUMP_INPUTQ: dump link input queue + * - TIPC_DUMP_WAKEUP: dump link wakeup queue + * - TIPC_DUMP_ALL: dump all the link queues above + * @buf: returned buffer of dump data in format + */ +int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf) +{ + int i = 0; + size_t sz = (dqueues) ? LINK_LMAX : LINK_LMIN; + struct sk_buff_head *list; + struct sk_buff *hskb, *tskb; + u32 len; + + if (!l) { + i += scnprintf(buf, sz, "link data: (null)\n"); + return i; + } + + i += scnprintf(buf, sz, "link data: %x", l->addr); + i += scnprintf(buf + i, sz - i, " %x", l->state); + i += scnprintf(buf + i, sz - i, " %u", l->in_session); + i += scnprintf(buf + i, sz - i, " %u", l->session); + i += scnprintf(buf + i, sz - i, " %u", l->peer_session); + i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt); + i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt); + i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt_state); + i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt_state); + i += scnprintf(buf + i, sz - i, " %x", l->peer_caps); + i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt); + i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt); + i += scnprintf(buf + i, sz - i, " %u", l->prev_from); + i += scnprintf(buf + i, sz - i, " %u", l->stale_cnt); + i += scnprintf(buf + i, sz - i, " %u", l->acked); + + list = &l->transmq; + len = skb_queue_len(list); + hskb = skb_peek(list); + tskb = skb_peek_tail(list); + i += scnprintf(buf + i, sz - i, " | %u %u %u", len, + (hskb) ? msg_seqno(buf_msg(hskb)) : 0, + (tskb) ? msg_seqno(buf_msg(tskb)) : 0); + + list = &l->deferdq; + len = skb_queue_len(list); + hskb = skb_peek(list); + tskb = skb_peek_tail(list); + i += scnprintf(buf + i, sz - i, " | %u %u %u", len, + (hskb) ? msg_seqno(buf_msg(hskb)) : 0, + (tskb) ? msg_seqno(buf_msg(tskb)) : 0); + + list = &l->backlogq; + len = skb_queue_len(list); + hskb = skb_peek(list); + tskb = skb_peek_tail(list); + i += scnprintf(buf + i, sz - i, " | %u %u %u", len, + (hskb) ? msg_seqno(buf_msg(hskb)) : 0, + (tskb) ? msg_seqno(buf_msg(tskb)) : 0); + + list = l->inputq; + len = skb_queue_len(list); + hskb = skb_peek(list); + tskb = skb_peek_tail(list); + i += scnprintf(buf + i, sz - i, " | %u %u %u\n", len, + (hskb) ? msg_seqno(buf_msg(hskb)) : 0, + (tskb) ? msg_seqno(buf_msg(tskb)) : 0); + + if (dqueues & TIPC_DUMP_TRANSMQ) { + i += scnprintf(buf + i, sz - i, "transmq: "); + i += tipc_list_dump(&l->transmq, false, buf + i); + } + if (dqueues & TIPC_DUMP_BACKLOGQ) { + i += scnprintf(buf + i, sz - i, + "backlogq: <%u %u %u %u %u>, ", + l->backlog[TIPC_LOW_IMPORTANCE].len, + l->backlog[TIPC_MEDIUM_IMPORTANCE].len, + l->backlog[TIPC_HIGH_IMPORTANCE].len, + l->backlog[TIPC_CRITICAL_IMPORTANCE].len, + l->backlog[TIPC_SYSTEM_IMPORTANCE].len); + i += tipc_list_dump(&l->backlogq, false, buf + i); + } + if (dqueues & TIPC_DUMP_DEFERDQ) { + i += scnprintf(buf + i, sz - i, "deferdq: "); + i += tipc_list_dump(&l->deferdq, false, buf + i); + } + if (dqueues & TIPC_DUMP_INPUTQ) { + i += scnprintf(buf + i, sz - i, "inputq: "); + i += tipc_list_dump(l->inputq, false, buf + i); + } + if (dqueues & TIPC_DUMP_WAKEUP) { + i += scnprintf(buf + i, sz - i, "wakeup: "); + i += tipc_list_dump(&l->wakeupq, false, buf + i); + } + + return i; +} diff --git a/net/tipc/link.h b/net/tipc/link.h index 90488c538a4e..8439e0ee53a8 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -109,6 +109,7 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l); u16 tipc_link_acked(struct tipc_link *l); u32 tipc_link_id(struct tipc_link *l); char *tipc_link_name(struct tipc_link *l); +char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); @@ -147,4 +148,5 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *xmitq); int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq); +bool tipc_link_too_silent(struct tipc_link *l); #endif diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a2879e6ec5b6..a0924956bb61 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -105,6 +105,7 @@ struct tipc_skb_cb { u32 bytes_read; u32 orig_member; struct sk_buff *tail; + unsigned long nxt_retr; bool validated; u16 chain_imp; u16 ackers; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 6376467e78f8..21f6ccc89401 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -951,8 +951,11 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg, u32 node; struct nlattr *con[TIPC_NLA_CON_MAX + 1]; - nla_parse_nested(con, TIPC_NLA_CON_MAX, - sock[TIPC_NLA_SOCK_CON], NULL, NULL); + err = nla_parse_nested(con, TIPC_NLA_CON_MAX, + sock[TIPC_NLA_SOCK_CON], NULL, NULL); + + if (err) + return err; node = nla_get_u32(con[TIPC_NLA_CON_NODE]); tipc_tlv_sprintf(msg->rep, " connected to <%u.%u.%u:%u>", diff --git a/net/tipc/node.c b/net/tipc/node.c index 2afc4f8c37a7..db2a6c3e0be9 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -43,6 +43,7 @@ #include "monitor.h" #include "discover.h" #include "netlink.h" +#include "trace.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 @@ -432,6 +433,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, break; } list_add_tail_rcu(&n->list, &temp_node->list); + trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; @@ -459,6 +461,7 @@ static void tipc_node_delete_from_list(struct tipc_node *node) static void tipc_node_delete(struct tipc_node *node) { + trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); del_timer_sync(&node->timer); @@ -584,12 +587,15 @@ static void tipc_node_clear_links(struct tipc_node *node) /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ -static int tipc_node_cleanup(struct tipc_node *peer) +static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; - spin_lock_bh(&tn->node_list_lock); + /* If lock held by tipc_node_stop() the node will be deleted anyway */ + if (!spin_trylock_bh(&tn->node_list_lock)) + return false; + tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { @@ -613,6 +619,7 @@ static void tipc_node_timeout(struct timer_list *t) int bearer_id; int rc = 0; + trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); @@ -621,6 +628,12 @@ static void tipc_node_timeout(struct timer_list *t) __skb_queue_head_init(&xmitq); + /* Initial node interval to value larger (10 seconds), then it will be + * recalculated with link lowest tolerance + */ + tipc_node_read_lock(n); + n->keepalive_intv = 10000; + tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; @@ -672,6 +685,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); + trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); @@ -774,6 +788,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); + trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); @@ -791,6 +806,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); + trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); @@ -823,6 +839,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) /* Defuse pending tipc_node_link_up() */ tipc_link_fsm_evt(l, LINK_RESET_EVT); } + trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); @@ -1012,6 +1029,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, *respond = false; goto exit; } + trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) @@ -1051,6 +1069,7 @@ static void tipc_node_reset_links(struct tipc_node *n) pr_warn("Resetting all links to %x\n", n->addr); + trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } @@ -1226,11 +1245,13 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) pr_err("Unknown node fsm state %x\n", state); break; } + trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); + trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, @@ -1244,6 +1265,7 @@ static void node_lost_contact(struct tipc_node *n, pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); + trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); @@ -1540,6 +1562,10 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); + /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ + if (!skb_queue_empty(&n->bc_entry.namedq)) + tipc_named_rcv(net, &n->bc_entry.namedq); + /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); @@ -1568,6 +1594,10 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, struct tipc_media_addr *maddr; int pb_id; + if (trace_tipc_node_check_state_enabled()) { + trace_tipc_skb_dump(skb, false, "skb for node state check"); + trace_tipc_node_check_state(n, true, " "); + } l = n->links[bearer_id].link; if (!l) return false; @@ -1585,8 +1615,11 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, } } - if (!tipc_link_validate_msg(l, hdr)) + if (!tipc_link_validate_msg(l, hdr)) { + trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; + } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { @@ -1616,6 +1649,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, syncpt = oseqno + exp_pkts - 1; if (pl && tipc_link_is_up(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); + trace_tipc_node_link_down(n, true, + "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } @@ -2422,3 +2457,65 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, return skb->len; } + +u32 tipc_node_get_addr(struct tipc_node *node) +{ + return (node) ? node->addr : 0; +} + +/** + * tipc_node_dump - dump TIPC node data + * @n: tipc node to be dumped + * @more: dump more? + * - false: dump only tipc node data + * - true: dump node link data as well + * @buf: returned buffer of dump data in format + */ +int tipc_node_dump(struct tipc_node *n, bool more, char *buf) +{ + int i = 0; + size_t sz = (more) ? NODE_LMAX : NODE_LMIN; + + if (!n) { + i += scnprintf(buf, sz, "node data: (null)\n"); + return i; + } + + i += scnprintf(buf, sz, "node data: %x", n->addr); + i += scnprintf(buf + i, sz - i, " %x", n->state); + i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); + i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); + i += scnprintf(buf + i, sz - i, " %x", n->action_flags); + i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); + i += scnprintf(buf + i, sz - i, " %u", n->sync_point); + i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); + i += scnprintf(buf + i, sz - i, " %u", n->working_links); + i += scnprintf(buf + i, sz - i, " %x", n->capabilities); + i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); + + if (!more) + return i; + + i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); + i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); + i += scnprintf(buf + i, sz - i, " media: "); + i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); + i += scnprintf(buf + i, sz - i, "\n"); + i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); + i += scnprintf(buf + i, sz - i, " inputq: "); + i += tipc_list_dump(&n->links[0].inputq, false, buf + i); + + i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); + i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); + i += scnprintf(buf + i, sz - i, " media: "); + i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); + i += scnprintf(buf + i, sz - i, "\n"); + i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); + i += scnprintf(buf + i, sz - i, " inputq: "); + i += tipc_list_dump(&n->links[1].inputq, false, buf + i); + + i += scnprintf(buf + i, sz - i, "bclink:\n "); + i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); + + return i; +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 03f5efb62cfb..4f59a30e989a 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -65,6 +65,7 @@ enum { void tipc_node_stop(struct net *net); bool tipc_node_get_id(struct net *net, u32 addr, u8 *id); +u32 tipc_node_get_addr(struct tipc_node *node); u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b57b1be7252b..1217c90a363b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -46,6 +46,7 @@ #include "bcast.h" #include "netlink.h" #include "group.h" +#include "trace.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ @@ -233,6 +234,7 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen) */ static void tsk_advance_rx_queue(struct sock *sk) { + trace_tipc_sk_advance_rx(sk, NULL, TIPC_DUMP_SK_RCVQ, " "); kfree_skb(__skb_dequeue(&sk->sk_receive_queue)); } @@ -247,6 +249,7 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) if (!tipc_msg_reverse(onode, &skb, err)) return; + trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_NONE, "@sk_respond!"); dnode = msg_destnode(buf_msg(skb)); selector = msg_origport(buf_msg(skb)); tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); @@ -482,6 +485,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk_set_unreliable(tsk, true); } + trace_tipc_sk_create(sk, NULL, TIPC_DUMP_NONE, " "); return 0; } @@ -571,6 +575,7 @@ static int tipc_release(struct socket *sock) tsk = tipc_sk(sk); lock_sock(sk); + trace_tipc_sk_release(sk, NULL, TIPC_DUMP_ALL, " "); __tipc_shutdown(sock, TIPC_ERR_NO_PORT); sk->sk_shutdown = SHUTDOWN_MASK; tipc_sk_leave(tsk); @@ -718,6 +723,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, __poll_t revents = 0; sock_poll_wait(file, sock, wait); + trace_tipc_sk_poll(sk, NULL, TIPC_DUMP_ALL, " "); if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; @@ -804,9 +810,12 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts); /* Send message if build was successful */ - if (unlikely(rc == dlen)) + if (unlikely(rc == dlen)) { + trace_tipc_sk_sendmcast(sk, skb_peek(&pkts), + TIPC_DUMP_SK_SNDQ, " "); rc = tipc_mcast_xmit(net, &pkts, method, &dsts, &tsk->cong_link_cnt); + } tipc_nlist_purge(&dsts); @@ -880,7 +889,6 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_group *grp = tsk->group; struct net *net = sock_net(sk); struct tipc_member *mb = NULL; u32 node, port; @@ -894,7 +902,9 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, /* Block or return if destination link or member is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tipc_dest_find(&tsk->cong_links, node, 0) && - !tipc_group_cong(grp, node, port, blks, &mb)); + tsk->group && + !tipc_group_cong(tsk->group, node, port, blks, + &mb)); if (unlikely(rc)) return rc; @@ -924,7 +934,6 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, struct tipc_sock *tsk = tipc_sk(sk); struct list_head *cong_links = &tsk->cong_links; int blks = tsk_blocks(GROUP_H_SIZE + dlen); - struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_member *first = NULL; struct tipc_member *mbr = NULL; @@ -941,9 +950,10 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, type = msg_nametype(hdr); inst = dest->addr.name.name.instance; scope = msg_lookup_scope(hdr); - exclude = tipc_group_exclude(grp); while (++lookups < 4) { + exclude = tipc_group_exclude(tsk->group); + first = NULL; /* Look for a non-congested destination member, if any */ @@ -952,7 +962,8 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, &dstcnt, exclude, false)) return -EHOSTUNREACH; tipc_dest_pop(&dsts, &node, &port); - cong = tipc_group_cong(grp, node, port, blks, &mbr); + cong = tipc_group_cong(tsk->group, node, port, blks, + &mbr); if (!cong) break; if (mbr == first) @@ -971,7 +982,8 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, /* Block or return if destination link or member is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tipc_dest_find(cong_links, node, 0) && - !tipc_group_cong(grp, node, port, + tsk->group && + !tipc_group_cong(tsk->group, node, port, blks, &mbr)); if (unlikely(rc)) return rc; @@ -1006,8 +1018,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_group *grp = tsk->group; - struct tipc_nlist *dsts = tipc_group_dests(grp); + struct tipc_nlist *dsts; struct tipc_mc_method *method = &tsk->mc_method; bool ack = method->mandatory && method->rcast; int blks = tsk_blocks(MCAST_H_SIZE + dlen); @@ -1016,15 +1027,17 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, struct sk_buff_head pkts; int rc = -EHOSTUNREACH; - if (!dsts->local && !dsts->remote) - return -EHOSTUNREACH; - /* Block or return if any destination link or member is congested */ - rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt && - !tipc_group_bc_cong(grp, blks)); + rc = tipc_wait_for_cond(sock, &timeout, + !tsk->cong_link_cnt && tsk->group && + !tipc_group_bc_cong(tsk->group, blks)); if (unlikely(rc)) return rc; + dsts = tipc_group_dests(tsk->group); + if (!dsts->local && !dsts->remote) + return -EHOSTUNREACH; + /* Complete message header */ if (dest) { msg_set_type(hdr, TIPC_GRP_MCAST_MSG); @@ -1036,7 +1049,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); - msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); + msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(tsk->group)); /* Avoid getting stuck with repeated forced replicasts */ msg_set_grp_bc_ack_req(hdr, ack); @@ -1208,8 +1221,10 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, bool conn_cong; /* Ignore if connection cannot be validated: */ - if (!tsk_peer_msg(tsk, hdr)) + if (!tsk_peer_msg(tsk, hdr)) { + trace_tipc_sk_drop_msg(sk, skb, TIPC_DUMP_NONE, "@proto_rcv!"); goto exit; + } if (unlikely(msg_errcode(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); @@ -1377,6 +1392,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) return -ENOMEM; + trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " "); rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tipc_dest_push(clinks, dnode, 0); @@ -1454,6 +1470,8 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(rc != send)) break; + trace_tipc_sk_sendstream(sk, skb_peek(&pkts), + TIPC_DUMP_SK_SNDQ, " "); rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tsk->cong_link_cnt = 1; @@ -2128,6 +2146,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, struct sk_buff_head inputq; int limit, err = TIPC_OK; + trace_tipc_sk_filter_rcv(sk, skb, TIPC_DUMP_ALL, " "); TIPC_SKB_CB(skb)->bytes_read = 0; __skb_queue_head_init(&inputq); __skb_queue_tail(&inputq, skb); @@ -2147,17 +2166,25 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { + trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, + "err_overload2!"); atomic_inc(&sk->sk_drops); err = TIPC_ERR_OVERLOAD; } if (unlikely(err)) { - tipc_skb_reject(net, err, skb, xmitq); + if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) { + trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_NONE, + "@filter_rcv!"); + __skb_queue_tail(xmitq, skb); + } err = TIPC_OK; continue; } __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); + trace_tipc_sk_overlimit2(sk, skb, TIPC_DUMP_ALL, + "rcvq >90% allocated!"); sk->sk_data_ready(sk); } } @@ -2223,14 +2250,21 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, if (!sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); - if (likely(!sk_add_backlog(sk, skb, lim))) + if (likely(!sk_add_backlog(sk, skb, lim))) { + trace_tipc_sk_overlimit1(sk, skb, TIPC_DUMP_ALL, + "bklg & rcvq >90% allocated!"); continue; + } + trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!"); /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); atomic_inc(&sk->sk_drops); - if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) + if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) { + trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL, + "@sk_enqueue!"); __skb_queue_tail(xmitq, skb); + } break; } } @@ -2279,6 +2313,8 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) /* Prepare for message rejection */ if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err)) continue; + + trace_tipc_sk_rej_msg(NULL, skb, TIPC_DUMP_NONE, "@sk_rcv!"); xmit: dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, dport); @@ -2552,6 +2588,7 @@ static int tipc_shutdown(struct socket *sock, int how) lock_sock(sk); + trace_tipc_sk_shutdown(sk, NULL, TIPC_DUMP_ALL, " "); __tipc_shutdown(sock, TIPC_CONN_SHUTDOWN); sk->sk_shutdown = SEND_SHUTDOWN; @@ -2724,11 +2761,15 @@ void tipc_sk_reinit(struct net *net) rhashtable_walk_start(&iter); while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { - spin_lock_bh(&tsk->sk.sk_lock.slock); + sock_hold(&tsk->sk); + rhashtable_walk_stop(&iter); + lock_sock(&tsk->sk); msg = &tsk->phdr; msg_set_prevnode(msg, tipc_own_addr(net)); msg_set_orignode(msg, tipc_own_addr(net)); - spin_unlock_bh(&tsk->sk.sk_lock.slock); + release_sock(&tsk->sk); + rhashtable_walk_start(&iter); + sock_put(&tsk->sk); } rhashtable_walk_stop(&iter); @@ -3564,3 +3605,187 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } + +/** + * tipc_sk_filtering - check if a socket should be traced + * @sk: the socket to be examined + * @sysctl_tipc_sk_filter[]: the socket tuple for filtering, + * (portid, sock type, name type, name lower, name upper) + * + * Returns true if the socket meets the socket tuple data + * (value 0 = 'any') or when there is no tuple set (all = 0), + * otherwise false + */ +bool tipc_sk_filtering(struct sock *sk) +{ + struct tipc_sock *tsk; + struct publication *p; + u32 _port, _sktype, _type, _lower, _upper; + u32 type = 0, lower = 0, upper = 0; + + if (!sk) + return true; + + tsk = tipc_sk(sk); + + _port = sysctl_tipc_sk_filter[0]; + _sktype = sysctl_tipc_sk_filter[1]; + _type = sysctl_tipc_sk_filter[2]; + _lower = sysctl_tipc_sk_filter[3]; + _upper = sysctl_tipc_sk_filter[4]; + + if (!_port && !_sktype && !_type && !_lower && !_upper) + return true; + + if (_port) + return (_port == tsk->portid); + + if (_sktype && _sktype != sk->sk_type) + return false; + + if (tsk->published) { + p = list_first_entry_or_null(&tsk->publications, + struct publication, binding_sock); + if (p) { + type = p->type; + lower = p->lower; + upper = p->upper; + } + } + + if (!tipc_sk_type_connectionless(sk)) { + type = tsk->conn_type; + lower = tsk->conn_instance; + upper = tsk->conn_instance; + } + + if ((_type && _type != type) || (_lower && _lower != lower) || + (_upper && _upper != upper)) + return false; + + return true; +} + +u32 tipc_sock_get_portid(struct sock *sk) +{ + return (sk) ? (tipc_sk(sk))->portid : 0; +} + +/** + * tipc_sk_overlimit1 - check if socket rx queue is about to be overloaded, + * both the rcv and backlog queues are considered + * @sk: tipc sk to be checked + * @skb: tipc msg to be checked + * + * Returns true if the socket rx queue allocation is > 90%, otherwise false + */ + +bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb) +{ + atomic_t *dcnt = &tipc_sk(sk)->dupl_rcvcnt; + unsigned int lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); + unsigned int qsize = sk->sk_backlog.len + sk_rmem_alloc_get(sk); + + return (qsize > lim * 90 / 100); +} + +/** + * tipc_sk_overlimit2 - check if socket rx queue is about to be overloaded, + * only the rcv queue is considered + * @sk: tipc sk to be checked + * @skb: tipc msg to be checked + * + * Returns true if the socket rx queue allocation is > 90%, otherwise false + */ + +bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb) +{ + unsigned int lim = rcvbuf_limit(sk, skb); + unsigned int qsize = sk_rmem_alloc_get(sk); + + return (qsize > lim * 90 / 100); +} + +/** + * tipc_sk_dump - dump TIPC socket + * @sk: tipc sk to be dumped + * @dqueues: bitmask to decide if any socket queue to be dumped? + * - TIPC_DUMP_NONE: don't dump socket queues + * - TIPC_DUMP_SK_SNDQ: dump socket send queue + * - TIPC_DUMP_SK_RCVQ: dump socket rcv queue + * - TIPC_DUMP_SK_BKLGQ: dump socket backlog queue + * - TIPC_DUMP_ALL: dump all the socket queues above + * @buf: returned buffer of dump data in format + */ +int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) +{ + int i = 0; + size_t sz = (dqueues) ? SK_LMAX : SK_LMIN; + struct tipc_sock *tsk; + struct publication *p; + bool tsk_connected; + + if (!sk) { + i += scnprintf(buf, sz, "sk data: (null)\n"); + return i; + } + + tsk = tipc_sk(sk); + tsk_connected = !tipc_sk_type_connectionless(sk); + + i += scnprintf(buf, sz, "sk data: %u", sk->sk_type); + i += scnprintf(buf + i, sz - i, " %d", sk->sk_state); + i += scnprintf(buf + i, sz - i, " %x", tsk_own_node(tsk)); + i += scnprintf(buf + i, sz - i, " %u", tsk->portid); + i += scnprintf(buf + i, sz - i, " | %u", tsk_connected); + if (tsk_connected) { + i += scnprintf(buf + i, sz - i, " %x", tsk_peer_node(tsk)); + i += scnprintf(buf + i, sz - i, " %u", tsk_peer_port(tsk)); + i += scnprintf(buf + i, sz - i, " %u", tsk->conn_type); + i += scnprintf(buf + i, sz - i, " %u", tsk->conn_instance); + } + i += scnprintf(buf + i, sz - i, " | %u", tsk->published); + if (tsk->published) { + p = list_first_entry_or_null(&tsk->publications, + struct publication, binding_sock); + i += scnprintf(buf + i, sz - i, " %u", (p) ? p->type : 0); + i += scnprintf(buf + i, sz - i, " %u", (p) ? p->lower : 0); + i += scnprintf(buf + i, sz - i, " %u", (p) ? p->upper : 0); + } + i += scnprintf(buf + i, sz - i, " | %u", tsk->snd_win); + i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_win); + i += scnprintf(buf + i, sz - i, " %u", tsk->max_pkt); + i += scnprintf(buf + i, sz - i, " %x", tsk->peer_caps); + i += scnprintf(buf + i, sz - i, " %u", tsk->cong_link_cnt); + i += scnprintf(buf + i, sz - i, " %u", tsk->snt_unacked); + i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_unacked); + i += scnprintf(buf + i, sz - i, " %u", atomic_read(&tsk->dupl_rcvcnt)); + i += scnprintf(buf + i, sz - i, " %u", sk->sk_shutdown); + i += scnprintf(buf + i, sz - i, " | %d", sk_wmem_alloc_get(sk)); + i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf); + i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk)); + i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf); + i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len); + + if (dqueues & TIPC_DUMP_SK_SNDQ) { + i += scnprintf(buf + i, sz - i, "sk_write_queue: "); + i += tipc_list_dump(&sk->sk_write_queue, false, buf + i); + } + + if (dqueues & TIPC_DUMP_SK_RCVQ) { + i += scnprintf(buf + i, sz - i, "sk_receive_queue: "); + i += tipc_list_dump(&sk->sk_receive_queue, false, buf + i); + } + + if (dqueues & TIPC_DUMP_SK_BKLGQ) { + i += scnprintf(buf + i, sz - i, "sk_backlog:\n head "); + i += tipc_skb_dump(sk->sk_backlog.head, false, buf + i); + if (sk->sk_backlog.tail != sk->sk_backlog.head) { + i += scnprintf(buf + i, sz - i, " tail "); + i += tipc_skb_dump(sk->sk_backlog.tail, false, + buf + i); + } + } + + return i; +} diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 5e575f205afe..235b9679acee 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -71,4 +71,8 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, int tipc_dump_start(struct netlink_callback *cb); int __tipc_dump_start(struct netlink_callback *cb, struct net *net); int tipc_dump_done(struct netlink_callback *cb); +u32 tipc_sock_get_portid(struct sock *sk); +bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb); +bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb); + #endif diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 1a779b1e8510..3481e4906bd6 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -34,6 +34,7 @@ */ #include "core.h" +#include "trace.h" #include <linux/sysctl.h> @@ -54,6 +55,13 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sk_filter", + .data = &sysctl_tipc_sk_filter, + .maxlen = sizeof(sysctl_tipc_sk_filter), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, {} }; diff --git a/net/tipc/trace.c b/net/tipc/trace.c new file mode 100644 index 000000000000..964823841efe --- /dev/null +++ b/net/tipc/trace.c @@ -0,0 +1,206 @@ +/* + * net/tipc/trace.c: TIPC tracepoints code + * + * Copyright (c) 2018, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#define CREATE_TRACE_POINTS +#include "trace.h" + +/** + * socket tuples for filtering in socket traces: + * (portid, sock type, name type, name lower, name upper) + */ +unsigned long sysctl_tipc_sk_filter[5] __read_mostly = {0, }; + +/** + * tipc_skb_dump - dump TIPC skb data + * @skb: skb to be dumped + * @more: dump more? + * - false: dump only tipc msg data + * - true: dump kernel-related skb data and tipc cb[] array as well + * @buf: returned buffer of dump data in format + */ +int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf) +{ + int i = 0; + size_t sz = (more) ? SKB_LMAX : SKB_LMIN; + struct tipc_msg *hdr; + struct tipc_skb_cb *skbcb; + + if (!skb) { + i += scnprintf(buf, sz, "msg: (null)\n"); + return i; + } + + hdr = buf_msg(skb); + skbcb = TIPC_SKB_CB(skb); + + /* tipc msg data section */ + i += scnprintf(buf, sz, "msg: %u", msg_user(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_type(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_hdr_sz(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_data_sz(hdr)); + i += scnprintf(buf + i, sz - i, " %x", msg_orignode(hdr)); + i += scnprintf(buf + i, sz - i, " %x", msg_destnode(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_seqno(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_ack(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_bcast_ack(hdr)); + switch (msg_user(hdr)) { + case LINK_PROTOCOL: + i += scnprintf(buf + i, sz - i, " %c", msg_net_plane(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_probe(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_peer_stopping(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_session(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_next_sent(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_seq_gap(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_bc_snd_nxt(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_bc_gap(hdr)); + break; + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + case CONN_MANAGER: + case SOCK_WAKEUP: + i += scnprintf(buf + i, sz - i, " | %u", msg_origport(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_destport(hdr)); + switch (msg_type(hdr)) { + case TIPC_NAMED_MSG: + i += scnprintf(buf + i, sz - i, " %u", + msg_nametype(hdr)); + i += scnprintf(buf + i, sz - i, " %u", + msg_nameinst(hdr)); + break; + case TIPC_MCAST_MSG: + i += scnprintf(buf + i, sz - i, " %u", + msg_nametype(hdr)); + i += scnprintf(buf + i, sz - i, " %u", + msg_namelower(hdr)); + i += scnprintf(buf + i, sz - i, " %u", + msg_nameupper(hdr)); + break; + default: + break; + }; + i += scnprintf(buf + i, sz - i, " | %u", + msg_src_droppable(hdr)); + i += scnprintf(buf + i, sz - i, " %u", + msg_dest_droppable(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_errcode(hdr)); + i += scnprintf(buf + i, sz - i, " %u", msg_reroute_cnt(hdr)); + break; + default: + /* need more? */ + break; + }; + + i += scnprintf(buf + i, sz - i, "\n"); + if (!more) + return i; + + /* kernel-related skb data section */ + i += scnprintf(buf + i, sz - i, "skb: %s", + (skb->dev) ? skb->dev->name : "n/a"); + i += scnprintf(buf + i, sz - i, " %u", skb->len); + i += scnprintf(buf + i, sz - i, " %u", skb->data_len); + i += scnprintf(buf + i, sz - i, " %u", skb->hdr_len); + i += scnprintf(buf + i, sz - i, " %u", skb->truesize); + i += scnprintf(buf + i, sz - i, " %u", skb_cloned(skb)); + i += scnprintf(buf + i, sz - i, " %p", skb->sk); + i += scnprintf(buf + i, sz - i, " %u", skb_shinfo(skb)->nr_frags); + i += scnprintf(buf + i, sz - i, " %llx", + ktime_to_ms(skb_get_ktime(skb))); + i += scnprintf(buf + i, sz - i, " %llx\n", + ktime_to_ms(skb_hwtstamps(skb)->hwtstamp)); + + /* tipc skb cb[] data section */ + i += scnprintf(buf + i, sz - i, "cb[]: %u", skbcb->bytes_read); + i += scnprintf(buf + i, sz - i, " %u", skbcb->orig_member); + i += scnprintf(buf + i, sz - i, " %u", + jiffies_to_msecs(skbcb->nxt_retr)); + i += scnprintf(buf + i, sz - i, " %u", skbcb->validated); + i += scnprintf(buf + i, sz - i, " %u", skbcb->chain_imp); + i += scnprintf(buf + i, sz - i, " %u\n", skbcb->ackers); + + return i; +} + +/** + * tipc_list_dump - dump TIPC skb list/queue + * @list: list of skbs to be dumped + * @more: dump more? + * - false: dump only the head & tail skbs + * - true: dump the first & last 5 skbs + * @buf: returned buffer of dump data in format + */ +int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf) +{ + int i = 0; + size_t sz = (more) ? LIST_LMAX : LIST_LMIN; + u32 count, len; + struct sk_buff *hskb, *tskb, *skb, *tmp; + + if (!list) { + i += scnprintf(buf, sz, "(null)\n"); + return i; + } + + len = skb_queue_len(list); + i += scnprintf(buf, sz, "len = %d\n", len); + + if (!len) + return i; + + if (!more) { + hskb = skb_peek(list); + i += scnprintf(buf + i, sz - i, " head "); + i += tipc_skb_dump(hskb, false, buf + i); + if (len > 1) { + tskb = skb_peek_tail(list); + i += scnprintf(buf + i, sz - i, " tail "); + i += tipc_skb_dump(tskb, false, buf + i); + } + } else { + count = 0; + skb_queue_walk_safe(list, skb, tmp) { + count++; + if (count == 6) + i += scnprintf(buf + i, sz - i, " .\n .\n"); + if (count > 5 && count <= len - 5) + continue; + i += scnprintf(buf + i, sz - i, " #%d ", count); + i += tipc_skb_dump(skb, false, buf + i); + } + } + return i; +} diff --git a/net/tipc/trace.h b/net/tipc/trace.h new file mode 100644 index 000000000000..4d8e00483afc --- /dev/null +++ b/net/tipc/trace.h @@ -0,0 +1,431 @@ +/* + * net/tipc/trace.h: TIPC tracepoints + * + * Copyright (c) 2018, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tipc + +#if !defined(_TIPC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TIPC_TRACE_H + +#include <linux/tracepoint.h> +#include "core.h" +#include "link.h" +#include "socket.h" +#include "node.h" + +#define SKB_LMIN (100) +#define SKB_LMAX (SKB_LMIN * 2) +#define LIST_LMIN (SKB_LMIN * 3) +#define LIST_LMAX (SKB_LMIN * 11) +#define SK_LMIN (SKB_LMIN * 2) +#define SK_LMAX (SKB_LMIN * 11) +#define LINK_LMIN (SKB_LMIN) +#define LINK_LMAX (SKB_LMIN * 16) +#define NODE_LMIN (SKB_LMIN) +#define NODE_LMAX (SKB_LMIN * 11) + +#ifndef __TIPC_TRACE_ENUM +#define __TIPC_TRACE_ENUM +enum { + TIPC_DUMP_NONE = 0, + + TIPC_DUMP_TRANSMQ = 1, + TIPC_DUMP_BACKLOGQ = (1 << 1), + TIPC_DUMP_DEFERDQ = (1 << 2), + TIPC_DUMP_INPUTQ = (1 << 3), + TIPC_DUMP_WAKEUP = (1 << 4), + + TIPC_DUMP_SK_SNDQ = (1 << 8), + TIPC_DUMP_SK_RCVQ = (1 << 9), + TIPC_DUMP_SK_BKLGQ = (1 << 10), + TIPC_DUMP_ALL = 0xffffu +}; +#endif + +/* Link & Node FSM states: */ +#define state_sym(val) \ + __print_symbolic(val, \ + {(0xe), "ESTABLISHED" },\ + {(0xe << 4), "ESTABLISHING" },\ + {(0x1 << 8), "RESET" },\ + {(0x2 << 12), "RESETTING" },\ + {(0xd << 16), "PEER_RESET" },\ + {(0xf << 20), "FAILINGOVER" },\ + {(0xc << 24), "SYNCHING" },\ + {(0xdd), "SELF_DOWN_PEER_DOWN" },\ + {(0xaa), "SELF_UP_PEER_UP" },\ + {(0xd1), "SELF_DOWN_PEER_LEAVING" },\ + {(0xac), "SELF_UP_PEER_COMING" },\ + {(0xca), "SELF_COMING_PEER_UP" },\ + {(0x1d), "SELF_LEAVING_PEER_DOWN" },\ + {(0xf0), "FAILINGOVER" },\ + {(0xcc), "SYNCHING" }) + +/* Link & Node FSM events: */ +#define evt_sym(val) \ + __print_symbolic(val, \ + {(0xec1ab1e), "ESTABLISH_EVT" },\ + {(0x9eed0e), "PEER_RESET_EVT" },\ + {(0xfa110e), "FAILURE_EVT" },\ + {(0x10ca1d0e), "RESET_EVT" },\ + {(0xfa110bee), "FAILOVER_BEGIN_EVT" },\ + {(0xfa110ede), "FAILOVER_END_EVT" },\ + {(0xc1ccbee), "SYNCH_BEGIN_EVT" },\ + {(0xc1ccede), "SYNCH_END_EVT" },\ + {(0xece), "SELF_ESTABL_CONTACT_EVT" },\ + {(0x1ce), "SELF_LOST_CONTACT_EVT" },\ + {(0x9ece), "PEER_ESTABL_CONTACT_EVT" },\ + {(0x91ce), "PEER_LOST_CONTACT_EVT" },\ + {(0xfbe), "FAILOVER_BEGIN_EVT" },\ + {(0xfee), "FAILOVER_END_EVT" },\ + {(0xcbe), "SYNCH_BEGIN_EVT" },\ + {(0xcee), "SYNCH_END_EVT" }) + +/* Bearer, net device events: */ +#define dev_evt_sym(val) \ + __print_symbolic(val, \ + {(NETDEV_CHANGE), "NETDEV_CHANGE" },\ + {(NETDEV_GOING_DOWN), "NETDEV_GOING_DOWN" },\ + {(NETDEV_UP), "NETDEV_UP" },\ + {(NETDEV_CHANGEMTU), "NETDEV_CHANGEMTU" },\ + {(NETDEV_CHANGEADDR), "NETDEV_CHANGEADDR" },\ + {(NETDEV_UNREGISTER), "NETDEV_UNREGISTER" },\ + {(NETDEV_CHANGENAME), "NETDEV_CHANGENAME" }) + +extern unsigned long sysctl_tipc_sk_filter[5] __read_mostly; + +int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf); +int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf); +int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf); +int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf); +int tipc_node_dump(struct tipc_node *n, bool more, char *buf); +bool tipc_sk_filtering(struct sock *sk); + +DECLARE_EVENT_CLASS(tipc_skb_class, + + TP_PROTO(struct sk_buff *skb, bool more, const char *header), + + TP_ARGS(skb, more, header), + + TP_STRUCT__entry( + __string(header, header) + __dynamic_array(char, buf, (more) ? SKB_LMAX : SKB_LMIN) + ), + + TP_fast_assign( + __assign_str(header, header); + tipc_skb_dump(skb, more, __get_str(buf)); + ), + + TP_printk("%s\n%s", __get_str(header), __get_str(buf)) +) + +#define DEFINE_SKB_EVENT(name) \ +DEFINE_EVENT(tipc_skb_class, name, \ + TP_PROTO(struct sk_buff *skb, bool more, const char *header), \ + TP_ARGS(skb, more, header)) +DEFINE_SKB_EVENT(tipc_skb_dump); +DEFINE_SKB_EVENT(tipc_proto_build); +DEFINE_SKB_EVENT(tipc_proto_rcv); + +DECLARE_EVENT_CLASS(tipc_list_class, + + TP_PROTO(struct sk_buff_head *list, bool more, const char *header), + + TP_ARGS(list, more, header), + + TP_STRUCT__entry( + __string(header, header) + __dynamic_array(char, buf, (more) ? LIST_LMAX : LIST_LMIN) + ), + + TP_fast_assign( + __assign_str(header, header); + tipc_list_dump(list, more, __get_str(buf)); + ), + + TP_printk("%s\n%s", __get_str(header), __get_str(buf)) +); + +#define DEFINE_LIST_EVENT(name) \ +DEFINE_EVENT(tipc_list_class, name, \ + TP_PROTO(struct sk_buff_head *list, bool more, const char *header), \ + TP_ARGS(list, more, header)) +DEFINE_LIST_EVENT(tipc_list_dump); + +DECLARE_EVENT_CLASS(tipc_sk_class, + + TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, + const char *header), + + TP_ARGS(sk, skb, dqueues, header), + + TP_STRUCT__entry( + __string(header, header) + __field(u32, portid) + __dynamic_array(char, buf, (dqueues) ? SK_LMAX : SK_LMIN) + __dynamic_array(char, skb_buf, (skb) ? SKB_LMIN : 1) + ), + + TP_fast_assign( + __assign_str(header, header); + __entry->portid = tipc_sock_get_portid(sk); + tipc_sk_dump(sk, dqueues, __get_str(buf)); + if (skb) + tipc_skb_dump(skb, false, __get_str(skb_buf)); + else + *(__get_str(skb_buf)) = '\0'; + ), + + TP_printk("<%u> %s\n%s%s", __entry->portid, __get_str(header), + __get_str(skb_buf), __get_str(buf)) +); + +#define DEFINE_SK_EVENT_FILTER(name) \ +DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ + TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ + const char *header), \ + TP_ARGS(sk, skb, dqueues, header), \ + TP_CONDITION(tipc_sk_filtering(sk))) +DEFINE_SK_EVENT_FILTER(tipc_sk_dump); +DEFINE_SK_EVENT_FILTER(tipc_sk_create); +DEFINE_SK_EVENT_FILTER(tipc_sk_sendmcast); +DEFINE_SK_EVENT_FILTER(tipc_sk_sendmsg); +DEFINE_SK_EVENT_FILTER(tipc_sk_sendstream); +DEFINE_SK_EVENT_FILTER(tipc_sk_poll); +DEFINE_SK_EVENT_FILTER(tipc_sk_filter_rcv); +DEFINE_SK_EVENT_FILTER(tipc_sk_advance_rx); +DEFINE_SK_EVENT_FILTER(tipc_sk_rej_msg); +DEFINE_SK_EVENT_FILTER(tipc_sk_drop_msg); +DEFINE_SK_EVENT_FILTER(tipc_sk_release); +DEFINE_SK_EVENT_FILTER(tipc_sk_shutdown); + +#define DEFINE_SK_EVENT_FILTER_COND(name, cond) \ +DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ + TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ + const char *header), \ + TP_ARGS(sk, skb, dqueues, header), \ + TP_CONDITION(tipc_sk_filtering(sk) && (cond))) +DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit1, tipc_sk_overlimit1(sk, skb)); +DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit2, tipc_sk_overlimit2(sk, skb)); + +DECLARE_EVENT_CLASS(tipc_link_class, + + TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), + + TP_ARGS(l, dqueues, header), + + TP_STRUCT__entry( + __string(header, header) + __array(char, name, TIPC_MAX_LINK_NAME) + __dynamic_array(char, buf, (dqueues) ? LINK_LMAX : LINK_LMIN) + ), + + TP_fast_assign( + __assign_str(header, header); + tipc_link_name_ext(l, __entry->name); + tipc_link_dump(l, dqueues, __get_str(buf)); + ), + + TP_printk("<%s> %s\n%s", __entry->name, __get_str(header), + __get_str(buf)) +); + +#define DEFINE_LINK_EVENT(name) \ +DEFINE_EVENT(tipc_link_class, name, \ + TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ + TP_ARGS(l, dqueues, header)) +DEFINE_LINK_EVENT(tipc_link_dump); +DEFINE_LINK_EVENT(tipc_link_conges); +DEFINE_LINK_EVENT(tipc_link_timeout); +DEFINE_LINK_EVENT(tipc_link_reset); + +#define DEFINE_LINK_EVENT_COND(name, cond) \ +DEFINE_EVENT_CONDITION(tipc_link_class, name, \ + TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ + TP_ARGS(l, dqueues, header), \ + TP_CONDITION(cond)) +DEFINE_LINK_EVENT_COND(tipc_link_too_silent, tipc_link_too_silent(l)); + +DECLARE_EVENT_CLASS(tipc_link_transmq_class, + + TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), + + TP_ARGS(r, f, t, tq), + + TP_STRUCT__entry( + __array(char, name, TIPC_MAX_LINK_NAME) + __field(u16, from) + __field(u16, to) + __field(u32, len) + __field(u16, fseqno) + __field(u16, lseqno) + ), + + TP_fast_assign( + tipc_link_name_ext(r, __entry->name); + __entry->from = f; + __entry->to = t; + __entry->len = skb_queue_len(tq); + __entry->fseqno = msg_seqno(buf_msg(skb_peek(tq))); + __entry->lseqno = msg_seqno(buf_msg(skb_peek_tail(tq))); + ), + + TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n", + __entry->name, __entry->from, __entry->to, + __entry->len, __entry->fseqno, __entry->lseqno) +); + +DEFINE_EVENT(tipc_link_transmq_class, tipc_link_retrans, + TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), + TP_ARGS(r, f, t, tq) +); + +DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack, + TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), + TP_ARGS(r, f, t, tq), + TP_printk("<%s> acked: [%u-%u] transmq: %u [%u-%u]\n", + __entry->name, __entry->from, __entry->to, + __entry->len, __entry->fseqno, __entry->lseqno) +); + +DECLARE_EVENT_CLASS(tipc_node_class, + + TP_PROTO(struct tipc_node *n, bool more, const char *header), + + TP_ARGS(n, more, header), + + TP_STRUCT__entry( + __string(header, header) + __field(u32, addr) + __dynamic_array(char, buf, (more) ? NODE_LMAX : NODE_LMIN) + ), + + TP_fast_assign( + __assign_str(header, header); + __entry->addr = tipc_node_get_addr(n); + tipc_node_dump(n, more, __get_str(buf)); + ), + + TP_printk("<%x> %s\n%s", __entry->addr, __get_str(header), + __get_str(buf)) +); + +#define DEFINE_NODE_EVENT(name) \ +DEFINE_EVENT(tipc_node_class, name, \ + TP_PROTO(struct tipc_node *n, bool more, const char *header), \ + TP_ARGS(n, more, header)) +DEFINE_NODE_EVENT(tipc_node_dump); +DEFINE_NODE_EVENT(tipc_node_create); +DEFINE_NODE_EVENT(tipc_node_delete); +DEFINE_NODE_EVENT(tipc_node_lost_contact); +DEFINE_NODE_EVENT(tipc_node_timeout); +DEFINE_NODE_EVENT(tipc_node_link_up); +DEFINE_NODE_EVENT(tipc_node_link_down); +DEFINE_NODE_EVENT(tipc_node_reset_links); +DEFINE_NODE_EVENT(tipc_node_check_state); + +DECLARE_EVENT_CLASS(tipc_fsm_class, + + TP_PROTO(const char *name, u32 os, u32 ns, int evt), + + TP_ARGS(name, os, ns, evt), + + TP_STRUCT__entry( + __string(name, name) + __field(u32, os) + __field(u32, ns) + __field(u32, evt) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->os = os; + __entry->ns = ns; + __entry->evt = evt; + ), + + TP_printk("<%s> %s--(%s)->%s\n", __get_str(name), + state_sym(__entry->os), evt_sym(__entry->evt), + state_sym(__entry->ns)) +); + +#define DEFINE_FSM_EVENT(fsm_name) \ +DEFINE_EVENT(tipc_fsm_class, fsm_name, \ + TP_PROTO(const char *name, u32 os, u32 ns, int evt), \ + TP_ARGS(name, os, ns, evt)) +DEFINE_FSM_EVENT(tipc_link_fsm); +DEFINE_FSM_EVENT(tipc_node_fsm); + +TRACE_EVENT(tipc_l2_device_event, + + TP_PROTO(struct net_device *dev, struct tipc_bearer *b, + unsigned long evt), + + TP_ARGS(dev, b, evt), + + TP_STRUCT__entry( + __string(dev_name, dev->name) + __string(b_name, b->name) + __field(unsigned long, evt) + __field(u8, b_up) + __field(u8, carrier) + __field(u8, oper) + ), + + TP_fast_assign( + __assign_str(dev_name, dev->name); + __assign_str(b_name, b->name); + __entry->evt = evt; + __entry->b_up = test_bit(0, &b->up); + __entry->carrier = netif_carrier_ok(dev); + __entry->oper = netif_oper_up(dev); + ), + + TP_printk("%s on: <%s>/<%s> oper: %s carrier: %s bearer: %s\n", + dev_evt_sym(__entry->evt), __get_str(dev_name), + __get_str(b_name), (__entry->oper) ? "up" : "down", + (__entry->carrier) ? "ok" : "notok", + (__entry->b_up) ? "up" : "down") +); + +#endif /* _TIPC_TRACE_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 10dc59ce9c82..4d85d71f16e2 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -245,10 +245,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, } err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr); - if (err) { - kfree_skb(_skb); + if (err) goto out; - } } err = 0; out: @@ -681,6 +679,11 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (err) goto err; + if (remote.proto != local.proto) { + err = -EINVAL; + goto err; + } + /* Checking remote ip address */ rmcast = tipc_udp_is_mcast_addr(&remote); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 311cec8e533d..78cb4a584080 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -55,8 +55,10 @@ enum { static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); +static struct proto *saved_tcpv4_prot; +static DEFINE_MUTEX(tcpv4_prot_mutex); static LIST_HEAD(device_list); -static DEFINE_MUTEX(device_mutex); +static DEFINE_SPINLOCK(device_spinlock); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; @@ -538,11 +540,14 @@ static struct tls_context *create_ctx(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); if (!ctx) return NULL; icsk->icsk_ulp_data = ctx; + ctx->setsockopt = sk->sk_prot->setsockopt; + ctx->getsockopt = sk->sk_prot->getsockopt; + ctx->sk_proto_close = sk->sk_prot->close; return ctx; } @@ -552,7 +557,7 @@ static int tls_hw_prot(struct sock *sk) struct tls_device *dev; int rc = 0; - mutex_lock(&device_mutex); + spin_lock_bh(&device_spinlock); list_for_each_entry(dev, &device_list, dev_list) { if (dev->feature && dev->feature(dev)) { ctx = create_ctx(sk); @@ -570,7 +575,7 @@ static int tls_hw_prot(struct sock *sk) } } out: - mutex_unlock(&device_mutex); + spin_unlock_bh(&device_spinlock); return rc; } @@ -579,12 +584,17 @@ static void tls_hw_unhash(struct sock *sk) struct tls_context *ctx = tls_get_ctx(sk); struct tls_device *dev; - mutex_lock(&device_mutex); + spin_lock_bh(&device_spinlock); list_for_each_entry(dev, &device_list, dev_list) { - if (dev->unhash) + if (dev->unhash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); dev->unhash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } } - mutex_unlock(&device_mutex); + spin_unlock_bh(&device_spinlock); ctx->unhash(sk); } @@ -595,12 +605,17 @@ static int tls_hw_hash(struct sock *sk) int err; err = ctx->hash(sk); - mutex_lock(&device_mutex); + spin_lock_bh(&device_spinlock); list_for_each_entry(dev, &device_list, dev_list) { - if (dev->hash) + if (dev->hash) { + kref_get(&dev->kref); + spin_unlock_bh(&device_spinlock); err |= dev->hash(dev, sk); + kref_put(&dev->kref, dev->release); + spin_lock_bh(&device_spinlock); + } } - mutex_unlock(&device_mutex); + spin_unlock_bh(&device_spinlock); if (err) tls_hw_unhash(sk); @@ -675,9 +690,6 @@ static int tls_init(struct sock *sk) rc = -ENOMEM; goto out; } - ctx->setsockopt = sk->sk_prot->setsockopt; - ctx->getsockopt = sk->sk_prot->getsockopt; - ctx->sk_proto_close = sk->sk_prot->close; /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ if (ip_ver == TLSV6 && @@ -690,6 +702,16 @@ static int tls_init(struct sock *sk) mutex_unlock(&tcpv6_prot_mutex); } + if (ip_ver == TLSV4 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) { + mutex_lock(&tcpv4_prot_mutex); + if (likely(sk->sk_prot != saved_tcpv4_prot)) { + build_protos(tls_prots[TLSV4], sk->sk_prot); + smp_store_release(&saved_tcpv4_prot, sk->sk_prot); + } + mutex_unlock(&tcpv4_prot_mutex); + } + ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; update_sk_prot(sk, ctx); @@ -699,17 +721,17 @@ out: void tls_register_device(struct tls_device *device) { - mutex_lock(&device_mutex); + spin_lock_bh(&device_spinlock); list_add_tail(&device->dev_list, &device_list); - mutex_unlock(&device_mutex); + spin_unlock_bh(&device_spinlock); } EXPORT_SYMBOL(tls_register_device); void tls_unregister_device(struct tls_device *device) { - mutex_lock(&device_mutex); + spin_lock_bh(&device_spinlock); list_del(&device->dev_list); - mutex_unlock(&device_mutex); + spin_unlock_bh(&device_spinlock); } EXPORT_SYMBOL(tls_unregister_device); @@ -721,8 +743,6 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static int __init tls_register(void) { - build_protos(tls_prots[TLSV4], &tcp_prot); - tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7b1af8b59cd2..11cdc8f7db63 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -686,16 +686,24 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; + bool enospc, policy; int err = 0, send; - bool enospc; + u32 delta = 0; + policy = !(flags & MSG_SENDPAGE_NOPOLICY); psock = sk_psock_get(sk); - if (!psock) + if (!psock || !policy) return tls_push_record(sk, flags, record_type); more_data: enospc = sk_msg_full(msg); - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (delta < msg->sg.size) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { err = -ENOSPC; @@ -743,7 +751,7 @@ more_data: msg->apply_bytes -= send; if (msg->sg.size == 0) tls_free_open_rec(sk); - *copied -= send; + *copied -= (send + delta); err = -EACCES; } @@ -935,10 +943,12 @@ fallback_to_reg_send: tls_ctx->tx.overhead_size); } - ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl, - try_to_copy); - if (ret < 0) - goto trim_sgl; + if (try_to_copy) { + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, + msg_pl, try_to_copy); + if (ret < 0) + goto trim_sgl; + } /* Open records defined only if successfully copied, otherwise * we would trim the sg but not reset the open record frags. @@ -1010,8 +1020,8 @@ send_end: return copied ? copied : ret; } -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) +int tls_sw_do_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) { long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -1026,15 +1036,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, int ret = 0; bool eor; - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST)) - return -ENOTSUPP; - - /* No MSG_EOR from splice, only look at MSG_MORE */ eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); - - lock_sock(sk); - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Wait till there is any pending write on socket */ @@ -1138,10 +1140,34 @@ wait_for_memory: } sendpage_end: ret = sk_stream_error(sk, flags, ret); - release_sock(sk); return copied ? copied : ret; } +int tls_sw_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + return -ENOTSUPP; + + return tls_sw_do_sendpage(sk, page, offset, size, flags); +} + +int tls_sw_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + int ret; + + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + return -ENOTSUPP; + + lock_sock(sk); + ret = tls_sw_do_sendpage(sk, page, offset, size, flags); + release_sock(sk); + return ret; +} + static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, int flags, long timeo, int *err) { diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ab27a2872935..43a1dec08825 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -107,6 +107,7 @@ #include <linux/mutex.h> #include <linux/net.h> #include <linux/poll.h> +#include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> @@ -504,9 +505,13 @@ out: static int __vsock_bind_stream(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - static u32 port = LAST_RESERVED_PORT + 1; + static u32 port = 0; struct sockaddr_vm new_addr; + if (!port) + port = LAST_RESERVED_PORT + 1 + + prandom_u32_max(U32_MAX - LAST_RESERVED_PORT); + vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); if (addr->svm_port == VMADDR_PORT_ANY) { diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index cb332adb84cd..c361ce782412 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -264,6 +264,31 @@ vmci_transport_send_control_pkt_bh(struct sockaddr_vm *src, } static int +vmci_transport_alloc_send_control_pkt(struct sockaddr_vm *src, + struct sockaddr_vm *dst, + enum vmci_transport_packet_type type, + u64 size, + u64 mode, + struct vmci_transport_waiting_info *wait, + u16 proto, + struct vmci_handle handle) +{ + struct vmci_transport_packet *pkt; + int err; + + pkt = kmalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return -ENOMEM; + + err = __vmci_transport_send_control_pkt(pkt, src, dst, type, size, + mode, wait, proto, handle, + true); + kfree(pkt); + + return err; +} + +static int vmci_transport_send_control_pkt(struct sock *sk, enum vmci_transport_packet_type type, u64 size, @@ -272,9 +297,7 @@ vmci_transport_send_control_pkt(struct sock *sk, u16 proto, struct vmci_handle handle) { - struct vmci_transport_packet *pkt; struct vsock_sock *vsk; - int err; vsk = vsock_sk(sk); @@ -284,17 +307,10 @@ vmci_transport_send_control_pkt(struct sock *sk, if (!vsock_addr_bound(&vsk->remote_addr)) return -EINVAL; - pkt = kmalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) - return -ENOMEM; - - err = __vmci_transport_send_control_pkt(pkt, &vsk->local_addr, - &vsk->remote_addr, type, size, - mode, wait, proto, handle, - true); - kfree(pkt); - - return err; + return vmci_transport_alloc_send_control_pkt(&vsk->local_addr, + &vsk->remote_addr, + type, size, mode, + wait, proto, handle); } static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst, @@ -312,12 +328,29 @@ static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst, static int vmci_transport_send_reset(struct sock *sk, struct vmci_transport_packet *pkt) { + struct sockaddr_vm *dst_ptr; + struct sockaddr_vm dst; + struct vsock_sock *vsk; + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) return 0; - return vmci_transport_send_control_pkt(sk, - VMCI_TRANSPORT_PACKET_TYPE_RST, - 0, 0, NULL, VSOCK_PROTO_INVALID, - VMCI_INVALID_HANDLE); + + vsk = vsock_sk(sk); + + if (!vsock_addr_bound(&vsk->local_addr)) + return -EINVAL; + + if (vsock_addr_bound(&vsk->remote_addr)) { + dst_ptr = &vsk->remote_addr; + } else { + vsock_addr_init(&dst, pkt->dg.src.context, + pkt->src_port); + dst_ptr = &dst; + } + return vmci_transport_alloc_send_control_pkt(&vsk->local_addr, dst_ptr, + VMCI_TRANSPORT_PACKET_TYPE_RST, + 0, 0, NULL, VSOCK_PROTO_INVALID, + VMCI_INVALID_HANDLE); } static int vmci_transport_send_negotiate(struct sock *sk, size_t size) diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 1d84f91bbfb0..72a224ce8e0a 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o +cfg80211-y += pmsr.o cfg80211-$(CONFIG_OF) += of.o cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 2db713d18f71..7dc1bbd0888f 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,6 +6,7 @@ * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2018 Intel Corporation */ #include <linux/export.h> @@ -747,6 +748,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_20: if (!ht_cap->ht_supported) return false; + /* fall through */ case NL80211_CHAN_WIDTH_20_NOHT: prohibited_flags |= IEEE80211_CHAN_NO_20MHZ; width = 20; @@ -769,6 +771,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return false; + /* fall through */ case NL80211_CHAN_WIDTH_80: if (!vht_cap->vht_supported) return false; diff --git a/net/wireless/core.c b/net/wireless/core.c index 5bd01058b9e6..623dfe5e211c 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -4,6 +4,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -190,11 +191,25 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, return err; } + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + if (!wdev->netdev) + continue; + nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); + } + nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); + wiphy_net_set(&rdev->wiphy, net); err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev)); WARN_ON(err); + nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + if (!wdev->netdev) + continue; + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); + } + return 0; } @@ -664,6 +679,34 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; #endif + if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported)) + return -EINVAL; + + if (wiphy->pmsr_capa && wiphy->pmsr_capa->ftm.supported) { + if (WARN_ON(!wiphy->pmsr_capa->ftm.asap && + !wiphy->pmsr_capa->ftm.non_asap)) + return -EINVAL; + if (WARN_ON(!wiphy->pmsr_capa->ftm.preambles || + !wiphy->pmsr_capa->ftm.bandwidths)) + return -EINVAL; + if (WARN_ON(wiphy->pmsr_capa->ftm.preambles & + ~(BIT(NL80211_PREAMBLE_LEGACY) | + BIT(NL80211_PREAMBLE_HT) | + BIT(NL80211_PREAMBLE_VHT) | + BIT(NL80211_PREAMBLE_DMG)))) + return -EINVAL; + if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths & + ~(BIT(NL80211_CHAN_WIDTH_20_NOHT) | + BIT(NL80211_CHAN_WIDTH_20) | + BIT(NL80211_CHAN_WIDTH_40) | + BIT(NL80211_CHAN_WIDTH_80) | + BIT(NL80211_CHAN_WIDTH_80P80) | + BIT(NL80211_CHAN_WIDTH_160) | + BIT(NL80211_CHAN_WIDTH_5) | + BIT(NL80211_CHAN_WIDTH_10)))) + return -EINVAL; + } + /* * if a wiphy has unsupported modes for regulatory channel enforcement, * opt-out of enforcement checking @@ -1087,6 +1130,8 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); + cfg80211_pmsr_wdev_down(wdev); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: __cfg80211_leave_ibss(rdev, dev, true); @@ -1174,6 +1219,9 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); spin_lock_init(&wdev->mgmt_registrations_lock); + INIT_LIST_HEAD(&wdev->pmsr_list); + spin_lock_init(&wdev->pmsr_lock); + INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); /* * We get here also when the interface changes network namespaces, diff --git a/net/wireless/core.h b/net/wireless/core.h index c61dbba8bf47..c5d6f3418601 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,6 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright (C) 2018 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -530,4 +531,8 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, void cfg80211_cqm_config_free(struct wireless_dev *wdev); +void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid); +void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev); +void cfg80211_pmsr_free_wk(struct work_struct *work); + #endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index 6beab0cfcb99..55214fe925b2 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -75,7 +75,7 @@ static void *lib80211_ccmp_init(int key_idx) goto fail; priv->key_idx = key_idx; - priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + priv->tfm = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(priv->tfm)) { priv->tfm = NULL; goto fail; diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index b5e235573c8a..35f06563207d 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -99,7 +99,7 @@ static void *lib80211_tkip_init(int key_idx) priv->key_idx = key_idx; - priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->tx_tfm_arc4)) { priv->tx_tfm_arc4 = NULL; goto fail; @@ -111,7 +111,7 @@ static void *lib80211_tkip_init(int key_idx) goto fail; } - priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->rx_tfm_arc4)) { priv->rx_tfm_arc4 = NULL; goto fail; diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c index 6015f6b542a6..20c1ad63ad44 100644 --- a/net/wireless/lib80211_crypt_wep.c +++ b/net/wireless/lib80211_crypt_wep.c @@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx) goto fail; priv->key_idx = keyidx; - priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->tx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->tx_tfm)) { priv->tx_tfm = NULL; goto fail; } - priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->rx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->rx_tfm)) { priv->rx_tfm = NULL; goto fail; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 12b3edf70a7b..1615e503f8e3 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -272,11 +272,11 @@ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, p1 = (u8*)(ht_capa); p2 = (u8*)(ht_capa_mask); - for (i = 0; i<sizeof(*ht_capa); i++) + for (i = 0; i < sizeof(*ht_capa); i++) p1[i] &= p2[i]; } -/* Do a logical ht_capa &= ht_capa_mask. */ +/* Do a logical vht_capa &= vht_capa_mask. */ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 744b5851bbf9..5e49492d5911 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -240,7 +240,63 @@ nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = { .len = U8_MAX }, }; -static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { +static const struct nla_policy +nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { + [NL80211_PMSR_FTM_REQ_ATTR_ASAP] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE] = { .type = NLA_U32 }, + [NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP] = + NLA_POLICY_MAX(NLA_U8, 15), + [NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD] = { .type = NLA_U16 }, + [NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION] = + NLA_POLICY_MAX(NLA_U8, 15), + [NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST] = + NLA_POLICY_MAX(NLA_U8, 15), + [NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES] = { .type = NLA_U8 }, + [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_pmsr_req_data_policy[NL80211_PMSR_TYPE_MAX + 1] = { + [NL80211_PMSR_TYPE_FTM] = + NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX, + nl80211_pmsr_ftm_req_attr_policy), +}; + +static const struct nla_policy +nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = { + [NL80211_PMSR_REQ_ATTR_DATA] = + NLA_POLICY_NESTED(NL80211_PMSR_TYPE_MAX, + nl80211_pmsr_req_data_policy), + [NL80211_PMSR_REQ_ATTR_GET_AP_TSF] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { + [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR, + /* + * we could specify this again to be the top-level policy, + * but that would open us up to recursion problems ... + */ + [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_NESTED }, + [NL80211_PMSR_PEER_ATTR_REQ] = + NLA_POLICY_NESTED(NL80211_PMSR_REQ_ATTR_MAX, + nl80211_pmsr_req_attr_policy), + [NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT }, +}; + +static const struct nla_policy +nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = { + [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_REJECT }, + [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_REJECT }, + [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT }, + [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT }, + [NL80211_PMSR_ATTR_PEERS] = + NLA_POLICY_NESTED_ARRAY(NL80211_PMSR_PEER_ATTR_MAX, + nl80211_psmr_peer_attr_policy), +}; + +const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, .len = 20-1 }, @@ -497,6 +553,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .type = NLA_NESTED, .validation_data = nl80211_ftm_responder_policy, }, + [NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1), + [NL80211_ATTR_PEER_MEASUREMENTS] = + NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX, + nl80211_pmsr_attr_policy), }; /* policy for the key attributes */ @@ -637,9 +697,9 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, }; -static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, - struct cfg80211_registered_device **rdev, - struct wireless_dev **wdev) +int nl80211_prepare_wdev_dump(struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct wireless_dev **wdev) { int err; @@ -684,8 +744,8 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, } /* message building helper */ -static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, - int flags, u8 cmd) +void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, + int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd); @@ -1615,6 +1675,91 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev, return -ENOBUFS; } +static int +nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap, + struct sk_buff *msg) +{ + struct nlattr *ftm; + + if (!cap->ftm.supported) + return 0; + + ftm = nla_nest_start(msg, NL80211_PMSR_TYPE_FTM); + if (!ftm) + return -ENOBUFS; + + if (cap->ftm.asap && nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_ASAP)) + return -ENOBUFS; + if (cap->ftm.non_asap && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP)) + return -ENOBUFS; + if (cap->ftm.request_lci && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI)) + return -ENOBUFS; + if (cap->ftm.request_civicloc && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES, + cap->ftm.preambles)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS, + cap->ftm.bandwidths)) + return -ENOBUFS; + if (cap->ftm.max_bursts_exponent >= 0 && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT, + cap->ftm.max_bursts_exponent)) + return -ENOBUFS; + if (cap->ftm.max_ftms_per_burst && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST, + cap->ftm.max_ftms_per_burst)) + return -ENOBUFS; + + nla_nest_end(msg, ftm); + return 0; +} + +static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) +{ + const struct cfg80211_pmsr_capabilities *cap = rdev->wiphy.pmsr_capa; + struct nlattr *pmsr, *caps; + + if (!cap) + return 0; + + /* + * we don't need to clean up anything here since the caller + * will genlmsg_cancel() if we fail + */ + + pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS); + if (!pmsr) + return -ENOBUFS; + + if (nla_put_u32(msg, NL80211_PMSR_ATTR_MAX_PEERS, cap->max_peers)) + return -ENOBUFS; + + if (cap->report_ap_tsf && + nla_put_flag(msg, NL80211_PMSR_ATTR_REPORT_AP_TSF)) + return -ENOBUFS; + + if (cap->randomize_mac_addr && + nla_put_flag(msg, NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR)) + return -ENOBUFS; + + caps = nla_nest_start(msg, NL80211_PMSR_ATTR_TYPE_CAPA); + if (!caps) + return -ENOBUFS; + + if (nl80211_send_pmsr_ftm_capa(cap, msg)) + return -ENOBUFS; + + nla_nest_end(msg, caps); + nla_nest_end(msg, pmsr); + + return 0; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -1706,6 +1851,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->split_start++; if (state->split) break; + /* fall through */ case 1: if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES, sizeof(u32) * rdev->wiphy.n_cipher_suites, @@ -1752,6 +1898,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->split_start++; if (state->split) break; + /* fall through */ case 2: if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES, rdev->wiphy.interface_modes)) @@ -1759,6 +1906,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->split_start++; if (state->split) break; + /* fall through */ case 3: nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS); if (!nl_bands) @@ -1784,6 +1932,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->chan_start++; if (state->split) break; + /* fall through */ default: /* add frequencies */ nl_freqs = nla_nest_start( @@ -1837,6 +1986,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->split_start++; if (state->split) break; + /* fall through */ case 4: nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS); if (!nl_cmds) @@ -1863,6 +2013,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->split_start++; if (state->split) break; + /* fall through */ case 5: if (rdev->ops->remain_on_channel && (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) && @@ -1880,6 +2031,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->split_start++; if (state->split) break; + /* fall through */ case 6: #ifdef CONFIG_PM if (nl80211_send_wowlan(msg, rdev, state->split)) @@ -1890,6 +2042,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, #else state->split_start++; #endif + /* fall through */ case 7: if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES, rdev->wiphy.software_iftypes)) @@ -1902,6 +2055,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, state->split_start++; if (state->split) break; + /* fall through */ case 8: if ((rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) && nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME, @@ -2118,6 +2272,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, goto nla_put_failure; } + state->split_start++; + break; + case 14: + if (nl80211_send_pmsr_capa(rdev, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -2318,9 +2478,9 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev) wdev->iftype == NL80211_IFTYPE_P2P_GO; } -static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, - struct genl_info *info, - struct cfg80211_chan_def *chandef) +int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, + struct genl_info *info, + struct cfg80211_chan_def *chandef) { struct netlink_ext_ack *extack = info->extack; struct nlattr **attrs = info->attrs; @@ -2794,12 +2954,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) return 0; } -static inline u64 wdev_id(struct wireless_dev *wdev) -{ - return (u64)wdev->identifier | - ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32); -} - static int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef) { @@ -2832,14 +2986,15 @@ static int nl80211_send_chandef(struct sk_buff *msg, static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, bool removal) + struct wireless_dev *wdev, + enum nl80211_commands cmd) { struct net_device *dev = wdev->netdev; - u8 cmd = NL80211_CMD_NEW_INTERFACE; void *hdr; - if (removal) - cmd = NL80211_CMD_DEL_INTERFACE; + WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE && + cmd != NL80211_CMD_DEL_INTERFACE && + cmd != NL80211_CMD_SET_INTERFACE); hdr = nl80211hdr_put(msg, portid, seq, flags, cmd); if (!hdr) @@ -2987,7 +3142,8 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * } if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - rdev, wdev, false) < 0) { + rdev, wdev, + NL80211_CMD_NEW_INTERFACE) < 0) { goto out; } if_idx++; @@ -3017,7 +3173,7 @@ static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0, - rdev, wdev, false) < 0) { + rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -3207,6 +3363,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (!err && params.use_4addr != -1) dev->ieee80211_ptr->use_4addr = params.use_4addr; + if (change && !err) { + struct wireless_dev *wdev = dev->ieee80211_ptr; + + nl80211_notify_iface(rdev, wdev, NL80211_CMD_SET_INTERFACE); + } + return err; } @@ -3298,7 +3460,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) } if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0, - rdev, wdev, false) < 0) { + rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -4521,8 +4683,7 @@ static int parse_station_flags(struct genl_info *info, return 0; } -static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, - int attr) +bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) { struct nlattr *rate; u32 bitrate; @@ -4731,6 +4892,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(LOCAL_PM, local_pm, u32); PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); + PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); @@ -6122,7 +6284,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW, cur_params.dot11MeshAwakeWindowDuration) || nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT, - cur_params.plink_timeout)) + cur_params.plink_timeout) || + nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, + cur_params.dot11MeshConnectedToMeshGate)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6179,6 +6343,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { NL80211_MESH_POWER_MAX), [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, + [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -6290,6 +6455,9 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, mask, NL80211_MESHCONF_RSSI_THRESHOLD, nla_get_s32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask, + NL80211_MESHCONF_CONNECTED_TO_GATE, + nla_get_u8); /* * Check HT operation mode based on * IEEE 802.11-2016 9.4.2.57 HT Operation element. @@ -6855,8 +7023,8 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy, return 0; } -static int nl80211_parse_random_mac(struct nlattr **attrs, - u8 *mac_addr, u8 *mac_addr_mask) +int nl80211_parse_random_mac(struct nlattr **attrs, + u8 *mac_addr, u8 *mac_addr_mask) { int i; @@ -7822,6 +7990,60 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, return err; } +static int nl80211_notify_radar_detection(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_chan_def chandef; + enum nl80211_dfs_regions dfs_region; + int err; + + dfs_region = reg_get_dfs_region(wiphy); + if (dfs_region == NL80211_DFS_UNSET) { + GENL_SET_ERR_MSG(info, + "DFS Region is not set. Unexpected Radar indication"); + return -EINVAL; + } + + err = nl80211_parse_chandef(rdev, info, &chandef); + if (err) { + GENL_SET_ERR_MSG(info, "Unable to extract chandef info"); + return err; + } + + err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype); + if (err < 0) { + GENL_SET_ERR_MSG(info, "chandef is invalid"); + return err; + } + + if (err == 0) { + GENL_SET_ERR_MSG(info, + "Unexpected Radar indication for chandef/iftype"); + return -EINVAL; + } + + /* Do not process this notification if radar is already detected + * by kernel on this channel, and return success. + */ + if (chandef.chan->dfs_state == NL80211_DFS_UNAVAILABLE) + return 0; + + cfg80211_set_dfs_state(wiphy, &chandef, NL80211_DFS_UNAVAILABLE); + + cfg80211_sched_dfs_chan_update(rdev); + + memcpy(&rdev->radar_chandef, &chandef, sizeof(chandef)); + + /* Propagate this notification to other radios as well */ + queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); + + return 0; +} + static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -7870,6 +8092,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) } memset(¶ms, 0, sizeof(params)); + params.beacon_csa.ftm_responder = -1; if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]) @@ -8929,8 +9152,10 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { int r = validate_pae_over_nl80211(rdev, info); - if (r < 0) + if (r < 0) { + kzfree(connkeys); return r; + } ibss.control_port_over_nl80211 = true; } @@ -13898,6 +14123,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_PEER_MEASUREMENT_START, + .doit = nl80211_pmsr_start, + .policy = nl80211_policy, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_NOTIFY_RADAR, + .doit = nl80211_notify_radar_detection, + .policy = nl80211_policy, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { @@ -13945,15 +14186,11 @@ void nl80211_notify_iface(struct cfg80211_registered_device *rdev, { struct sk_buff *msg; - WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE && - cmd != NL80211_CMD_DEL_INTERFACE); - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, - cmd == NL80211_CMD_DEL_INTERFACE) < 0) { + if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, cmd) < 0) { nlmsg_free(msg); return; } @@ -14572,7 +14809,8 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, } void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, - const u8* ie, u8 ie_len, gfp_t gfp) + const u8 *ie, u8 ie_len, + int sig_dbm, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -14598,7 +14836,9 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || (ie_len && ie && - nla_put(msg, NL80211_ATTR_IE, ie_len , ie))) + nla_put(msg, NL80211_ATTR_IE, ie_len, ie)) || + (sig_dbm && + nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -15881,6 +16121,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb, } else if (wdev->conn_owner_nlportid == notify->portid) { schedule_work(&wdev->disconnect_wk); } + + cfg80211_release_pmsr(wdev, notify->portid); } spin_lock_bh(&rdev->beacon_registrations_lock); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 79e47fe60c35..531c82dcba6b 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -1,4 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Portions of this file + * Copyright (C) 2018 Intel Corporation + */ #ifndef __NET_WIRELESS_NL80211_H #define __NET_WIRELESS_NL80211_H @@ -6,6 +10,30 @@ int nl80211_init(void); void nl80211_exit(void); + +extern const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; + +void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, + int flags, u8 cmd); +bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, + int attr); + +static inline u64 wdev_id(struct wireless_dev *wdev) +{ + return (u64)wdev->identifier | + ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32); +} + +int nl80211_prepare_wdev_dump(struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct wireless_dev **wdev); + +int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, + struct genl_info *info, + struct cfg80211_chan_def *chandef); +int nl80211_parse_random_mac(struct nlattr **attrs, + u8 *mac_addr, u8 *mac_addr_mask); + void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, enum nl80211_commands cmd); void nl80211_notify_iface(struct cfg80211_registered_device *rdev, @@ -95,4 +123,8 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev); void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); +/* peer measurement */ +int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info); +int nl80211_pmsr_dump_results(struct sk_buff *skb, struct netlink_callback *cb); + #endif /* __NET_WIRELESS_NL80211_H */ diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c new file mode 100644 index 000000000000..de9286703280 --- /dev/null +++ b/net/wireless/pmsr.c @@ -0,0 +1,590 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Intel Corporation + */ +#ifndef __PMSR_H +#define __PMSR_H +#include <net/cfg80211.h> +#include "core.h" +#include "nl80211.h" +#include "rdev-ops.h" + +static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, + struct nlattr *ftmreq, + struct cfg80211_pmsr_request_peer *out, + struct genl_info *info) +{ + const struct cfg80211_pmsr_capabilities *capa = rdev->wiphy.pmsr_capa; + struct nlattr *tb[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1]; + u32 preamble = NL80211_PREAMBLE_DMG; /* only optional in DMG */ + + /* validate existing data */ + if (!(rdev->wiphy.pmsr_capa->ftm.bandwidths & BIT(out->chandef.width))) { + NL_SET_ERR_MSG(info->extack, "FTM: unsupported bandwidth"); + return -EINVAL; + } + + /* no validation needed - was already done via nested policy */ + nla_parse_nested(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq, NULL, NULL); + + if (tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) + preamble = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]); + + /* set up values - struct is 0-initialized */ + out->ftm.requested = true; + + switch (out->chandef.chan->band) { + case NL80211_BAND_60GHZ: + /* optional */ + break; + default: + if (!tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) { + NL_SET_ERR_MSG(info->extack, + "FTM: must specify preamble"); + return -EINVAL; + } + } + + if (!(capa->ftm.preambles & BIT(preamble))) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE], + "FTM: invalid preamble"); + return -EINVAL; + } + + out->ftm.preamble = preamble; + + out->ftm.burst_period = 0; + if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]) + out->ftm.burst_period = + nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]); + + out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP]; + if (out->ftm.asap && !capa->ftm.asap) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP], + "FTM: ASAP mode not supported"); + return -EINVAL; + } + + if (!out->ftm.asap && !capa->ftm.non_asap) { + NL_SET_ERR_MSG(info->extack, + "FTM: non-ASAP mode not supported"); + return -EINVAL; + } + + out->ftm.num_bursts_exp = 0; + if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]) + out->ftm.num_bursts_exp = + nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]); + + if (capa->ftm.max_bursts_exponent >= 0 && + out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP], + "FTM: max NUM_BURSTS_EXP must be set lower than the device limit"); + return -EINVAL; + } + + out->ftm.burst_duration = 15; + if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) + out->ftm.burst_duration = + nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); + + out->ftm.ftms_per_burst = 0; + if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) + out->ftm.ftms_per_burst = + nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); + + if (capa->ftm.max_ftms_per_burst && + (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst || + out->ftm.ftms_per_burst == 0)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST], + "FTM: FTMs per burst must be set lower than the device limit but non-zero"); + return -EINVAL; + } + + out->ftm.ftmr_retries = 3; + if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]) + out->ftm.ftmr_retries = + nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]); + + out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI]; + if (out->ftm.request_lci && !capa->ftm.request_lci) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI], + "FTM: LCI request not supported"); + } + + out->ftm.request_civicloc = + !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC]; + if (out->ftm.request_civicloc && !capa->ftm.request_civicloc) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC], + "FTM: civic location request not supported"); + } + + return 0; +} + +static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, + struct nlattr *peer, + struct cfg80211_pmsr_request_peer *out, + struct genl_info *info) +{ + struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1]; + struct nlattr *req[NL80211_PMSR_REQ_ATTR_MAX + 1]; + struct nlattr *treq; + int err, rem; + + /* no validation needed - was already done via nested policy */ + nla_parse_nested(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, NULL, NULL); + + if (!tb[NL80211_PMSR_PEER_ATTR_ADDR] || + !tb[NL80211_PMSR_PEER_ATTR_CHAN] || + !tb[NL80211_PMSR_PEER_ATTR_REQ]) { + NL_SET_ERR_MSG_ATTR(info->extack, peer, + "insufficient peer data"); + return -EINVAL; + } + + memcpy(out->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN); + + /* reuse info->attrs */ + memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1)); + /* need to validate here, we don't want to have validation recursion */ + err = nla_parse_nested(info->attrs, NL80211_ATTR_MAX, + tb[NL80211_PMSR_PEER_ATTR_CHAN], + nl80211_policy, info->extack); + if (err) + return err; + + err = nl80211_parse_chandef(rdev, info, &out->chandef); + if (err) + return err; + + /* no validation needed - was already done via nested policy */ + nla_parse_nested(req, NL80211_PMSR_REQ_ATTR_MAX, + tb[NL80211_PMSR_PEER_ATTR_REQ], + NULL, NULL); + + if (!req[NL80211_PMSR_REQ_ATTR_DATA]) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_PEER_ATTR_REQ], + "missing request type/data"); + return -EINVAL; + } + + if (req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF]) + out->report_ap_tsf = true; + + if (out->report_ap_tsf && !rdev->wiphy.pmsr_capa->report_ap_tsf) { + NL_SET_ERR_MSG_ATTR(info->extack, + req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF], + "reporting AP TSF is not supported"); + return -EINVAL; + } + + nla_for_each_nested(treq, req[NL80211_PMSR_REQ_ATTR_DATA], rem) { + switch (nla_type(treq)) { + case NL80211_PMSR_TYPE_FTM: + err = pmsr_parse_ftm(rdev, treq, out, info); + break; + default: + NL_SET_ERR_MSG_ATTR(info->extack, treq, + "unsupported measurement type"); + err = -EINVAL; + } + } + + if (err) + return err; + + return 0; +} + +int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *reqattr = info->attrs[NL80211_ATTR_PEER_MEASUREMENTS]; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct cfg80211_pmsr_request *req; + struct nlattr *peers, *peer; + int count, rem, err, idx; + + if (!rdev->wiphy.pmsr_capa) + return -EOPNOTSUPP; + + if (!reqattr) + return -EINVAL; + + peers = nla_find(nla_data(reqattr), nla_len(reqattr), + NL80211_PMSR_ATTR_PEERS); + if (!peers) + return -EINVAL; + + count = 0; + nla_for_each_nested(peer, peers, rem) { + count++; + + if (count > rdev->wiphy.pmsr_capa->max_peers) { + NL_SET_ERR_MSG_ATTR(info->extack, peer, + "Too many peers used"); + return -EINVAL; + } + } + + req = kzalloc(struct_size(req, peers, count), GFP_KERNEL); + if (!req) + return -ENOMEM; + + if (info->attrs[NL80211_ATTR_TIMEOUT]) + req->timeout = nla_get_u32(info->attrs[NL80211_ATTR_TIMEOUT]); + + if (info->attrs[NL80211_ATTR_MAC]) { + if (!rdev->wiphy.pmsr_capa->randomize_mac_addr) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_MAC], + "device cannot randomize MAC address"); + err = -EINVAL; + goto out_err; + } + + err = nl80211_parse_random_mac(info->attrs, req->mac_addr, + req->mac_addr_mask); + if (err) + goto out_err; + } else { + memcpy(req->mac_addr, nla_data(info->attrs[NL80211_ATTR_MAC]), + ETH_ALEN); + memset(req->mac_addr_mask, 0xff, ETH_ALEN); + } + + idx = 0; + nla_for_each_nested(peer, peers, rem) { + /* NB: this reuses info->attrs, but we no longer need it */ + err = pmsr_parse_peer(rdev, peer, &req->peers[idx], info); + if (err) + goto out_err; + idx++; + } + + req->n_peers = count; + req->cookie = cfg80211_assign_cookie(rdev); + + err = rdev_start_pmsr(rdev, wdev, req); + if (err) + goto out_err; + + list_add_tail(&req->list, &wdev->pmsr_list); + + nl_set_extack_cookie_u64(info->extack, req->cookie); + return 0; +out_err: + kfree(req); + return err; +} + +void cfg80211_pmsr_complete(struct wireless_dev *wdev, + struct cfg80211_pmsr_request *req, + gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_pmsr_complete(wdev->wiphy, wdev, req->cookie); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + goto free_request; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_PEER_MEASUREMENT_COMPLETE); + if (!hdr) + goto free_msg; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) + goto free_msg; + + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie, + NL80211_ATTR_PAD)) + goto free_msg; + + genlmsg_end(msg, hdr); + genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid); + goto free_request; +free_msg: + nlmsg_free(msg); +free_request: + spin_lock_bh(&wdev->pmsr_lock); + list_del(&req->list); + spin_unlock_bh(&wdev->pmsr_lock); + kfree(req); +} +EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete); + +static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg, + struct cfg80211_pmsr_result *res) +{ + if (res->status == NL80211_PMSR_STATUS_FAILURE) { + if (nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON, + res->ftm.failure_reason)) + goto error; + + if (res->ftm.failure_reason == + NL80211_PMSR_FTM_FAILURE_PEER_BUSY && + res->ftm.busy_retry_time && + nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME, + res->ftm.busy_retry_time)) + goto error; + + return 0; + } + +#define PUT(tp, attr, val) \ + do { \ + if (nla_put_##tp(msg, \ + NL80211_PMSR_FTM_RESP_ATTR_##attr, \ + res->ftm.val)) \ + goto error; \ + } while (0) + +#define PUTOPT(tp, attr, val) \ + do { \ + if (res->ftm.val##_valid) \ + PUT(tp, attr, val); \ + } while (0) + +#define PUT_U64(attr, val) \ + do { \ + if (nla_put_u64_64bit(msg, \ + NL80211_PMSR_FTM_RESP_ATTR_##attr,\ + res->ftm.val, \ + NL80211_PMSR_FTM_RESP_ATTR_PAD)) \ + goto error; \ + } while (0) + +#define PUTOPT_U64(attr, val) \ + do { \ + if (res->ftm.val##_valid) \ + PUT_U64(attr, val); \ + } while (0) + + if (res->ftm.burst_index >= 0) + PUT(u32, BURST_INDEX, burst_index); + PUTOPT(u32, NUM_FTMR_ATTEMPTS, num_ftmr_attempts); + PUTOPT(u32, NUM_FTMR_SUCCESSES, num_ftmr_successes); + PUT(u8, NUM_BURSTS_EXP, num_bursts_exp); + PUT(u8, BURST_DURATION, burst_duration); + PUT(u8, FTMS_PER_BURST, ftms_per_burst); + PUTOPT(s32, RSSI_AVG, rssi_avg); + PUTOPT(s32, RSSI_SPREAD, rssi_spread); + if (res->ftm.tx_rate_valid && + !nl80211_put_sta_rate(msg, &res->ftm.tx_rate, + NL80211_PMSR_FTM_RESP_ATTR_TX_RATE)) + goto error; + if (res->ftm.rx_rate_valid && + !nl80211_put_sta_rate(msg, &res->ftm.rx_rate, + NL80211_PMSR_FTM_RESP_ATTR_RX_RATE)) + goto error; + PUTOPT_U64(RTT_AVG, rtt_avg); + PUTOPT_U64(RTT_VARIANCE, rtt_variance); + PUTOPT_U64(RTT_SPREAD, rtt_spread); + PUTOPT_U64(DIST_AVG, dist_avg); + PUTOPT_U64(DIST_VARIANCE, dist_variance); + PUTOPT_U64(DIST_SPREAD, dist_spread); + if (res->ftm.lci && res->ftm.lci_len && + nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_LCI, + res->ftm.lci_len, res->ftm.lci)) + goto error; + if (res->ftm.civicloc && res->ftm.civicloc_len && + nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC, + res->ftm.civicloc_len, res->ftm.civicloc)) + goto error; +#undef PUT +#undef PUTOPT +#undef PUT_U64 +#undef PUTOPT_U64 + + return 0; +error: + return -ENOSPC; +} + +static int nl80211_pmsr_send_result(struct sk_buff *msg, + struct cfg80211_pmsr_result *res) +{ + struct nlattr *pmsr, *peers, *peer, *resp, *data, *typedata; + + pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS); + if (!pmsr) + goto error; + + peers = nla_nest_start(msg, NL80211_PMSR_ATTR_PEERS); + if (!peers) + goto error; + + peer = nla_nest_start(msg, 1); + if (!peer) + goto error; + + if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, res->addr)) + goto error; + + resp = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_RESP); + if (!resp) + goto error; + + if (nla_put_u32(msg, NL80211_PMSR_RESP_ATTR_STATUS, res->status) || + nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_HOST_TIME, + res->host_time, NL80211_PMSR_RESP_ATTR_PAD)) + goto error; + + if (res->ap_tsf_valid && + nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_AP_TSF, + res->host_time, NL80211_PMSR_RESP_ATTR_PAD)) + goto error; + + if (res->final && nla_put_flag(msg, NL80211_PMSR_RESP_ATTR_FINAL)) + goto error; + + data = nla_nest_start(msg, NL80211_PMSR_RESP_ATTR_DATA); + if (!data) + goto error; + + typedata = nla_nest_start(msg, res->type); + if (!typedata) + goto error; + + switch (res->type) { + case NL80211_PMSR_TYPE_FTM: + if (nl80211_pmsr_send_ftm_res(msg, res)) + goto error; + break; + default: + WARN_ON(1); + } + + nla_nest_end(msg, typedata); + nla_nest_end(msg, data); + nla_nest_end(msg, resp); + nla_nest_end(msg, peer); + nla_nest_end(msg, peers); + nla_nest_end(msg, pmsr); + + return 0; +error: + return -ENOSPC; +} + +void cfg80211_pmsr_report(struct wireless_dev *wdev, + struct cfg80211_pmsr_request *req, + struct cfg80211_pmsr_result *result, + gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + int err; + + trace_cfg80211_pmsr_report(wdev->wiphy, wdev, req->cookie, + result->addr); + + /* + * Currently, only variable items are LCI and civic location, + * both of which are reasonably short so we don't need to + * worry about them here for the allocation. + */ + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_RESULT); + if (!hdr) + goto free; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) + goto free; + + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie, + NL80211_ATTR_PAD)) + goto free; + + err = nl80211_pmsr_send_result(msg, result); + if (err) { + pr_err_ratelimited("peer measurement result: message didn't fit!"); + goto free; + } + + genlmsg_end(msg, hdr); + genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid); + return; +free: + nlmsg_free(msg); +} +EXPORT_SYMBOL_GPL(cfg80211_pmsr_report); + +void cfg80211_pmsr_free_wk(struct work_struct *work) +{ + struct wireless_dev *wdev = container_of(work, struct wireless_dev, + pmsr_free_wk); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_pmsr_request *req, *tmp; + LIST_HEAD(free_list); + + spin_lock_bh(&wdev->pmsr_lock); + list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) { + if (req->nl_portid) + continue; + list_move_tail(&req->list, &free_list); + } + spin_unlock_bh(&wdev->pmsr_lock); + + list_for_each_entry_safe(req, tmp, &free_list, list) { + wdev_lock(wdev); + rdev_abort_pmsr(rdev, wdev, req); + wdev_unlock(wdev); + + kfree(req); + } +} + +void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) +{ + struct cfg80211_pmsr_request *req; + bool found = false; + + spin_lock_bh(&wdev->pmsr_lock); + list_for_each_entry(req, &wdev->pmsr_list, list) { + found = true; + req->nl_portid = 0; + } + spin_unlock_bh(&wdev->pmsr_lock); + + if (found) + schedule_work(&wdev->pmsr_free_wk); + flush_work(&wdev->pmsr_free_wk); + WARN_ON(!list_empty(&wdev->pmsr_list)); +} + +void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid) +{ + struct cfg80211_pmsr_request *req; + + spin_lock_bh(&wdev->pmsr_lock); + list_for_each_entry(req, &wdev->pmsr_list, list) { + if (req->nl_portid == portid) { + req->nl_portid = 0; + schedule_work(&wdev->pmsr_free_wk); + } + } + spin_unlock_bh(&wdev->pmsr_lock); +} + +#endif /* __PMSR_H */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 51380b5c32f2..5cb48d135fab 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1247,4 +1247,29 @@ rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_start_pmsr(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_pmsr_request *request) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie); + if (rdev->ops->start_pmsr) + ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline void +rdev_abort_pmsr(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_pmsr_request *request) +{ + trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie); + if (rdev->ops->abort_pmsr) + rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request); + trace_rdev_return_void(&rdev->wiphy); +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d0e7472dd9fd..5123667f4569 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1183,7 +1183,7 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, switch (ftype) { case CFG80211_BSS_FTYPE_BEACON: ies->from_beacon = true; - /* fall through to assign */ + /* fall through */ case CFG80211_BSS_FTYPE_UNKNOWN: rcu_assign_pointer(tmp.pub.beacon_ies, ies); break; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d536b07582f8..f741d8376a46 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -642,11 +642,15 @@ static bool cfg80211_is_all_idle(void) * All devices must be idle as otherwise if you are actively * scanning some new beacon hints could be learned and would * count as new regulatory hints. + * Also if there is any other active beaconing interface we + * need not issue a disconnect hint and reset any info such + * as chan dfs state, etc. */ list_for_each_entry(rdev, &cfg80211_rdev_list, list) { list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { wdev_lock(wdev); - if (wdev->conn || wdev->current_bss) + if (wdev->conn || wdev->current_bss || + cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; wdev_unlock(wdev); } @@ -1171,6 +1175,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, cfg80211_oper_and_ht_capa(&connect->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); + cfg80211_oper_and_vht_capa(&connect->vht_capa_mask, + rdev->wiphy.vht_capa_mod_mask); if (connkeys && connkeys->def >= 0) { int idx; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index c6a9446b4e6b..44b2ce1bb13a 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -361,6 +361,24 @@ DECLARE_EVENT_CLASS(wiphy_wdev_evt, TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); +DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), + TP_ARGS(wiphy, wdev, cookie), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u64, cookie) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->cookie = cookie; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld", + WIPHY_PR_ARG, WDEV_PR_ARG, + (unsigned long long)__entry->cookie) +); + DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) @@ -770,9 +788,9 @@ DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_set_wds_peer, ); TRACE_EVENT(rdev_dump_station, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *mac), - TP_ARGS(wiphy, netdev, idx, mac), + TP_ARGS(wiphy, netdev, _idx, mac), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -783,7 +801,7 @@ TRACE_EVENT(rdev_dump_station, WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); - __entry->idx = idx; + __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", idx: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), @@ -847,9 +865,9 @@ DEFINE_EVENT(mpath_evt, rdev_get_mpath, ); TRACE_EVENT(rdev_dump_mpath, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *dst, u8 *next_hop), - TP_ARGS(wiphy, netdev, idx, dst, next_hop), + TP_ARGS(wiphy, netdev, _idx, dst, next_hop), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -862,7 +880,7 @@ TRACE_EVENT(rdev_dump_mpath, NETDEV_ASSIGN; MAC_ASSIGN(dst, dst); MAC_ASSIGN(next_hop, next_hop); - __entry->idx = idx; + __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: " MAC_PR_FMT ", next hop: " MAC_PR_FMT, @@ -892,9 +910,9 @@ TRACE_EVENT(rdev_get_mpp, ); TRACE_EVENT(rdev_dump_mpp, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *dst, u8 *mpp), - TP_ARGS(wiphy, netdev, idx, mpp, dst), + TP_ARGS(wiphy, netdev, _idx, mpp, dst), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -907,7 +925,7 @@ TRACE_EVENT(rdev_dump_mpp, NETDEV_ASSIGN; MAC_ASSIGN(dst, dst); MAC_ASSIGN(mpp, mpp); - __entry->idx = idx; + __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: " MAC_PR_FMT ", mpp: " MAC_PR_FMT, @@ -1673,8 +1691,8 @@ TRACE_EVENT(rdev_tdls_mgmt, ); TRACE_EVENT(rdev_dump_survey, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx), - TP_ARGS(wiphy, netdev, idx), + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx), + TP_ARGS(wiphy, netdev, _idx), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -1683,7 +1701,7 @@ TRACE_EVENT(rdev_dump_survey, TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; - __entry->idx = idx; + __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx) @@ -2502,6 +2520,16 @@ TRACE_EVENT(rdev_get_ftm_responder_stats, __entry->out_of_window) ); +DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_start_pmsr, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), + TP_ARGS(wiphy, wdev, cookie) +); + +DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), + TP_ARGS(wiphy, wdev, cookie) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -3294,6 +3322,46 @@ TRACE_EVENT(cfg80211_stop_iface, TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); + +TRACE_EVENT(cfg80211_pmsr_report, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + u64 cookie, const u8 *addr), + TP_ARGS(wiphy, wdev, cookie, addr), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u64, cookie) + MAC_ENTRY(addr) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->cookie = cookie; + MAC_ASSIGN(addr, addr); + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, " MAC_PR_FMT, + WIPHY_PR_ARG, WDEV_PR_ARG, + (unsigned long long)__entry->cookie, + MAC_PR_ARG(addr)) +); + +TRACE_EVENT(cfg80211_pmsr_complete, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), + TP_ARGS(wiphy, wdev, cookie), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u64, cookie) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->cookie = cookie; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld", + WIPHY_PR_ARG, WDEV_PR_ARG, + (unsigned long long)__entry->cookie) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index ef14d80ca03e..cd48cdd582c0 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1421,6 +1421,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); + else + break; } } else { pos = skip_ie(ies, ielen, pos); @@ -2013,33 +2015,32 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, case IEEE80211_VHT_CHANWIDTH_160MHZ: if (supp_width == 0 && (ext_nss_bw == 1 || ext_nss_bw == 2)) - return DIV_ROUND_UP(max_vht_nss, 2); + return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) - return DIV_ROUND_UP(3 * max_vht_nss, 4); + return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - if (supp_width == 0 && - (ext_nss_bw == 1 || ext_nss_bw == 2)) + if (supp_width == 0 && ext_nss_bw == 1) return 0; /* not possible */ if (supp_width == 0 && ext_nss_bw == 2) - return DIV_ROUND_UP(max_vht_nss, 2); + return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) - return DIV_ROUND_UP(3 * max_vht_nss, 4); + return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 0) return 0; /* not possible */ if (supp_width == 1 && ext_nss_bw == 1) - return DIV_ROUND_UP(max_vht_nss, 2); + return max_vht_nss / 2; if (supp_width == 1 && ext_nss_bw == 2) - return DIV_ROUND_UP(3 * max_vht_nss, 4); + return (3 * max_vht_nss) / 4; break; } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d49aa79b7997..5121729b8b63 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -100,7 +100,7 @@ int x25_parse_address_block(struct sk_buff *skb, } len = *skb->data; - needed = 1 + (len >> 4) + (len & 0x0f); + needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims @@ -288,7 +288,7 @@ static struct sock *x25_find_listener(struct x25_address *addr, sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || - !strcmp(addr->x25_addr, + !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* @@ -688,11 +688,15 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - len = strlen(addr->sx25_addr.x25_addr); - for (i = 0; i < len; i++) { - if (!isdigit(addr->sx25_addr.x25_addr[i])) { - rc = -EINVAL; - goto out; + /* check for the null_x25_address */ + if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { + + len = strlen(addr->sx25_addr.x25_addr); + for (i = 0; i < len; i++) { + if (!isdigit(addr->sx25_addr.x25_addr[i])) { + rc = -EINVAL; + goto out; + } } } diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 3c12cae32001..afb26221d8a8 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -142,6 +142,15 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->sk_state_change(sk); break; } + case X25_CALL_REQUEST: + /* call collision */ + x25->causediag.cause = 0x01; + x25->causediag.diagnostic = 0x48; + + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25_disconnect(sk, EISCONN, 0x01, 0x48); + break; + case X25_CLEAR_REQUEST: if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2)) goto out_clear; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 07156f43d295..a03268454a27 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -366,6 +366,7 @@ static int xsk_release(struct socket *sock) xskq_destroy(xs->rx); xskq_destroy(xs->tx); + xdp_put_umem(xs->umem); sock_orphan(sk); sock->sk = NULL; @@ -713,18 +714,6 @@ static const struct proto_ops xsk_proto_ops = { .sendpage = sock_no_sendpage, }; -static void xsk_destruct(struct sock *sk) -{ - struct xdp_sock *xs = xdp_sk(sk); - - if (!sock_flag(sk, SOCK_DEAD)) - return; - - xdp_put_umem(xs->umem); - - sk_refcnt_debug_dec(sk); -} - static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -751,9 +740,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_family = PF_XDP; - sk->sk_destruct = xsk_destruct; - sk_refcnt_debug_inc(sk); - sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 140270a13d54..5d43aaa17027 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -5,6 +5,7 @@ config XFRM bool depends on NET select GRO_CELLS + select SKB_EXTENSIONS config XFRM_OFFLOAD bool diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 144c137886b1..b8736f56e7f7 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -32,6 +32,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur struct softnet_data *sd; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return skb; @@ -39,7 +40,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) return skb; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 684c0bc01e2c..b3b613660d44 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -38,8 +38,6 @@ struct xfrm_trans_cb { #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) -static struct kmem_cache *secpath_cachep __ro_after_init; - static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; @@ -111,56 +109,24 @@ static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, return ret; } -void __secpath_destroy(struct sec_path *sp) +struct sec_path *secpath_set(struct sk_buff *skb) { - int i; - for (i = 0; i < sp->len; i++) - xfrm_state_put(sp->xvec[i]); - kmem_cache_free(secpath_cachep, sp); -} -EXPORT_SYMBOL(__secpath_destroy); + struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH); -struct sec_path *secpath_dup(struct sec_path *src) -{ - struct sec_path *sp; - - sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC); + sp = skb_ext_add(skb, SKB_EXT_SEC_PATH); if (!sp) return NULL; - sp->len = 0; - sp->olen = 0; + if (tmp) /* reused existing one (was COW'd if needed) */ + return sp; + /* allocated new secpath */ memset(sp->ovec, 0, sizeof(sp->ovec)); + sp->olen = 0; + sp->len = 0; - if (src) { - int i; - - memcpy(sp, src, sizeof(*sp)); - for (i = 0; i < sp->len; i++) - xfrm_state_hold(sp->xvec[i]); - } - refcount_set(&sp->refcnt, 1); return sp; } -EXPORT_SYMBOL(secpath_dup); - -int secpath_set(struct sk_buff *skb) -{ - struct sec_path *sp; - - /* Allocate new secpath or COW existing one. */ - if (!skb->sp || refcount_read(&skb->sp->refcnt) != 1) { - sp = secpath_dup(skb->sp); - if (!sp) - return -ENOMEM; - - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; - } - return 0; -} EXPORT_SYMBOL(secpath_set); /* Fetch spi and seq from ipsec header */ @@ -236,6 +202,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) bool xfrm_gro = false; bool crypto_done = false; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (encap_type < 0) { x = xfrm_input_state(skb); @@ -312,8 +279,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) break; } - err = secpath_set(skb); - if (err) { + sp = secpath_set(skb); + if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } @@ -328,7 +295,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) daddr = (xfrm_address_t *)(skb_network_header(skb) + XFRM_SPI_SKB_CB(skb)->daddroff); do { - if (skb->sp->len == XFRM_MAX_DEPTH) { + sp = skb_sec_path(skb); + + if (sp->len == XFRM_MAX_DEPTH) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; @@ -344,7 +313,13 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->mark = xfrm_smark_get(skb->mark, x); - skb->sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; + + skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); + goto drop; + } lock: spin_lock(&x->lock); @@ -385,7 +360,6 @@ lock: XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - skb_dst_force(skb); dev_hold(skb->dev); if (crypto_done) @@ -468,8 +442,9 @@ resume: nf_reset(skb); if (decaps) { - if (skb->sp) - skb->sp->olen = 0; + sp = skb_sec_path(skb); + if (sp) + sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; @@ -480,8 +455,9 @@ resume: err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { - if (skb->sp) - skb->sp->olen = 0; + sp = skb_sec_path(skb); + if (sp) + sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; @@ -546,11 +522,6 @@ void __init xfrm_input_init(void) if (err) gro_cells.cells = NULL; - secpath_cachep = kmem_cache_create("secpath_cache", - sizeof(struct sec_path), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); - for_each_possible_cpu(i) { struct xfrm_trans_tasklet *trans; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index d679fa0f44b3..6be8c7df15bb 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -251,7 +251,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) struct xfrm_if *xi; bool xnet; - if (err && !skb->sp) + if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 4ae87c5ce2e3..9333153bafda 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -102,6 +102,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + err = -EHOSTUNREACH; goto error_nolock; } @@ -218,19 +219,16 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) if (xfrm_dev_offload_ok(skb, x)) { struct sec_path *sp; - sp = secpath_dup(skb->sp); + sp = secpath_set(skb); if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); kfree_skb(skb); return -ENOMEM; } - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; skb->encapsulation = 1; sp->olen++; - sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; xfrm_state_hold(x); if (skb_is_gso(skb)) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 119a427d9b2b..934492bad8e0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -26,6 +26,7 @@ #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> +#include <linux/rhashtable.h> #include <net/dst.h> #include <net/flow.h> #include <net/xfrm.h> @@ -45,6 +46,99 @@ struct xfrm_flo { u8 flags; }; +/* prefixes smaller than this are stored in lists, not trees. */ +#define INEXACT_PREFIXLEN_IPV4 16 +#define INEXACT_PREFIXLEN_IPV6 48 + +struct xfrm_pol_inexact_node { + struct rb_node node; + union { + xfrm_address_t addr; + struct rcu_head rcu; + }; + u8 prefixlen; + + struct rb_root root; + + /* the policies matching this node, can be empty list */ + struct hlist_head hhead; +}; + +/* xfrm inexact policy search tree: + * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); + * | + * +---- root_d: sorted by daddr:prefix + * | | + * | xfrm_pol_inexact_node + * | | + * | +- root: sorted by saddr/prefix + * | | | + * | | xfrm_pol_inexact_node + * | | | + * | | + root: unused + * | | | + * | | + hhead: saddr:daddr policies + * | | + * | +- coarse policies and all any:daddr policies + * | + * +---- root_s: sorted by saddr:prefix + * | | + * | xfrm_pol_inexact_node + * | | + * | + root: unused + * | | + * | + hhead: saddr:any policies + * | + * +---- coarse policies and all any:any policies + * + * Lookups return four candidate lists: + * 1. any:any list from top-level xfrm_pol_inexact_bin + * 2. any:daddr list from daddr tree + * 3. saddr:daddr list from 2nd level daddr tree + * 4. saddr:any list from saddr tree + * + * This result set then needs to be searched for the policy with + * the lowest priority. If two results have same prio, youngest one wins. + */ + +struct xfrm_pol_inexact_key { + possible_net_t net; + u32 if_id; + u16 family; + u8 dir, type; +}; + +struct xfrm_pol_inexact_bin { + struct xfrm_pol_inexact_key k; + struct rhash_head head; + /* list containing '*:*' policies */ + struct hlist_head hhead; + + seqcount_t count; + /* tree sorted by daddr/prefix */ + struct rb_root root_d; + + /* tree sorted by saddr/prefix */ + struct rb_root root_s; + + /* slow path below */ + struct list_head inexact_bins; + struct rcu_head rcu; +}; + +enum xfrm_pol_inexact_candidate_type { + XFRM_POL_CAND_BOTH, + XFRM_POL_CAND_SADDR, + XFRM_POL_CAND_DADDR, + XFRM_POL_CAND_ANY, + + XFRM_POL_CAND_MAX, +}; + +struct xfrm_pol_inexact_candidates { + struct hlist_head *res[XFRM_POL_CAND_MAX]; +}; + static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; @@ -55,6 +149,9 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] static struct kmem_cache *xfrm_dst_cache __ro_after_init; static __read_mostly seqcount_t xfrm_policy_hash_generation; +static struct rhashtable xfrm_policy_inexact_table; +static const struct rhashtable_params xfrm_pol_inexact_params; + static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); @@ -64,6 +161,25 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, + u32 if_id); + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup_rcu(struct net *net, + u8 type, u16 family, u8 dir, u32 if_id); +static struct xfrm_policy * +xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, + bool excl); +static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, + struct xfrm_policy *policy); + +static bool +xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_pol_inexact_bin *b, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr); + static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); @@ -269,6 +385,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); + INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -365,7 +482,7 @@ static struct hlist_head *policy_hash_bysel(struct net *net, hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) - return &net->xfrm.policy_inexact[dir]; + return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; @@ -563,6 +680,533 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } +static void xfrm_hash_reset_inexact_table(struct net *net) +{ + struct xfrm_pol_inexact_bin *b; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins) + INIT_HLIST_HEAD(&b->hhead); +} + +/* Make sure *pol can be inserted into fastbin. + * Useful to check that later insert requests will be sucessful + * (provided xfrm_policy_lock is held throughout). + */ +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) +{ + struct xfrm_pol_inexact_bin *bin, *prev; + struct xfrm_pol_inexact_key k = { + .family = pol->family, + .type = pol->type, + .dir = dir, + .if_id = pol->if_id, + }; + struct net *net = xp_net(pol); + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + write_pnet(&k.net, net); + bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, + xfrm_pol_inexact_params); + if (bin) + return bin; + + bin = kzalloc(sizeof(*bin), GFP_ATOMIC); + if (!bin) + return NULL; + + bin->k = k; + INIT_HLIST_HEAD(&bin->hhead); + bin->root_d = RB_ROOT; + bin->root_s = RB_ROOT; + seqcount_init(&bin->count); + + prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, + &bin->k, &bin->head, + xfrm_pol_inexact_params); + if (!prev) { + list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); + return bin; + } + + kfree(bin); + + return IS_ERR(prev) ? NULL : prev; +} + +static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, + int family, u8 prefixlen) +{ + if (xfrm_addr_any(addr, family)) + return true; + + if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) + return true; + + if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) + return true; + + return false; +} + +static bool +xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) +{ + const xfrm_address_t *addr; + bool saddr_any, daddr_any; + u8 prefixlen; + + addr = &policy->selector.saddr; + prefixlen = policy->selector.prefixlen_s; + + saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, + policy->family, + prefixlen); + addr = &policy->selector.daddr; + prefixlen = policy->selector.prefixlen_d; + daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, + policy->family, + prefixlen); + return saddr_any && daddr_any; +} + +static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, + const xfrm_address_t *addr, u8 prefixlen) +{ + node->addr = *addr; + node->prefixlen = prefixlen; +} + +static struct xfrm_pol_inexact_node * +xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) +{ + struct xfrm_pol_inexact_node *node; + + node = kzalloc(sizeof(*node), GFP_ATOMIC); + if (node) + xfrm_pol_inexact_node_init(node, addr, prefixlen); + + return node; +} + +static int xfrm_policy_addr_delta(const xfrm_address_t *a, + const xfrm_address_t *b, + u8 prefixlen, u16 family) +{ + unsigned int pdw, pbi; + int delta = 0; + + switch (family) { + case AF_INET: + if (sizeof(long) == 4 && prefixlen == 0) + return ntohl(a->a4) - ntohl(b->a4); + return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) - + (ntohl(b->a4) & ((~0UL << (32 - prefixlen)))); + case AF_INET6: + pdw = prefixlen >> 5; + pbi = prefixlen & 0x1f; + + if (pdw) { + delta = memcmp(a->a6, b->a6, pdw << 2); + if (delta) + return delta; + } + if (pbi) { + u32 mask = ~0u << (32 - pbi); + + delta = (ntohl(a->a6[pdw]) & mask) - + (ntohl(b->a6[pdw]) & mask); + } + break; + default: + break; + } + + return delta; +} + +static void xfrm_policy_inexact_list_reinsert(struct net *net, + struct xfrm_pol_inexact_node *n, + u16 family) +{ + unsigned int matched_s, matched_d; + struct hlist_node *newpos = NULL; + struct xfrm_policy *policy, *p; + + matched_s = 0; + matched_d = 0; + + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + bool matches_s, matches_d; + + if (!policy->bydst_reinsert) + continue; + + WARN_ON_ONCE(policy->family != family); + + policy->bydst_reinsert = false; + hlist_for_each_entry(p, &n->hhead, bydst) { + if (policy->priority >= p->priority) + newpos = &p->bydst; + else + break; + } + + if (newpos) + hlist_add_behind(&policy->bydst, newpos); + else + hlist_add_head(&policy->bydst, &n->hhead); + + /* paranoia checks follow. + * Check that the reinserted policy matches at least + * saddr or daddr for current node prefix. + * + * Matching both is fine, matching saddr in one policy + * (but not daddr) and then matching only daddr in another + * is a bug. + */ + matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, + &n->addr, + n->prefixlen, + family) == 0; + matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, + &n->addr, + n->prefixlen, + family) == 0; + if (matches_s && matches_d) + continue; + + WARN_ON_ONCE(!matches_s && !matches_d); + if (matches_s) + matched_s++; + if (matches_d) + matched_d++; + WARN_ON_ONCE(matched_s && matched_d); + } +} + +static void xfrm_policy_inexact_node_reinsert(struct net *net, + struct xfrm_pol_inexact_node *n, + struct rb_root *new, + u16 family) +{ + struct rb_node **p, *parent = NULL; + struct xfrm_pol_inexact_node *node; + + /* we should not have another subtree here */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); + + p = &new->rb_node; + while (*p) { + u8 prefixlen; + int delta; + + parent = *p; + node = rb_entry(*p, struct xfrm_pol_inexact_node, node); + + prefixlen = min(node->prefixlen, n->prefixlen); + + delta = xfrm_policy_addr_delta(&n->addr, &node->addr, + prefixlen, family); + if (delta < 0) { + p = &parent->rb_left; + } else if (delta > 0) { + p = &parent->rb_right; + } else { + struct xfrm_policy *tmp; + + hlist_for_each_entry(tmp, &node->hhead, bydst) + tmp->bydst_reinsert = true; + hlist_for_each_entry(tmp, &n->hhead, bydst) + tmp->bydst_reinsert = true; + + INIT_HLIST_HEAD(&node->hhead); + xfrm_policy_inexact_list_reinsert(net, node, family); + + if (node->prefixlen == n->prefixlen) { + kfree_rcu(n, rcu); + return; + } + + rb_erase(*p, new); + kfree_rcu(n, rcu); + n = node; + n->prefixlen = prefixlen; + *p = new->rb_node; + parent = NULL; + } + } + + rb_link_node_rcu(&n->node, parent, p); + rb_insert_color(&n->node, new); +} + +/* merge nodes v and n */ +static void xfrm_policy_inexact_node_merge(struct net *net, + struct xfrm_pol_inexact_node *v, + struct xfrm_pol_inexact_node *n, + u16 family) +{ + struct xfrm_pol_inexact_node *node; + struct xfrm_policy *tmp; + struct rb_node *rnode; + + /* To-be-merged node v has a subtree. + * + * Dismantle it and insert its nodes to n->root. + */ + while ((rnode = rb_first(&v->root)) != NULL) { + node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); + rb_erase(&node->node, &v->root); + xfrm_policy_inexact_node_reinsert(net, node, &n->root, + family); + } + + hlist_for_each_entry(tmp, &v->hhead, bydst) + tmp->bydst_reinsert = true; + hlist_for_each_entry(tmp, &n->hhead, bydst) + tmp->bydst_reinsert = true; + + INIT_HLIST_HEAD(&n->hhead); + xfrm_policy_inexact_list_reinsert(net, n, family); +} + +static struct xfrm_pol_inexact_node * +xfrm_policy_inexact_insert_node(struct net *net, + struct rb_root *root, + xfrm_address_t *addr, + u16 family, u8 prefixlen, u8 dir) +{ + struct xfrm_pol_inexact_node *cached = NULL; + struct rb_node **p, *parent = NULL; + struct xfrm_pol_inexact_node *node; + + p = &root->rb_node; + while (*p) { + int delta; + + parent = *p; + node = rb_entry(*p, struct xfrm_pol_inexact_node, node); + + delta = xfrm_policy_addr_delta(addr, &node->addr, + node->prefixlen, + family); + if (delta == 0 && prefixlen >= node->prefixlen) { + WARN_ON_ONCE(cached); /* ipsec policies got lost */ + return node; + } + + if (delta < 0) + p = &parent->rb_left; + else + p = &parent->rb_right; + + if (prefixlen < node->prefixlen) { + delta = xfrm_policy_addr_delta(addr, &node->addr, + prefixlen, + family); + if (delta) + continue; + + /* This node is a subnet of the new prefix. It needs + * to be removed and re-inserted with the smaller + * prefix and all nodes that are now also covered + * by the reduced prefixlen. + */ + rb_erase(&node->node, root); + + if (!cached) { + xfrm_pol_inexact_node_init(node, addr, + prefixlen); + cached = node; + } else { + /* This node also falls within the new + * prefixlen. Merge the to-be-reinserted + * node and this one. + */ + xfrm_policy_inexact_node_merge(net, node, + cached, family); + kfree_rcu(node, rcu); + } + + /* restart */ + p = &root->rb_node; + parent = NULL; + } + } + + node = cached; + if (!node) { + node = xfrm_pol_inexact_node_alloc(addr, prefixlen); + if (!node) + return NULL; + } + + rb_link_node_rcu(&node->node, parent, p); + rb_insert_color(&node->node, root); + + return node; +} + +static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) +{ + struct xfrm_pol_inexact_node *node; + struct rb_node *rn = rb_first(r); + + while (rn) { + node = rb_entry(rn, struct xfrm_pol_inexact_node, node); + + xfrm_policy_inexact_gc_tree(&node->root, rm); + rn = rb_next(rn); + + if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { + WARN_ON_ONCE(rm); + continue; + } + + rb_erase(&node->node, r); + kfree_rcu(node, rcu); + } +} + +static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) +{ + write_seqcount_begin(&b->count); + xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); + xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); + write_seqcount_end(&b->count); + + if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || + !hlist_empty(&b->hhead)) { + WARN_ON_ONCE(net_exit); + return; + } + + if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, + xfrm_pol_inexact_params) == 0) { + list_del(&b->inexact_bins); + kfree_rcu(b, rcu); + } +} + +static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) +{ + struct net *net = read_pnet(&b->k.net); + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + __xfrm_policy_inexact_prune_bin(b, false); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); +} + +static void __xfrm_policy_inexact_flush(struct net *net) +{ + struct xfrm_pol_inexact_bin *bin, *t; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) + __xfrm_policy_inexact_prune_bin(bin, false); +} + +static struct hlist_head * +xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, + struct xfrm_policy *policy, u8 dir) +{ + struct xfrm_pol_inexact_node *n; + struct net *net; + + net = xp_net(policy); + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + if (xfrm_policy_inexact_insert_use_any_list(policy)) + return &bin->hhead; + + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, + policy->family, + policy->selector.prefixlen_d)) { + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &bin->root_s, + &policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s, + dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + + return &n->hhead; + } + + /* daddr is fixed */ + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &bin->root_d, + &policy->selector.daddr, + policy->family, + policy->selector.prefixlen_d, dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + + /* saddr is wildcard */ + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s)) + return &n->hhead; + + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &n->root, + &policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s, dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + + return &n->hhead; +} + +static struct xfrm_policy * +xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) +{ + struct xfrm_pol_inexact_bin *bin; + struct xfrm_policy *delpol; + struct hlist_head *chain; + struct net *net; + + bin = xfrm_policy_inexact_alloc_bin(policy, dir); + if (!bin) + return ERR_PTR(-ENOMEM); + + net = xp_net(policy); + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); + if (!chain) { + __xfrm_policy_inexact_prune_bin(bin, false); + return ERR_PTR(-ENOMEM); + } + + delpol = xfrm_policy_insert_list(chain, policy, excl); + if (delpol && excl) { + __xfrm_policy_inexact_prune_bin(bin, false); + return ERR_PTR(-EEXIST); + } + + chain = &net->xfrm.policy_inexact[dir]; + xfrm_policy_insert_inexact_list(chain, policy); + + if (delpol) + __xfrm_policy_inexact_prune_bin(bin, false); + + return delpol; +} + static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, @@ -592,7 +1236,50 @@ static void xfrm_hash_rebuild(struct work_struct *work) spin_lock_bh(&net->xfrm.xfrm_policy_lock); + /* make sure that we can insert the indirect policies again before + * we start with destructive action. + */ + list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { + struct xfrm_pol_inexact_bin *bin; + u8 dbits, sbits; + + dir = xfrm_policy_id2dir(policy->index); + if (policy->walk.dead || dir >= XFRM_POLICY_MAX) + continue; + + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { + if (policy->family == AF_INET) { + dbits = rbits4; + sbits = lbits4; + } else { + dbits = rbits6; + sbits = lbits6; + } + } else { + if (policy->family == AF_INET) { + dbits = lbits4; + sbits = rbits4; + } else { + dbits = lbits6; + sbits = rbits6; + } + } + + if (policy->selector.prefixlen_d < dbits || + policy->selector.prefixlen_s < sbits) + continue; + + bin = xfrm_policy_inexact_alloc_bin(policy, dir); + if (!bin) + goto out_unlock; + + if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) + goto out_unlock; + } + /* reset the bydst and inexact table in all directions */ + xfrm_hash_reset_inexact_table(net); + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); hmask = net->xfrm.policy_bydst[dir].hmask; @@ -616,15 +1303,23 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (policy->walk.dead || - xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + if (policy->walk.dead) + continue; + dir = xfrm_policy_id2dir(policy->index); + if (dir >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, - policy->family, - xfrm_policy_id2dir(policy->index)); + policy->family, dir); + if (!chain) { + void *p = xfrm_policy_inexact_insert(policy, dir, 0); + + WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); + continue; + } + hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; @@ -637,6 +1332,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_add_head_rcu(&policy->bydst, chain); } +out_unlock: + __xfrm_policy_inexact_flush(net); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); @@ -740,18 +1437,97 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy, return false; } -int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) { - struct net *net = xp_net(policy); - struct xfrm_policy *pol; - struct xfrm_policy *delpol; - struct hlist_head *chain; - struct hlist_node *newpos; + const struct xfrm_pol_inexact_key *k = data; + u32 a = k->type << 24 | k->dir << 16 | k->family; + + return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), + seed); +} + +static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) +{ + const struct xfrm_pol_inexact_bin *b = data; + + return xfrm_pol_bin_key(&b->k, 0, seed); +} + +static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xfrm_pol_inexact_key *key = arg->key; + const struct xfrm_pol_inexact_bin *b = ptr; + int ret; + + if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) + return -1; + + ret = b->k.dir ^ key->dir; + if (ret) + return ret; + + ret = b->k.type ^ key->type; + if (ret) + return ret; + + ret = b->k.family ^ key->family; + if (ret) + return ret; + + return b->k.if_id ^ key->if_id; +} + +static const struct rhashtable_params xfrm_pol_inexact_params = { + .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), + .hashfn = xfrm_pol_bin_key, + .obj_hashfn = xfrm_pol_bin_obj, + .obj_cmpfn = xfrm_pol_bin_cmp, + .automatic_shrinking = true, +}; + +static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, + struct xfrm_policy *policy) +{ + struct xfrm_policy *pol, *delpol = NULL; + struct hlist_node *newpos = NULL; + int i = 0; + + hlist_for_each_entry(pol, chain, bydst_inexact_list) { + if (pol->type == policy->type && + pol->if_id == policy->if_id && + !selector_cmp(&pol->selector, &policy->selector) && + xfrm_policy_mark_match(policy, pol) && + xfrm_sec_ctx_match(pol->security, policy->security) && + !WARN_ON(delpol)) { + delpol = pol; + if (policy->priority > pol->priority) + continue; + } else if (policy->priority >= pol->priority) { + newpos = &pol->bydst_inexact_list; + continue; + } + if (delpol) + break; + } + + if (newpos) + hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); + else + hlist_add_head_rcu(&policy->bydst_inexact_list, chain); + + hlist_for_each_entry(pol, chain, bydst_inexact_list) { + pol->pos = i; + i++; + } +} + +static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, + struct xfrm_policy *policy, + bool excl) +{ + struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); - delpol = NULL; - newpos = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && @@ -759,24 +1535,45 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_policy_mark_match(policy, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { - if (excl) { - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - return -EEXIST; - } + if (excl) + return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { - newpos = &pol->bydst; + newpos = pol; continue; } if (delpol) break; } + if (newpos) - hlist_add_behind_rcu(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else hlist_add_head_rcu(&policy->bydst, chain); + + return delpol; +} + +int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +{ + struct net *net = xp_net(policy); + struct xfrm_policy *delpol; + struct hlist_head *chain; + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); + if (chain) + delpol = xfrm_policy_insert_list(chain, policy, excl); + else + delpol = xfrm_policy_inexact_insert(policy, dir, excl); + + if (IS_ERR(delpol)) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return PTR_ERR(delpol); + } + __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ @@ -806,43 +1603,96 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); +static struct xfrm_policy * +__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id, + u8 type, int dir, + struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx) +{ + struct xfrm_policy *pol; + + if (!chain) + return NULL; + + hlist_for_each_entry(pol, chain, bydst) { + if (pol->type == type && + pol->if_id == if_id && + (mark & pol->mark.m) == pol->mark.v && + !selector_cmp(sel, &pol->selector) && + xfrm_sec_ctx_match(ctx, pol->security)) + return pol; + } + + return NULL; +} + struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { - struct xfrm_policy *pol, *ret; + struct xfrm_pol_inexact_bin *bin = NULL; + struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); - ret = NULL; - hlist_for_each_entry(pol, chain, bydst) { - if (pol->type == type && - pol->if_id == if_id && - (mark & pol->mark.m) == pol->mark.v && - !selector_cmp(sel, &pol->selector) && - xfrm_sec_ctx_match(ctx, pol->security)) { - xfrm_pol_hold(pol); - if (delete) { - *err = security_xfrm_policy_delete( - pol->security); - if (*err) { - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - return pol; - } - __xfrm_policy_unlink(pol, dir); + if (!chain) { + struct xfrm_pol_inexact_candidates cand; + int i; + + bin = xfrm_policy_inexact_lookup(net, type, + sel->family, dir, if_id); + if (!bin) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return NULL; + } + + if (!xfrm_policy_find_inexact_candidates(&cand, bin, + &sel->saddr, + &sel->daddr)) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return NULL; + } + + pol = NULL; + for (i = 0; i < ARRAY_SIZE(cand.res); i++) { + struct xfrm_policy *tmp; + + tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, + if_id, type, dir, + sel, ctx); + if (!tmp) + continue; + + if (!pol || tmp->pos < pol->pos) + pol = tmp; + } + } else { + pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, + sel, ctx); + } + + if (pol) { + xfrm_pol_hold(pol); + if (delete) { + *err = security_xfrm_policy_delete(pol->security); + if (*err) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return pol; } - ret = pol; - break; + __xfrm_policy_unlink(pol, dir); } + ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); + if (bin && delete) + xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); @@ -892,36 +1742,19 @@ EXPORT_SYMBOL(xfrm_policy_byid); static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { - int dir, err = 0; + struct xfrm_policy *pol; + int err = 0; - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct xfrm_policy *pol; - int i; + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead || + xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || + pol->type != type) + continue; - hlist_for_each_entry(pol, - &net->xfrm.policy_inexact[dir], bydst) { - if (pol->type != type) - continue; - err = security_xfrm_policy_delete(pol->security); - if (err) { - xfrm_audit_policy_delete(pol, 0, task_valid); - return err; - } - } - for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - hlist_for_each_entry(pol, - net->xfrm.policy_bydst[dir].table + i, - bydst) { - if (pol->type != type) - continue; - err = security_xfrm_policy_delete( - pol->security); - if (err) { - xfrm_audit_policy_delete(pol, 0, - task_valid); - return err; - } - } + err = security_xfrm_policy_delete(pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, task_valid); + return err; } } return err; @@ -937,6 +1770,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; + struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); @@ -944,48 +1778,25 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (err) goto out; - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct xfrm_policy *pol; - int i; - - again1: - hlist_for_each_entry(pol, - &net->xfrm.policy_inexact[dir], bydst) { - if (pol->type != type) - continue; - __xfrm_policy_unlink(pol, dir); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - cnt++; - - xfrm_audit_policy_delete(pol, 1, task_valid); - - xfrm_policy_kill(pol); - - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - goto again1; - } - - for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - again2: - hlist_for_each_entry(pol, - net->xfrm.policy_bydst[dir].table + i, - bydst) { - if (pol->type != type) - continue; - __xfrm_policy_unlink(pol, dir); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - cnt++; - - xfrm_audit_policy_delete(pol, 1, task_valid); - xfrm_policy_kill(pol); - - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - goto again2; - } - } +again: + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + dir = xfrm_policy_id2dir(pol->index); + if (pol->walk.dead || + dir >= XFRM_POLICY_MAX || + pol->type != type) + continue; + __xfrm_policy_unlink(pol, dir); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + cnt++; + xfrm_audit_policy_delete(pol, 1, task_valid); + xfrm_policy_kill(pol); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + goto again; } - if (!cnt) + if (cnt) + __xfrm_policy_inexact_flush(net); + else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -1084,21 +1895,188 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, dir); - return ret; } +static struct xfrm_pol_inexact_node * +xfrm_policy_lookup_inexact_addr(const struct rb_root *r, + seqcount_t *count, + const xfrm_address_t *addr, u16 family) +{ + const struct rb_node *parent; + int seq; + +again: + seq = read_seqcount_begin(count); + + parent = rcu_dereference_raw(r->rb_node); + while (parent) { + struct xfrm_pol_inexact_node *node; + int delta; + + node = rb_entry(parent, struct xfrm_pol_inexact_node, node); + + delta = xfrm_policy_addr_delta(addr, &node->addr, + node->prefixlen, family); + if (delta < 0) { + parent = rcu_dereference_raw(parent->rb_left); + continue; + } else if (delta > 0) { + parent = rcu_dereference_raw(parent->rb_right); + continue; + } + + return node; + } + + if (read_seqcount_retry(count, seq)) + goto again; + + return NULL; +} + +static bool +xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_pol_inexact_bin *b, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct xfrm_pol_inexact_node *n; + u16 family; + + if (!b) + return false; + + family = b->k.family; + memset(cand, 0, sizeof(*cand)); + cand->res[XFRM_POL_CAND_ANY] = &b->hhead; + + n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, + family); + if (n) { + cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; + n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, + family); + if (n) + cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; + } + + n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, + family); + if (n) + cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; + + return true; +} + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, + u8 dir, u32 if_id) +{ + struct xfrm_pol_inexact_key k = { + .family = family, + .type = type, + .dir = dir, + .if_id = if_id, + }; + + write_pnet(&k.net, net); + + return rhashtable_lookup(&xfrm_policy_inexact_table, &k, + xfrm_pol_inexact_params); +} + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, + u8 dir, u32 if_id) +{ + struct xfrm_pol_inexact_bin *bin; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + rcu_read_lock(); + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); + rcu_read_unlock(); + + return bin; +} + +static struct xfrm_policy * +__xfrm_policy_eval_candidates(struct hlist_head *chain, + struct xfrm_policy *prefer, + const struct flowi *fl, + u8 type, u16 family, int dir, u32 if_id) +{ + u32 priority = prefer ? prefer->priority : ~0u; + struct xfrm_policy *pol; + + if (!chain) + return NULL; + + hlist_for_each_entry_rcu(pol, chain, bydst) { + int err; + + if (pol->priority > priority) + break; + + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); + if (err) { + if (err != -ESRCH) + return ERR_PTR(err); + + continue; + } + + if (prefer) { + /* matches. Is it older than *prefer? */ + if (pol->priority == priority && + prefer->pos < pol->pos) + return prefer; + } + + return pol; + } + + return NULL; +} + +static struct xfrm_policy * +xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_policy *prefer, + const struct flowi *fl, + u8 type, u16 family, int dir, u32 if_id) +{ + struct xfrm_policy *tmp; + int i; + + for (i = 0; i < ARRAY_SIZE(cand->res); i++) { + tmp = __xfrm_policy_eval_candidates(cand->res[i], + prefer, + fl, type, family, dir, + if_id); + if (!tmp) + continue; + + if (IS_ERR(tmp)) + return tmp; + prefer = tmp; + } + + return prefer; +} + static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { - int err; - struct xfrm_policy *pol, *ret; + struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; + struct xfrm_pol_inexact_bin *bin; + struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; - u32 priority; + int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); @@ -1112,7 +2090,6 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, chain = policy_hash_direct(net, daddr, saddr, family, dir); } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); - priority = ~0U; ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir, if_id); @@ -1125,29 +2102,23 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } else { ret = pol; - priority = ret->priority; break; } } - chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry_rcu(pol, chain, bydst) { - if ((pol->priority >= priority) && ret) - break; + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); + if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, + daddr)) + goto skip_inexact; - err = xfrm_policy_match(pol, fl, type, family, dir, if_id); - if (err) { - if (err == -ESRCH) - continue; - else { - ret = ERR_PTR(err); - goto fail; - } - } else { - ret = pol; - break; - } + pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, + family, dir, if_id); + if (pol) { + ret = pol; + if (IS_ERR(pol)) + goto fail; } +skip_inexact: if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) goto retry; @@ -1239,6 +2210,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); + hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } @@ -1811,7 +2783,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); - goto out; + goto out; } dst_release(dst); @@ -2225,11 +3197,12 @@ EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { + struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; - if (!skb->sp || idx < 0 || idx >= skb->sp->len) + if (!sp || idx < 0 || idx >= sp->len) return 0; - x = skb->sp->xvec[idx]; + x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); @@ -2329,6 +3302,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; + struct sec_path *sp; struct xfrm_if *xi; u32 if_id = 0; @@ -2353,11 +3327,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ - if (skb->sp) { + sp = skb_sec_path(skb); + if (sp) { int i; - for (i = skb->sp->len-1; i >= 0; i--) { - struct xfrm_state *x = skb->sp->xvec[i]; + for (i = sp->len - 1; i >= 0; i--) { + struct xfrm_state *x = sp->xvec[i]; if (!xfrm_selector_match(&x->sel, &fl, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; @@ -2384,7 +3359,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) { - if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) { + if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; @@ -2413,7 +3388,6 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, #endif if (pol->action == XFRM_POLICY_ALLOW) { - struct sec_path *sp; static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; @@ -2421,7 +3395,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int ti = 0; int i, k; - if ((sp = skb->sp) == NULL) + sp = skb_sec_path(skb); + if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { @@ -2816,13 +3791,17 @@ static void xfrm_statistics_fini(struct net *net) static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; - int dir; + int dir, err; - if (net_eq(net, &init_net)) + if (net_eq(net, &init_net)) { xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", sizeof(struct xfrm_dst), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + err = rhashtable_init(&xfrm_policy_inexact_table, + &xfrm_pol_inexact_params); + BUG_ON(err); + } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); @@ -2857,6 +3836,7 @@ static int __net_init xfrm_policy_init(struct net *net) seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); + INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; @@ -2875,6 +3855,7 @@ out_byidx: static void xfrm_policy_fini(struct net *net) { + struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; @@ -2900,6 +3881,11 @@ static void xfrm_policy_fini(struct net *net) sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) + __xfrm_policy_inexact_prune_bin(b, true); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) @@ -3065,7 +4051,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * } } chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst_inexact_list) { if ((pol->priority >= priority) && ret) break; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index dc4a9f1fb941..23c92891758a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -426,6 +426,12 @@ static void xfrm_put_mode(struct xfrm_mode *mode) module_put(mode->owner); } +void xfrm_state_free(struct xfrm_state *x) +{ + kmem_cache_free(xfrm_state_cache, x); +} +EXPORT_SYMBOL(xfrm_state_free); + static void xfrm_state_gc_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); @@ -452,7 +458,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) } xfrm_dev_state_free(x); security_xfrm_state_free(x); - kmem_cache_free(xfrm_state_cache, x); + xfrm_state_free(x); } static void xfrm_state_gc_task(struct work_struct *work) @@ -788,7 +794,7 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) { spin_lock_bh(&net->xfrm.xfrm_state_lock); si->sadcnt = net->xfrm.state_num; - si->sadhcnt = net->xfrm.state_hmask; + si->sadhcnt = net->xfrm.state_hmask + 1; si->sadhmcnt = xfrm_state_hashmax; spin_unlock_bh(&net->xfrm.xfrm_state_lock); } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c9a84e22f5d5..277c1c46fe94 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2288,13 +2288,13 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, } - kfree(x); + xfrm_state_free(x); kfree(xp); return 0; free_state: - kfree(x); + xfrm_state_free(x); nomem: return err; } |