diff options
Diffstat (limited to 'net')
262 files changed, 6652 insertions, 3927 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b90781b9ece6..2a7f1b15714a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -354,6 +354,26 @@ out: return 0; } +static int vlan_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + + return generic_hwtstamp_get_lower(real_dev, cfg); +} + +static int vlan_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + + if (!net_eq(dev_net(dev), dev_net(real_dev))) + return -EOPNOTSUPP; + + return generic_hwtstamp_set_lower(real_dev, cfg, extack); +} + static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; @@ -365,14 +385,9 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { - case SIOCSHWTSTAMP: - if (!net_eq(dev_net(dev), dev_net(real_dev))) - break; - fallthrough; case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: - case SIOCGHWTSTAMP: if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; @@ -1081,6 +1096,8 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, .ndo_fill_forward_path = vlan_dev_fill_forward_path, + .ndo_hwtstamp_get = vlan_hwtstamp_get, + .ndo_hwtstamp_set = vlan_hwtstamp_set, }; static void vlan_dev_free(struct net_device *dev) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 00b684616e8d..c4015f30f9fa 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -1019,7 +1019,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } } - err = csocket->ops->connect(csocket, + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sin_server, sizeof(struct sockaddr_in), 0); if (err < 0) { @@ -1060,7 +1060,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) return err; } - err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server, + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server, sizeof(struct sockaddr_un) - 1, 0); if (err < 0) { pr_err("%s (%d): problem connecting socket: %s: %d\n", diff --git a/net/Kconfig b/net/Kconfig index 2fb25b534df5..d532ec33f1fe 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,11 @@ config NET_INGRESS config NET_EGRESS bool +config NET_XGRESS + select NET_INGRESS + select NET_EGRESS + bool + config NET_REDIRECT bool diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 828fb393ee94..74b49c35ddc1 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2516,6 +2516,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { }, .gw = { .init_sel_class = batadv_iv_init_sel_class, + .sel_class_max = BATADV_TQ_MAX_VALUE, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, .dump = batadv_iv_gw_dump, diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 54e41fc709c3..ac11f1f08db0 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> @@ -34,7 +35,6 @@ #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "gateway_client.h" -#include "gateway_common.h" #include "hard-interface.h" #include "hash.h" #include "log.h" @@ -512,25 +512,6 @@ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) atomic_set(&bat_priv->gw.sel_class, 50); } -static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, - char *buff, size_t count) -{ - u32 old_class, class; - - if (!batadv_parse_throughput(bat_priv->soft_iface, buff, - "B.A.T.M.A.N. V GW selection class", - &class)) - return -EINVAL; - - old_class = atomic_read(&bat_priv->gw.sel_class); - atomic_set(&bat_priv->gw.sel_class, class); - - if (old_class != class) - batadv_gw_reselect(bat_priv); - - return count; -} - /** * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for @@ -818,7 +799,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { }, .gw = { .init_sel_class = batadv_v_init_sel_class, - .store_sel_class = batadv_v_store_sel_class, + .sel_class_max = U32_MAX, .get_best_gw_node = batadv_v_gw_get_best_gw_node, .is_eligible = batadv_v_gw_is_eligible, .dump = batadv_v_gw_dump, diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 6a964a773f57..2dd36ef03c84 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -9,124 +9,15 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> -#include <linux/errno.h> -#include <linux/kstrtox.h> -#include <linux/limits.h> -#include <linux/math64.h> -#include <linux/netdevice.h> #include <linux/stddef.h> -#include <linux/string.h> +#include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "gateway_client.h" -#include "log.h" #include "tvlv.h" /** - * batadv_parse_throughput() - parse supplied string buffer to extract - * throughput information - * @net_dev: the soft interface net device - * @buff: string buffer to parse - * @description: text shown when throughput string cannot be parsed - * @throughput: pointer holding the returned throughput information - * - * Return: false on parse error and true otherwise. - */ -bool batadv_parse_throughput(struct net_device *net_dev, char *buff, - const char *description, u32 *throughput) -{ - enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; - u64 lthroughput; - char *tmp_ptr; - int ret; - - if (strlen(buff) > 4) { - tmp_ptr = buff + strlen(buff) - 4; - - if (strncasecmp(tmp_ptr, "mbit", 4) == 0) - bw_unit_type = BATADV_BW_UNIT_MBIT; - - if (strncasecmp(tmp_ptr, "kbit", 4) == 0 || - bw_unit_type == BATADV_BW_UNIT_MBIT) - *tmp_ptr = '\0'; - } - - ret = kstrtou64(buff, 10, <hroughput); - if (ret) { - batadv_err(net_dev, - "Invalid throughput speed for %s: %s\n", - description, buff); - return false; - } - - switch (bw_unit_type) { - case BATADV_BW_UNIT_MBIT: - /* prevent overflow */ - if (U64_MAX / 10 < lthroughput) { - batadv_err(net_dev, - "Throughput speed for %s too large: %s\n", - description, buff); - return false; - } - - lthroughput *= 10; - break; - case BATADV_BW_UNIT_KBIT: - default: - lthroughput = div_u64(lthroughput, 100); - break; - } - - if (lthroughput > U32_MAX) { - batadv_err(net_dev, - "Throughput speed for %s too large: %s\n", - description, buff); - return false; - } - - *throughput = lthroughput; - - return true; -} - -/** - * batadv_parse_gw_bandwidth() - parse supplied string buffer to extract - * download and upload bandwidth information - * @net_dev: the soft interface net device - * @buff: string buffer to parse - * @down: pointer holding the returned download bandwidth information - * @up: pointer holding the returned upload bandwidth information - * - * Return: false on parse error and true otherwise. - */ -static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, - u32 *down, u32 *up) -{ - char *slash_ptr; - bool ret; - - slash_ptr = strchr(buff, '/'); - if (slash_ptr) - *slash_ptr = 0; - - ret = batadv_parse_throughput(net_dev, buff, "download gateway speed", - down); - if (!ret) - return false; - - /* we also got some upload info */ - if (slash_ptr) { - ret = batadv_parse_throughput(net_dev, slash_ptr + 1, - "upload gateway speed", up); - if (!ret) - return false; - } - - return true; -} - -/** * batadv_gw_tvlv_container_update() - update the gw tvlv container after * gateway setting change * @bat_priv: the bat priv with all the soft interface information @@ -156,57 +47,6 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) } /** - * batadv_gw_bandwidth_set() - Parse and set download/upload gateway bandwidth - * from supplied string buffer - * @net_dev: netdev struct of the soft interface - * @buff: the buffer containing the user data - * @count: number of bytes in the buffer - * - * Return: 'count' on success or a negative error code in case of failure - */ -ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, - size_t count) -{ - struct batadv_priv *bat_priv = netdev_priv(net_dev); - u32 down_curr; - u32 up_curr; - u32 down_new = 0; - u32 up_new = 0; - bool ret; - - down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down); - up_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_up); - - ret = batadv_parse_gw_bandwidth(net_dev, buff, &down_new, &up_new); - if (!ret) - return -EINVAL; - - if (!down_new) - down_new = 1; - - if (!up_new) - up_new = down_new / 5; - - if (!up_new) - up_new = 1; - - if (down_curr == down_new && up_curr == up_new) - return count; - - batadv_gw_reselect(bat_priv); - batadv_info(net_dev, - "Changing gateway bandwidth from: '%u.%u/%u.%u MBit' to: '%u.%u/%u.%u MBit'\n", - down_curr / 10, down_curr % 10, up_curr / 10, up_curr % 10, - down_new / 10, down_new % 10, up_new / 10, up_new % 10); - - atomic_set(&bat_priv->gw.bandwidth_down, down_new); - atomic_set(&bat_priv->gw.bandwidth_up, up_new); - batadv_gw_tvlv_container_update(bat_priv); - - return count; -} - -/** * batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 87c37f907261..5d097d6a1dd9 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -9,9 +9,6 @@ #include "main.h" -#include <linux/netdevice.h> -#include <linux/types.h> - /** * enum batadv_bandwidth_units - bandwidth unit types */ @@ -27,12 +24,8 @@ enum batadv_bandwidth_units { #define BATADV_GW_MODE_CLIENT_NAME "client" #define BATADV_GW_MODE_SERVER_NAME "server" -ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, - size_t count); void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv); void batadv_gw_init(struct batadv_priv *bat_priv); void batadv_gw_free(struct batadv_priv *bat_priv); -bool batadv_parse_throughput(struct net_device *net_dev, char *buff, - const char *description, u32 *throughput); #endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 24c9c0c3f316..96a412beab2d 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -9,6 +9,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/compiler.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/gfp.h> @@ -711,9 +712,14 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct batadv_priv *bat_priv; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); + unsigned int required_mtu; + unsigned int hardif_mtu; int ret; - if (hard_iface->net_dev->mtu < ETH_MIN_MTU + max_header_len) + hardif_mtu = READ_ONCE(hard_iface->net_dev->mtu); + required_mtu = READ_ONCE(soft_iface->mtu) + max_header_len; + + if (hardif_mtu < ETH_MIN_MTU + max_header_len) return -EINVAL; if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) @@ -746,18 +752,18 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->net_dev->name); if (atomic_read(&bat_priv->fragmentation) && - hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len) + hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n", - hard_iface->net_dev->name, hard_iface->net_dev->mtu, - ETH_DATA_LEN + max_header_len); + hard_iface->net_dev->name, hardif_mtu, + required_mtu); if (!atomic_read(&bat_priv->fragmentation) && - hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len) + hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n", - hard_iface->net_dev->name, hard_iface->net_dev->mtu, - ETH_DATA_LEN + max_header_len); + hard_iface->net_dev->name, hardif_mtu, + required_mtu); if (batadv_hardif_is_iface_up(hard_iface)) batadv_hardif_activate_interface(hard_iface); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 156ed39eded1..10007c5894a1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2023.1" +#define BATADV_SOURCE_VERSION "2023.3" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 6efbc9275aec..0c64d81a7761 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -377,7 +377,7 @@ nla_put_failure: * * Return: 0 on success, < 0 on error */ -int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) +static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) { struct sk_buff *msg; int ret; @@ -551,15 +551,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) * algorithm in use implements the GW API */ - u32 sel_class_max = 0xffffffffu; + u32 sel_class_max = bat_priv->algo_ops->gw.sel_class_max; u32 sel_class; attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS]; sel_class = nla_get_u32(attr); - if (!bat_priv->algo_ops->gw.store_sel_class) - sel_class_max = BATADV_TQ_MAX_VALUE; - if (sel_class >= 1 && sel_class <= sel_class_max) { atomic_set(&bat_priv->gw.sel_class, sel_class); batadv_gw_reselect(bat_priv); @@ -861,8 +858,8 @@ nla_put_failure: * * Return: 0 on success, < 0 on error */ -int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, - struct batadv_hard_iface *hard_iface) +static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface) { struct sk_buff *msg; int ret; @@ -1076,8 +1073,8 @@ nla_put_failure: * * Return: 0 on success, < 0 on error */ -int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan) +static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, + struct batadv_softif_vlan *vlan) { struct sk_buff *msg; int ret; diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 48102cc7490c..876d2806a67d 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -21,12 +21,6 @@ int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, u32 cookie); -int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv); -int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, - struct batadv_hard_iface *hard_iface); -int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan); - extern struct genl_family batadv_netlink_family; #endif /* _NET_BATMAN_ADV_NETLINK_H_ */ diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 5f387786e9a7..afd15b3879f1 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -27,10 +27,6 @@ int batadv_recv_frag_packet(struct sk_buff *skb, struct batadv_hard_iface *iface); int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); -int batadv_recv_tt_query(struct sk_buff *skb, - struct batadv_hard_iface *recv_if); -int batadv_recv_roam_adv(struct sk_buff *skb, - struct batadv_hard_iface *recv_if); int batadv_recv_unicast_tvlv(struct sk_buff *skb, struct batadv_hard_iface *recv_if); int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 85d00dc9ce32..1bf1232a4f75 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -156,7 +156,7 @@ static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ - if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) + if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index cf1a0eafe3ab..17d5ea1d8e84 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -2197,11 +2197,10 @@ struct batadv_algo_gw_ops { void (*init_sel_class)(struct batadv_priv *bat_priv); /** - * @store_sel_class: parse and stores a new GW selection class - * (optional) + * @sel_class_max: maximum allowed GW selection class */ - ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, - size_t count); + u32 sel_class_max; + /** * @get_best_gw_node: select the best GW from the list of available * nodes (optional) diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1c3c7ff5c3c6..336a76165454 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -140,6 +140,35 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto, return err; } +struct sock *bt_sock_alloc(struct net *net, struct socket *sock, + struct proto *prot, int proto, gfp_t prio, int kern) +{ + struct sock *sk; + + sk = sk_alloc(net, PF_BLUETOOTH, prio, prot, kern); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + /* Init peer information so it can be properly monitored */ + if (!kern) { + spin_lock(&sk->sk_peer_lock); + sk->sk_peer_pid = get_pid(task_tgid(current)); + sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + } + + return sk; +} +EXPORT_SYMBOL(bt_sock_alloc); + void bt_sock_link(struct bt_sock_list *l, struct sock *sk) { write_lock(&l->lock); @@ -158,6 +187,9 @@ EXPORT_SYMBOL(bt_sock_unlink); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { + const struct cred *old_cred; + struct pid *old_pid; + BT_DBG("parent %p, sk %p", parent, sk); sock_hold(sk); @@ -170,6 +202,19 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; + /* Copy credentials from parent since for incoming connections the + * socket is allocated by the kernel. + */ + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; + sk->sk_peer_pid = get_pid(parent->sk_peer_pid); + sk->sk_peer_cred = get_cred(parent->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); + if (bh) bh_unlock_sock(sk); else @@ -288,8 +333,12 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, bt_sk(sk)->skb_msg_name(skb, msg->msg_name, &msg->msg_namelen); - if (bt_sk(sk)->skb_put_cmsg) - bt_sk(sk)->skb_put_cmsg(skb, msg, sk); + if (test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags)) { + u8 pkt_status = hci_skb_pkt_status(skb); + + put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, + sizeof(pkt_status), &pkt_status); + } } skb_free_datagram(sk, skb); diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h index 832764dfbfb3..97c87abd129f 100644 --- a/net/bluetooth/amp.h +++ b/net/bluetooth/amp.h @@ -28,7 +28,6 @@ struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr, int phylink_gen_key(struct hci_conn *hcon, u8 *data, u8 *len, u8 *type); -void amp_read_loc_info(struct hci_dev *hdev, struct amp_mgr *mgr); void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle); void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr); void amp_read_loc_assoc_final_data(struct hci_dev *hdev, diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 57d509d77cb4..00d47bcf4d7d 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -205,21 +205,13 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern); + sk = bt_sock_alloc(net, sock, &bnep_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; - sock_init_data(sock, sk); - sock->ops = &bnep_sock_ops; - sock->state = SS_UNCONNECTED; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = protocol; - sk->sk_state = BT_OPEN; - bt_sock_link(&bnep_sk_list, sk); return 0; } diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c index d2d2624ec708..ec97a4bab1c9 100644 --- a/net/bluetooth/coredump.c +++ b/net/bluetooth/coredump.c @@ -100,8 +100,7 @@ void hci_devcd_reset(struct hci_dev *hdev) /* Call with hci_dev_lock only. */ static void hci_devcd_free(struct hci_dev *hdev) { - if (hdev->dump.head) - vfree(hdev->dump.head); + vfree(hdev->dump.head); hci_devcd_reset(hdev); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 76222565e2df..234746721047 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -178,57 +178,6 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_conn_put(conn); } -static void le_scan_cleanup(struct work_struct *work) -{ - struct hci_conn *conn = container_of(work, struct hci_conn, - le_scan_cleanup); - struct hci_dev *hdev = conn->hdev; - struct hci_conn *c = NULL; - - BT_DBG("%s hcon %p", hdev->name, conn); - - hci_dev_lock(hdev); - - /* Check that the hci_conn is still around */ - rcu_read_lock(); - list_for_each_entry_rcu(c, &hdev->conn_hash.list, list) { - if (c == conn) - break; - } - rcu_read_unlock(); - - if (c == conn) { - hci_connect_le_scan_cleanup(conn, 0x00); - hci_conn_cleanup(conn); - } - - hci_dev_unlock(hdev); - hci_dev_put(hdev); - hci_conn_put(conn); -} - -static void hci_connect_le_scan_remove(struct hci_conn *conn) -{ - BT_DBG("%s hcon %p", conn->hdev->name, conn); - - /* We can't call hci_conn_del/hci_conn_cleanup here since that - * could deadlock with another hci_conn_del() call that's holding - * hci_dev_lock and doing cancel_delayed_work_sync(&conn->disc_work). - * Instead, grab temporary extra references to the hci_dev and - * hci_conn and perform the necessary cleanup in a separate work - * callback. - */ - - hci_dev_hold(conn->hdev); - hci_conn_get(conn); - - /* Even though we hold a reference to the hdev, many other - * things might get cleaned up meanwhile, including the hdev's - * own workqueue, so we can't use that for scheduling. - */ - schedule_work(&conn->le_scan_cleanup); -} - static void hci_acl_create_connection(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -679,13 +628,6 @@ static void hci_conn_timeout(struct work_struct *work) if (refcnt > 0) return; - /* LE connections in scanning state need special handling */ - if (conn->state == BT_CONNECT && conn->type == LE_LINK && - test_bit(HCI_CONN_SCANNING, &conn->flags)) { - hci_connect_le_scan_remove(conn); - return; - } - hci_abort_conn(conn, hci_proto_disconn_ind(conn)); } @@ -791,7 +733,8 @@ struct iso_list_data { u16 sync_handle; }; int count; - struct iso_cig_params pdu; + bool big_term; + bool big_sync_term; }; static void bis_list(struct hci_conn *conn, void *data) @@ -809,17 +752,6 @@ static void bis_list(struct hci_conn *conn, void *data) d->count++; } -static void find_bis(struct hci_conn *conn, void *data) -{ - struct iso_list_data *d = data; - - /* Ignore unicast */ - if (bacmp(&conn->dst, BDADDR_ANY)) - return; - - d->count++; -} - static int terminate_big_sync(struct hci_dev *hdev, void *data) { struct iso_list_data *d = data; @@ -828,11 +760,8 @@ static int terminate_big_sync(struct hci_dev *hdev, void *data) hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL); - /* Check if ISO connection is a BIS and terminate BIG if there are - * no other connections using it. - */ - hci_conn_hash_list_state(hdev, find_bis, ISO_LINK, BT_CONNECTED, d); - if (d->count) + /* Only terminate BIG if it has been created */ + if (!d->big_term) return 0; return hci_le_terminate_big_sync(hdev, d->big, @@ -844,19 +773,21 @@ static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err) kfree(data); } -static int hci_le_terminate_big(struct hci_dev *hdev, u8 big, u8 bis) +static int hci_le_terminate_big(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; - bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", big, bis); + bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", conn->iso_qos.bcast.big, + conn->iso_qos.bcast.bis); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - d->big = big; - d->bis = bis; + d->big = conn->iso_qos.bcast.big; + d->bis = conn->iso_qos.bcast.bis; + d->big_term = test_and_clear_bit(HCI_CONN_BIG_CREATED, &conn->flags); ret = hci_cmd_sync_queue(hdev, terminate_big_sync, d, terminate_big_destroy); @@ -873,31 +804,26 @@ static int big_terminate_sync(struct hci_dev *hdev, void *data) bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", d->big, d->sync_handle); - /* Check if ISO connection is a BIS and terminate BIG if there are - * no other connections using it. - */ - hci_conn_hash_list_state(hdev, find_bis, ISO_LINK, BT_CONNECTED, d); - if (d->count) - return 0; - - hci_le_big_terminate_sync(hdev, d->big); + if (d->big_sync_term) + hci_le_big_terminate_sync(hdev, d->big); return hci_le_pa_terminate_sync(hdev, d->sync_handle); } -static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle) +static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn) { struct iso_list_data *d; int ret; - bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, sync_handle); + bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, conn->sync_handle); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; d->big = big; - d->sync_handle = sync_handle; + d->sync_handle = conn->sync_handle; + d->big_sync_term = test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags); ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); @@ -916,6 +842,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle) static void bis_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; + struct hci_conn *bis; bt_dev_dbg(hdev, "conn %p", conn); @@ -923,17 +850,29 @@ static void bis_cleanup(struct hci_conn *conn) if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags)) return; - hci_le_terminate_big(hdev, conn->iso_qos.bcast.big, - conn->iso_qos.bcast.bis); + /* Check if ISO connection is a BIS and terminate advertising + * set and BIG if there are no other connections using it. + */ + bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); + if (bis) + return; + + hci_le_terminate_big(hdev, conn); } else { + bis = hci_conn_hash_lookup_big_any_dst(hdev, + conn->iso_qos.bcast.big); + + if (bis) + return; + hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, - conn->sync_handle); + conn); } } static int remove_cig_sync(struct hci_dev *hdev, void *data) { - u8 handle = PTR_ERR(data); + u8 handle = PTR_UINT(data); return hci_le_remove_cig_sync(hdev, handle); } @@ -942,7 +881,8 @@ static int hci_le_remove_cig(struct hci_dev *hdev, u8 handle) { bt_dev_dbg(hdev, "handle 0x%2.2x", handle); - return hci_cmd_sync_queue(hdev, remove_cig_sync, ERR_PTR(handle), NULL); + return hci_cmd_sync_queue(hdev, remove_cig_sync, UINT_PTR(handle), + NULL); } static void find_cis(struct hci_conn *conn, void *data) @@ -983,6 +923,25 @@ static void cis_cleanup(struct hci_conn *conn) hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig); } +static u16 hci_conn_hash_alloc_unset(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + u16 handle = HCI_CONN_HANDLE_MAX + 1; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + /* Find the first unused handle */ + if (handle == 0xffff || c->handle != handle) + break; + handle++; + } + rcu_read_unlock(); + + return handle; +} + struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role) { @@ -996,7 +955,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, bacpy(&conn->dst, dst); bacpy(&conn->src, &hdev->bdaddr); - conn->handle = HCI_CONN_HANDLE_UNSET; + conn->handle = hci_conn_hash_alloc_unset(hdev); conn->hdev = hdev; conn->type = type; conn->role = role; @@ -1059,7 +1018,6 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept); INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle); INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout); - INIT_WORK(&conn->le_scan_cleanup, le_scan_cleanup); atomic_set(&conn->refcnt, 0); @@ -1109,7 +1067,7 @@ static void hci_conn_unlink(struct hci_conn *conn) */ if ((child->type == SCO_LINK || child->type == ESCO_LINK) && - child->handle == HCI_CONN_HANDLE_UNSET) + HCI_CONN_HANDLE_UNSET(child->handle)) hci_conn_del(child); } @@ -1273,9 +1231,41 @@ void hci_conn_failed(struct hci_conn *conn, u8 status) hci_conn_del(conn); } +/* This function requires the caller holds hdev->lock */ +u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle) +{ + struct hci_dev *hdev = conn->hdev; + + bt_dev_dbg(hdev, "hcon %p handle 0x%4.4x", conn, handle); + + if (conn->handle == handle) + return 0; + + if (handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + handle, HCI_CONN_HANDLE_MAX); + return HCI_ERROR_INVALID_PARAMETERS; + } + + /* If abort_reason has been sent it means the connection is being + * aborted and the handle shall not be changed. + */ + if (conn->abort_reason) + return conn->abort_reason; + + conn->handle = handle; + + return 0; +} + static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { - struct hci_conn *conn = data; + struct hci_conn *conn; + u16 handle = PTR_UINT(data); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + return; bt_dev_dbg(hdev, "err %d", err); @@ -1300,10 +1290,17 @@ done: static int hci_connect_le_sync(struct hci_dev *hdev, void *data) { - struct hci_conn *conn = data; + struct hci_conn *conn; + u16 handle = PTR_UINT(data); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + return 0; bt_dev_dbg(hdev, "conn %p", conn); + conn->state = BT_CONNECT; + return hci_le_create_conn_sync(hdev, conn); } @@ -1373,10 +1370,10 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; - conn->state = BT_CONNECT; clear_bit(HCI_CONN_SCANNING, &conn->flags); - err = hci_cmd_sync_queue(hdev, hci_connect_le_sync, conn, + err = hci_cmd_sync_queue(hdev, hci_connect_le_sync, + UINT_PTR(conn->handle), create_le_conn_complete); if (err) { hci_conn_del(conn); @@ -1440,25 +1437,23 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev, static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos) { - struct iso_list_data data; + struct hci_conn *conn; + u8 big; /* Allocate a BIG if not set */ if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) { - for (data.big = 0x00; data.big < 0xef; data.big++) { - data.count = 0; - data.bis = 0xff; + for (big = 0x00; big < 0xef; big++) { - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, - BT_BOUND, &data); - if (!data.count) + conn = hci_conn_hash_lookup_big(hdev, big); + if (!conn) break; } - if (data.big == 0xef) + if (big == 0xef) return -EADDRNOTAVAIL; /* Update BIG */ - qos->bcast.big = data.big; + qos->bcast.big = big; } return 0; @@ -1466,28 +1461,27 @@ static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos) static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) { - struct iso_list_data data; + struct hci_conn *conn; + u8 bis; /* Allocate BIS if not set */ if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) { /* Find an unused adv set to advertise BIS, skip instance 0x00 * since it is reserved as general purpose set. */ - for (data.bis = 0x01; data.bis < hdev->le_num_of_adv_sets; - data.bis++) { - data.count = 0; + for (bis = 0x01; bis < hdev->le_num_of_adv_sets; + bis++) { - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, - BT_BOUND, &data); - if (!data.count) + conn = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, bis); + if (!conn) break; } - if (data.bis == hdev->le_num_of_adv_sets) + if (bis == hdev->le_num_of_adv_sets) return -EADDRNOTAVAIL; /* Update BIS */ - qos->bcast.bis = data.bis; + qos->bcast.bis = bis; } return 0; @@ -1495,10 +1489,10 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, - struct bt_iso_qos *qos) + struct bt_iso_qos *qos, __u8 base_len, + __u8 *base) { struct hci_conn *conn; - struct iso_list_data data; int err; /* Let's make sure that le is enabled.*/ @@ -1516,24 +1510,26 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, if (err) return ERR_PTR(err); - data.big = qos->bcast.big; - data.bis = qos->bcast.bis; - data.count = 0; - - /* Check if there is already a matching BIG/BIS */ - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, BT_BOUND, &data); - if (data.count) + /* Check if the LE Create BIG command has already been sent */ + conn = hci_conn_hash_lookup_per_adv_bis(hdev, dst, qos->bcast.big, + qos->bcast.big); + if (conn) return ERR_PTR(-EADDRINUSE); - conn = hci_conn_hash_lookup_bis(hdev, dst, qos->bcast.big, qos->bcast.bis); - if (conn) + /* Check BIS settings against other bound BISes, since all + * BISes in a BIG must have the same value for all parameters + */ + conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big); + + if (conn && (memcmp(qos, &conn->iso_qos, sizeof(*qos)) || + base_len != conn->le_per_adv_data_len || + memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); conn = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); if (!conn) return ERR_PTR(-ENOMEM); - set_bit(HCI_CONN_PER_ADV, &conn->flags); conn->state = BT_CONNECT; hci_conn_hold(conn); @@ -1707,52 +1703,25 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, return sco; } -static void cis_add(struct iso_list_data *d, struct bt_iso_qos *qos) -{ - struct hci_cis_params *cis = &d->pdu.cis[d->pdu.cp.num_cis]; - - cis->cis_id = qos->ucast.cis; - cis->c_sdu = cpu_to_le16(qos->ucast.out.sdu); - cis->p_sdu = cpu_to_le16(qos->ucast.in.sdu); - cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : qos->ucast.in.phy; - cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : qos->ucast.out.phy; - cis->c_rtn = qos->ucast.out.rtn; - cis->p_rtn = qos->ucast.in.rtn; - - d->pdu.cp.num_cis++; -} - -static void cis_list(struct hci_conn *conn, void *data) -{ - struct iso_list_data *d = data; - - /* Skip if broadcast/ANY address */ - if (!bacmp(&conn->dst, BDADDR_ANY)) - return; - - if (d->cig != conn->iso_qos.ucast.cig || d->cis == BT_ISO_QOS_CIS_UNSET || - d->cis != conn->iso_qos.ucast.cis) - return; - - d->count++; - - if (d->pdu.cp.cig_id == BT_ISO_QOS_CIG_UNSET || - d->count >= ARRAY_SIZE(d->pdu.cis)) - return; - - cis_add(d, &conn->iso_qos); -} - static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_create_big cp; + struct iso_list_data data; memset(&cp, 0, sizeof(cp)); + data.big = qos->bcast.big; + data.bis = qos->bcast.bis; + data.count = 0; + + /* Create a BIS for each bound connection */ + hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, + BT_BOUND, &data); + cp.handle = qos->bcast.big; cp.adv_handle = qos->bcast.bis; - cp.num_bis = 0x01; + cp.num_bis = data.count; hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval); cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu); cp.bis.latency = cpu_to_le16(qos->bcast.out.latency); @@ -1766,25 +1735,62 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp); } -static void set_cig_params_complete(struct hci_dev *hdev, void *data, int err) +static int set_cig_params_sync(struct hci_dev *hdev, void *data) { - struct iso_cig_params *pdu = data; + u8 cig_id = PTR_UINT(data); + struct hci_conn *conn; + struct bt_iso_qos *qos; + struct iso_cig_params pdu; + u8 cis_id; - bt_dev_dbg(hdev, ""); + conn = hci_conn_hash_lookup_cig(hdev, cig_id); + if (!conn) + return 0; - if (err) - bt_dev_err(hdev, "Unable to set CIG parameters: %d", err); + memset(&pdu, 0, sizeof(pdu)); - kfree(pdu); -} + qos = &conn->iso_qos; + pdu.cp.cig_id = cig_id; + hci_cpu_to_le24(qos->ucast.out.interval, pdu.cp.c_interval); + hci_cpu_to_le24(qos->ucast.in.interval, pdu.cp.p_interval); + pdu.cp.sca = qos->ucast.sca; + pdu.cp.packing = qos->ucast.packing; + pdu.cp.framing = qos->ucast.framing; + pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency); + pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency); + + /* Reprogram all CIS(s) with the same CIG, valid range are: + * num_cis: 0x00 to 0x1F + * cis_id: 0x00 to 0xEF + */ + for (cis_id = 0x00; cis_id < 0xf0 && + pdu.cp.num_cis < ARRAY_SIZE(pdu.cis); cis_id++) { + struct hci_cis_params *cis; -static int set_cig_params_sync(struct hci_dev *hdev, void *data) -{ - struct iso_cig_params *pdu = data; - u32 plen; + conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id); + if (!conn) + continue; + + qos = &conn->iso_qos; + + cis = &pdu.cis[pdu.cp.num_cis++]; + cis->cis_id = cis_id; + cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu); + cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu); + cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : + qos->ucast.in.phy; + cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : + qos->ucast.out.phy; + cis->c_rtn = qos->ucast.out.rtn; + cis->p_rtn = qos->ucast.in.rtn; + } + + if (!pdu.cp.num_cis) + return 0; - plen = sizeof(pdu->cp) + pdu->cp.num_cis * sizeof(pdu->cis[0]); - return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS, plen, pdu, + return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS, + sizeof(pdu.cp) + + pdu.cp.num_cis * sizeof(pdu.cis[0]), &pdu, HCI_CMD_TIMEOUT); } @@ -1792,7 +1798,6 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct iso_list_data data; - struct iso_cig_params *pdu; memset(&data, 0, sizeof(data)); @@ -1819,58 +1824,31 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) qos->ucast.cig = data.cig; } - data.pdu.cp.cig_id = qos->ucast.cig; - hci_cpu_to_le24(qos->ucast.out.interval, data.pdu.cp.c_interval); - hci_cpu_to_le24(qos->ucast.in.interval, data.pdu.cp.p_interval); - data.pdu.cp.sca = qos->ucast.sca; - data.pdu.cp.packing = qos->ucast.packing; - data.pdu.cp.framing = qos->ucast.framing; - data.pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency); - data.pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency); - if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) { - data.count = 0; - data.cig = qos->ucast.cig; - data.cis = qos->ucast.cis; - - hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND, - &data); - if (data.count) + if (hci_conn_hash_lookup_cis(hdev, NULL, 0, qos->ucast.cig, + qos->ucast.cis)) return false; - - cis_add(&data, qos); + goto done; } - /* Reprogram all CIS(s) with the same CIG */ - for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0x11; + /* Allocate first available CIS if not set */ + for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0; data.cis++) { - data.count = 0; - - hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND, - &data); - if (data.count) - continue; - - /* Allocate a CIS if not set */ - if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) { + if (!hci_conn_hash_lookup_cis(hdev, NULL, 0, data.cig, + data.cis)) { /* Update CIS */ qos->ucast.cis = data.cis; - cis_add(&data, qos); + break; } } - if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET || !data.pdu.cp.num_cis) + if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) return false; - pdu = kmemdup(&data.pdu, sizeof(*pdu), GFP_KERNEL); - if (!pdu) - return false; - - if (hci_cmd_sync_queue(hdev, set_cig_params_sync, pdu, - set_cig_params_complete) < 0) { - kfree(pdu); +done: + if (hci_cmd_sync_queue(hdev, set_cig_params_sync, + UINT_PTR(qos->ucast.cig), NULL) < 0) return false; - } return true; } @@ -1888,6 +1866,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-ENOMEM); cis->cleanup = cis_cleanup; cis->dst_type = dst_type; + cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; + cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; } if (cis->state == BT_CONNECTED) @@ -1931,6 +1911,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EINVAL); } + hci_conn_hold(cis); + cis->iso_qos = *qos; cis->state = BT_BOUND; @@ -1969,59 +1951,47 @@ bool hci_iso_setup_path(struct hci_conn *conn) return true; } -static int hci_create_cis_sync(struct hci_dev *hdev, void *data) +int hci_conn_check_create_cis(struct hci_conn *conn) { - return hci_le_create_cis_sync(hdev, data); -} + if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY)) + return -EINVAL; -int hci_le_create_cis(struct hci_conn *conn) -{ - struct hci_conn *cis; - struct hci_link *link, *t; - struct hci_dev *hdev = conn->hdev; - int err; + if (!conn->parent || conn->parent->state != BT_CONNECTED || + conn->state != BT_CONNECT || HCI_CONN_HANDLE_UNSET(conn->handle)) + return 1; - bt_dev_dbg(hdev, "hcon %p", conn); + return 0; +} - switch (conn->type) { - case LE_LINK: - if (conn->state != BT_CONNECTED || list_empty(&conn->link_list)) - return -EINVAL; +static int hci_create_cis_sync(struct hci_dev *hdev, void *data) +{ + return hci_le_create_cis_sync(hdev); +} - cis = NULL; +int hci_le_create_cis_pending(struct hci_dev *hdev) +{ + struct hci_conn *conn; + bool pending = false; - /* hci_conn_link uses list_add_tail_rcu so the list is in - * the same order as the connections are requested. - */ - list_for_each_entry_safe(link, t, &conn->link_list, list) { - if (link->conn->state == BT_BOUND) { - err = hci_le_create_cis(link->conn); - if (err) - return err; + rcu_read_lock(); - cis = link->conn; - } + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) { + rcu_read_unlock(); + return -EBUSY; } - return cis ? 0 : -EINVAL; - case ISO_LINK: - cis = conn; - break; - default: - return -EINVAL; + if (!hci_conn_check_create_cis(conn)) + pending = true; } - if (cis->state == BT_CONNECT) + rcu_read_unlock(); + + if (!pending) return 0; /* Queue Create CIS */ - err = hci_cmd_sync_queue(hdev, hci_create_cis_sync, cis, NULL); - if (err) - return err; - - cis->state = BT_CONNECT; - - return 0; + return hci_cmd_sync_queue(hdev, hci_create_cis_sync, NULL, NULL); } static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, @@ -2051,16 +2021,6 @@ static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, qos->latency = conn->le_conn_latency; } -static void hci_bind_bis(struct hci_conn *conn, - struct bt_iso_qos *qos) -{ - /* Update LINK PHYs according to QoS preference */ - conn->le_tx_phy = qos->bcast.out.phy; - conn->le_tx_phy = qos->bcast.out.phy; - conn->iso_qos = *qos; - conn->state = BT_BOUND; -} - static int create_big_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; @@ -2183,27 +2143,80 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) } } -struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, + struct bt_iso_qos *qos, + __u8 base_len, __u8 *base) { struct hci_conn *conn; - int err; + __u8 eir[HCI_MAX_PER_AD_LENGTH]; + + if (base_len && base) + base_len = eir_append_service_data(eir, 0, 0x1851, + base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, qos); + conn = hci_add_bis(hdev, dst, qos, base_len, eir); if (IS_ERR(conn)) return conn; - hci_bind_bis(conn, qos); + /* Update LINK PHYs according to QoS preference */ + conn->le_tx_phy = qos->bcast.out.phy; + conn->le_tx_phy = qos->bcast.out.phy; /* Add Basic Announcement into Peridic Adv Data if BASE is set */ if (base_len && base) { - base_len = eir_append_service_data(conn->le_per_adv_data, 0, - 0x1851, base, base_len); + memcpy(conn->le_per_adv_data, eir, sizeof(eir)); conn->le_per_adv_data_len = base_len; } + hci_iso_qos_setup(hdev, conn, &qos->bcast.out, + conn->le_tx_phy ? conn->le_tx_phy : + hdev->le_tx_def_phys); + + conn->iso_qos = *qos; + conn->state = BT_BOUND; + + return conn; +} + +static void bis_mark_per_adv(struct hci_conn *conn, void *data) +{ + struct iso_list_data *d = data; + + /* Skip if not broadcast/ANY address */ + if (bacmp(&conn->dst, BDADDR_ANY)) + return; + + if (d->big != conn->iso_qos.bcast.big || + d->bis == BT_ISO_QOS_BIS_UNSET || + d->bis != conn->iso_qos.bcast.bis) + return; + + set_bit(HCI_CONN_PER_ADV, &conn->flags); +} + +struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, struct bt_iso_qos *qos, + __u8 base_len, __u8 *base) +{ + struct hci_conn *conn; + int err; + struct iso_list_data data; + + conn = hci_bind_bis(hdev, dst, qos, base_len, base); + if (IS_ERR(conn)) + return conn; + + data.big = qos->bcast.big; + data.bis = qos->bcast.bis; + + /* Set HCI_CONN_PER_ADV for all bound connections, to mark that + * the start periodic advertising and create BIG commands have + * been queued + */ + hci_conn_hash_list_state(hdev, bis_mark_per_adv, ISO_LINK, + BT_BOUND, &data); + /* Queue start periodic advertising and create BIG */ err = hci_cmd_sync_queue(hdev, create_big_sync, conn, create_big_complete); @@ -2212,10 +2225,6 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(err); } - hci_iso_qos_setup(hdev, conn, &qos->bcast.out, - conn->le_tx_phy ? conn->le_tx_phy : - hdev->le_tx_def_phys); - return conn; } @@ -2257,11 +2266,12 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-ENOLINK); } - /* If LE is already connected and CIS handle is already set proceed to - * Create CIS immediately. - */ - if (le->state == BT_CONNECTED && cis->handle != HCI_CONN_HANDLE_UNSET) - hci_le_create_cis(cis); + /* Link takes the refcount */ + hci_conn_drop(cis); + + cis->state = BT_CONNECT; + + hci_le_create_cis_pending(hdev); return cis; } @@ -2848,81 +2858,49 @@ u32 hci_conn_get_phy(struct hci_conn *conn) return phys; } -int hci_abort_conn(struct hci_conn *conn, u8 reason) +static int abort_conn_sync(struct hci_dev *hdev, void *data) { - int r = 0; + struct hci_conn *conn; + u16 handle = PTR_UINT(data); - if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) return 0; - switch (conn->state) { - case BT_CONNECTED: - case BT_CONFIG: - if (conn->type == AMP_LINK) { - struct hci_cp_disconn_phy_link cp; + return hci_abort_conn_sync(hdev, conn, conn->abort_reason); +} - cp.phy_handle = HCI_PHY_HANDLE(conn->handle); - cp.reason = reason; - r = hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK, - sizeof(cp), &cp); - } else { - struct hci_cp_disconnect dc; +int hci_abort_conn(struct hci_conn *conn, u8 reason) +{ + struct hci_dev *hdev = conn->hdev; - dc.handle = cpu_to_le16(conn->handle); - dc.reason = reason; - r = hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, - sizeof(dc), &dc); - } + /* If abort_reason has already been set it means the connection is + * already being aborted so don't attempt to overwrite it. + */ + if (conn->abort_reason) + return 0; - conn->state = BT_DISCONN; + bt_dev_dbg(hdev, "handle 0x%2.2x reason 0x%2.2x", conn->handle, reason); - break; - case BT_CONNECT: - if (conn->type == LE_LINK) { - if (test_bit(HCI_CONN_SCANNING, &conn->flags)) - break; - r = hci_send_cmd(conn->hdev, - HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL); - } else if (conn->type == ACL_LINK) { - if (conn->hdev->hci_ver < BLUETOOTH_VER_1_2) - break; - r = hci_send_cmd(conn->hdev, - HCI_OP_CREATE_CONN_CANCEL, - 6, &conn->dst); - } - break; - case BT_CONNECT2: - if (conn->type == ACL_LINK) { - struct hci_cp_reject_conn_req rej; - - bacpy(&rej.bdaddr, &conn->dst); - rej.reason = reason; - - r = hci_send_cmd(conn->hdev, - HCI_OP_REJECT_CONN_REQ, - sizeof(rej), &rej); - } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { - struct hci_cp_reject_sync_conn_req rej; - - bacpy(&rej.bdaddr, &conn->dst); - - /* SCO rejection has its own limited set of - * allowed error values (0x0D-0x0F) which isn't - * compatible with most values passed to this - * function. To be safe hard-code one of the - * values that's suitable for SCO. - */ - rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; + conn->abort_reason = reason; - r = hci_send_cmd(conn->hdev, - HCI_OP_REJECT_SYNC_CONN_REQ, - sizeof(rej), &rej); + /* If the connection is pending check the command opcode since that + * might be blocking on hci_cmd_sync_work while waiting its respective + * event so we need to hci_cmd_sync_cancel to cancel it. + * + * hci_connect_le serializes the connection attempts so only one + * connection can be in BT_CONNECT at time. + */ + if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) { + switch (hci_skb_event(hdev->sent_cmd)) { + case HCI_EV_LE_CONN_COMPLETE: + case HCI_EV_LE_ENHANCED_CONN_COMPLETE: + case HCI_EVT_LE_CIS_ESTABLISHED: + hci_cmd_sync_cancel(hdev, -ECANCELED); + break; } - break; - default: - conn->state = BT_CLOSED; - break; } - return r; + return hci_cmd_sync_queue(hdev, abort_conn_sync, UINT_PTR(conn->handle), + NULL); } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1ec83985f1ab..a5992f1b3c9b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1074,9 +1074,9 @@ void hci_uuids_clear(struct hci_dev *hdev) void hci_link_keys_clear(struct hci_dev *hdev) { - struct link_key *key; + struct link_key *key, *tmp; - list_for_each_entry(key, &hdev->link_keys, list) { + list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } @@ -1084,9 +1084,9 @@ void hci_link_keys_clear(struct hci_dev *hdev) void hci_smp_ltks_clear(struct hci_dev *hdev) { - struct smp_ltk *k; + struct smp_ltk *k, *tmp; - list_for_each_entry(k, &hdev->long_term_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -1094,9 +1094,9 @@ void hci_smp_ltks_clear(struct hci_dev *hdev) void hci_smp_irks_clear(struct hci_dev *hdev) { - struct smp_irk *k; + struct smp_irk *k, *tmp; - list_for_each_entry(k, &hdev->identity_resolving_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -1104,9 +1104,9 @@ void hci_smp_irks_clear(struct hci_dev *hdev) void hci_blocked_keys_clear(struct hci_dev *hdev) { - struct blocked_key *b; + struct blocked_key *b, *tmp; - list_for_each_entry(b, &hdev->blocked_keys, list) { + list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } @@ -1949,15 +1949,15 @@ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: - bt_dev_dbg(hdev, "%s add monitor %d status %d", hdev->name, + bt_dev_dbg(hdev, "add monitor %d status %d", monitor->handle, status); /* Message was not forwarded to controller - not an error */ break; case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); - bt_dev_dbg(hdev, "%s add monitor %d msft status %d", hdev->name, - monitor->handle, status); + bt_dev_dbg(hdev, "add monitor %d msft status %d", + handle, status); break; } @@ -1976,15 +1976,15 @@ static int hci_remove_adv_monitor(struct hci_dev *hdev, switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ - bt_dev_dbg(hdev, "%s remove monitor %d status %d", hdev->name, + bt_dev_dbg(hdev, "remove monitor %d status %d", monitor->handle, status); goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); - bt_dev_dbg(hdev, "%s remove monitor %d msft status %d", - hdev->name, handle, status); + bt_dev_dbg(hdev, "remove monitor %d msft status %d", + handle, status); break; } @@ -2436,6 +2436,9 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return NOTIFY_DONE; + /* To avoid a potential race with hci_unregister_dev. */ + hci_dev_hold(hdev); + if (action == PM_SUSPEND_PREPARE) ret = hci_suspend_dev(hdev); else if (action == PM_POST_SUSPEND) @@ -2445,6 +2448,7 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); + hci_dev_put(hdev); return NOTIFY_DONE; } @@ -3891,7 +3895,7 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) if (conn) { /* Send to upper protocol */ - bt_cb(skb)->sco.pkt_status = flags & 0x03; + hci_skb_pkt_status(skb) = flags & 0x03; sco_recv_scodata(conn, skb); return; } else { diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index ec0df2f9188e..6b7741f6e95b 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -22,6 +22,7 @@ */ #include <linux/debugfs.h> +#include <linux/kstrtox.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> @@ -1152,7 +1153,7 @@ static ssize_t force_no_mitm_write(struct file *file, return -EFAULT; buf[buf_size] = '\0'; - if (strtobool(buf, &enable)) + if (kstrtobool(buf, &enable)) return -EINVAL; if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM)) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 31ca320ce38d..559b6080706c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1639,7 +1639,7 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_ADV); - if (adv) + if (adv && !adv->periodic) adv->enabled = true; conn = hci_lookup_le_connect(hdev); @@ -1747,7 +1747,7 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct discovery_state *d = &hdev->discovery; - if (len > HCI_MAX_AD_LENGTH) + if (len > max_adv_len(hdev)) return; bacpy(&d->last_adv_addr, bdaddr); @@ -3173,19 +3173,15 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, * As the connection handle is set here for the first time, it indicates * whether the connection is already set up. */ - if (conn->handle != HCI_CONN_HANDLE_UNSET) { + if (!HCI_CONN_HANDLE_UNSET(conn->handle)) { bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection"); goto unlock; } if (!status) { - conn->handle = __le16_to_cpu(ev->handle); - if (conn->handle > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", - conn->handle, HCI_CONN_HANDLE_MAX); - status = HCI_ERROR_INVALID_PARAMETERS; + status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle)); + if (status) goto done; - } if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; @@ -3803,6 +3799,22 @@ static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data, return rp->status; } +static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status) +{ + struct hci_conn *conn, *tmp; + + lockdep_assert_held(&hdev->lock); + + list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) { + if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY) || + conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig) + continue; + + if (HCI_CONN_HANDLE_UNSET(conn->handle)) + hci_conn_failed(conn, status); + } +} + static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3810,6 +3822,7 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, struct hci_cp_le_set_cig_params *cp; struct hci_conn *conn; u8 status = rp->status; + bool pending = false; int i; bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); @@ -3823,12 +3836,15 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E page 2554 + * + * If the Status return parameter is non-zero, then the state of the CIG + * and its CIS configurations shall not be changed by the command. If + * the CIG did not already exist, it shall not be created. + */ if (status) { - while ((conn = hci_conn_hash_lookup_cig(hdev, rp->cig_id))) { - conn->state = BT_CLOSED; - hci_connect_cfm(conn, status); - hci_conn_del(conn); - } + /* Keep current configuration, fail only the unbound CIS */ + hci_unbound_cis_failed(hdev, rp->cig_id, status); goto unlock; } @@ -3848,17 +3864,17 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, if (conn->state != BT_BOUND && conn->state != BT_CONNECT) continue; - conn->handle = __le16_to_cpu(rp->handle[i]); - - bt_dev_dbg(hdev, "%p handle 0x%4.4x parent %p", conn, - conn->handle, conn->parent); + if (hci_conn_set_handle(conn, __le16_to_cpu(rp->handle[i]))) + continue; - /* Create CIS if LE is already connected */ - if (conn->parent && conn->parent->state == BT_CONNECTED) - hci_le_create_cis(conn); + if (conn->state == BT_CONNECT) + pending = true; } unlock: + if (pending) + hci_le_create_cis_pending(hdev); + hci_dev_unlock(hdev); return rp->status; @@ -3938,24 +3954,47 @@ static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_ev_status *rp = data; - __u8 *sent; + struct hci_cp_le_set_per_adv_enable *cp; + struct adv_info *adv = NULL, *n; + u8 per_adv_cnt = 0; bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); if (rp->status) return rp->status; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE); - if (!sent) + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE); + if (!cp) return rp->status; hci_dev_lock(hdev); - if (*sent) + adv = hci_find_adv_instance(hdev, cp->handle); + + if (cp->enable) { hci_dev_set_flag(hdev, HCI_LE_PER_ADV); - else + + if (adv) + adv->enabled = true; + } else { + /* If just one instance was disabled check if there are + * any other instance enabled before clearing HCI_LE_PER_ADV. + * The current periodic adv instance will be marked as + * disabled once extended advertising is also disabled. + */ + list_for_each_entry_safe(adv, n, &hdev->adv_instances, + list) { + if (adv->periodic && adv->enabled) + per_adv_cnt++; + } + + if (per_adv_cnt > 1) + goto unlock; + hci_dev_clear_flag(hdev, HCI_LE_PER_ADV); + } +unlock: hci_dev_unlock(hdev); return rp->status; @@ -4224,6 +4263,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status) { struct hci_cp_le_create_cis *cp; + bool pending = false; int i; bt_dev_dbg(hdev, "status 0x%2.2x", status); @@ -4246,12 +4286,18 @@ static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status) conn = hci_conn_hash_lookup_handle(hdev, handle); if (conn) { + if (test_and_clear_bit(HCI_CONN_CREATE_CIS, + &conn->flags)) + pending = true; conn->state = BT_CLOSED; hci_connect_cfm(conn, status); hci_conn_del(conn); } } + if (pending) + hci_le_create_cis_pending(hdev); + hci_dev_unlock(hdev); } @@ -4999,18 +5045,15 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, * As the connection handle is set here for the first time, it indicates * whether the connection is already set up. */ - if (conn->handle != HCI_CONN_HANDLE_UNSET) { + if (!HCI_CONN_HANDLE_UNSET(conn->handle)) { bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete event for existing connection"); goto unlock; } switch (status) { case 0x00: - conn->handle = __le16_to_cpu(ev->handle); - if (conn->handle > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", - conn->handle, HCI_CONN_HANDLE_MAX); - status = HCI_ERROR_INVALID_PARAMETERS; + status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle)); + if (status) { conn->state = BT_CLOSED; break; } @@ -5863,7 +5906,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, * As the connection handle is set here for the first time, it indicates * whether the connection is already set up. */ - if (conn->handle != HCI_CONN_HANDLE_UNSET) { + if (!HCI_CONN_HANDLE_UNSET(conn->handle)) { bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection"); goto unlock; } @@ -6216,8 +6259,9 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, return; } - if (!ext_adv && len > HCI_MAX_AD_LENGTH) { - bt_dev_err_ratelimited(hdev, "legacy adv larger than 31 bytes"); + if (len > max_adv_len(hdev)) { + bt_dev_err_ratelimited(hdev, + "adv larger than maximum supported"); return; } @@ -6282,7 +6326,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, bdaddr_resolved, type); - if (!ext_adv && conn && type == LE_ADV_IND && len <= HCI_MAX_AD_LENGTH) { + if (!ext_adv && conn && type == LE_ADV_IND && + len <= max_adv_len(hdev)) { /* Store report for later inclusion by * mgmt_device_connected */ @@ -6423,7 +6468,7 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, info->length + 1)) break; - if (info->length <= HCI_MAX_AD_LENGTH) { + if (info->length <= max_adv_len(hdev)) { rssi = info->data[info->length]; process_adv_report(hdev, info->type, &info->bdaddr, info->bdaddr_type, NULL, 0, rssi, @@ -6790,6 +6835,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, struct hci_evt_le_cis_established *ev = data; struct hci_conn *conn; struct bt_iso_qos *qos; + bool pending = false; u16 handle = __le16_to_cpu(ev->handle); bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6813,6 +6859,8 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, qos = &conn->iso_qos; + pending = test_and_clear_bit(HCI_CONN_CREATE_CIS, &conn->flags); + /* Convert ISO Interval (1.25 ms slots) to SDU Interval (us) */ qos->ucast.in.interval = le16_to_cpu(ev->interval) * 1250; qos->ucast.out.interval = qos->ucast.in.interval; @@ -6854,10 +6902,14 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; } + conn->state = BT_CLOSED; hci_connect_cfm(conn, ev->status); hci_conn_del(conn); unlock: + if (pending) + hci_le_create_cis_pending(hdev); + hci_dev_unlock(hdev); } @@ -6936,6 +6988,7 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, { struct hci_evt_le_create_big_complete *ev = data; struct hci_conn *conn; + __u8 i = 0; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -6944,33 +6997,46 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, return; hci_dev_lock(hdev); + rcu_read_lock(); - conn = hci_conn_hash_lookup_big(hdev, ev->handle); - if (!conn) - goto unlock; + /* Connect all BISes that are bound to the BIG */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (bacmp(&conn->dst, BDADDR_ANY) || + conn->type != ISO_LINK || + conn->iso_qos.bcast.big != ev->handle) + continue; - if (conn->type != ISO_LINK) { - bt_dev_err(hdev, - "Invalid connection link type handle 0x%2.2x", - ev->handle); - goto unlock; - } + if (hci_conn_set_handle(conn, + __le16_to_cpu(ev->bis_handle[i++]))) + continue; - if (ev->num_bis) - conn->handle = __le16_to_cpu(ev->bis_handle[0]); + if (!ev->status) { + conn->state = BT_CONNECTED; + set_bit(HCI_CONN_BIG_CREATED, &conn->flags); + rcu_read_unlock(); + hci_debugfs_create_conn(conn); + hci_conn_add_sysfs(conn); + hci_iso_setup_path(conn); + rcu_read_lock(); + continue; + } - if (!ev->status) { - conn->state = BT_CONNECTED; - hci_debugfs_create_conn(conn); - hci_conn_add_sysfs(conn); - hci_iso_setup_path(conn); - goto unlock; + hci_connect_cfm(conn, ev->status); + rcu_read_unlock(); + hci_conn_del(conn); + rcu_read_lock(); } - hci_connect_cfm(conn, ev->status); - hci_conn_del(conn); + if (!ev->status && !i) + /* If no BISes have been connected for the BIG, + * terminate. This is in case all bound connections + * have been closed before the BIG creation + * has completed. + */ + hci_le_terminate_big_sync(hdev, ev->handle, + HCI_ERROR_LOCAL_HOST_TERM); -unlock: + rcu_read_unlock(); hci_dev_unlock(hdev); } @@ -6987,9 +7053,6 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, flex_array_size(ev, bis, ev->num_bis))) return; - if (ev->status) - return; - hci_dev_lock(hdev); for (i = 0; i < ev->num_bis; i++) { @@ -7013,9 +7076,25 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis->iso_qos.bcast.in.latency = le16_to_cpu(ev->interval) * 125 / 100; bis->iso_qos.bcast.in.sdu = le16_to_cpu(ev->max_pdu); - hci_iso_setup_path(bis); + if (!ev->status) { + set_bit(HCI_CONN_BIG_SYNC, &bis->flags); + hci_iso_setup_path(bis); + } } + /* In case BIG sync failed, notify each failed connection to + * the user after all hci connections have been added + */ + if (ev->status) + for (i = 0; i < ev->num_bis; i++) { + u16 handle = le16_to_cpu(ev->bis[i]); + + bis = hci_conn_hash_lookup_handle(hdev, handle); + + set_bit(HCI_CONN_BIG_SYNC_FAILED, &bis->flags); + hci_connect_cfm(bis, ev->status); + } + hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index f7e006a36382..6e023b0104b0 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -629,27 +629,6 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, } } -/* Returns true if an le connection is in the scanning state */ -static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) -{ - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c; - - rcu_read_lock(); - - list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == LE_LINK && c->state == BT_CONNECT && - test_bit(HCI_CONN_SCANNING, &c->flags)) { - rcu_read_unlock(); - return true; - } - } - - rcu_read_unlock(); - - return false; -} - static void set_random_addr(struct hci_request *req, bdaddr_t *rpa); static int hci_update_random_address(struct hci_request *req, bool require_privacy, bool use_rpa, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d249d839819..5e4f718073b7 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -264,6 +264,53 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb_copy); } +static void hci_sock_copy_creds(struct sock *sk, struct sk_buff *skb) +{ + struct scm_creds *creds; + + if (!sk || WARN_ON(!skb)) + return; + + creds = &bt_cb(skb)->creds; + + /* Check if peer credentials is set */ + if (!sk->sk_peer_pid) { + /* Check if parent peer credentials is set */ + if (bt_sk(sk)->parent && bt_sk(sk)->parent->sk_peer_pid) + sk = bt_sk(sk)->parent; + else + return; + } + + /* Check if scm_creds already set */ + if (creds->pid == pid_vnr(sk->sk_peer_pid)) + return; + + memset(creds, 0, sizeof(*creds)); + + creds->pid = pid_vnr(sk->sk_peer_pid); + if (sk->sk_peer_cred) { + creds->uid = sk->sk_peer_cred->uid; + creds->gid = sk->sk_peer_cred->gid; + } +} + +static struct sk_buff *hci_skb_clone(struct sk_buff *skb) +{ + struct sk_buff *nskb; + + if (!skb) + return NULL; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + + hci_sock_copy_creds(skb->sk, nskb); + + return nskb; +} + /* Send frame to sockets with specific channel */ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) @@ -289,7 +336,7 @@ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, if (hci_pi(sk)->channel != channel) continue; - nskb = skb_clone(skb, GFP_ATOMIC); + nskb = hci_skb_clone(skb); if (!nskb) continue; @@ -356,6 +403,8 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) if (!skb_copy) return; + hci_sock_copy_creds(skb->sk, skb_copy); + /* Put header before the data */ hdr = skb_push(skb_copy, HCI_MON_HDR_SIZE); hdr->opcode = opcode; @@ -531,10 +580,12 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) return NULL; } - skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC); + skb = bt_skb_alloc(14 + TASK_COMM_LEN, GFP_ATOMIC); if (!skb) return NULL; + hci_sock_copy_creds(sk, skb); + flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); @@ -580,6 +631,8 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) if (!skb) return NULL; + hci_sock_copy_creds(sk, skb); + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); __net_timestamp(skb); @@ -606,6 +659,8 @@ static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, if (!skb) return NULL; + hci_sock_copy_creds(sk, skb); + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); @@ -638,6 +693,8 @@ send_monitor_note(struct sock *sk, const char *fmt, ...) if (!skb) return; + hci_sock_copy_creds(sk, skb); + va_start(args, fmt); vsprintf(skb_put(skb, len), fmt, args); *(u8 *)skb_put(skb, 1) = 0; @@ -1494,6 +1551,7 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { + struct scm_cookie scm; struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; @@ -1538,11 +1596,16 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, break; } + memset(&scm, 0, sizeof(scm)); + scm.creds = bt_cb(skb)->creds; + skb_free_datagram(sk, skb); if (flags & MSG_TRUNC) copied = skblen; + scm_recv(sock, msg, &scm, flags); + return err ? : copied; } @@ -2143,18 +2206,12 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &hci_sock_ops; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern); + sk = bt_sock_alloc(net, sock, &hci_sk_proto, protocol, GFP_ATOMIC, + kern); if (!sk) return -ENOMEM; - sock_init_data(sock, sk); - - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = protocol; - sock->state = SS_UNCONNECTED; - sk->sk_state = BT_OPEN; sk->sk_destruct = hci_sock_destruct; bt_sock_link(&hci_sk_list, sk); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 4d1e32bb6a9c..5eb30ba21370 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3,6 +3,7 @@ * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2021 Intel Corporation + * Copyright 2023 NXP */ #include <linux/property.h> @@ -1319,9 +1320,11 @@ int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance) static int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; + struct adv_info *adv = NULL; /* If periodic advertising already disabled there is nothing to do. */ - if (!hci_dev_test_flag(hdev, HCI_LE_PER_ADV)) + adv = hci_find_adv_instance(hdev, instance); + if (!adv || !adv->periodic || !adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -1386,9 +1389,11 @@ static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance) static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; + struct adv_info *adv = NULL; /* If periodic advertising already enabled there is nothing to do. */ - if (hci_dev_test_flag(hdev, HCI_LE_PER_ADV)) + adv = hci_find_adv_instance(hdev, instance); + if (adv && adv->periodic && adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -1458,22 +1463,19 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, sync_interval); if (IS_ERR(adv)) return PTR_ERR(adv); + adv->pending = false; added = true; } } - /* Only start advertising if instance 0 or if a dedicated instance has - * been added. - */ - if (!adv || added) { - err = hci_start_ext_adv_sync(hdev, instance); - if (err < 0) - goto fail; + /* Start advertising */ + err = hci_start_ext_adv_sync(hdev, instance); + if (err < 0) + goto fail; - err = hci_adv_bcast_annoucement(hdev, adv); - if (err < 0) - goto fail; - } + err = hci_adv_bcast_annoucement(hdev, adv); + if (err < 0) + goto fail; err = hci_set_per_adv_params_sync(hdev, instance, min_interval, max_interval); @@ -2670,27 +2672,6 @@ done: return filter_policy; } -/* Returns true if an le connection is in the scanning state */ -static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) -{ - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c; - - rcu_read_lock(); - - list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == LE_LINK && c->state == BT_CONNECT && - test_bit(HCI_CONN_SCANNING, &c->flags)) { - rcu_read_unlock(); - return true; - } - } - - rcu_read_unlock(); - - return false; -} - static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) @@ -4133,10 +4114,13 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) } if (bis_capable(hdev)) { + events[1] |= 0x20; /* LE PA Report */ + events[1] |= 0x40; /* LE PA Sync Established */ events[3] |= 0x04; /* LE Create BIG Complete */ events[3] |= 0x08; /* LE Terminate BIG Complete */ events[3] |= 0x10; /* LE BIG Sync Established */ events[3] |= 0x20; /* LE BIG Sync Loss */ + events[4] |= 0x02; /* LE BIG Info Advertising Report */ } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK, @@ -5269,26 +5253,64 @@ static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, } static int hci_le_connect_cancel_sync(struct hci_dev *hdev, - struct hci_conn *conn) + struct hci_conn *conn, u8 reason) { + /* Return reason if scanning since the connection shall probably be + * cleanup directly. + */ if (test_bit(HCI_CONN_SCANNING, &conn->flags)) - return 0; + return reason; - if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + if (conn->role == HCI_ROLE_SLAVE || + test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); } -static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn) +static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, + u8 reason) { if (conn->type == LE_LINK) - return hci_le_connect_cancel_sync(hdev, conn); + return hci_le_connect_cancel_sync(hdev, conn, reason); + + if (conn->type == ISO_LINK) { + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 1857: + * + * If this command is issued for a CIS on the Central and the + * CIS is successfully terminated before being established, + * then an HCI_LE_CIS_Established event shall also be sent for + * this CIS with the Status Operation Cancelled by Host (0x44). + */ + if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) + return hci_disconnect_sync(hdev, conn, reason); + + /* CIS with no Create CIS sent have nothing to cancel */ + if (bacmp(&conn->dst, BDADDR_ANY)) + return HCI_ERROR_LOCAL_HOST_TERM; + + /* There is no way to cancel a BIS without terminating the BIG + * which is done later on connection cleanup. + */ + return 0; + } if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; + /* Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the + * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is + * used when suspending or powering off, where we don't want to wait + * for the peer's response. + */ + if (reason != HCI_ERROR_REMOTE_POWER_OFF) + return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN_CANCEL, + 6, &conn->dst, + HCI_EV_CONN_COMPLETE, + HCI_CMD_TIMEOUT, NULL); + return __hci_cmd_sync_status(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_CMD_TIMEOUT); } @@ -5312,11 +5334,27 @@ static int hci_reject_sco_sync(struct hci_dev *hdev, struct hci_conn *conn, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } +static int hci_le_reject_cis_sync(struct hci_dev *hdev, struct hci_conn *conn, + u8 reason) +{ + struct hci_cp_le_reject_cis cp; + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + cp.reason = reason; + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_REJECT_CIS, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_conn_req cp; + if (conn->type == ISO_LINK) + return hci_le_reject_cis_sync(hdev, conn, reason); + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); @@ -5330,31 +5368,52 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { - int err; + int err = 0; + u16 handle = conn->handle; switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: - return hci_disconnect_sync(hdev, conn, reason); + err = hci_disconnect_sync(hdev, conn, reason); + break; case BT_CONNECT: - err = hci_connect_cancel_sync(hdev, conn); - /* Cleanup hci_conn object if it cannot be cancelled as it - * likelly means the controller and host stack are out of sync. - */ - if (err) { - hci_dev_lock(hdev); - hci_conn_failed(conn, err); - hci_dev_unlock(hdev); - } - return err; + err = hci_connect_cancel_sync(hdev, conn, reason); + break; case BT_CONNECT2: - return hci_reject_conn_sync(hdev, conn, reason); + err = hci_reject_conn_sync(hdev, conn, reason); + break; + case BT_OPEN: + case BT_BOUND: + hci_dev_lock(hdev); + hci_conn_failed(conn, reason); + hci_dev_unlock(hdev); + return 0; default: conn->state = BT_CLOSED; - break; + return 0; } - return 0; + /* Cleanup hci_conn object if it cannot be cancelled as it + * likelly means the controller and host stack are out of sync + * or in case of LE it was still scanning so it can be cleanup + * safely. + */ + if (err) { + struct hci_conn *c; + + /* Check if the connection hasn't been cleanup while waiting + * commands to complete. + */ + c = hci_conn_hash_lookup_handle(hdev, handle); + if (!c || c != conn) + return 0; + + hci_dev_lock(hdev); + hci_conn_failed(conn, err); + hci_dev_unlock(hdev); + } + + return err; } static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) @@ -6253,63 +6312,99 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) done: if (err == -ETIMEDOUT) - hci_le_connect_cancel_sync(hdev, conn); + hci_le_connect_cancel_sync(hdev, conn, 0x00); /* Re-enable advertising after the connection attempt is finished. */ hci_resume_advertising_sync(hdev); return err; } -int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn) +int hci_le_create_cis_sync(struct hci_dev *hdev) { struct { struct hci_cp_le_create_cis cp; struct hci_cis cis[0x1f]; } cmd; - u8 cig; - struct hci_conn *hcon = conn; + struct hci_conn *conn; + u8 cig = BT_ISO_QOS_CIG_UNSET; + + /* The spec allows only one pending LE Create CIS command at a time. If + * the command is pending now, don't do anything. We check for pending + * connections after each CIS Established event. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2566: + * + * If the Host issues this command before all the + * HCI_LE_CIS_Established events from the previous use of the + * command have been generated, the Controller shall return the + * error code Command Disallowed (0x0C). + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2567: + * + * When the Controller receives the HCI_LE_Create_CIS command, the + * Controller sends the HCI_Command_Status event to the Host. An + * HCI_LE_CIS_Established event will be generated for each CIS when it + * is established or if it is disconnected or considered lost before + * being established; until all the events are generated, the command + * remains pending. + */ memset(&cmd, 0, sizeof(cmd)); - cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle); - cmd.cis[0].cis_handle = cpu_to_le16(conn->handle); - cmd.cp.num_cis++; - cig = conn->iso_qos.ucast.cig; hci_dev_lock(hdev); rcu_read_lock(); + /* Wait until previous Create CIS has completed */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis]; + if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) + goto done; + } - if (conn == hcon || conn->type != ISO_LINK || - conn->state == BT_CONNECTED || - conn->iso_qos.ucast.cig != cig) + /* Find CIG with all CIS ready */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + struct hci_conn *link; + + if (hci_conn_check_create_cis(conn)) continue; - /* Check if all CIS(s) belonging to a CIG are ready */ - if (!conn->parent || conn->parent->state != BT_CONNECTED || - conn->state != BT_CONNECT) { - cmd.cp.num_cis = 0; - break; + cig = conn->iso_qos.ucast.cig; + + list_for_each_entry_rcu(link, &hdev->conn_hash.list, list) { + if (hci_conn_check_create_cis(link) > 0 && + link->iso_qos.ucast.cig == cig && + link->state != BT_CONNECTED) { + cig = BT_ISO_QOS_CIG_UNSET; + break; + } } - /* Group all CIS with state BT_CONNECT since the spec don't - * allow to send them individually: - * - * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E - * page 2566: - * - * If the Host issues this command before all the - * HCI_LE_CIS_Established events from the previous use of the - * command have been generated, the Controller shall return the - * error code Command Disallowed (0x0C). - */ + if (cig != BT_ISO_QOS_CIG_UNSET) + break; + } + + if (cig == BT_ISO_QOS_CIG_UNSET) + goto done; + + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis]; + + if (hci_conn_check_create_cis(conn) || + conn->iso_qos.ucast.cig != cig) + continue; + + set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); cmd.cp.num_cis++; + + if (cmd.cp.num_cis >= ARRAY_SIZE(cmd.cis)) + break; } +done: rcu_read_unlock(); hci_dev_unlock(hdev); @@ -6433,7 +6528,7 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, static int _update_adv_data_sync(struct hci_dev *hdev, void *data) { - u8 instance = PTR_ERR(data); + u8 instance = PTR_UINT(data); return hci_update_adv_data_sync(hdev, instance); } @@ -6441,5 +6536,5 @@ static int _update_adv_data_sync(struct hci_dev *hdev, void *data) int hci_update_adv_data(struct hci_dev *hdev, u8 instance) { return hci_cmd_sync_queue(hdev, _update_adv_data_sync, - ERR_PTR(instance), NULL); + UINT_PTR(instance), NULL); } diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 369ed92dac99..c93aaeb3a3fa 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -256,21 +256,13 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern); + sk = bt_sock_alloc(net, sock, &hidp_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; - sock_init_data(sock, sk); - sock->ops = &hidp_sock_ops; - sock->state = SS_UNCONNECTED; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = protocol; - sk->sk_state = BT_OPEN; - bt_sock_link(&hidp_sk_list, sk); return 0; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 505d62247268..3c03e49422c7 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -48,6 +48,11 @@ static void iso_sock_kill(struct sock *sk); #define EIR_SERVICE_DATA_LENGTH 4 #define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH) +/* iso_pinfo flags values */ +enum { + BT_SK_BIG_SYNC, +}; + struct iso_pinfo { struct bt_sock bt; bdaddr_t src; @@ -58,7 +63,7 @@ struct iso_pinfo { __u8 bc_num_bis; __u8 bc_bis[ISO_MAX_NUM_BIS]; __u16 sync_handle; - __u32 flags; + unsigned long flags; struct bt_iso_qos qos; bool qos_user_set; __u8 base_len; @@ -287,13 +292,24 @@ static int iso_connect_bis(struct sock *sk) goto unlock; } - hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, - le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); - if (IS_ERR(hcon)) { - err = PTR_ERR(hcon); - goto unlock; + /* Just bind if DEFER_SETUP has been set */ + if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { + hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, + &iso_pi(sk)->qos, iso_pi(sk)->base_len, + iso_pi(sk)->base); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto unlock; + } + } else { + hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, + le_addr_type(iso_pi(sk)->dst_type), + &iso_pi(sk)->qos, iso_pi(sk)->base_len, + iso_pi(sk)->base); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto unlock; + } } conn = iso_conn_add(hcon); @@ -317,6 +333,9 @@ static int iso_connect_bis(struct sock *sk) if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; + } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { + iso_sock_clear_timer(sk); + sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); @@ -600,18 +619,6 @@ static void iso_sock_kill(struct sock *sk) sock_put(sk); } -static void iso_conn_defer_reject(struct hci_conn *conn) -{ - struct hci_cp_le_reject_cis cp; - - BT_DBG("conn %p", conn); - - memset(&cp, 0, sizeof(cp)); - cp.handle = cpu_to_le16(conn->handle); - cp.reason = HCI_ERROR_REJ_BAD_ADDR; - hci_send_cmd(conn->hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp); -} - static void __iso_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); @@ -621,6 +628,7 @@ static void __iso_sock_close(struct sock *sk) iso_sock_cleanup_listen(sk); break; + case BT_CONNECT: case BT_CONNECTED: case BT_CONFIG: if (iso_pi(sk)->conn->hcon) { @@ -636,21 +644,6 @@ static void __iso_sock_close(struct sock *sk) break; case BT_CONNECT2: - if (iso_pi(sk)->conn->hcon) - iso_conn_defer_reject(iso_pi(sk)->conn->hcon); - iso_chan_del(sk, ECONNRESET); - break; - case BT_CONNECT: - /* In case of DEFER_SETUP the hcon would be bound to CIG which - * needs to be removed so just call hci_conn_del so the cleanup - * callback do what is needed. - */ - if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && - iso_pi(sk)->conn->hcon) { - hci_conn_del(iso_pi(sk)->conn->hcon); - iso_pi(sk)->conn->hcon = NULL; - } - iso_chan_del(sk, ECONNRESET); break; case BT_DISCONN: @@ -724,21 +717,13 @@ static struct sock *iso_sock_alloc(struct net *net, struct socket *sock, { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &iso_proto, kern); + sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - sk->sk_destruct = iso_sock_destruct; sk->sk_sndtimeo = ISO_CONN_TIMEOUT; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - /* Set address type as public as default src address is BDADDR_ANY */ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; @@ -1202,6 +1187,12 @@ static bool check_io_qos(struct bt_iso_io_qos *qos) static bool check_ucast_qos(struct bt_iso_qos *qos) { + if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET) + return false; + + if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) + return false; + if (qos->ucast.sca > 0x07) return false; @@ -1291,6 +1282,18 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; + case BT_PKT_STATUS: + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + err = -EFAULT; + break; + } + + if (opt) + set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); + else + clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); + break; + case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { @@ -1376,6 +1379,12 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname, break; + case BT_PKT_STATUS: + if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), + (int __user *)optval)) + err = -EFAULT; + break; + case BT_ISO_QOS: qos = iso_sock_get_qos(sk); @@ -1466,7 +1475,7 @@ static int iso_sock_release(struct socket *sock) iso_sock_close(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); @@ -1563,6 +1572,12 @@ static void iso_conn_ready(struct iso_conn *conn) hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); + if (ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) { + /* Trigger error signal on child socket */ + sk->sk_err = ECONNREFUSED; + sk->sk_error_report(sk); + } + if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else @@ -1631,15 +1646,17 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; - err = hci_le_big_create_sync(hdev, - &iso_pi(sk)->qos, - iso_pi(sk)->sync_handle, - iso_pi(sk)->bc_num_bis, - iso_pi(sk)->bc_bis); - if (err) { - bt_dev_err(hdev, "hci_le_big_create_sync: %d", - err); - sk = NULL; + if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { + err = hci_le_big_create_sync(hdev, + &iso_pi(sk)->qos, + iso_pi(sk)->sync_handle, + iso_pi(sk)->bc_num_bis, + iso_pi(sk)->bc_bis); + if (err) { + bt_dev_err(hdev, "hci_le_big_create_sync: %d", + err); + sk = NULL; + } } } } else { @@ -1676,13 +1693,18 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) } /* Create CIS if pending */ - hci_le_create_cis(hcon); + hci_le_create_cis_pending(hcon->hdev); return; } BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); - if (!status) { + /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED is set, + * queue the failed bis connection into the accept queue of the + * listening socket and wake up userspace, to inform the user about + * the BIG sync failed event. + */ + if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { struct iso_conn *conn; conn = iso_conn_add(hcon); @@ -1757,6 +1779,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ + hci_skb_pkt_status(skb) = flags & 0x03; iso_recv_frame(conn, skb); return; } @@ -1778,6 +1801,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (!conn->rx_skb) goto drop; + hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 947ca580bb9a..3bdfc3f1e73d 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -178,21 +178,6 @@ done: return err; } -static void l2cap_sock_init_pid(struct sock *sk) -{ - struct l2cap_chan *chan = l2cap_pi(sk)->chan; - - /* Only L2CAP_MODE_EXT_FLOWCTL ever need to access the PID in order to - * group the channels being requested. - */ - if (chan->mode != L2CAP_MODE_EXT_FLOWCTL) - return; - - spin_lock(&sk->sk_peer_lock); - sk->sk_peer_pid = get_pid(task_tgid(current)); - spin_unlock(&sk->sk_peer_lock); -} - static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { @@ -268,8 +253,6 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, chan->mode != L2CAP_MODE_EXT_FLOWCTL) chan->mode = L2CAP_MODE_LE_FLOWCTL; - l2cap_sock_init_pid(sk); - err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), &la.l2_bdaddr, la.l2_bdaddr_type); if (err) @@ -325,8 +308,6 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) goto done; } - l2cap_sock_init_pid(sk); - sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; @@ -1858,21 +1839,13 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, struct sock *sk; struct l2cap_chan *chan; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern); + sk = bt_sock_alloc(net, sock, &l2cap_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - sk->sk_destruct = l2cap_sock_destruct; sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - chan = l2cap_chan_create(); if (!chan) { sk_free(sk); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d4498037fadc..d6c9b7bc8592 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -944,6 +944,12 @@ static u32 get_current_settings(struct hci_dev *hdev) if (cis_peripheral_capable(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; + if (bis_capable(hdev)) + settings |= MGMT_SETTING_ISO_BROADCASTER; + + if (sync_recv_capable(hdev)) + settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; + return settings; } @@ -3580,18 +3586,6 @@ unlock: return err; } -static int abort_conn_sync(struct hci_dev *hdev, void *data) -{ - struct hci_conn *conn; - u16 handle = PTR_ERR(data); - - conn = hci_conn_hash_lookup_handle(hdev, handle); - if (!conn) - return 0; - - return hci_abort_conn_sync(hdev, conn, HCI_ERROR_REMOTE_USER_TERM); -} - static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { @@ -3642,8 +3636,7 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, le_addr_type(addr->type)); if (conn->conn_reason == CONN_REASON_PAIR_DEVICE) - hci_cmd_sync_queue(hdev, abort_conn_sync, ERR_PTR(conn->handle), - NULL); + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); unlock: hci_dev_unlock(hdev); @@ -8435,8 +8428,8 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, supported_flags = get_supported_adv_flags(hdev); rp->supported_flags = cpu_to_le32(supported_flags); - rp->max_adv_data_len = HCI_MAX_AD_LENGTH; - rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH; + rp->max_adv_data_len = max_adv_len(hdev); + rp->max_scan_rsp_len = max_adv_len(hdev); rp->max_instances = hdev->le_num_of_adv_sets; rp->num_instances = hdev->adv_instance_cnt; @@ -8472,7 +8465,7 @@ static u8 calculate_name_len(struct hci_dev *hdev) static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags, bool is_adv_data) { - u8 max_len = HCI_MAX_AD_LENGTH; + u8 max_len = max_adv_len(hdev); if (is_adv_data) { if (adv_flags & (MGMT_ADV_FLAG_DISCOV | diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index bf5cee48916c..abbafa6194ca 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -91,6 +91,33 @@ struct msft_ev_le_monitor_device { struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; + __s8 rssi_high; + __s8 rssi_low; + __u8 rssi_low_interval; + __u8 rssi_sampling_period; + __u8 cond_type; + struct list_head list; +}; + +enum monitor_addr_filter_state { + AF_STATE_IDLE, + AF_STATE_ADDING, + AF_STATE_ADDED, + AF_STATE_REMOVING, +}; + +#define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04 +struct msft_monitor_addr_filter_data { + __u8 msft_handle; + __u8 pattern_handle; /* address filters pertain to */ + __u16 mgmt_handle; + int state; + __s8 rssi_high; + __s8 rssi_low; + __u8 rssi_low_interval; + __u8 rssi_sampling_period; + __u8 addr_type; + bdaddr_t bdaddr; struct list_head list; }; @@ -99,9 +126,12 @@ struct msft_data { __u8 evt_prefix_len; __u8 *evt_prefix; struct list_head handle_map; + struct list_head address_filters; __u8 resuming; __u8 suspending; __u8 filter_enabled; + /* To synchronize add/remove address filter and monitor device event.*/ + struct mutex filter_lock; }; bool msft_monitor_supported(struct hci_dev *hdev) @@ -180,6 +210,24 @@ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data return NULL; } +/* This function requires the caller holds msft->filter_lock */ +static struct msft_monitor_addr_filter_data *msft_find_address_data + (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr, + u8 pattern_handle) +{ + struct msft_monitor_addr_filter_data *entry; + struct msft_data *msft = hdev->msft_data; + + list_for_each_entry(entry, &msft->address_filters, list) { + if (entry->pattern_handle == pattern_handle && + addr_type == entry->addr_type && + !bacmp(addr, &entry->bdaddr)) + return entry; + } + + return NULL; +} + /* This function requires the caller holds hdev->lock */ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, bdaddr_t *bdaddr, __u8 addr_type, @@ -240,6 +288,7 @@ static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, handle_data->mgmt_handle = monitor->handle; handle_data->msft_handle = rp->handle; + handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; INIT_LIST_HEAD(&handle_data->list); list_add(&handle_data->list, &msft->handle_map); @@ -254,6 +303,70 @@ unlock: return status; } +/* This function requires the caller holds hci_req_sync_lock */ +static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle) +{ + struct msft_monitor_addr_filter_data *address_filter, *n; + struct msft_cp_le_cancel_monitor_advertisement cp; + struct msft_data *msft = hdev->msft_data; + struct list_head head; + struct sk_buff *skb; + + INIT_LIST_HEAD(&head); + + /* Cancel all corresponding address monitors */ + mutex_lock(&msft->filter_lock); + + list_for_each_entry_safe(address_filter, n, &msft->address_filters, + list) { + if (address_filter->pattern_handle != handle) + continue; + + list_del(&address_filter->list); + + /* Keep the address filter and let + * msft_add_address_filter_sync() remove and free the address + * filter. + */ + if (address_filter->state == AF_STATE_ADDING) { + address_filter->state = AF_STATE_REMOVING; + continue; + } + + /* Keep the address filter and let + * msft_cancel_address_filter_sync() remove and free the address + * filter + */ + if (address_filter->state == AF_STATE_REMOVING) + continue; + + list_add_tail(&address_filter->list, &head); + } + + mutex_unlock(&msft->filter_lock); + + list_for_each_entry_safe(address_filter, n, &head, list) { + list_del(&address_filter->list); + + cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; + cp.handle = address_filter->msft_handle; + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, + HCI_CMD_TIMEOUT); + if (IS_ERR_OR_NULL(skb)) { + kfree(address_filter); + continue; + } + + kfree_skb(skb); + + bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", + &address_filter->bdaddr); + + kfree(address_filter); + } +} + static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, @@ -263,6 +376,7 @@ static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; + u8 msft_handle; rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { @@ -293,11 +407,17 @@ static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, NULL, 0, false); } + msft_handle = handle_data->msft_handle; + list_del(&handle_data->list); kfree(handle_data); - } - hci_dev_unlock(hdev); + hci_dev_unlock(hdev); + + msft_remove_addr_filters_sync(hdev, msft_handle); + } else { + hci_dev_unlock(hdev); + } done: return status; @@ -394,12 +514,14 @@ static int msft_add_monitor_sync(struct hci_dev *hdev, { struct msft_cp_le_monitor_advertisement *cp; struct msft_le_monitor_advertisement_pattern_data *pattern_data; + struct msft_monitor_advertisement_handle_data *handle_data; struct msft_le_monitor_advertisement_pattern *pattern; struct adv_pattern *entry; size_t total_size = sizeof(*cp) + sizeof(*pattern_data); ptrdiff_t offset = 0; u8 pattern_count = 0; struct sk_buff *skb; + int err; if (!msft_monitor_pattern_valid(monitor)) return -EINVAL; @@ -436,16 +558,31 @@ static int msft_add_monitor_sync(struct hci_dev *hdev, skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp, HCI_CMD_TIMEOUT); - kfree(cp); if (IS_ERR_OR_NULL(skb)) { - if (!skb) - return -EIO; - return PTR_ERR(skb); + err = PTR_ERR(skb); + goto out_free; } - return msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, - monitor, skb); + err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, + monitor, skb); + if (err) + goto out_free; + + handle_data = msft_find_handle_data(hdev, monitor->handle, true); + if (!handle_data) { + err = -ENODATA; + goto out_free; + } + + handle_data->rssi_high = cp->rssi_high; + handle_data->rssi_low = cp->rssi_low; + handle_data->rssi_low_interval = cp->rssi_low_interval; + handle_data->rssi_sampling_period = cp->rssi_sampling_period; + +out_free: + kfree(cp); + return err; } /* This function requires the caller holds hci_req_sync_lock */ @@ -538,6 +675,7 @@ void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct msft_monitor_advertisement_handle_data *handle_data, *tmp; + struct msft_monitor_addr_filter_data *address_filter, *n; struct adv_monitor *monitor; if (!msft) @@ -559,6 +697,14 @@ void msft_do_close(struct hci_dev *hdev) kfree(handle_data); } + mutex_lock(&msft->filter_lock); + list_for_each_entry_safe(address_filter, n, &msft->address_filters, + list) { + list_del(&address_filter->list); + kfree(address_filter); + } + mutex_unlock(&msft->filter_lock); + hci_dev_lock(hdev); /* Clear any devices that are being monitored and notify device lost */ @@ -568,6 +714,49 @@ void msft_do_close(struct hci_dev *hdev) hci_dev_unlock(hdev); } +static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) +{ + struct msft_monitor_addr_filter_data *address_filter = data; + struct msft_cp_le_cancel_monitor_advertisement cp; + struct msft_data *msft = hdev->msft_data; + struct sk_buff *skb; + int err = 0; + + if (!msft) { + bt_dev_err(hdev, "MSFT: msft data is freed"); + return -EINVAL; + } + + /* The address filter has been removed by hci dev close */ + if (!test_bit(HCI_UP, &hdev->flags)) + return 0; + + mutex_lock(&msft->filter_lock); + list_del(&address_filter->list); + mutex_unlock(&msft->filter_lock); + + cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; + cp.handle = address_filter->msft_handle; + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, + HCI_CMD_TIMEOUT); + if (IS_ERR_OR_NULL(skb)) { + bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", + &address_filter->bdaddr); + err = -EIO; + goto done; + } + kfree_skb(skb); + + bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", + &address_filter->bdaddr); + +done: + kfree(address_filter); + + return err; +} + void msft_register(struct hci_dev *hdev) { struct msft_data *msft = NULL; @@ -581,7 +770,9 @@ void msft_register(struct hci_dev *hdev) } INIT_LIST_HEAD(&msft->handle_map); + INIT_LIST_HEAD(&msft->address_filters); hdev->msft_data = msft; + mutex_init(&msft->filter_lock); } void msft_unregister(struct hci_dev *hdev) @@ -596,6 +787,7 @@ void msft_unregister(struct hci_dev *hdev) hdev->msft_data = NULL; kfree(msft->evt_prefix); + mutex_destroy(&msft->filter_lock); kfree(msft); } @@ -645,11 +837,149 @@ static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, return data; } +static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data) +{ + struct msft_monitor_addr_filter_data *address_filter = data; + struct msft_rp_le_monitor_advertisement *rp; + struct msft_cp_le_monitor_advertisement *cp; + struct msft_data *msft = hdev->msft_data; + struct sk_buff *skb = NULL; + bool remove = false; + size_t size; + + if (!msft) { + bt_dev_err(hdev, "MSFT: msft data is freed"); + return -EINVAL; + } + + /* The address filter has been removed by hci dev close */ + if (!test_bit(HCI_UP, &hdev->flags)) + return -ENODEV; + + /* We are safe to use the address filter from now on. + * msft_monitor_device_evt() wouldn't delete this filter because it's + * not been added by now. + * And all other functions that requiring hci_req_sync_lock wouldn't + * touch this filter before this func completes because it's protected + * by hci_req_sync_lock. + */ + + if (address_filter->state == AF_STATE_REMOVING) { + mutex_lock(&msft->filter_lock); + list_del(&address_filter->list); + mutex_unlock(&msft->filter_lock); + kfree(address_filter); + return 0; + } + + size = sizeof(*cp) + + sizeof(address_filter->addr_type) + + sizeof(address_filter->bdaddr); + cp = kzalloc(size, GFP_KERNEL); + if (!cp) { + bt_dev_err(hdev, "MSFT: Alloc cmd param err"); + remove = true; + goto done; + } + cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; + cp->rssi_high = address_filter->rssi_high; + cp->rssi_low = address_filter->rssi_low; + cp->rssi_low_interval = address_filter->rssi_low_interval; + cp->rssi_sampling_period = address_filter->rssi_sampling_period; + cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR; + cp->data[0] = address_filter->addr_type; + memcpy(&cp->data[1], &address_filter->bdaddr, + sizeof(address_filter->bdaddr)); + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp, + HCI_CMD_TIMEOUT); + if (IS_ERR_OR_NULL(skb)) { + bt_dev_err(hdev, "Failed to enable address %pMR filter", + &address_filter->bdaddr); + skb = NULL; + remove = true; + goto done; + } + + rp = skb_pull_data(skb, sizeof(*rp)); + if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT || + rp->status) + remove = true; + +done: + mutex_lock(&msft->filter_lock); + + if (remove) { + bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter", + &address_filter->bdaddr); + list_del(&address_filter->list); + kfree(address_filter); + } else { + address_filter->state = AF_STATE_ADDED; + address_filter->msft_handle = rp->handle; + bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled", + &address_filter->bdaddr); + } + mutex_unlock(&msft->filter_lock); + + kfree_skb(skb); + + return 0; +} + +/* This function requires the caller holds msft->filter_lock */ +static struct msft_monitor_addr_filter_data *msft_add_address_filter + (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr, + struct msft_monitor_advertisement_handle_data *handle_data) +{ + struct msft_monitor_addr_filter_data *address_filter = NULL; + struct msft_data *msft = hdev->msft_data; + int err; + + address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL); + if (!address_filter) + return NULL; + + address_filter->state = AF_STATE_ADDING; + address_filter->msft_handle = 0xff; + address_filter->pattern_handle = handle_data->msft_handle; + address_filter->mgmt_handle = handle_data->mgmt_handle; + address_filter->rssi_high = handle_data->rssi_high; + address_filter->rssi_low = handle_data->rssi_low; + address_filter->rssi_low_interval = handle_data->rssi_low_interval; + address_filter->rssi_sampling_period = handle_data->rssi_sampling_period; + address_filter->addr_type = addr_type; + bacpy(&address_filter->bdaddr, bdaddr); + + /* With the above AF_STATE_ADDING, duplicated address filter can be + * avoided when receiving monitor device event (found/lost) frequently + * for the same device. + */ + list_add_tail(&address_filter->list, &msft->address_filters); + + err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync, + address_filter, NULL); + if (err < 0) { + bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr); + list_del(&address_filter->list); + kfree(address_filter); + return NULL; + } + + bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter", + &address_filter->bdaddr); + + return address_filter; +} + /* This function requires the caller holds hdev->lock */ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct msft_monitor_addr_filter_data *n, *address_filter = NULL; struct msft_ev_le_monitor_device *ev; struct msft_monitor_advertisement_handle_data *handle_data; + struct msft_data *msft = hdev->msft_data; + u16 mgmt_handle = 0xffff; u8 addr_type; ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); @@ -662,9 +992,53 @@ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) ev->monitor_state, &ev->bdaddr); handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); - if (!handle_data) + + if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) { + if (!handle_data) + return; + mgmt_handle = handle_data->mgmt_handle; + goto report_state; + } + + if (handle_data) { + /* Don't report any device found/lost event from pattern + * monitors. Pattern monitor always has its address filters for + * tracking devices. + */ + + address_filter = msft_find_address_data(hdev, ev->addr_type, + &ev->bdaddr, + handle_data->msft_handle); + if (address_filter) + return; + + if (ev->monitor_state && handle_data->cond_type == + MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN) + msft_add_address_filter(hdev, ev->addr_type, + &ev->bdaddr, handle_data); + return; + } + /* This device event is not from pattern monitor. + * Report it if there is a corresponding address_filter for it. + */ + list_for_each_entry(n, &msft->address_filters, list) { + if (n->state == AF_STATE_ADDED && + n->msft_handle == ev->monitor_handle) { + mgmt_handle = n->mgmt_handle; + address_filter = n; + break; + } + } + + if (!address_filter) { + bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u", + &ev->bdaddr, ev->monitor_handle, ev->monitor_state); + return; + } + +report_state: switch (ev->addr_type) { case ADDR_LE_DEV_PUBLIC: addr_type = BDADDR_LE_PUBLIC; @@ -681,12 +1055,18 @@ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - if (ev->monitor_state) - msft_device_found(hdev, &ev->bdaddr, addr_type, - handle_data->mgmt_handle); - else - msft_device_lost(hdev, &ev->bdaddr, addr_type, - handle_data->mgmt_handle); + if (ev->monitor_state) { + msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle); + } else { + if (address_filter && address_filter->state == AF_STATE_ADDED) { + address_filter->state = AF_STATE_REMOVING; + hci_cmd_sync_queue(hdev, + msft_cancel_address_filter_sync, + address_filter, + NULL); + } + msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle); + } } void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) @@ -724,7 +1104,9 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) switch (*evt) { case MSFT_EV_LE_MONITOR_DEVICE: + mutex_lock(&msft->filter_lock); msft_monitor_device_evt(hdev, skb); + mutex_unlock(&msft->filter_lock); break; default: diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 4397e14ff560..b54e8a530f55 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -268,18 +268,16 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) +static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, + int proto, gfp_t prio, int kern) { struct rfcomm_dlc *d; struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern); + sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - d = rfcomm_dlc_alloc(prio); if (!d) { sk_free(sk); @@ -298,11 +296,6 @@ static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - bt_sock_link(&rfcomm_sk_list, sk); BT_DBG("sk %p", sk); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 7762604ddfc0..c736186aba26 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -68,7 +68,6 @@ struct sco_pinfo { bdaddr_t dst; __u32 flags; __u16 setting; - __u8 cmsg_mask; struct bt_codec codec; struct sco_conn *conn; }; @@ -471,15 +470,6 @@ static void sco_sock_close(struct sock *sk) release_sock(sk); } -static void sco_skb_put_cmsg(struct sk_buff *skb, struct msghdr *msg, - struct sock *sk) -{ - if (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS) - put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, - sizeof(bt_cb(skb)->sco.pkt_status), - &bt_cb(skb)->sco.pkt_status); -} - static void sco_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); @@ -488,8 +478,6 @@ static void sco_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); - } else { - bt_sk(sk)->skb_put_cmsg = sco_skb_put_cmsg; } } @@ -504,21 +492,13 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern); + sk = bt_sock_alloc(net, sock, &sco_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - sk->sk_destruct = sco_sock_destruct; sk->sk_sndtimeo = SCO_CONN_TIMEOUT; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; sco_pi(sk)->codec.id = BT_CODEC_CVSD; sco_pi(sk)->codec.cid = 0xffff; @@ -915,9 +895,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, } if (opt) - sco_pi(sk)->cmsg_mask |= SCO_CMSG_PKT_STATUS; + set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else - sco_pi(sk)->cmsg_mask &= SCO_CMSG_PKT_STATUS; + clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_CODEC: @@ -1048,7 +1028,6 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, int len, err = 0; struct bt_voice voice; u32 phys; - int pkt_status; int buf_len; struct codec_list *c; u8 num_codecs, i, __user *ptr; @@ -1102,9 +1081,8 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - pkt_status = (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS); - - if (put_user(pkt_status, (int __user *)optval)) + if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), + (int __user *)optval)) err = -EFAULT; break; @@ -1267,7 +1245,7 @@ static int sco_sock_release(struct socket *sock) sco_sock_close(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2321bd2f9964..57a7a64b84ed 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,11 +15,12 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> #include <linux/netfilter.h> +#include <net/netdev_rx_queue.h> #include <net/xdp.h> #include <net/netfilter/nf_bpf_link.h> @@ -555,12 +556,23 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } +void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +{ +} + __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } +__bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, + void *e, char f, int g) +{ + *b += 1; + return a + *b + c + d + (long)e + f + g; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -596,6 +608,7 @@ __diag_pop(); BTF_SET8_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) +BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_SET8_END(bpf_test_modify_return_ids) @@ -663,7 +676,11 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) - side_effect = 1; + side_effect++; + b = 2; + ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); + if (b != 2) + side_effect++; break; default: goto out; diff --git a/net/bridge/br.c b/net/bridge/br.c index 4f5098d33a46..a6e94ceb7c9a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -234,6 +234,14 @@ static int br_switchdev_blocking_event(struct notifier_block *nb, br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; + case SWITCHDEV_BRPORT_REPLAY: + brport_info = ptr; + b = &brport_info->brport; + + err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, + b->blocking_nb, extack); + err = notifier_from_errno(err); + break; } out: diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6116eba1bd89..9d7bc8b96b53 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -154,6 +154,7 @@ void br_forward(const struct net_bridge_port *to, backup_port = rcu_dereference(to->backup_port); if (unlikely(!backup_port)) goto out; + BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid); to = backup_port; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 05c5863d2e20..10f0d33d8ccf 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -211,6 +211,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } @@ -319,6 +320,10 @@ static int br_port_fill_attrs(struct sk_buff *skb, backup_p->dev->ifindex); rcu_read_unlock(); + if (p->backup_nhid && + nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) + return -EMSGSIZE; + return 0; } @@ -895,6 +900,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), + [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -1065,6 +1071,12 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], return err; } + if (tb[IFLA_BRPORT_BACKUP_NHID]) { + u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); + + WRITE_ONCE(p->backup_nhid, backup_nhid); + } + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a63b32c1638e..a1f4acfa6994 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -387,6 +387,7 @@ struct net_bridge_port { struct net_bridge_vlan_group __rcu *vlgrp; #endif struct net_bridge_port __rcu *backup_port; + u32 backup_nhid; /* STP */ u8 priority; @@ -605,6 +606,8 @@ struct br_input_skb_cb { */ unsigned long fwd_hwdoms; #endif + + u32 backup_nhid; }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) @@ -971,7 +974,6 @@ int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router); int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val); -int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val); #if IS_ENABLED(CONFIG_IPV6) @@ -2115,6 +2117,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); + bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); @@ -2165,6 +2173,16 @@ br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, { } +static inline int +br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { return false; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index ba95c4d74a60..ee84e783e1df 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -727,6 +727,8 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, err = br_switchdev_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); + if (err == -EOPNOTSUPP) + err = 0; if (err) goto out_free_mdb; } @@ -759,8 +761,10 @@ static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); - if (err && err != -EOPNOTSUPP) + if (err) { + /* -EOPNOTSUPP not propagated from MDB replay. */ return err; + } err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); if (err && err != -EOPNOTSUPP) @@ -825,3 +829,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, nbp_switchdev_del(p); } + +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); +} diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 6399a8a69d07..81833ca7a2c7 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -201,6 +201,21 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, if (err) return err; + if (BR_INPUT_SKB_CB(skb)->backup_nhid) { + tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, + tunnel_id, 0); + if (!tunnel_dst) + return -ENOMEM; + + tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX | + IP_TUNNEL_INFO_BRIDGE; + tunnel_dst->u.tun_info.key.nhid = + BR_INPUT_SKB_CB(skb)->backup_nhid; + skb_dst_set(skb, &tunnel_dst->dst); + + return 0; + } + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 757ec46fc45a..aa23479b20b2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2115,8 +2115,7 @@ static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *ba return ret; offsets[0] = sizeof(struct ebt_entry); /* matches come first */ - memcpy(&offsets[1], &entry->watchers_offset, - sizeof(offsets) - sizeof(offsets[0])); + memcpy(&offsets[1], &entry->offsets, sizeof(entry->offsets)); if (state->buf_kern_start) { buf_start = state->buf_kern_start + state->buf_kern_offset; diff --git a/net/core/dev.c b/net/core/dev.c index 69a3e544676c..17e6281e408c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -107,6 +107,7 @@ #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -132,6 +133,7 @@ #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> +#include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> @@ -150,11 +152,11 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> +#include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" - static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ @@ -388,6 +390,8 @@ static void list_netdevice(struct net_device *dev) hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock(&dev_base_lock); + /* We reserved the ifindex, this can't fail */ + WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } @@ -397,8 +401,12 @@ static void list_netdevice(struct net_device *dev) */ static void unlist_netdevice(struct net_device *dev, bool lock) { + struct net *net = dev_net(dev); + ASSERT_RTNL(); + xa_erase(&net->dev_by_index, dev->ifindex); + /* Unlink dev from the device chain */ if (lock) write_lock(&dev_base_lock); @@ -2384,8 +2392,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -3882,69 +3889,198 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS -static struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) +{ + int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); - struct tcf_result cl_res; + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; if (!miniq) - return skb; + return ret; - /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); + skb->tc_index = TC_H_MIN(res.classid); break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + int sch_ret; + + if (!entry) + return skb; + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_skb_cb(skb)->pkt_len = skb->len; + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + return NULL; case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - *ret = NET_XMIT_DROP; - kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; + /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: - *ret = NET_XMIT_SUCCESS; consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; + } + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + int sch_ret; + + if (!entry) + return skb; + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +egress_verdict: + switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; return NULL; - default: - break; + case TC_ACT_SHOT: + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + *ret = NET_XMIT_DROP; + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *ret = NET_XMIT_SUCCESS; + return NULL; } -#endif /* CONFIG_NET_CLS_ACT */ return skb; } - -static struct netdev_queue * -netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) -{ - int qm = skb_get_queue_mapping(skb); - - return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); -} - -static bool netdev_xmit_txqueue_skipped(void) +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) { - return __this_cpu_read(softnet_data.xmit.skip_txqueue); + return skb; } -void netdev_xmit_skip_txqueue(bool skip) +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); + return skb; } -EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); -#endif /* CONFIG_NET_EGRESS */ +#endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, @@ -4128,9 +4264,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); qdisc_pkt_len_init(skb); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_at_ingress = 0; -#endif + tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -5064,72 +5198,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); - struct tcf_result cl_res; - - /* If there's at least one ingress present somewhere (so - * we get here via enabled static key), remaining devices - * that are not configured with an ingress qdisc will bail - * out here. - */ - if (!miniq) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - qdisc_skb_cb(skb)->pkt_len = skb->len; - tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - skb->tc_at_ingress = 1; - mini_qdisc_bstats_cpu_update(miniq, skb); - - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { - case TC_ACT_OK: - case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); - break; - case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); - *ret = NET_RX_DROP; - return NULL; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_REDIRECT: - /* skb_mac_header check was done by cls/act_bpf, so - * we can safely push the L2 header back before - * redirecting to another netdev - */ - __skb_push(skb, skb->mac_len); - if (skb_do_redirect(skb) == -EAGAIN) { - __skb_pull(skb, skb->mac_len); - *another = true; - break; - } - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_CONSUMED: - *ret = NET_RX_SUCCESS; - return NULL; - default: - break; - } -#endif /* CONFIG_NET_CLS_ACT */ - return skb; -} - /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check @@ -6316,12 +6384,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (threaded) - set_bit(NAPI_STATE_THREADED, &napi->state); - else - clear_bit(NAPI_STATE_THREADED, &napi->state); - } + list_for_each_entry(napi, &dev->napi_list, dev_list) + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } @@ -9413,6 +9477,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; + struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; @@ -9440,12 +9505,13 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - err = dev_xdp_attach_link(dev, NULL, link); + err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); + trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } @@ -9509,23 +9575,40 @@ err_out: } /** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace + * dev_index_reserve() - allocate an ifindex in a namespace + * @net: the applicable net namespace + * @ifindex: requested ifindex, pass %0 to get one allocated + * + * Allocate a ifindex for a new device. Caller must either use the ifindex + * to store the device (via list_netdevice()) or call dev_index_release() + * to give the index up. * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * Return: a suitable unique value for a new device interface number or -errno. */ -static int dev_new_index(struct net *net) +static int dev_index_reserve(struct net *net, u32 ifindex) { - int ifindex = net->ifindex; + int err; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return net->ifindex = ifindex; + if (ifindex > INT_MAX) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EINVAL; } + + if (!ifindex) + err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, + xa_limit_31b, &net->ifindex, GFP_KERNEL); + else + err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); + if (err < 0) + return err; + + return ifindex; +} + +static void dev_index_release(struct net *net, int ifindex) +{ + /* Expect only unused indexes, unlist_netdevice() removes the used */ + WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } /* Delayed registration/unregisteration */ @@ -9995,11 +10078,10 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - ret = -EBUSY; - if (!dev->ifindex) - dev->ifindex = dev_new_index(net); - else if (__dev_get_by_index(net, dev->ifindex)) + ret = dev_index_reserve(net, dev->ifindex); + if (ret < 0) goto err_uninit; + dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). @@ -10046,7 +10128,7 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_uninit; + goto err_ifindex_release; ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); @@ -10102,6 +10184,8 @@ out: err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); +err_ifindex_release: + dev_index_release(net, dev->ifindex); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10617,6 +10701,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; @@ -10838,7 +10923,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* Shutdown queueing discipline. */ dev_shutdown(dev); - + dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); @@ -10978,9 +11063,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, } /* Check that new_ifindex isn't used yet. */ - err = -EBUSY; - if (new_ifindex && __dev_get_by_index(net, new_ifindex)) - goto out; + if (new_ifindex) { + err = dev_index_reserve(net, new_ifindex); + if (err < 0) + goto out; + } else { + /* If there is an ifindex conflict assign a new one */ + err = dev_index_reserve(net, dev->ifindex); + if (err == -EBUSY) + err = dev_index_reserve(net, 0); + if (err < 0) + goto out; + new_ifindex = err; + } /* * And now a mini version of register_netdevice unregister_netdevice. @@ -11008,13 +11103,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); - /* If there is an ifindex conflict assign a new one */ - if (!new_ifindex) { - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; - } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11192,6 +11280,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; @@ -11289,6 +11379,7 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 3730945ee294..b46aedc36939 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -5,6 +5,7 @@ #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> +#include <linux/phylib_stubs.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> @@ -252,14 +253,121 @@ static int dev_eth_ioctl(struct net_device *dev, return ops->ndo_eth_ioctl(dev, ifr, cmd); } +/** + * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. + * + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and + * there only exists a phydev->mii_ts->hwtstamp() method. So this will return + * -EOPNOTSUPP for phylib for now, which is still more accurate than letting + * the netdev handle the GET request. + */ +static int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + if (phy_has_hwtstamp(dev->phydev)) + return phy_hwtstamp_get(dev->phydev, cfg); + + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); +} + static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { - return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct hwtstamp_config cfg; + int err; + + if (!ops->ndo_hwtstamp_get) + return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + kernel_cfg.ifr = ifr; + err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); + if (err) + return err; + + /* If the request was resolved through an unconverted driver, omit + * the copy_to_user(), since the implementation has already done that + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +/** + * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * @extack: Netlink extended ack message structure, for error reporting + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. If the netdev driver needs to perform specific actions even for PHY + * timestamping to work properly (a switch port must trap the timestamped + * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in + * dev->priv_flags. + */ +static int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + bool phy_ts = phy_has_hwtstamp(dev->phydev); + struct kernel_hwtstamp_config old_cfg = {}; + bool changed = false; + int err; + + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; + + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + err = ops->ndo_hwtstamp_get(dev, &old_cfg); + if (err) + return err; + } + + if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + err = ops->ndo_hwtstamp_set(dev, cfg, extack); + if (err) { + if (extack->_msg) + netdev_err(dev, "%s\n", extack->_msg); + return err; + } + } + + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) + changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); + + if (phy_ts) { + err = phy_hwtstamp_set(dev->phydev, cfg, extack); + if (err) { + if (changed) + ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); + return err; + } + } + + return 0; } static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { - struct kernel_hwtstamp_config kernel_cfg; + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; int err; @@ -268,6 +376,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return -EFAULT; hwtstamp_config_to_kernel(&kernel_cfg, &cfg); + kernel_cfg.ifr = ifr; err = net_hwtstamp_validate(&kernel_cfg); if (err) @@ -280,8 +389,80 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return err; } - return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); + if (!ops->ndo_hwtstamp_set) + return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + if (err) + return err; + + /* The driver may have modified the configuration, so copy the + * updated version of it back to user space + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, + struct kernel_hwtstamp_config *kernel_cfg) +{ + struct ifreq ifrr; + int err; + + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); + ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; + + err = dev_eth_ioctl(dev, &ifrr, cmd); + if (err) + return err; + + kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; + kernel_cfg->copied_to_user = true; + + return 0; +} + +int generic_hwtstamp_get_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_get) + return dev_get_hwtstamp_phylib(dev, kernel_cfg); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_get_lower); + +int generic_hwtstamp_set_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_set) + return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); } +EXPORT_SYMBOL(generic_hwtstamp_set_lower); static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) diff --git a/net/core/dst.c b/net/core/dst.c index 79d9306ad1ee..980e2fd2f013 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -152,7 +152,7 @@ void dst_dev_put(struct dst_entry *dst) dst->obsolete = DST_OBSOLETE_DEAD; if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, true); + dst->ops->ifdown(dst, dev); dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; diff --git a/net/core/filter.c b/net/core/filter.c index 28a59596987a..a094694899c9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4339,13 +4339,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; - if (map_type == BPF_MAP_TYPE_XSKMAP) { - /* XDP_REDIRECT is not supported AF_XDP yet. */ - if (unlikely(xdp_buff_has_frags(xdp))) - return -EOPNOTSUPP; - + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); - } return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); @@ -7350,8 +7345,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; - if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) - return -ESOCKTNOSUPPORT; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; @@ -9306,7 +9301,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. @@ -9340,7 +9335,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 85a2d0d9bd39..89d15ceaf9af 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -40,7 +40,7 @@ static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - flow_dissector->used_keys |= (1 << key_id); + flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, @@ -205,6 +205,50 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_ah(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_ah; + struct ip_auth_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_ah = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_ah->spi = hdr->spi; +} + +static void __skb_flow_dissect_esp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_esp; + struct ip_esp_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_esp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_esp->spi = hdr->spi; +} + static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, @@ -1571,7 +1615,14 @@ ip_proto_again: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; - + case IPPROTO_ESP: + __skb_flow_dissect_esp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_AH: + __skb_flow_dissect_ah(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index acfc1f88ea79..bc5169482710 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -146,6 +146,13 @@ void flow_rule_match_tcp(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_tcp); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out); +} +EXPORT_SYMBOL(flow_rule_match_ipsec); + void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 15e3f4606b5f..fccaa5bac0ed 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> +#include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a4270fafdf11..c1aea8b756b6 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,11 +10,11 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, - u32 portid, u32 seq, int flags, u32 cmd) + const struct genl_info *info) { void *hdr; - hdr = genlmsg_put(rsp, portid, seq, &netdev_nl_family, flags, cmd); + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, return -EINVAL; } + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) { + genlmsg_cancel(rsp, hdr); + return -EINVAL; + } + } + genlmsg_end(rsp, hdr); return 0; @@ -33,17 +41,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { + struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; - if (netdev_nl_dev_fill(netdev, ntf, 0, 0, 0, cmd)) { + if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } @@ -72,8 +83,7 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info->snd_portid, - info->snd_seq, 0, info->genlhdr->cmd); + err = netdev_nl_dev_fill(netdev, rsp, info); else err = -ENODEV; @@ -93,43 +103,19 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *netdev; - int idx = 0, s_idx; - int h, s_h; - int err; - - s_h = cb->args[0]; - s_idx = cb->args[1]; + int err = 0; rtnl_lock(); - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(netdev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = netdev_nl_dev_fill(netdev, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NETDEV_CMD_DEV_GET); - if (err < 0) - break; -cont: - idx++; - } + for_each_netdev_dump(net, netdev, cb->args[0]) { + err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); + if (err < 0) + break; } - rtnl_unlock(); if (err != -EMSGSIZE) return err; - cb->args[1] = idx; - cb->args[0] = h; - cb->seq = net->dev_base_seq; - return skb->len; } diff --git a/net/core/of_net.c b/net/core/of_net.c index 55d3fe229269..93ea425b9248 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/of_net.h> #include <linux/of_platform.h> +#include <linux/platform_device.h> #include <linux/phy.h> #include <linux/export.h> #include <linux/device.h> diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a3e12a61d456..77cb75e63aca 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include <linux/device.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -58,6 +58,17 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_recycle_released_ref", }; +/** + * page_pool_get_stats() - fetch page pool stats + * @pool: pool from which page was allocated + * @stats: struct page_pool_stats to fill in + * + * Retrieve statistics about the page_pool. This API is only available + * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. + * A pointer to a caller allocated struct page_pool_stats structure + * is passed to this API which is filled in. The caller can then report + * those stats to the user (perhaps via ethtool, debugfs, etc.). + */ bool page_pool_get_stats(struct page_pool *pool, struct page_pool_stats *stats) { @@ -224,6 +235,10 @@ static int page_pool_init(struct page_pool *pool, return 0; } +/** + * page_pool_create() - create a page pool. + * @params: parameters, see struct page_pool_params + */ struct page_pool *page_pool_create(const struct page_pool_params *params) { struct page_pool *pool; @@ -492,7 +507,7 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; @@ -518,13 +533,6 @@ skip_dma_unmap: */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); -} -EXPORT_SYMBOL(page_pool_release_page); - -/* Return a page to the page allocator, cleaning up our state */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) -{ - page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -579,6 +587,8 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + lockdep_assert_no_hardirq(); + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -616,9 +626,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + page_pool_return_page(pool, page); return NULL; } @@ -635,7 +643,21 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, } EXPORT_SYMBOL(page_pool_put_defragged_page); -/* Caller must not use data area after call, as this function overwrites it */ +/** + * page_pool_put_page_bulk() - release references on multiple pages + * @pool: pool from which pages were allocated + * @data: array holding page pointers + * @count: number of pages in @data + * + * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() + * will release leftover pages to the page allocator. + * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_page_bulk(), as this function overwrites it. + */ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { @@ -915,42 +937,3 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); - -bool page_pool_return_skb_page(struct page *page, bool napi_safe) -{ - struct napi_struct *napi; - struct page_pool *pp; - bool allow_direct; - - page = compound_head(page); - - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; - - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - */ - napi = READ_ONCE(pp->p.napi); - allow_direct = napi_safe && napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); - - return true; -} -EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 00c94d9622b4..4a2ec33bfb51 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -61,7 +61,7 @@ #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 43 +#define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; @@ -1273,7 +1273,6 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; @@ -1343,7 +1342,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) - goto nla_put_vfinfo_failure; + return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || @@ -1414,8 +1413,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_vf_failure: nla_nest_cancel(skb, vf); -nla_put_vfinfo_failure: - nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } @@ -1441,8 +1438,10 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) + if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { + nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; + } } nla_nest_end(skb, vfinfo); diff --git a/net/core/scm.c b/net/core/scm.c index 3cd7dd377e53..880027ecf516 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -130,6 +130,7 @@ EXPORT_SYMBOL(__scm_destroy); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; @@ -153,7 +154,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: - if (!sock->ops || sock->ops->family != PF_UNIX) + if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a298992060e6..faa6c86da2a5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,7 +73,7 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/dropreason.h> #include <linux/uaccess.h> @@ -879,11 +879,56 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page, bool napi_safe) +{ + bool allow_direct = false; + struct page_pool *pp; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + if (napi_safe || in_softirq()) { + const struct napi_struct *napi = READ_ONCE(pp->p.napi); + + allow_direct = napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + } + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, allow_direct); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data), napi_safe); } static void skb_kfree_head(void *head, unsigned int end_offset) @@ -3656,20 +3701,23 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) EXPORT_SYMBOL(skb_dequeue_tail); /** - * skb_queue_purge - empty a list + * skb_queue_purge_reason - empty a list * @list: list to empty + * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ -void skb_queue_purge(struct sk_buff_head *list) +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) { struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } -EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_purge_reason); /** * skb_rbtree_purge - empty a skb rbtree @@ -3697,6 +3745,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root) return sum; } +void skb_errqueue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb, *next; + struct sk_buff_head kill; + unsigned long flags; + + __skb_queue_head_init(&kill); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk_safe(list, skb, next) { + if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) + continue; + __skb_unlink(skb, list); + __skb_queue_tail(&kill, skb); + } + spin_unlock_irqrestore(&list->lock, flags); + __skb_queue_purge(&kill); +} +EXPORT_SYMBOL(skb_errqueue_purge); + /** * skb_queue_head - queue a buffer at the list head * @list: list to use @@ -4750,12 +4819,23 @@ static void skb_extensions_init(void) static void skb_extensions_init(void) {} #endif +/* The SKB kmem_cache slab is critical for network performance. Never + * merge/alias the slab with similar sized objects. This avoids fragmentation + * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. + */ +#ifndef CONFIG_SLUB_TINY +#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE +#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ +#define FLAG_SKB_NO_MERGE 0 +#endif + void __init skb_init(void) { skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, + SLAB_HWCACHE_ALIGN|SLAB_PANIC| + FLAG_SKB_NO_MERGE, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); @@ -6204,7 +6284,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); * * @header_len: size of linear part * @data_len: needed length in frags - * @max_page_order: max page order desired. + * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * @@ -6212,21 +6292,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, - int max_page_order, + int order, int *errcode, gfp_t gfp_mask) { - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; unsigned long chunk; struct sk_buff *skb; struct page *page; - int i; + int nr_frags = 0; *errcode = -EMSGSIZE; - /* Note this test could be relaxed, if we succeed to allocate - * high order pages... - */ - if (npages > MAX_SKB_FRAGS) + if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; @@ -6234,34 +6310,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (!skb) return NULL; - skb->truesize += npages << PAGE_SHIFT; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | - __GFP_NOWARN, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } + while (data_len) { + if (nr_frags == MAX_SKB_FRAGS - 1) + goto failure; + while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; + + if (order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (!page) { + order--; + continue; + } + } else { + page = alloc_page(gfp_mask); + if (!page) + goto failure; } - page = alloc_page(gfp_mask); - if (!page) - goto failure; -fill_page: chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); + skb_fill_page_desc(skb, nr_frags, page, 0, chunk); + nr_frags++; + skb->truesize += (PAGE_SIZE << order); data_len -= chunk; - npages -= 1 << order; } return skb; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ef1a2eb6520b..a0659fc29bcc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1204,13 +1204,17 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops; int copied; trace_sk_data_ready(sk); - if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) + if (unlikely(!sock)) return; - copied = sock->ops->read_skb(sk, sk_psock_verdict_recv); + ops = READ_ONCE(sock->ops); + if (!ops || !ops->read_skb) + return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; diff --git a/net/core/sock.c b/net/core/sock.c index c9cffb7acbea..666a17cab4f5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -767,7 +767,7 @@ bool sk_mc_loop(struct sock *sk) return true; switch (sk->sk_family) { case AF_INET: - return inet_sk(sk)->mc_loop; + return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_sk(sk)->mc_loop; @@ -797,7 +797,7 @@ EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); - sk->sk_lingertime = 0; + WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } @@ -1230,15 +1230,15 @@ set_sndbuf: ret = -EFAULT; break; } - if (!ling.l_onoff) + if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); - else { -#if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + } else { + unsigned long t_sec = ling.l_linger; + + if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) + WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else -#endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; @@ -1247,17 +1247,11 @@ set_sndbuf: break; case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); - else - clear_bit(SOCK_PASSCRED, &sock->flags); + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); break; case SO_PASSPIDFD: - if (valbool) - set_bit(SOCK_PASSPIDFD, &sock->flags); - else - clear_bit(SOCK_PASSPIDFD, &sock->flags); + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); break; case SO_TIMESTAMP_OLD: @@ -1283,14 +1277,19 @@ set_sndbuf: break; case SO_RCVLOWAT: + { + int (*set_rcvlowat)(struct sock *sk, int val) = NULL; + if (val < 0) val = INT_MAX; - if (sock && sock->ops->set_rcvlowat) - ret = sock->ops->set_rcvlowat(sk, val); + if (sock) + set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; + if (set_rcvlowat) + ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; - + } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, @@ -1361,10 +1360,7 @@ set_sndbuf: break; case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -1388,11 +1384,16 @@ set_sndbuf: break; case SO_PEEK_OFF: - if (sock->ops->set_peek_off) - ret = sock->ops->set_peek_off(sk, val); + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; + } case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); @@ -1691,7 +1692,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; + v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: @@ -1823,14 +1824,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_PEERNAME: { - char address[128]; + struct sockaddr_storage address; - lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_sockptr(optval, address, len)) + if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } @@ -1867,7 +1868,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PEEK_OFF: - if (!sock->ops->set_peek_off) + if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); diff --git a/net/core/xdp.c b/net/core/xdp.c index 8362130bf085..a70670fe9a2d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -14,7 +14,7 @@ #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/bug.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index d76c9be5bfca..57d9c026aa3f 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -105,7 +105,6 @@ extern int sysctl_dccp_rx_ccid; extern int sysctl_dccp_tx_ccid; int dccp_feat_init(struct sock *sk); -void dccp_feat_initialise_sysctls(void); int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a545ad71201c..1591b061105a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -247,7 +247,6 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) const u8 offset = iph->ihl << 2; const struct dccp_hdr *dh; struct dccp_sock *dp; - struct inet_sock *inet; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; @@ -361,8 +360,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) * --ANK (980905) */ - inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { + if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ @@ -474,7 +472,8 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, - .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_tos = ip_sock_rt_tos(sk), + .flowi4_scope = ip_sock_rt_scope(sk), .flowi4_proto = sk->sk_protocol, .fl4_sport = dccp_hdr(skb)->dccph_dport, .fl4_dport = dccp_hdr(skb)->dccph_sport, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d29d1163203d..686090bc5945 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1056,6 +1056,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), + .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h index 7e4c2a3b322b..c5d14c48def1 100644 --- a/net/dccp/ipv6.h +++ b/net/dccp/ipv6.h @@ -13,10 +13,6 @@ struct dccp6_sock { struct dccp_sock dccp; - /* - * ipv6_pinfo has to be the last member of dccp6_sock, - * see inet6_sk_generic. - */ struct ipv6_pinfo inet6; }; diff --git a/net/devlink/Makefile b/net/devlink/Makefile index ef91a76646a3..a087af581847 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := leftover.o core.o netlink.o dev.o health.o +obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o health.o diff --git a/net/devlink/dev.c b/net/devlink/dev.c index bf1d6f1bcfc7..abf3393a7a17 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -196,7 +196,7 @@ void devlink_notify(struct devlink *devlink, enum devlink_command cmd) msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; @@ -217,17 +217,18 @@ int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) { return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); } -const struct devlink_cmd devl_cmd_get = { - .dump_one = devlink_nl_cmd_get_dump_one, -}; +int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one); +} static void devlink_reload_failed_set(struct devlink *devlink, bool reload_failed) @@ -804,7 +805,7 @@ err_cancel_msg: return err; } -int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info) +int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; @@ -826,23 +827,24 @@ int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) { int err; err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh->nlmsg_seq, flags, cb->extack); if (err == -EOPNOTSUPP) err = 0; return err; } -const struct devlink_cmd devl_cmd_info_get = { - .dump_one = devlink_nl_cmd_info_get_dump_one, -}; +int devlink_nl_info_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(msg, cb, devlink_nl_info_get_dump_one); +} static int devlink_nl_flash_update_fill(struct sk_buff *msg, struct devlink *devlink, @@ -1204,8 +1206,7 @@ err_cancel_msg: return err; } -int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; @@ -1228,23 +1229,25 @@ int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int -devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_selftests_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { if (!devlink->ops->selftest_check) return 0; return devlink_nl_selftests_fill(msg, devlink, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh->nlmsg_seq, flags, cb->extack); } -const struct devlink_cmd devl_cmd_selftests_get = { - .dump_one = devlink_nl_cmd_selftests_get_dump_one, -}; +int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_selftests_get_dump_one); +} static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, enum devlink_selftest_status test_status) diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 62921b2eb0d3..eb1d5066c73f 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -12,6 +12,8 @@ #include <net/devlink.h> #include <net/net_namespace.h> +#include "netlink_gen.h" + #define DEVLINK_REGISTERED XA_MARK_1 #define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ @@ -90,9 +92,6 @@ static inline bool devl_is_registered(struct devlink *devlink) /* Netlink */ #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) -#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) -#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) enum devlink_multicast_groups { DEVLINK_MCGRP_CONFIG, @@ -114,12 +113,12 @@ struct devlink_nl_dump_state { }; }; -struct devlink_cmd { - int (*dump_one)(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb); -}; +typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags); -extern const struct genl_small_ops devlink_nl_ops[56]; +extern const struct genl_small_ops devlink_nl_small_ops[40]; struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); @@ -127,8 +126,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); void devlink_notify_unregister(struct devlink *devlink); void devlink_notify_register(struct devlink *devlink); -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, - struct netlink_callback *cb); +int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, + devlink_nl_dump_one_func_t *dump_one); static inline struct devlink_nl_dump_state * devlink_dump_state(struct netlink_callback *cb) @@ -148,24 +147,6 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } -/* Commands */ -extern const struct devlink_cmd devl_cmd_get; -extern const struct devlink_cmd devl_cmd_port_get; -extern const struct devlink_cmd devl_cmd_sb_get; -extern const struct devlink_cmd devl_cmd_sb_pool_get; -extern const struct devlink_cmd devl_cmd_sb_port_pool_get; -extern const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get; -extern const struct devlink_cmd devl_cmd_param_get; -extern const struct devlink_cmd devl_cmd_region_get; -extern const struct devlink_cmd devl_cmd_info_get; -extern const struct devlink_cmd devl_cmd_health_reporter_get; -extern const struct devlink_cmd devl_cmd_trap_get; -extern const struct devlink_cmd devl_cmd_trap_group_get; -extern const struct devlink_cmd devl_cmd_trap_policer_get; -extern const struct devlink_cmd devl_cmd_rate_get; -extern const struct devlink_cmd devl_cmd_linecard_get; -extern const struct devlink_cmd devl_cmd_selftests_get; - /* Notify */ void devlink_notify(struct devlink *devlink, enum devlink_command cmd); @@ -199,31 +180,16 @@ int devlink_resources_validate(struct devlink *devlink, struct devlink_resource *resource, struct genl_info *info); -/* Line cards */ -struct devlink_linecard; - -struct devlink_linecard * -devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info); - /* Rates */ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack); -struct devlink_rate * -devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info); -struct devlink_rate * -devlink_rate_node_get_from_info(struct devlink *devlink, - struct genl_info *info); + /* Devlink nl cmds */ -int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, diff --git a/net/devlink/health.c b/net/devlink/health.c index 194340a8bb86..638cad8d5c65 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -356,8 +356,8 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, return devlink_health_reporter_get_from_attrs(devlink, info->attrs); } -int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; @@ -384,18 +384,29 @@ int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int -devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); + const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; + unsigned long port_index_end = ULONG_MAX; + struct nlattr **attrs = info->attrs; + unsigned long port_index_start = 0; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; + if (attrs && attrs[DEVLINK_ATTR_PORT_INDEX]) { + port_index_start = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + port_index_end = port_index_start; + flags |= NLM_F_DUMP_FILTERED; + goto per_port_dump; + } + list_for_each_entry(reporter, &devlink->reporter_list, list) { if (idx < state->idx) { idx++; @@ -405,14 +416,16 @@ devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + flags); if (err) { state->idx = idx; return err; } idx++; } - xa_for_each(&devlink->ports, port_index, port) { +per_port_dump: + xa_for_each_range(&devlink->ports, port_index, port, + port_index_start, port_index_end) { list_for_each_entry(reporter, &port->reporter_list, list) { if (idx < state->idx) { idx++; @@ -422,7 +435,7 @@ devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + flags); if (err) { state->idx = idx; return err; @@ -434,9 +447,12 @@ devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, return 0; } -const struct devlink_cmd devl_cmd_health_reporter_get = { - .dump_one = devlink_nl_cmd_health_reporter_get_dump_one, -}; +int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, + devlink_nl_health_reporter_get_dump_one); +} int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info) @@ -1248,7 +1264,7 @@ out: static struct devlink_health_reporter * devlink_health_reporter_get_from_cb(struct netlink_callback *cb) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; struct nlattr **attrs = info->attrs; struct devlink *devlink; diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index bfed7929a904..e2cd13958cc2 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -232,13 +232,13 @@ devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return devlink_rate_node_get_by_name(devlink, rate_node_name); } -struct devlink_rate * +static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } -struct devlink_rate * +static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; @@ -285,7 +285,7 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return ERR_PTR(-EINVAL); } -struct devlink_linecard * +static struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); @@ -1005,8 +1005,8 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, } static int -devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; @@ -1022,8 +1022,7 @@ devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, NULL); + cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; @@ -1034,17 +1033,22 @@ devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_rate_get = { - .dump_one = devlink_nl_cmd_rate_get_dump_one, -}; +int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); +} -static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) + return PTR_ERR(devlink_rate); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -1072,8 +1076,7 @@ devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, return false; } -static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; @@ -1095,8 +1098,8 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, } static int -devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; @@ -1107,8 +1110,8 @@ devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack); + cb->nlh->nlmsg_seq, flags, + cb->extack); if (err) { state->idx = port_index; break; @@ -1118,9 +1121,10 @@ devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_port_get = { - .dump_one = devlink_nl_cmd_port_get_dump_one, -}; +int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); +} static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) @@ -1629,11 +1633,16 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_rate *devlink_rate = info->user_ptr[1]; - struct devlink *devlink = devlink_rate->devlink; - const struct devlink_ops *ops = devlink->ops; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *devlink_rate; + const struct devlink_ops *ops; int err; + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) + return PTR_ERR(devlink_rate); + + ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; @@ -1704,18 +1713,22 @@ err_strdup: static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_rate *rate_node = info->user_ptr[1]; - struct devlink *devlink = rate_node->devlink; - const struct devlink_ops *ops = devlink->ops; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *rate_node; int err; + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) + return PTR_ERR(rate_node); + if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); - err = ops->rate_node_del(rate_node, rate_node->priv, info->extack); + err = devlink->ops->rate_node_del(rate_node, rate_node->priv, + info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); @@ -1811,14 +1824,17 @@ static void devlink_linecard_notify(struct devlink_linecard *linecard, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_linecard *linecard = info->user_ptr[1]; - struct devlink *devlink = linecard->devlink; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; struct sk_buff *msg; int err; + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -1837,9 +1853,10 @@ static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; @@ -1855,8 +1872,7 @@ static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, + cb->nlh->nlmsg_seq, flags, cb->extack); mutex_unlock(&linecard->state_lock); if (err) { @@ -1869,9 +1885,11 @@ static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_linecard_get = { - .dump_one = devlink_nl_cmd_linecard_get_dump_one, -}; +int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); +} static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, @@ -2008,10 +2026,15 @@ out: static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_linecard *linecard = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; int err; + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); + if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { const char *type; @@ -2068,8 +2091,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_sb *devlink_sb; @@ -2096,8 +2118,8 @@ static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, } static int -devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2112,8 +2134,7 @@ devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_sb_fill(msg, devlink, devlink_sb, DEVLINK_CMD_SB_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; @@ -2124,9 +2145,10 @@ devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_sb_get = { - .dump_one = devlink_nl_cmd_sb_get_dump_one, -}; +int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_get_dump_one); +} static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, @@ -2171,8 +2193,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_sb *devlink_sb; @@ -2210,7 +2231,7 @@ static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, - u32 portid, u32 seq) + u32 portid, u32 seq, int flags) { u16 pool_count = devlink_sb_pool_count(devlink_sb); u16 pool_index; @@ -2225,7 +2246,7 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, devlink_sb, pool_index, DEVLINK_CMD_SB_POOL_NEW, - portid, seq, NLM_F_MULTI); + portid, seq, flags); if (err) return err; (*p_idx)++; @@ -2234,9 +2255,8 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, } static int -devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2250,7 +2270,7 @@ devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, err = __sb_pool_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -2262,9 +2282,11 @@ devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_sb_pool_get = { - .dump_one = devlink_nl_cmd_sb_pool_get_dump_one, -}; +int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_pool_get_dump_one); +} static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, @@ -2371,8 +2393,8 @@ sb_occ_get_failure: return err; } -static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; @@ -2412,7 +2434,7 @@ static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, - u32 portid, u32 seq) + u32 portid, u32 seq, int flags) { struct devlink_port *devlink_port; u16 pool_count = devlink_sb_pool_count(devlink_sb); @@ -2431,8 +2453,7 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, devlink_sb, pool_index, DEVLINK_CMD_SB_PORT_POOL_NEW, - portid, seq, - NLM_F_MULTI); + portid, seq, flags); if (err) return err; (*p_idx)++; @@ -2442,9 +2463,9 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, } static int -devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2458,7 +2479,7 @@ devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -2470,9 +2491,11 @@ devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_sb_port_pool_get = { - .dump_one = devlink_nl_cmd_sb_port_pool_get_dump_one, -}; +int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_port_pool_get_dump_one); +} static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, @@ -2580,8 +2603,8 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; @@ -2628,7 +2651,7 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, - u32 portid, u32 seq) + u32 portid, u32 seq, int flags) { struct devlink_port *devlink_port; unsigned long port_index; @@ -2649,7 +2672,7 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, DEVLINK_SB_POOL_TYPE_INGRESS, DEVLINK_CMD_SB_TC_POOL_BIND_NEW, portid, seq, - NLM_F_MULTI); + flags); if (err) return err; (*p_idx)++; @@ -2667,7 +2690,7 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, DEVLINK_SB_POOL_TYPE_EGRESS, DEVLINK_CMD_SB_TC_POOL_BIND_NEW, portid, seq, - NLM_F_MULTI); + flags); if (err) return err; (*p_idx)++; @@ -2676,10 +2699,10 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, return 0; } -static int -devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2693,7 +2716,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -2705,9 +2728,12 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get = { - .dump_one = devlink_nl_cmd_sb_tc_pool_bind_get_dump_one, -}; +int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, + devlink_nl_sb_tc_pool_bind_get_dump_one); +} static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, @@ -3946,7 +3972,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get || devlink->reload_failed) + if (!param->get) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -3955,7 +3981,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set || devlink->reload_failed) + if (!param->set) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } @@ -4155,9 +4181,10 @@ static void devlink_param_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int -devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_param_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; @@ -4168,8 +4195,7 @@ devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -4181,9 +4207,11 @@ devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_param_get = { - .dump_one = devlink_nl_cmd_param_get_dump_one, -}; +int devlink_nl_param_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one); +} static int devlink_param_type_get_from_info(struct genl_info *info, @@ -4272,8 +4300,8 @@ devlink_param_get_from_info(struct xarray *params, struct genl_info *info) return devlink_param_find_by_name(params, param_name); } -static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_param_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_param_item *param_item; @@ -4770,8 +4798,7 @@ static void devlink_region_snapshot_del(struct devlink_region *region, kfree(snapshot); } -static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_port *port = NULL; @@ -4819,8 +4846,7 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, struct netlink_callback *cb, struct devlink_port *port, - int *idx, - int start) + int *idx, int start, int flags) { struct devlink_region *region; int err = 0; @@ -4834,7 +4860,7 @@ static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, region); + flags, region); if (err) goto out; (*idx)++; @@ -4844,9 +4870,10 @@ out: return err; } -static int -devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_region_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; @@ -4863,8 +4890,8 @@ devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, region); + cb->nlh->nlmsg_seq, flags, + region); if (err) { state->idx = idx; return err; @@ -4874,7 +4901,7 @@ devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, xa_for_each(&devlink->ports, port_index, port) { err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, - state->idx); + state->idx, flags); if (err) { state->idx = idx; return err; @@ -4884,9 +4911,11 @@ devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return 0; } -const struct devlink_cmd devl_cmd_region_get = { - .dump_one = devlink_nl_cmd_region_get_dump_one, -}; +int devlink_nl_region_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one); +} static int devlink_nl_cmd_region_del(struct sk_buff *skb, struct genl_info *info) @@ -5172,7 +5201,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; - struct nlattr **attrs = info->attrs; + struct nlattr **attrs = info->info.attrs; struct devlink_port *port = NULL; devlink_chunk_fill_t *region_cb; struct devlink_region *region; @@ -5195,8 +5224,8 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) { @@ -5632,8 +5661,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; @@ -5667,9 +5695,9 @@ err_trap_fill: return err; } -static int -devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; @@ -5684,8 +5712,7 @@ devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; @@ -5696,9 +5723,10 @@ devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_trap_get = { - .dump_one = devlink_nl_cmd_trap_get_dump_one, -}; +int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one); +} static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, @@ -5843,8 +5871,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; @@ -5878,10 +5905,10 @@ err_trap_group_fill: return err; } -static int -devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; @@ -5897,8 +5924,7 @@ devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; @@ -5909,9 +5935,11 @@ devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_trap_group_get = { - .dump_one = devlink_nl_cmd_trap_group_get_dump_one, -}; +int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one); +} static int __devlink_trap_group_action_set(struct devlink *devlink, @@ -6137,8 +6165,8 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; @@ -6172,10 +6200,10 @@ err_trap_policer_fill: return err; } -static int -devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; @@ -6190,8 +6218,7 @@ devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; @@ -6202,9 +6229,11 @@ devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_trap_policer_get = { - .dump_one = devlink_nl_cmd_trap_policer_get_dump_one, -}; +int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one); +} static int devlink_trap_policer_set(struct devlink *devlink, @@ -6278,22 +6307,7 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, return devlink_trap_policer_set(devlink, policer_item, info); } -const struct genl_small_ops devlink_nl_ops[56] = { - { - .cmd = DEVLINK_CMD_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_PORT_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, +const struct genl_small_ops devlink_nl_small_ops[40] = { { .cmd = DEVLINK_CMD_PORT_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6302,17 +6316,9 @@ const struct genl_small_ops devlink_nl_ops[56] = { .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { - .cmd = DEVLINK_CMD_RATE_GET, - .doit = devlink_nl_cmd_rate_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_RATE_SET, .doit = devlink_nl_cmd_rate_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, }, { .cmd = DEVLINK_CMD_RATE_NEW, @@ -6323,7 +6329,6 @@ const struct genl_small_ops devlink_nl_ops[56] = { .cmd = DEVLINK_CMD_RATE_DEL, .doit = devlink_nl_cmd_rate_del_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE, }, { .cmd = DEVLINK_CMD_PORT_SPLIT, @@ -6350,32 +6355,11 @@ const struct genl_small_ops devlink_nl_ops[56] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, - { - .cmd = DEVLINK_CMD_LINECARD_GET, - .doit = devlink_nl_cmd_linecard_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, - /* can be retrieved by unprivileged users */ - }, + { .cmd = DEVLINK_CMD_LINECARD_SET, .doit = devlink_nl_cmd_linecard_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, - }, - { - .cmd = DEVLINK_CMD_SB_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_SB_POOL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_pool_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_SET, @@ -6384,14 +6368,6 @@ const struct genl_small_ops devlink_nl_ops[56] = { .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_port_pool_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_set_doit, @@ -6399,14 +6375,6 @@ const struct genl_small_ops devlink_nl_ops[56] = { .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { - .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, @@ -6480,13 +6448,6 @@ const struct genl_small_ops devlink_nl_ops[56] = { .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_PARAM_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_param_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_PARAM_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_set_doit, @@ -6508,13 +6469,6 @@ const struct genl_small_ops devlink_nl_ops[56] = { .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { - .cmd = DEVLINK_CMD_REGION_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { .cmd = DEVLINK_CMD_REGION_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_new, @@ -6534,21 +6488,6 @@ const struct genl_small_ops devlink_nl_ops[56] = { .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_INFO_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_info_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_set_doit, @@ -6597,45 +6536,21 @@ const struct genl_small_ops devlink_nl_ops[56] = { .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_TRAP_GET, - .doit = devlink_nl_cmd_trap_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_TRAP_SET, .doit = devlink_nl_cmd_trap_set_doit, .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_TRAP_GROUP_GET, - .doit = devlink_nl_cmd_trap_group_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_TRAP_GROUP_SET, .doit = devlink_nl_cmd_trap_group_set_doit, .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_TRAP_POLICER_GET, - .doit = devlink_nl_cmd_trap_policer_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_TRAP_POLICER_SET, .doit = devlink_nl_cmd_trap_policer_set_doit, .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_SELFTESTS_GET, - .doit = devlink_nl_cmd_selftests_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_SELFTESTS_RUN, .doit = devlink_nl_cmd_selftests_run, .flags = GENL_ADMIN_PERM, @@ -6846,8 +6761,10 @@ int devl_port_register_with_ops(struct devlink *devlink, spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) + if (err) { + devlink_port->registered = false; return err; + } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 7a332eb70f70..72a5005a64cd 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -109,10 +109,9 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) return ERR_PTR(-ENODEV); } -static int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info, + u8 flags) { - struct devlink_linecard *linecard; struct devlink_port *devlink_port; struct devlink *devlink; int err; @@ -122,42 +121,17 @@ static int devlink_nl_pre_doit(const struct genl_split_ops *ops, return PTR_ERR(devlink); info->user_ptr[0] = devlink; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + if (flags & DEVLINK_NL_FLAG_NEED_PORT) { devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { err = PTR_ERR(devlink_port); goto unlock; } info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { + } else if (flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { devlink_port = devlink_port_get_from_info(devlink, info); if (!IS_ERR(devlink_port)) info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { - struct devlink_rate *devlink_rate; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) { - err = PTR_ERR(devlink_rate); - goto unlock; - } - info->user_ptr[1] = devlink_rate; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_from_info(devlink, info); - if (IS_ERR(rate_node)) { - err = PTR_ERR(rate_node); - goto unlock; - } - info->user_ptr[1] = rate_node; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) { - err = PTR_ERR(linecard); - goto unlock; - } - info->user_ptr[1] = linecard; } return 0; @@ -167,8 +141,27 @@ unlock: return err; } -static void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, ops->internal_flags); +} + +int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT); +} + +int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT); +} + +void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink; @@ -177,42 +170,41 @@ static void devlink_nl_post_doit(const struct genl_split_ops *ops, devlink_put(devlink); } -static const struct devlink_cmd *devl_cmds[] = { - [DEVLINK_CMD_GET] = &devl_cmd_get, - [DEVLINK_CMD_PORT_GET] = &devl_cmd_port_get, - [DEVLINK_CMD_SB_GET] = &devl_cmd_sb_get, - [DEVLINK_CMD_SB_POOL_GET] = &devl_cmd_sb_pool_get, - [DEVLINK_CMD_SB_PORT_POOL_GET] = &devl_cmd_sb_port_pool_get, - [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = &devl_cmd_sb_tc_pool_bind_get, - [DEVLINK_CMD_PARAM_GET] = &devl_cmd_param_get, - [DEVLINK_CMD_REGION_GET] = &devl_cmd_region_get, - [DEVLINK_CMD_INFO_GET] = &devl_cmd_info_get, - [DEVLINK_CMD_HEALTH_REPORTER_GET] = &devl_cmd_health_reporter_get, - [DEVLINK_CMD_TRAP_GET] = &devl_cmd_trap_get, - [DEVLINK_CMD_TRAP_GROUP_GET] = &devl_cmd_trap_group_get, - [DEVLINK_CMD_TRAP_POLICER_GET] = &devl_cmd_trap_policer_get, - [DEVLINK_CMD_RATE_GET] = &devl_cmd_rate_get, - [DEVLINK_CMD_LINECARD_GET] = &devl_cmd_linecard_get, - [DEVLINK_CMD_SELFTESTS_GET] = &devl_cmd_selftests_get, -}; +static int devlink_nl_inst_single_dumpit(struct sk_buff *msg, + struct netlink_callback *cb, int flags, + devlink_nl_dump_one_func_t *dump_one, + struct nlattr **attrs) +{ + struct devlink *devlink; + int err; + + devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs); + if (IS_ERR(devlink)) + return PTR_ERR(devlink); + err = dump_one(msg, devlink, cb, flags | NLM_F_DUMP_FILTERED); + + devl_unlock(devlink); + devlink_put(devlink); + + if (err != -EMSGSIZE) + return err; + return msg->len; +} -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int devlink_nl_inst_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb, int flags, + devlink_nl_dump_one_func_t *dump_one) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_nl_dump_state *state = devlink_dump_state(cb); - const struct devlink_cmd *cmd; struct devlink *devlink; int err = 0; - cmd = devl_cmds[info->op.cmd]; - while ((devlink = devlinks_xa_find_get(sock_net(msg->sk), &state->instance))) { devl_lock(devlink); if (devl_is_registered(devlink)) - err = cmd->dump_one(msg, devlink, cb); + err = dump_one(msg, devlink, cb, flags); else err = 0; @@ -233,6 +225,21 @@ int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, return msg->len; } +int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, + devlink_nl_dump_one_func_t *dump_one) +{ + const struct genl_info *info = genl_info_dump(cb); + struct nlattr **attrs = info->attrs; + int flags = NLM_F_MULTI; + + if (attrs && + (attrs[DEVLINK_ATTR_BUS_NAME] || attrs[DEVLINK_ATTR_DEV_NAME])) + return devlink_nl_inst_single_dumpit(msg, cb, flags, dump_one, + attrs); + else + return devlink_nl_inst_iter_dumpit(msg, cb, flags, dump_one); +} + struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, @@ -243,8 +250,10 @@ struct genl_family devlink_nl_family __ro_after_init = { .pre_doit = devlink_nl_pre_doit, .post_doit = devlink_nl_post_doit, .module = THIS_MODULE, - .small_ops = devlink_nl_ops, - .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .small_ops = devlink_nl_small_ops, + .n_small_ops = ARRAY_SIZE(devlink_nl_small_ops), + .split_ops = devlink_nl_ops, + .n_split_ops = ARRAY_SIZE(devlink_nl_ops), .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c new file mode 100644 index 000000000000..467b7a431de1 --- /dev/null +++ b/net/devlink/netlink_gen.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/devlink.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netlink_gen.h" + +#include <uapi/linux/devlink.h> + +/* DEVLINK_CMD_GET - do */ +static const struct nla_policy devlink_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_PORT_GET - do */ +static const struct nla_policy devlink_port_get_do_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_PORT_GET - dump */ +static const struct nla_policy devlink_port_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_SB_GET - do */ +static const struct nla_policy devlink_sb_get_do_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_SB_GET - dump */ +static const struct nla_policy devlink_sb_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_SB_POOL_GET - do */ +static const struct nla_policy devlink_sb_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, }, +}; + +/* DEVLINK_CMD_SB_POOL_GET - dump */ +static const struct nla_policy devlink_sb_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_SB_PORT_POOL_GET - do */ +static const struct nla_policy devlink_sb_port_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, }, +}; + +/* DEVLINK_CMD_SB_PORT_POOL_GET - dump */ +static const struct nla_policy devlink_sb_port_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */ +static const struct nla_policy devlink_sb_tc_pool_bind_get_do_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_POOL_TYPE] = NLA_POLICY_MAX(NLA_U8, 1), + [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16, }, +}; + +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - dump */ +static const struct nla_policy devlink_sb_tc_pool_bind_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_PARAM_GET - do */ +static const struct nla_policy devlink_param_get_do_nl_policy[DEVLINK_ATTR_PARAM_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_PARAM_GET - dump */ +static const struct nla_policy devlink_param_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_REGION_GET - do */ +static const struct nla_policy devlink_region_get_do_nl_policy[DEVLINK_ATTR_REGION_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_REGION_GET - dump */ +static const struct nla_policy devlink_region_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_INFO_GET - do */ +static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_HEALTH_REPORTER_GET - do */ +static const struct nla_policy devlink_health_reporter_get_do_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */ +static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_TRAP_GET - do */ +static const struct nla_policy devlink_trap_get_do_nl_policy[DEVLINK_ATTR_TRAP_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_TRAP_GET - dump */ +static const struct nla_policy devlink_trap_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_TRAP_GROUP_GET - do */ +static const struct nla_policy devlink_trap_group_get_do_nl_policy[DEVLINK_ATTR_TRAP_GROUP_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_TRAP_GROUP_GET - dump */ +static const struct nla_policy devlink_trap_group_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_TRAP_POLICER_GET - do */ +static const struct nla_policy devlink_trap_policer_get_do_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_TRAP_POLICER_GET - dump */ +static const struct nla_policy devlink_trap_policer_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_RATE_GET - do */ +static const struct nla_policy devlink_rate_get_do_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_RATE_GET - dump */ +static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_LINECARD_GET - do */ +static const struct nla_policy devlink_linecard_get_do_nl_policy[DEVLINK_ATTR_LINECARD_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_LINECARD_GET - dump */ +static const struct nla_policy devlink_linecard_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_SELFTESTS_GET - do */ +static const struct nla_policy devlink_selftests_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* Ops table for devlink */ +const struct genl_split_ops devlink_nl_ops[32] = { + { + .cmd = DEVLINK_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port, + .doit = devlink_nl_port_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_port_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_PORT_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .dumpit = devlink_nl_port_get_dumpit, + .policy = devlink_port_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_sb_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_SB_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_GET, + .dumpit = devlink_nl_sb_get_dumpit, + .policy = devlink_sb_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_POOL_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_sb_pool_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_pool_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_SB_POOL_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_POOL_GET, + .dumpit = devlink_nl_sb_pool_get_dumpit, + .policy = devlink_sb_pool_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port, + .doit = devlink_nl_sb_port_pool_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_port_pool_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_SB_POOL_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, + .dumpit = devlink_nl_sb_port_pool_get_dumpit, + .policy = devlink_sb_port_pool_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port, + .doit = devlink_nl_sb_tc_pool_bind_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_tc_pool_bind_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_SB_TC_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, + .dumpit = devlink_nl_sb_tc_pool_bind_get_dumpit, + .policy = devlink_sb_tc_pool_bind_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_PARAM_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_param_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_param_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_PARAM_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_PARAM_GET, + .dumpit = devlink_nl_param_get_dumpit, + .policy = devlink_param_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_REGION_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port_optional, + .doit = devlink_nl_region_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_region_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_REGION_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_REGION_GET, + .dumpit = devlink_nl_region_get_dumpit, + .policy = devlink_region_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_INFO_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_info_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_info_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_INFO_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_info_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port_optional, + .doit = devlink_nl_health_reporter_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_health_reporter_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, + .dumpit = devlink_nl_health_reporter_get_dumpit, + .policy = devlink_health_reporter_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_PORT_INDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_TRAP_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_trap_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_trap_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_TRAP_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_TRAP_GET, + .dumpit = devlink_nl_trap_get_dumpit, + .policy = devlink_trap_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_trap_group_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_trap_group_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_TRAP_GROUP_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_GET, + .dumpit = devlink_nl_trap_group_get_dumpit, + .policy = devlink_trap_group_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_trap_policer_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_trap_policer_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_TRAP_POLICER_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_GET, + .dumpit = devlink_nl_trap_policer_get_dumpit, + .policy = devlink_trap_policer_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_RATE_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_rate_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_rate_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_RATE_NODE_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_RATE_GET, + .dumpit = devlink_nl_rate_get_dumpit, + .policy = devlink_rate_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_LINECARD_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_linecard_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_linecard_get_do_nl_policy, + .maxattr = DEVLINK_ATTR_LINECARD_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_LINECARD_GET, + .dumpit = devlink_nl_linecard_get_dumpit, + .policy = devlink_linecard_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SELFTESTS_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_selftests_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_selftests_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SELFTESTS_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_selftests_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +}; diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h new file mode 100644 index 000000000000..f8bbc93e39be --- /dev/null +++ b/net/devlink/netlink_gen.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/devlink.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_DEVLINK_GEN_H +#define _LINUX_DEVLINK_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/devlink.h> + +/* Ops table for devlink */ +extern const struct genl_split_ops devlink_nl_ops[32]; + +int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); +void +devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_port_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_param_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_region_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_info_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_rate_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); + +#endif /* _LINUX_DEVLINK_GEN_H */ diff --git a/net/dsa/port.c b/net/dsa/port.c index 2f6195d7b741..37ab238e8304 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1568,27 +1568,6 @@ static void dsa_port_phylink_validate(struct phylink_config *config, phylink_generic_validate(config, supported, state); } -static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - int err; - - /* Only called for inband modes */ - if (!ds->ops->phylink_mac_link_state) { - state->link = 0; - return; - } - - err = ds->ops->phylink_mac_link_state(ds, dp->index, state); - if (err < 0) { - dev_err(ds->dev, "p%d: phylink_mac_link_state() failed: %d\n", - dp->index, err); - state->link = 0; - } -} - static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) @@ -1646,17 +1625,6 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config, return err; } -static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_an_restart) - return; - - ds->ops->phylink_mac_an_restart(ds, dp->index); -} - static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) @@ -1700,11 +1668,9 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, .mac_select_pcs = dsa_port_phylink_mac_select_pcs, - .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_prepare = dsa_port_phylink_mac_prepare, .mac_config = dsa_port_phylink_mac_config, .mac_finish = dsa_port_phylink_mac_finish, - .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, .mac_link_up = dsa_port_phylink_mac_link_up, }; @@ -1720,21 +1686,18 @@ int dsa_port_phylink_create(struct dsa_port *dp) if (err) mode = PHY_INTERFACE_MODE_NA; - /* Presence of phylink_mac_link_state or phylink_mac_an_restart is - * an indicator of a legacy phylink driver. - */ - if (ds->ops->phylink_mac_link_state || - ds->ops->phylink_mac_an_restart) - dp->pl_config.legacy_pre_march2020 = true; - if (ds->ops->phylink_get_caps) { ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config); } else { /* For legacy drivers */ - __set_bit(PHY_INTERFACE_MODE_INTERNAL, - dp->pl_config.supported_interfaces); - __set_bit(PHY_INTERFACE_MODE_GMII, - dp->pl_config.supported_interfaces); + if (mode != PHY_INTERFACE_MODE_NA) { + __set_bit(mode, dp->pl_config.supported_interfaces); + } else { + __set_bit(PHY_INTERFACE_MODE_INTERNAL, + dp->pl_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_GMII, + dp->pl_config.supported_interfaces); + } } pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 527b1d576460..48db91b33390 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -21,6 +21,7 @@ #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> +#include <linux/string.h> #include "dsa.h" #include "port.h" @@ -1056,10 +1057,10 @@ static void dsa_slave_get_strings(struct net_device *dev, if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; - strncpy(data, "tx_packets", len); - strncpy(data + len, "tx_bytes", len); - strncpy(data + 2 * len, "rx_packets", len); - strncpy(data + 3 * len, "rx_bytes", len); + strscpy_pad(data, "tx_packets", len); + strscpy_pad(data + len, "tx_bytes", len); + strscpy_pad(data + 2 * len, "rx_packets", len); + strscpy_pad(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index e757c8de06f1..e5ff7c34e577 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -75,10 +75,6 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) return NULL; } - /* Remove QCA tag and recalculate checksum */ - skb_pull_rcsum(skb, QCA_HDR_LEN); - dsa_strip_etype_header(skb, QCA_HDR_LEN); - /* Get source port information */ port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr); @@ -86,6 +82,10 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) if (!skb->dev) return NULL; + /* Remove QCA tag and recalculate checksum */ + skb_pull_rcsum(skb, QCA_HDR_LEN); + dsa_strip_etype_header(skb, QCA_HDR_LEN); + return skb; } diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 61c40e889a4d..7b4bbd674bae 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -24,7 +24,7 @@ const struct nla_policy ethnl_channels_get_policy[] = { static int channels_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 01a59ce211c8..83112c1a71ae 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -59,10 +59,9 @@ const struct nla_policy ethnl_coalesce_get_policy[] = { static int coalesce_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -73,7 +72,8 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce, - &data->kernel_coalesce, extack); + &data->kernel_coalesce, + info->extack); ethnl_ops_complete(dev); return ret; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 5fb19050991e..f5598c5f50de 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -665,9 +665,8 @@ const struct ethtool_phy_ops *ethtool_phy_ops; void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) { - rtnl_lock(); + ASSERT_RTNL(); ethtool_phy_ops = ops; - rtnl_unlock(); } EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops); diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index e4369769817e..0b2dea56d461 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -23,7 +23,7 @@ const struct nla_policy ethnl_debug_get_policy[] = { static int debug_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct debug_reply_data *data = DEBUG_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c index 42104bcb0e47..2853394d06a8 100644 --- a/net/ethtool/eee.c +++ b/net/ethtool/eee.c @@ -26,7 +26,7 @@ const struct nla_policy ethnl_eee_get_policy[] = { static int eee_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct eee_reply_data *data = EEE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 49c0a2a77f02..6209c3a9c8f7 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -51,8 +51,7 @@ static int fallback_set_params(struct eeprom_req_info *request, } static int eeprom_fallback(struct eeprom_req_info *request, - struct eeprom_reply_data *reply, - struct genl_info *info) + struct eeprom_reply_data *reply) { struct net_device *dev = reply->base.dev; struct ethtool_modinfo modinfo = {0}; @@ -103,7 +102,7 @@ static int get_module_eeprom_by_page(struct net_device *dev, static int eeprom_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); @@ -124,7 +123,7 @@ static int eeprom_prepare_data(const struct ethnl_req_info *req_base, if (ret) goto err_free; - ret = get_module_eeprom_by_page(dev, &page_data, info ? info->extack : NULL); + ret = get_module_eeprom_by_page(dev, &page_data, info->extack); if (ret < 0) goto err_ops; @@ -140,7 +139,7 @@ err_free: kfree(page_data.data); if (ret == -EOPNOTSUPP) - return eeprom_fallback(request, reply, info); + return eeprom_fallback(request, reply); return ret; } diff --git a/net/ethtool/features.c b/net/ethtool/features.c index 55d449a2d3fc..a79af8c25a07 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -35,7 +35,7 @@ static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src) static int features_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct features_reply_data *data = FEATURES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index 0d9a3d153170..e7d3f2c352a3 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -92,7 +92,7 @@ fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats) static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {}; struct fec_reply_data *data = FEC_REPDATA(reply_base); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 4a51e0ec295c..0b0ce4f81c01 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -907,6 +907,38 @@ static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, return 0; } +static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info, + size_t *info_size, void __user *useraddr) +{ + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. + */ + if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) + *info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info->data)); + + if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) + return -EFAULT; + + if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) { + *info_size = sizeof(*info); + if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) + return -EFAULT; + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info->flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info->cmd != cmd) + return -EINVAL; + + return 0; +} + static int ethtool_rxnfc_copy_to_user(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) @@ -944,16 +976,9 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_SRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; + rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (rc) + return rc; rc = dev->ethtool_ops->set_rxnfc(dev, &info); if (rc) @@ -978,33 +1003,9 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (!ops->get_rxnfc) return -EOPNOTSUPP; - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_GRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - - /* If FLOW_RSS was requested then user-space must be using the - * new definition, as FLOW_RSS is newer. - */ - if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { - info_size = sizeof(info); - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - /* Since malicious users may modify the original data, - * we need to check whether FLOW_RSS is still requested. - */ - if (!(info.flow_type & FLOW_RSS)) - return -EINVAL; - } - - if (info.cmd != cmd) - return -EINVAL; + ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (ret) + return ret; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { @@ -3207,7 +3208,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v4_m_spec->ip4src || v4_m_spec->ip4dst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv4); } @@ -3222,7 +3223,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v4_m_spec->psrc || v4_m_spec->pdst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } @@ -3259,7 +3260,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) || !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv6); } @@ -3274,7 +3275,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v6_m_spec->psrc || v6_m_spec->pdst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } @@ -3282,7 +3283,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) match->key.ip.tos = v6_spec->tclass; match->mask.ip.tos = v6_m_spec->tclass; match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IP); + BIT_ULL(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } @@ -3306,7 +3307,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) break; } - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = offsetof(struct ethtool_rx_flow_key, basic); @@ -3339,7 +3340,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (ext_m_spec->vlan_etype || ext_m_spec->vlan_tci) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_VLAN); + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct ethtool_rx_flow_key, vlan); } @@ -3354,7 +3355,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) ETH_ALEN); match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = offsetof(struct ethtool_rx_flow_key, eth_addrs); } diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 310dfe63292a..5c317d23787b 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -23,7 +23,7 @@ const struct nla_policy ethnl_linkinfo_get_policy[] = { static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 20165e07ef90..b2591db49f7d 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -27,7 +27,7 @@ const struct nla_policy ethnl_linkmodes_get_policy[] = { static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 2158c17a0b32..b2de2108b356 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -81,7 +81,7 @@ static int linkstate_get_link_ext_state(struct net_device *dev, static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c index 4058a557b5a4..2816bb23c3ad 100644 --- a/net/ethtool/mm.c +++ b/net/ethtool/mm.c @@ -27,7 +27,7 @@ const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1] = { static int mm_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct mm_reply_data *data = MM_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/module.c b/net/ethtool/module.c index e0d539b21423..ceb575efc290 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -38,10 +38,9 @@ static int module_get_power_mode(struct net_device *dev, static int module_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct module_reply_data *data = MODULE_REPDATA(reply_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -49,7 +48,7 @@ static int module_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; - ret = module_get_power_mode(dev, data, extack); + ret = module_get_power_mode(dev, data, info->extack); if (ret < 0) goto out_complete; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 39a459b0111b..3bbd5afb7b31 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -252,8 +252,7 @@ int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply - * @pos_hash: saved iteration position - hashbucket - * @pos_idx: saved iteration position - index + * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used @@ -263,8 +262,7 @@ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; - int pos_hash; - int pos_idx; + unsigned long pos_ifindex; }; static const struct ethnl_request_ops * @@ -318,10 +316,8 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into - * @tb: parsed attributes - * @net: request netns + * @info: genl_info from the request * @request_ops: struct request_ops for request type - * @extack: netlink extack for error reporting * @require_dev: fail if no device identified in header * * Parse universal request header and call request specific ->parse_request() @@ -330,19 +326,21 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) * Return: 0 on success or negative error code */ static int ethnl_default_parse(struct ethnl_req_info *req_info, - struct nlattr **tb, struct net *net, + const struct genl_info *info, const struct ethnl_request_ops *request_ops, - struct netlink_ext_ack *extack, bool require_dev) + bool require_dev) { + struct nlattr **tb = info->attrs; int ret; ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr], - net, extack, require_dev); + genl_info_net(info), info->extack, + require_dev); if (ret < 0) return ret; if (request_ops->parse_request) { - ret = request_ops->parse_request(req_info, tb, extack); + ret = request_ops->parse_request(req_info, tb, info->extack); if (ret < 0) return ret; } @@ -395,8 +393,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; } - ret = ethnl_default_parse(req_info, info->attrs, genl_info_net(info), - ops, info->extack, !ops->allow_nodev_do); + ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do); if (ret < 0) goto err_dev; ethnl_init_reply_data(reply_data, ops, req_info->dev); @@ -447,12 +444,12 @@ err_dev: static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, const struct ethnl_dump_ctx *ctx, - struct netlink_callback *cb) + const struct genl_info *info) { void *ehdr; int ret; - ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, ðtool_genl_family, NLM_F_MULTI, ctx->ops->reply_cmd); if (!ehdr) @@ -460,7 +457,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); - ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, NULL); + ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); rtnl_unlock(); if (ret < 0) goto out; @@ -490,55 +487,27 @@ static int ethnl_default_dumpit(struct sk_buff *skb, { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; rtnl_lock(); - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - unsigned int seq; - - head = &net->dev_index_head[h]; - -restart_chain: - seq = net->dev_base_seq; - cb->seq = seq; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - dev_hold(dev); - rtnl_unlock(); - - ret = ethnl_default_dump_one(skb, dev, ctx, cb); - dev_put(dev); - if (ret < 0) { - if (ret == -EOPNOTSUPP) - goto lock_and_cont; - if (likely(skb->len)) - ret = skb->len; - goto out; - } -lock_and_cont: - rtnl_lock(); - if (net->dev_base_seq != seq) { - s_idx = idx + 1; - goto restart_chain; - } -cont: - idx++; - } + for_each_netdev_dump(net, dev, ctx->pos_ifindex) { + dev_hold(dev); + rtnl_unlock(); + + ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); + rtnl_lock(); + dev_put(dev); + + if (ret < 0 && ret != -EOPNOTSUPP) { + if (likely(skb->len)) + ret = skb->len; + break; + } } rtnl_unlock(); -out: - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - return ret; } @@ -568,8 +537,7 @@ static int ethnl_default_start(struct netlink_callback *cb) goto free_req_info; } - ret = ethnl_default_parse(req_info, info->attrs, sock_net(cb->skb->sk), - ops, cb->extack, false); + ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { /* We ignore device specification in dump requests but as the * same parser as for non-dump (doit) requests is used, it @@ -584,8 +552,7 @@ static int ethnl_default_start(struct netlink_callback *cb) ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; - ctx->pos_hash = 0; - ctx->pos_idx = 0; + ctx->pos_ifindex = 0; return 0; @@ -680,11 +647,14 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; + struct genl_info info; struct sk_buff *skb; void *reply_payload; int reply_len; int ret; + genl_info_init_ntf(&info, ðtool_genl_family, cmd); + if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || !ethnl_default_notify_ops[cmd], "unexpected notification type %u\n", cmd)) @@ -703,7 +673,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; ethnl_init_reply_data(reply_data, ops, dev); - ret = ops->prepare_data(req_info, reply_data, NULL); + ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) goto err_cleanup; ret = ops->reply_size(req_info, reply_data); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 79424b34b553..9a333a8d04c1 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -355,7 +355,7 @@ struct ethnl_request_ops { struct netlink_ext_ack *extack); int (*prepare_data)(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, - struct genl_info *info); + const struct genl_info *info); int (*reply_size)(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); int (*fill_reply)(struct sk_buff *skb, diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 6657d0b888d8..f7c847aeb1a2 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -51,10 +51,9 @@ static int pause_parse_request(struct ethnl_req_info *req_base, static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; @@ -74,7 +73,7 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { - NL_SET_ERR_MSG_MOD(extack, + NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c index 637b2f5297d5..cadaabed60bd 100644 --- a/net/ethtool/phc_vclocks.c +++ b/net/ethtool/phc_vclocks.c @@ -24,7 +24,7 @@ const struct nla_policy ethnl_phc_vclocks_get_policy[] = { static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c index 5a8cab4df0c9..b238a1afe9ae 100644 --- a/net/ethtool/plca.c +++ b/net/ethtool/plca.c @@ -40,7 +40,7 @@ const struct nla_policy ethnl_plca_get_cfg_policy[] = { static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; @@ -183,7 +183,7 @@ const struct nla_policy ethnl_plca_get_status_policy[] = { static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c index 23264a1ebf12..297be6a13ab9 100644 --- a/net/ethtool/privflags.c +++ b/net/ethtool/privflags.c @@ -57,7 +57,7 @@ static int ethnl_get_priv_flags_info(struct net_device *dev, static int privflags_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 530b8b99e6df..cc478af77111 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -53,8 +53,8 @@ static int pse_get_pse_attributes(struct net_device *dev, } static int pse_prepare_data(const struct ethnl_req_info *req_base, - struct ethnl_reply_data *reply_base, - struct genl_info *info) + struct ethnl_reply_data *reply_base, + const struct genl_info *info) { struct pse_reply_data *data = PSE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; @@ -64,7 +64,7 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; - ret = pse_get_pse_attributes(dev, info ? info->extack : NULL, data); + ret = pse_get_pse_attributes(dev, info->extack, data); ethnl_ops_complete(dev); diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 1c4972526142..fb09f774ea01 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -24,10 +24,9 @@ const struct nla_policy ethnl_rings_get_policy[] = { static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -39,7 +38,7 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, - &data->kernel_ringparam, extack); + &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index be260ab34e58..5764202e6cb6 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -42,7 +42,8 @@ rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, static int rss_prepare_data(const struct ethnl_req_info *req_base, - struct ethnl_reply_data *reply_base, struct genl_info *info) + struct ethnl_reply_data *reply_base, + const struct genl_info *info) { struct rss_reply_data *data = RSS_REPDATA(reply_base); struct rss_req_info *request = RSS_REQINFO(req_base); diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index 010ed19ccc99..912f0c4fff2f 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -114,10 +114,9 @@ static int stats_parse_request(struct ethnl_req_info *req_base, static int stats_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct stats_reply_data *data = STATS_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; @@ -130,7 +129,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { - NL_SET_ERR_MSG_MOD(extack, + NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 3f7de54d85fb..c678b484a079 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -274,7 +274,7 @@ static int strset_prepare_set(struct strset_info *info, struct net_device *dev, static int strset_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct strset_reply_data *data = STRSET_REPDATA(reply_base); diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 63b5814bd460..9daed0aab162 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -25,7 +25,7 @@ const struct nla_policy ethnl_tsinfo_get_policy[] = { static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index 67fb414ca859..b4ce47dd2aa6 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -212,15 +212,14 @@ err_unlock_rtnl: struct ethnl_tunnel_info_dump_ctx { struct ethnl_req_info req_info; - int pos_hash; - int pos_idx; + unsigned long ifindex; }; int ethnl_tunnel_info_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; - struct nlattr **tb = info->attrs; + struct nlattr **tb = info->info.attrs; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); @@ -243,57 +242,39 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; void *ehdr; rtnl_lock(); - cb->seq = net->dev_base_seq; - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - - head = &net->dev_index_head[h]; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - - ehdr = ethnl_dump_put(skb, cb, - ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); - if (!ehdr) { - ret = -EMSGSIZE; - goto out; - } - - ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - goto out; - } - - ctx->req_info.dev = dev; - ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); - ctx->req_info.dev = NULL; - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - if (ret == -EOPNOTSUPP) - goto cont; - goto out; - } - genlmsg_end(skb, ehdr); -cont: - idx++; + for_each_netdev_dump(net, dev, ctx->ifindex) { + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); + if (!ehdr) { + ret = -EMSGSIZE; + break; + } + + ret = ethnl_fill_reply_header(skb, dev, + ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + break; } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + continue; + break; + } + genlmsg_end(skb, ehdr); } -out: rtnl_unlock(); - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - if (ret == -EMSGSIZE && skb->len) return skb->len; return ret; diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index a4a43d9e6e9d..0ed56c9ac1bc 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -24,7 +24,7 @@ const struct nla_policy ethnl_wol_get_policy[] = { static int wol_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct wol_reply_data *data = WOL_REPDATA(reply_base); struct net_device *dev = reply_base->dev; @@ -39,7 +39,8 @@ static int wol_prepare_data(const struct ethnl_req_info *req_base, dev->ethtool_ops->get_wol(dev, &data->wol); ethnl_ops_complete(dev); /* do not include password in notifications */ - data->show_sopass = info && (data->wol.supported & WAKE_MAGICSECURE); + data->show_sopass = !genl_info_is_ntf(info) && + (data->wol.supported & WAKE_MAGICSECURE); return 0; } diff --git a/net/handshake/Makefile b/net/handshake/Makefile index 247d73c6ff6e..ef4d9a2112bd 100644 --- a/net/handshake/Makefile +++ b/net/handshake/Makefile @@ -8,6 +8,6 @@ # obj-y += handshake.o -handshake-y := genl.o netlink.o request.o tlshd.o trace.o +handshake-y := alert.o genl.o netlink.o request.o tlshd.o trace.o obj-$(CONFIG_NET_HANDSHAKE_KUNIT_TEST) += handshake-test.o diff --git a/net/handshake/alert.c b/net/handshake/alert.c new file mode 100644 index 000000000000..329d91984683 --- /dev/null +++ b/net/handshake/alert.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Handle the TLS Alert protocol + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2023, Oracle and/or its affiliates. + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet.h> + +#include <net/sock.h> +#include <net/handshake.h> +#include <net/tls.h> +#include <net/tls_prot.h> + +#include "handshake.h" + +#include <trace/events/handshake.h> + +/** + * tls_alert_send - send a TLS Alert on a kTLS socket + * @sock: open kTLS socket to send on + * @level: TLS Alert level + * @description: TLS Alert description + * + * Returns zero on success or a negative errno. + */ +int tls_alert_send(struct socket *sock, u8 level, u8 description) +{ + u8 record_type = TLS_RECORD_TYPE_ALERT; + u8 buf[CMSG_SPACE(sizeof(record_type))]; + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg; + struct kvec iov; + u8 alert[2]; + int ret; + + trace_tls_alert_send(sock->sk, level, description); + + alert[0] = level; + alert[1] = description; + iov.iov_base = alert; + iov.iov_len = sizeof(alert); + + memset(buf, 0, sizeof(buf)); + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + msg.msg_flags = MSG_DONTWAIT; + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_TLS; + cmsg->cmsg_type = TLS_SET_RECORD_TYPE; + cmsg->cmsg_len = CMSG_LEN(sizeof(record_type)); + memcpy(CMSG_DATA(cmsg), &record_type, sizeof(record_type)); + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, iov.iov_len); + ret = sock_sendmsg(sock, &msg); + return ret < 0 ? ret : 0; +} + +/** + * tls_get_record_type - Look for TLS RECORD_TYPE information + * @sk: socket (for IP address information) + * @cmsg: incoming message to be parsed + * + * Returns zero or a TLS_RECORD_TYPE value. + */ +u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *cmsg) +{ + u8 record_type; + + if (cmsg->cmsg_level != SOL_TLS) + return 0; + if (cmsg->cmsg_type != TLS_GET_RECORD_TYPE) + return 0; + + record_type = *((u8 *)CMSG_DATA(cmsg)); + trace_tls_contenttype(sk, record_type); + return record_type; +} +EXPORT_SYMBOL(tls_get_record_type); + +/** + * tls_alert_recv - Parse TLS Alert messages + * @sk: socket (for IP address information) + * @msg: incoming message to be parsed + * @level: OUT - TLS AlertLevel value + * @description: OUT - TLS AlertDescription value + * + */ +void tls_alert_recv(const struct sock *sk, const struct msghdr *msg, + u8 *level, u8 *description) +{ + const struct kvec *iov; + u8 *data; + + iov = msg->msg_iter.kvec; + data = iov->iov_base; + *level = data[0]; + *description = data[1]; + + trace_tls_alert_recv(sk, *level, *description); +} +EXPORT_SYMBOL(tls_alert_recv); diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h index 4dac965c99df..a48163765a7a 100644 --- a/net/handshake/handshake.h +++ b/net/handshake/handshake.h @@ -41,8 +41,11 @@ struct handshake_req { enum hr_flags_bits { HANDSHAKE_F_REQ_COMPLETED, + HANDSHAKE_F_REQ_SESSION, }; +struct genl_info; + /* Invariants for all handshake requests for one transport layer * security protocol */ @@ -63,6 +66,9 @@ enum hp_flags_bits { HANDSHAKE_F_PROTO_NOTIFY, }; +/* alert.c */ +int tls_alert_send(struct socket *sock, u8 level, u8 description); + /* netlink.c */ int handshake_genl_notify(struct net *net, const struct handshake_proto *proto, gfp_t flags); diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index b735f5cced2f..bbfb4095ddd6 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -18,6 +18,7 @@ #include <net/sock.h> #include <net/handshake.h> #include <net/genetlink.h> +#include <net/tls_prot.h> #include <uapi/linux/keyctl.h> #include <uapi/linux/handshake.h> @@ -100,6 +101,9 @@ static void tls_handshake_done(struct handshake_req *req, if (info) tls_handshake_remote_peerids(treq, info); + if (!status) + set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags); + treq->th_consumer_done(treq->th_consumer_data, -status, treq->th_peerid[0]); } @@ -424,3 +428,22 @@ bool tls_handshake_cancel(struct sock *sk) return handshake_req_cancel(sk); } EXPORT_SYMBOL(tls_handshake_cancel); + +/** + * tls_handshake_close - send a Closure alert + * @sock: an open socket + * + */ +void tls_handshake_close(struct socket *sock) +{ + struct handshake_req *req; + + req = handshake_req_hash_lookup(sock->sk); + if (!req) + return; + if (!test_and_clear_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags)) + return; + tls_alert_send(sock, TLS_ALERT_LEVEL_WARNING, + TLS_ALERT_DESC_CLOSE_NOTIFY); +} +EXPORT_SYMBOL(tls_handshake_close); diff --git a/net/handshake/trace.c b/net/handshake/trace.c index 1c4d8e27e17a..44432d0857b9 100644 --- a/net/handshake/trace.c +++ b/net/handshake/trace.c @@ -8,8 +8,10 @@ */ #include <linux/types.h> +#include <linux/ipv6.h> #include <net/sock.h> +#include <net/inet_sock.h> #include <net/netlink.h> #include <net/genetlink.h> diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 501552d9753b..8c99e64e1cea 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -23,7 +23,5 @@ void __exit hsr_netlink_exit(void); void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], struct hsr_port *port); void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]); -void hsr_nl_framedrop(int dropcount, int dev_idx); -void hsr_nl_linkdown(int dev_idx); #endif /* __HSR_NETLINK_H */ diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index d610c1886160..1a265a421308 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -262,7 +262,7 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, if (!cb->args[0]) { *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), - info->attrs); + info->info.attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; @@ -570,7 +570,7 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct nl802154_dump_wpan_phy_state *state) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - struct nlattr **tb = info->attrs; + struct nlattr **tb = info->info.attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 02736b83c303..3d2e30e20473 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -187,24 +187,13 @@ static int inet_autobind(struct sock *sk) return 0; } -/* - * Move a socket into listening state. - */ -int inet_listen(struct socket *sock, int backlog) +int __inet_listen_sk(struct sock *sk, int backlog) { - struct sock *sk = sock->sk; - unsigned char old_state; + unsigned char old_state = sk->sk_state; int err, tcp_fastopen; - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) - goto out; - - old_state = sk->sk_state; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) - goto out; + return -EINVAL; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state @@ -227,10 +216,27 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk); if (err) - goto out; + return err; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } - err = 0; + return 0; +} + +/* + * Move a socket into listening state. + */ +int inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = -EINVAL; + + lock_sock(sk); + + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + goto out; + + err = __inet_listen_sk(sk, backlog); out: release_sock(sk); @@ -325,14 +331,14 @@ lookup_protocol: sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); - inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); - inet->nodefrag = 0; + inet_clear_bit(NODEFRAG, sk); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) - inet->hdrincl = 1; + inet_set_bit(HDRINCL, sk); } if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) @@ -350,9 +356,9 @@ lookup_protocol: sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; - inet->mc_loop = 1; + inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; - inet->mc_all = 1; + inet_set_bit(MC_ALL, sk); inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; @@ -431,9 +437,8 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sock *sk = sock->sk; u32 flags = BIND_WITH_LOCK; int err; @@ -454,6 +459,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return __inet_bind(sk, uaddr, addr_len, flags); } + +int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + return inet_bind_sk(sock->sk, uaddr, addr_len); +} EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, @@ -519,7 +529,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (snum || !(inet->bind_address_no_port || + if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { @@ -646,7 +656,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -EISCONN; goto out; case SS_CONNECTING: - if (inet_sk(sk)->defer_connect) + if (inet_test_bit(DEFER_CONNECT, sk)) err = is_sendmsg ? -EINPROGRESS : -EISCONN; else err = -EALREADY; @@ -669,7 +679,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTING; - if (!err && inet_sk(sk)->defer_connect) + if (!err && inet_test_bit(DEFER_CONNECT, sk)) goto out; /* Just entered SS_CONNECTING state; the only diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 4406d796cc2f..39dcccf0f174 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -51,8 +51,6 @@ static bool is_unsupported(u32 member_offset) return false; } -extern struct btf *btf_vmlinux; - static bool bpf_tcp_ca_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 79ae7204e8ed..d048aa833293 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1881,7 +1881,7 @@ int cipso_v4_sock_setattr(struct sock *sk, old = rcu_dereference_protected(sk_inet->inet_opt, lockdep_sock_is_held(sk)); - if (sk_inet->is_icsk) { + if (inet_test_bit(IS_ICSK, sk)) { sk_conn = inet_csk(sk); if (old) sk_conn->icsk_ext_hdr_len -= old->opt.optlen; @@ -2051,7 +2051,7 @@ void cipso_v4_sock_delattr(struct sock *sk) sk_inet = inet_sk(sk); hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); - if (sk_inet->is_icsk && hdr_delta > 0) { + if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5deac0517ef7..c3658b8755bc 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -509,6 +509,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } @@ -664,6 +665,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { + NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto errout; } @@ -688,6 +690,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } + NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; errout: return err; @@ -839,13 +842,23 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); err = -EINVAL; - if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL]) + + if (ifm->ifa_prefixlen > 32) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); + goto errout; + } + + if (!tb[IFA_LOCAL]) { + NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); goto errout; + } dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; - if (!dev) + if (!dev) { + NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; + } in_dev = __in_dev_get_rtnl(dev); err = -ENOBUFS; @@ -897,6 +910,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { + NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); err = -EINVAL; goto errout_free; } @@ -954,6 +968,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, int ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { + NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); return ret; } @@ -967,8 +982,10 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || - !(nlh->nlmsg_flags & NLM_F_REPLACE)) + !(nlh->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); return -EEXIST; + } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 48ff5f13e797..0c9e768e5628 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2658,7 +2658,7 @@ int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr, (sdif && pmc->multi.imr_ifindex == sdif))) break; } - ret = inet->mc_all; + ret = inet_test_bit(MC_ALL, sk); if (!pmc) goto unlock; psl = rcu_dereference(pmc->sflist); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f7426926a104..e13a84433413 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -182,17 +182,17 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, r->idiag_inode = sock_i_ino(sk); memset(&inet_sockopt, 0, sizeof(inet_sockopt)); - inet_sockopt.recverr = inet->recverr; - inet_sockopt.is_icsk = inet->is_icsk; - inet_sockopt.freebind = inet->freebind; - inet_sockopt.hdrincl = inet->hdrincl; - inet_sockopt.mc_loop = inet->mc_loop; - inet_sockopt.transparent = inet->transparent; - inet_sockopt.mc_all = inet->mc_all; - inet_sockopt.nodefrag = inet->nodefrag; - inet_sockopt.bind_address_no_port = inet->bind_address_no_port; - inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884; - inet_sockopt.defer_connect = inet->defer_connect; + inet_sockopt.recverr = inet_test_bit(RECVERR, sk); + inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk); + inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); + inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); + inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); + inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk); + inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); + inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk); + inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); + inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); + inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk); if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), &inet_sockopt)) goto errout; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0819d6001b9a..7876b7d703cb 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -28,9 +28,9 @@ #include <net/tcp.h> #include <net/sock_reuseport.h> -static u32 inet_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 inet_ehash_secret __read_mostly; @@ -39,6 +39,7 @@ static u32 inet_ehashfn(const struct net *net, const __be32 laddr, return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. @@ -332,20 +333,38 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) +/** + * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. + * @net: network namespace. + * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. + * @skb: context for a potential SK_REUSEPORT program. + * @doff: header offset. + * @saddr: source address. + * @sport: source port. + * @daddr: destination address. + * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. + * + * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to + * the selected sock or an error. + */ +struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum, + inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } +EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API @@ -369,8 +388,8 @@ static struct sock *inet_lhash2_lookup(struct net *net, sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - result = lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + result = inet_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; @@ -382,24 +401,23 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } -static inline struct sock *inet_lookup_run_bpf(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, u16 hnum, const int dif) +struct sock *inet_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum, const int dif, + inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != net->ipv4.tcp_death_row.hashinfo) - return NULL; /* only TCP is supported */ - - no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, + no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, + ehashfn); if (reuse_sk) sk = reuse_sk; return sk; @@ -417,9 +435,11 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - result = inet_lookup_run_bpf(net, hashinfo, skb, doff, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + hashinfo == net->ipv4.tcp_death_row.hashinfo) { + result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, + saddr, sport, daddr, hnum, dif, + inet_ehashfn); if (result) goto done; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2c1b245dba8e..dd37a5bf6881 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -203,7 +203,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_reuseport = sk->sk_reuseport; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; - tw->tw_transparent = inet->transparent; + tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6ba1a0fafbaa..ce6257860a40 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(const struct inet_sock *inet, const struct dst_entry *dst) { - int ttl = inet->uc_ttl; + int ttl = READ_ONCE(inet->uc_ttl); if (ttl < 0) ttl = ip4_dst_hoplimit(dst); @@ -236,7 +236,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); - return -EINVAL; + return PTR_ERR(neigh); } static int ip_finish_output_gso(struct net *net, struct sock *sk, @@ -1039,7 +1039,7 @@ static int __ip_append_data(struct sock *sk, } } } else if ((flags & MSG_SPLICE_PAGES) && length) { - if (inet->hdrincl) + if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) @@ -1467,7 +1467,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * so icmphdr does not in skb linear region and can not get icmp_type * by icmp_hdr(skb)->type. */ - if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl) + if (sk->sk_type == SOCK_RAW && + !inet_test_bit(HDRINCL, sk)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d41bce8927b2..54ad0f0d5c2d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -171,8 +171,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset) { - struct inet_sock *inet = inet_sk(sk); - unsigned int flags = inet->cmsg_flags; + unsigned long flags = inet_cmsg_flags(inet_sk(sk)); + + if (!flags) + return; /* Ordered by supposed usage frequency */ if (flags & IP_CMSG_PKTINFO) { @@ -431,7 +433,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { - if (inet_sk(sk)->recverr_rfc4884) + if (inet_test_bit(RECVERR_RFC4884, sk)) ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); @@ -444,12 +446,11 @@ EXPORT_SYMBOL_GPL(ip_icmp_error); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) { - struct inet_sock *inet = inet_sk(sk); struct sock_exterr_skb *serr; struct iphdr *iph; struct sk_buff *skb; - if (!inet->recverr) + if (!inet_test_bit(RECVERR, sk)) return; skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); @@ -568,7 +569,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; - if (inet_sk(sk)->cmsg_flags) + if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } @@ -607,17 +608,13 @@ EXPORT_SYMBOL(ip_sock_set_tos); void ip_sock_set_freebind(struct sock *sk) { - lock_sock(sk); - inet_sk(sk)->freebind = true; - release_sock(sk); + inet_set_bit(FREEBIND, sk); } EXPORT_SYMBOL(ip_sock_set_freebind); void ip_sock_set_recverr(struct sock *sk) { - lock_sock(sk); - inet_sk(sk)->recverr = true; - release_sock(sk); + inet_set_bit(RECVERR, sk); } EXPORT_SYMBOL(ip_sock_set_recverr); @@ -634,9 +631,7 @@ EXPORT_SYMBOL(ip_sock_set_mtu_discover); void ip_sock_set_pktinfo(struct sock *sk) { - lock_sock(sk); - inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO; - release_sock(sk); + inet_set_bit(PKTINFO, sk); } EXPORT_SYMBOL(ip_sock_set_pktinfo); @@ -950,6 +945,104 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case IP_PKTINFO: + inet_assign_bit(PKTINFO, sk, val); + return 0; + case IP_RECVTTL: + inet_assign_bit(TTL, sk, val); + return 0; + case IP_RECVTOS: + inet_assign_bit(TOS, sk, val); + return 0; + case IP_RECVOPTS: + inet_assign_bit(RECVOPTS, sk, val); + return 0; + case IP_RETOPTS: + inet_assign_bit(RETOPTS, sk, val); + return 0; + case IP_PASSSEC: + inet_assign_bit(PASSSEC, sk, val); + return 0; + case IP_RECVORIGDSTADDR: + inet_assign_bit(ORIGDSTADDR, sk, val); + return 0; + case IP_RECVFRAGSIZE: + if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) + return -EINVAL; + inet_assign_bit(RECVFRAGSIZE, sk, val); + return 0; + case IP_RECVERR: + inet_assign_bit(RECVERR, sk, val); + if (!val) + skb_errqueue_purge(&sk->sk_error_queue); + return 0; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + return -EINVAL; + inet_assign_bit(RECVERR_RFC4884, sk, val); + return 0; + case IP_FREEBIND: + if (optlen < 1) + return -EINVAL; + inet_assign_bit(FREEBIND, sk, val); + return 0; + case IP_HDRINCL: + if (sk->sk_type != SOCK_RAW) + return -ENOPROTOOPT; + inet_assign_bit(HDRINCL, sk, val); + return 0; + case IP_MULTICAST_LOOP: + if (optlen < 1) + return -EINVAL; + inet_assign_bit(MC_LOOP, sk, val); + return 0; + case IP_MULTICAST_ALL: + if (optlen < 1) + return -EINVAL; + if (val != 0 && val != 1) + return -EINVAL; + inet_assign_bit(MC_ALL, sk, val); + return 0; + case IP_TRANSPARENT: + if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (optlen < 1) + goto e_inval; + inet_assign_bit(TRANSPARENT, sk, val); + return 0; + case IP_NODEFRAG: + if (sk->sk_type != SOCK_RAW) + return -ENOPROTOOPT; + inet_assign_bit(NODEFRAG, sk, val); + return 0; + case IP_BIND_ADDRESS_NO_PORT: + inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val); + return 0; + case IP_TTL: + if (optlen < 1) + return -EINVAL; + if (val != -1 && (val < 1 || val > 255)) + return -EINVAL; + WRITE_ONCE(inet->uc_ttl, val); + return 0; + case IP_MINTTL: + if (optlen < 1) + return -EINVAL; + if (val < 0 || val > 255) + return -EINVAL; + + if (val) + static_branch_enable(&ip4_min_ttl); + + WRITE_ONCE(inet->min_ttl, val); + return 0; + } + err = 0; if (needs_rtnl) rtnl_lock(); @@ -967,7 +1060,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, break; old = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); - if (inet->is_icsk) { + if (inet_test_bit(IS_ICSK, sk)) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET || @@ -989,111 +1082,27 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, kfree_rcu(old, rcu); break; } - case IP_PKTINFO: - if (val) - inet->cmsg_flags |= IP_CMSG_PKTINFO; - else - inet->cmsg_flags &= ~IP_CMSG_PKTINFO; - break; - case IP_RECVTTL: - if (val) - inet->cmsg_flags |= IP_CMSG_TTL; - else - inet->cmsg_flags &= ~IP_CMSG_TTL; - break; - case IP_RECVTOS: - if (val) - inet->cmsg_flags |= IP_CMSG_TOS; - else - inet->cmsg_flags &= ~IP_CMSG_TOS; - break; - case IP_RECVOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RECVOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; - break; - case IP_RETOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RETOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RETOPTS; - break; - case IP_PASSSEC: - if (val) - inet->cmsg_flags |= IP_CMSG_PASSSEC; - else - inet->cmsg_flags &= ~IP_CMSG_PASSSEC; - break; - case IP_RECVORIGDSTADDR: - if (val) - inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR; - else - inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; - break; case IP_CHECKSUM: if (val) { - if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) { + if (!(inet_test_bit(CHECKSUM, sk))) { inet_inc_convert_csum(sk); - inet->cmsg_flags |= IP_CMSG_CHECKSUM; + inet_set_bit(CHECKSUM, sk); } } else { - if (inet->cmsg_flags & IP_CMSG_CHECKSUM) { + if (inet_test_bit(CHECKSUM, sk)) { inet_dec_convert_csum(sk); - inet->cmsg_flags &= ~IP_CMSG_CHECKSUM; + inet_clear_bit(CHECKSUM, sk); } } break; - case IP_RECVFRAGSIZE: - if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) - goto e_inval; - if (val) - inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE; - else - inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; - break; case IP_TOS: /* This sets both TOS and Precedence */ __ip_sock_set_tos(sk, val); break; - case IP_TTL: - if (optlen < 1) - goto e_inval; - if (val != -1 && (val < 1 || val > 255)) - goto e_inval; - inet->uc_ttl = val; - break; - case IP_HDRINCL: - if (sk->sk_type != SOCK_RAW) { - err = -ENOPROTOOPT; - break; - } - inet->hdrincl = val ? 1 : 0; - break; - case IP_NODEFRAG: - if (sk->sk_type != SOCK_RAW) { - err = -ENOPROTOOPT; - break; - } - inet->nodefrag = val ? 1 : 0; - break; - case IP_BIND_ADDRESS_NO_PORT: - inet->bind_address_no_port = val ? 1 : 0; - break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; inet->pmtudisc = val; break; - case IP_RECVERR: - inet->recverr = !!val; - if (!val) - skb_queue_purge(&sk->sk_error_queue); - break; - case IP_RECVERR_RFC4884: - if (val < 0 || val > 1) - goto e_inval; - inet->recverr_rfc4884 = !!val; - break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -1105,11 +1114,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet->mc_ttl = val; break; - case IP_MULTICAST_LOOP: - if (optlen < 1) - goto e_inval; - inet->mc_loop = !!val; - break; case IP_UNICAST_IF: { struct net_device *dev = NULL; @@ -1214,7 +1218,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, struct ip_mreqn mreq; err = -EPROTO; - if (inet_sk(sk)->is_icsk) + if (inet_test_bit(IS_ICSK, sk)) break; if (optlen < sizeof(struct ip_mreq)) @@ -1325,20 +1329,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, else err = ip_set_mcast_msfilter(sk, optval, optlen); break; - case IP_MULTICAST_ALL: - if (optlen < 1) - goto e_inval; - if (val != 0 && val != 1) - goto e_inval; - inet->mc_all = val; - break; - - case IP_FREEBIND: - if (optlen < 1) - goto e_inval; - inet->freebind = !!val; - break; - case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; @@ -1347,32 +1337,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, err = xfrm_user_policy(sk, optname, optval, optlen); break; - case IP_TRANSPARENT: - if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - err = -EPERM; - break; - } - if (optlen < 1) - goto e_inval; - inet->transparent = !!val; - break; - - case IP_MINTTL: - if (optlen < 1) - goto e_inval; - if (val < 0 || val > 255) - goto e_inval; - - if (val) - static_branch_enable(&ip4_min_ttl); - - /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl - * while we are changint it. - */ - WRITE_ONCE(inet->min_ttl, val); - break; - case IP_LOCAL_PORT_RANGE: { const __u16 lo = val; @@ -1415,7 +1379,7 @@ e_inval: void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || + bool prepare = inet_test_bit(PKTINFO, sk) || ipv6_sk_rxinfo(sk); if (prepare && skb_rtable(skb)) { @@ -1566,6 +1530,72 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, if (len < 0) return -EINVAL; + /* Handle options that can be read without locking the socket. */ + switch (optname) { + case IP_PKTINFO: + val = inet_test_bit(PKTINFO, sk); + goto copyval; + case IP_RECVTTL: + val = inet_test_bit(TTL, sk); + goto copyval; + case IP_RECVTOS: + val = inet_test_bit(TOS, sk); + goto copyval; + case IP_RECVOPTS: + val = inet_test_bit(RECVOPTS, sk); + goto copyval; + case IP_RETOPTS: + val = inet_test_bit(RETOPTS, sk); + goto copyval; + case IP_PASSSEC: + val = inet_test_bit(PASSSEC, sk); + goto copyval; + case IP_RECVORIGDSTADDR: + val = inet_test_bit(ORIGDSTADDR, sk); + goto copyval; + case IP_CHECKSUM: + val = inet_test_bit(CHECKSUM, sk); + goto copyval; + case IP_RECVFRAGSIZE: + val = inet_test_bit(RECVFRAGSIZE, sk); + goto copyval; + case IP_RECVERR: + val = inet_test_bit(RECVERR, sk); + goto copyval; + case IP_RECVERR_RFC4884: + val = inet_test_bit(RECVERR_RFC4884, sk); + goto copyval; + case IP_FREEBIND: + val = inet_test_bit(FREEBIND, sk); + goto copyval; + case IP_HDRINCL: + val = inet_test_bit(HDRINCL, sk); + goto copyval; + case IP_MULTICAST_LOOP: + val = inet_test_bit(MC_LOOP, sk); + goto copyval; + case IP_MULTICAST_ALL: + val = inet_test_bit(MC_ALL, sk); + goto copyval; + case IP_TRANSPARENT: + val = inet_test_bit(TRANSPARENT, sk); + goto copyval; + case IP_NODEFRAG: + val = inet_test_bit(NODEFRAG, sk); + goto copyval; + case IP_BIND_ADDRESS_NO_PORT: + val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); + goto copyval; + case IP_TTL: + val = READ_ONCE(inet->uc_ttl); + if (val < 0) + val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl); + goto copyval; + case IP_MINTTL: + val = READ_ONCE(inet->min_ttl); + goto copyval; + } + if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); @@ -1600,53 +1630,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } - case IP_PKTINFO: - val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; - break; - case IP_RECVTTL: - val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; - break; - case IP_RECVTOS: - val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; - break; - case IP_RECVOPTS: - val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; - break; - case IP_RETOPTS: - val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; - break; - case IP_PASSSEC: - val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; - break; - case IP_RECVORIGDSTADDR: - val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; - break; - case IP_CHECKSUM: - val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0; - break; - case IP_RECVFRAGSIZE: - val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0; - break; case IP_TOS: val = inet->tos; break; - case IP_TTL: - { - struct net *net = sock_net(sk); - val = (inet->uc_ttl == -1 ? - READ_ONCE(net->ipv4.sysctl_ip_default_ttl) : - inet->uc_ttl); - break; - } - case IP_HDRINCL: - val = inet->hdrincl; - break; - case IP_NODEFRAG: - val = inet->nodefrag; - break; - case IP_BIND_ADDRESS_NO_PORT: - val = inet->bind_address_no_port; - break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; @@ -1665,18 +1651,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, } break; } - case IP_RECVERR: - val = inet->recverr; - break; - case IP_RECVERR_RFC4884: - val = inet->recverr_rfc4884; - break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; - case IP_MULTICAST_LOOP: - val = inet->mc_loop; - break; case IP_UNICAST_IF: val = (__force int)htonl((__u32) inet->uc_index); break; @@ -1715,9 +1692,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - case IP_MULTICAST_ALL: - val = inet->mc_all; - break; case IP_PKTOPTIONS: { struct msghdr msg; @@ -1737,7 +1711,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, msg.msg_controllen = len; msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; - if (inet->cmsg_flags & IP_CMSG_PKTINFO) { + if (inet_test_bit(PKTINFO, sk)) { struct in_pktinfo info; info.ipi_addr.s_addr = inet->inet_rcv_saddr; @@ -1745,26 +1719,17 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, info.ipi_ifindex = inet->mc_index; put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } - if (inet->cmsg_flags & IP_CMSG_TTL) { + if (inet_test_bit(TTL, sk)) { int hlim = inet->mc_ttl; put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); } - if (inet->cmsg_flags & IP_CMSG_TOS) { + if (inet_test_bit(TOS, sk)) { int tos = inet->rcv_tos; put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } - case IP_FREEBIND: - val = inet->freebind; - break; - case IP_TRANSPARENT: - val = inet->transparent; - break; - case IP_MINTTL: - val = inet->min_ttl; - break; case IP_LOCAL_PORT_RANGE: val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; break; @@ -1776,7 +1741,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } sockopt_release_sock(sk); - +copyval: if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index e61ea428ea18..265b39bc435b 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -7,6 +7,7 @@ #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/module.h> +#include <linux/rcupdate.h> #include <linux/skbuff.h> #include <net/netns/generic.h> #include <net/route.h> @@ -65,7 +66,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv, struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && - inet_sk(sk)->nodefrag) + inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -113,17 +114,31 @@ static void __net_exit defrag4_net_exit(struct net *net) } } +static const struct nf_defrag_hook defrag_hook = { + .owner = THIS_MODULE, + .enable = nf_defrag_ipv4_enable, + .disable = nf_defrag_ipv4_disable, +}; + static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, }; static int __init nf_defrag_init(void) { - return register_pernet_subsys(&defrag4_net_ops); + int err; + + err = register_pernet_subsys(&defrag4_net_ops); + if (err) + return err; + + rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook); + return err; } static void __exit nf_defrag_fini(void) { + rcu_assign_pointer(nf_defrag_v4_hook, NULL); unregister_pernet_subsys(&defrag4_net_ops); } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index be5498f5dd31..bbff68b5b5d4 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1152,41 +1152,64 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) +static bool nexthop_is_good_nh(const struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference(nh->nh_info); + + switch (nhi->family) { + case AF_INET: + return ipv4_good_nh(&nhi->fib_nh); + case AF_INET6: + return ipv6_good_nh(&nhi->fib6_nh); + } + + return false; +} + +static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { - struct nexthop *rc = NULL; int i; - for (i = 0; i < nhg->num_nh; ++i) { + for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - struct nh_info *nhi; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; - nhi = rcu_dereference(nhge->nh->nh_info); - if (nhi->fdb_nh) - return nhge->nh; + return nhge->nh; + } + + WARN_ON_ONCE(1); + return NULL; +} + +static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) +{ + struct nexthop *rc = NULL; + int i; + + if (nhg->fdb_nh) + return nexthop_select_path_fdb(nhg, hash); + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ - switch (nhi->family) { - case AF_INET: - if (ipv4_good_nh(&nhi->fib_nh)) - return nhge->nh; - break; - case AF_INET6: - if (ipv6_good_nh(&nhi->fib6_nh)) - return nhge->nh; - break; - } + if (!nexthop_is_good_nh(nhge->nh)) + continue; if (!rc) rc = nhge->nh; + + if (hash > atomic_read(&nhge->hthr.upper_bound)) + continue; + + return nhge->nh; } - return rc; + return rc ? : nhg->nh_entries[0].nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) @@ -3186,7 +3209,6 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, return err; } - ctx->idx++; return 0; } @@ -3314,7 +3336,6 @@ static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; - u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */ }; static struct rtm_dump_res_bucket_ctx * @@ -3343,9 +3364,6 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, u16 bucket_index; int err; - if (dd->ctx->nh.idx < dd->ctx->done_nh_idx) - return 0; - nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; @@ -3372,7 +3390,6 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, return err; } - dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; dd->ctx->bucket_index = 0; return 0; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 25dd78cee179..75e0aee35eb7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -580,7 +580,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ - if ((family == AF_INET && !inet_sock->recverr) || + if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) || (family == AF_INET6 && !inet6_sk(sk)->recverr)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -894,7 +894,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, *addr_len = sizeof(*sin); } - if (isk->cmsg_flags) + if (inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #if IS_ENABLED(CONFIG_IPV6) @@ -921,7 +921,8 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (skb->protocol == htons(ETH_P_IPV6) && inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); - else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) + else if (skb->protocol == htons(ETH_P_IP) && + inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #endif } else { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index cb381f5aa464..4b5db5d1edc2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -203,8 +203,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - int err = 0; int harderr = 0; + bool recverr; + int err = 0; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); @@ -218,7 +219,8 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) 2. Socket is connected (otherwise the error indication is useless without ip_recverr and error is hard. */ - if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) + recverr = inet_test_bit(RECVERR, sk); + if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; switch (type) { @@ -245,16 +247,16 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) } } - if (inet->recverr) { + if (recverr) { const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); - if (inet->hdrincl) + if (inet_test_bit(HDRINCL, sk)) payload = skb->data; ip_icmp_error(sk, skb, err, 0, info, payload); } - if (inet->recverr || harderr) { + if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } @@ -413,7 +415,7 @@ error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); - if (err == -ENOBUFS && !inet->recverr) + if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; return err; } @@ -489,12 +491,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (len > 0xFFFF) goto out; - /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields. - * Doing this indirectly yields the same result. - */ - hdrincl = inet->hdrincl; - hdrincl = READ_ONCE(hdrincl); + hdrincl = inet_test_bit(HDRINCL, sk); + /* * Check the flags. */ @@ -645,7 +643,7 @@ back_from_confirm: ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk, &fl4); - if (err == -ENOBUFS && !inet->recverr) + if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; } release_sock(sk); @@ -767,7 +765,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 92fede388d52..a4e153dd615b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -515,13 +515,12 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, __u8 scope = RT_SCOPE_UNIVERSE; if (sk) { - const struct inet_sock *inet = inet_sk(sk); - oif = sk->sk_bound_dev_if; mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); - prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; + prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : + sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope, @@ -555,7 +554,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk) & IPTOS_RT_MASK, ip_sock_rt_scope(sk), - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_test_bit(HDRINCL, sk) ? + IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8ed52e1e3c99..cee1e548660c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); @@ -582,7 +583,8 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; - } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + } else if (state == TCP_SYN_SENT && + inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data @@ -1006,7 +1008,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; - if (inet->defer_connect) { + if (inet_test_bit(DEFER_CONNECT, sk)) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { @@ -1024,7 +1026,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); - inet->defer_connect = 0; + inet_clear_bit(DEFER_CONNECT, sk); } return err; } @@ -1065,7 +1067,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_SPLICE_PAGES; } - if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && + if (unlikely(flags & MSG_FASTOPEN || + inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) @@ -1700,7 +1703,7 @@ EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { - int cap; + int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; @@ -1715,10 +1718,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - val <<= 1; - if (val > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, val); - tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + space = tcp_space_from_win(sk, val); + if (space > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, space); + tcp_sk(sk)->window_clamp = val; } return 0; } @@ -2864,7 +2867,7 @@ adjudge_to_death: if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), @@ -3087,7 +3090,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); - inet->defer_connect = 0; + inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); @@ -3290,18 +3293,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; - lock_sock(sk); WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); -void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +int tcp_sock_set_user_timeout(struct sock *sk, int val) { - lock_sock(sk); + /* Cap the max time in ms TCP will retry or probe the window + * before giving up and aborting (ETIMEDOUT) a connection. + */ + if (val < 0) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); - release_sock(sk); + return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); @@ -3344,9 +3350,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; - lock_sock(sk); WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); @@ -3356,10 +3360,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; - lock_sock(sk); /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); @@ -3461,6 +3463,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case TCP_SYNCNT: + return tcp_sock_set_syncnt(sk, val); + case TCP_USER_TIMEOUT: + return tcp_sock_set_user_timeout(sk, val); + case TCP_KEEPINTVL: + return tcp_sock_set_keepintvl(sk, val); + case TCP_KEEPCNT: + return tcp_sock_set_keepcnt(sk, val); + case TCP_LINGER2: + if (val < 0) + WRITE_ONCE(tp->linger2, -1); + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); + else + WRITE_ONCE(tp->linger2, val * HZ); + return 0; + case TCP_DEFER_ACCEPT: + /* Translate value in seconds to number of retransmits */ + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); + return 0; + } + sockopt_lock_sock(sk); switch (optname) { @@ -3556,25 +3584,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_intvl, val * HZ); - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_probes, val); - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_syn_retries, val); - break; - case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) @@ -3583,22 +3592,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->save_syn = val; break; - case TCP_LINGER2: - if (val < 0) - WRITE_ONCE(tp->linger2, -1); - else if (val > TCP_FIN_TIMEOUT_MAX / HZ) - WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); - else - WRITE_ONCE(tp->linger2, val * HZ); - break; - - case TCP_DEFER_ACCEPT: - /* Translate value in seconds to number of retransmits */ - WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ)); - break; - case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; @@ -3613,16 +3606,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif - case TCP_USER_TIMEOUT: - /* Cap the max time in ms TCP will retry or probe the window - * before giving up and aborting (ETIMEDOUT) a connection. - */ - if (val < 0) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_user_timeout, val); - break; - case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 85e4953f1182..8ed54e7334a9 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -451,7 +451,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) if (tp->fastopen_connect && !tp->fastopen_req) { if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { - inet_sk(sk)->defer_connect = 1; + inet_set_bit(DEFER_CONNECT, sk); return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c1..06fe1cf645d5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { + /* Note: divides are still a bit expensive. + * For the moment, only adjust scaling_ratio + * when we update icsk_ack.rcv_mss. + */ + if (unlikely(len != icsk->icsk_ack.rcv_mss)) { + u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, skb->truesize); + tcp_sk(sk)->scaling_ratio = val ? val : 1; + } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ @@ -287,7 +297,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -295,7 +305,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } -EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -727,8 +736,8 @@ void tcp_rcv_space_adjust(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. @@ -740,12 +749,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(grow, tp->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < tp->advmss) - rcvmem += 128; - - do_div(rcvwin, tp->advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); @@ -3521,7 +3525,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || - (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); + (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin)); } /* If we update tp->snd_una, also update tp->bytes_acked */ @@ -4122,9 +4126,8 @@ void tcp_parse_options(const struct net *net, break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: - /* - * The MD5 Hash has already been - * checked (see tcp_v{4,6}_do_rcv()). + /* The MD5 Hash has already been + * checked (see tcp_v{4,6}_rcv()). */ break; #endif @@ -4308,10 +4311,16 @@ static inline bool tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, + u32 seq, u32 end_seq) { - return !before(end_seq, tp->rcv_wup) && - !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); + if (before(end_seq, tp->rcv_wup)) + return SKB_DROP_REASON_TCP_OLD_SEQUENCE; + + if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + + return SKB_NOT_DROPPED_YET; } /* When we get a reset we do this. */ @@ -5050,13 +5059,19 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Ok. In sequence. In window. */ queue_and_out: - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { - reason = SKB_DROP_REASON_PROTO_MEM; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + /* TODO: maybe ratelimit these WIN 0 ACK ? */ + inet_csk(sk)->icsk_ack.pending |= + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); + inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); - goto drop; + + if (skb_queue_len(&sk->sk_receive_queue)) { + reason = SKB_DROP_REASON_PROTO_MEM; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + goto drop; + } + sk_forced_mem_schedule(sk, skb->truesize); } eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -5734,7 +5749,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } /* Step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, @@ -5751,7 +5767,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } else if (tcp_reset_check(sk, skb)) { goto reset; } - SKB_DR_SET(reason, TCP_INVALID_SEQUENCE); goto discard; } @@ -6315,7 +6330,7 @@ consume: if (fastopen_fail) return -1; if (sk->sk_write_pending || - icsk->icsk_accept_queue.rskq_defer_accept || + READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) || inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. @@ -6615,7 +6630,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; @@ -6985,7 +7000,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); - inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent; + inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2dbdc26da86e..27140e5cdc06 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -57,6 +57,7 @@ #include <linux/init.h> #include <linux/times.h> #include <linux/slab.h> +#include <linux/sched.h> #include <net/net_namespace.h> #include <net/icmp.h> @@ -476,7 +477,6 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; - struct inet_sock *inet; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; @@ -624,8 +624,8 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) * --ANK (980905) */ - inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { + if (!sock_owned_by_user(sk) && + inet_test_bit(RECVERR, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { /* Only an error on timeout */ @@ -2448,6 +2448,8 @@ static void *established_get_first(struct seq_file *seq) struct hlist_nulls_node *node; spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); + cond_resched(); + /* Lockless fast path for the common case of empty buckets */ if (empty_bucket(hinfo, st)) continue; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 99ac5efe244d..c196759f1d3b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -990,7 +990,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .resv_start_op = TCP_METRICS_CMD_DEL + 1, }; -static unsigned int tcpmhash_entries; +static unsigned int tcpmhash_entries __initdata; static int __init set_tcpmhash_entries(char *str) { ssize_t ret; @@ -1006,15 +1006,11 @@ static int __init set_tcpmhash_entries(char *str) } __setup("tcpmhash_entries=", set_tcpmhash_entries); -static int __net_init tcp_net_metrics_init(struct net *net) +static void __init tcp_metrics_hash_alloc(void) { + unsigned int slots = tcpmhash_entries; size_t size; - unsigned int slots; - if (!net_eq(net, &init_net)) - return 0; - - slots = tcpmhash_entries; if (!slots) { if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; @@ -1027,9 +1023,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) - return -ENOMEM; - - return 0; + panic("Could not allocate the tcp_metrics hash table\n"); } static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) @@ -1038,7 +1032,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, }; @@ -1046,9 +1039,11 @@ void __init tcp_metrics_init(void) { int ret; + tcp_metrics_hash_alloc(); + ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) - panic("Could not allocate the tcp_metrics hash table\n"); + panic("Could not register tcp_net_metrics_ops\n"); ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c8f2aa003387..b98d476f1594 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -289,9 +289,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - struct inet_sock *inet = inet_sk(sk); - tw->tw_transparent = inet->transparent; + tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = sk->sk_priority; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -570,8 +569,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ - if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req))) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; @@ -794,7 +791,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ - if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 51d8638d4b4c..e6b4fbd642f7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -257,11 +257,19 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 old_win = tp->rcv_wnd; - u32 cur_win = tcp_receive_window(tp); - u32 new_win = __tcp_select_window(sk); struct net *net = sock_net(sk); + u32 old_win = tp->rcv_wnd; + u32 cur_win, new_win; + + /* Make the window 0 if we failed to queue the data because we + * are out of memory. The window is temporary, so we don't store + * it on the socket. + */ + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + return 0; + cur_win = tcp_receive_window(tp); + new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else @@ -1293,14 +1301,21 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - /* if no packet is in qdisc/device queue, then allow XPS to select - * another queue. We can be called from tcp_tsq_handler() - * which holds one reference to sk. - * - * TODO: Ideally, in-flight pure ACK packets should not matter here. - * One way to get this would be to set skb->truesize = 2 on them. + /* We set skb->ooo_okay to one if this packet can select + * a different TX queue than prior packets of this flow, + * to avoid self inflicted reorders. + * The 'other' queue decision is based on current cpu number + * if XPS is enabled, or sk->sk_txhash otherwise. + * We can switch to another (and better) queue if: + * 1) No packet with payload is in qdisc/device queues. + * Delays in TX completion can defeat the test + * even if packets were already sent. + * 2) Or rtx queue is empty. + * This mitigates above case if ACK packets for + * all prior packets were already processed. */ - skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); + skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || + tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : @@ -3741,11 +3756,6 @@ static void tcp_connect_init(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; -#ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk)) - tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; -#endif - /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 206418b6d7c4..984ab4a0421e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -26,14 +26,15 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 elapsed, start_ts; + u32 elapsed, start_ts, user_timeout; s32 remaining; start_ts = tcp_sk(sk)->retrans_stamp; - if (!icsk->icsk_user_timeout) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - remaining = icsk->icsk_user_timeout - elapsed; + remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 remaining; + u32 remaining, user_timeout; s32 elapsed; - if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout || !icsk->icsk_probes_tstamp) return when; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; if (unlikely(elapsed < 0)) elapsed = 0; - remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; + remaining = msecs_to_jiffies(user_timeout) - elapsed; remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); @@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? : + /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ + retry_until = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); max_retransmits = retry_until; @@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk) } if (!expired) expired = retransmits_timed_out(sk, retry_until, - icsk->icsk_user_timeout); + READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) @@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (!icsk->icsk_probes_tstamp) + if (!icsk->icsk_probes_tstamp) { icsk->icsk_probes_tstamp = tcp_jiffies32; - else if (icsk->icsk_user_timeout && - (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= - msecs_to_jiffies(icsk->icsk_user_timeout)) - goto abort; + } else { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(user_timeout)) + goto abort; + } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->rsk_ops->syn_ack_timeout(req); - /* add one more retry for fastopen */ - max_retries = icsk->icsk_syn_retries ? : + /* Add one more retry for fastopen. + * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() + */ + max_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; if (req->num_timeout >= max_retries) { @@ -446,6 +454,22 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->timeout << req->num_timeout, TCP_RTO_MAX); } +static bool tcp_rtx_probe0_timed_out(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const int timeout = TCP_RTO_MAX * 2; + u32 rcv_delta, rtx_delta; + + rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; + if (rcv_delta <= timeout) + return false; + + rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - + (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); + + return rtx_delta > timeout; +} /** * tcp_retransmit_timer() - The TCP retransmit timeout handler @@ -495,23 +519,26 @@ void tcp_retransmit_timer(struct sock *sk) * we cannot allow such beasts to hang infinitely. */ struct inet_sock *inet = inet_sk(sk); + u32 rtx_delta; + + rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb)); if (sk->sk_family == AF_INET) { - net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &inet->inet_daddr, - ntohs(inet->inet_dport), - inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", + &inet->inet_daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt, + jiffies_to_msecs(jiffies - tp->rcv_tstamp), + rtx_delta); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &sk->sk_v6_daddr, - ntohs(inet->inet_dport), - inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", + &sk->sk_v6_daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt, + jiffies_to_msecs(jiffies - tp->rcv_tstamp), + rtx_delta); } #endif - if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { + if (tcp_rtx_probe0_timed_out(sk, skb)) { tcp_write_err(sk); goto out; } @@ -708,7 +735,7 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { - if (tp->linger2 >= 0) { + if (READ_ONCE(tp->linger2) >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { @@ -733,13 +760,15 @@ static void tcp_keepalive_timer (struct timer_list *t) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + /* If the TCP_USER_TIMEOUT option is enabled, use that * to determine when to timeout instead. */ - if ((icsk->icsk_user_timeout != 0 && - elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && + if ((user_timeout != 0 && + elapsed >= msecs_to_jiffies(user_timeout) && icsk->icsk_probes_out > 0) || - (icsk->icsk_user_timeout == 0 && + (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index abfa860367aa..0794a2c46a56 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -407,9 +407,9 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static u32 udp_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport) { static u32 udp_ehash_secret __read_mostly; @@ -419,22 +419,6 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -452,42 +436,36 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); + badness = score; + + if (sk->sk_state == TCP_ESTABLISHED) { + result = sk; + continue; + } + + result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp_ehashfn); + if (!result) { + result = sk; + continue; + } + /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) + if (!reuseport_has_conns(sk)) return result; - result = result ? : sk; - badness = score; + /* Reuseport logic returned an error, keep original score. */ + if (IS_ERR(result)) + continue; + + badness = compute_score(result, net, saddr, sport, + daddr, hnum, dif, sdif); + } } return result; } -static struct sock *udp4_lookup_run_bpf(struct net *net, - struct udp_table *udptable, - struct sk_buff *skb, - __be32 saddr, __be16 sport, - __be32 daddr, u16 hnum, const int dif) -{ - struct sock *sk, *reuse_sk; - bool no_reuseport; - - if (udptable != net->ipv4.udp_table) - return NULL; /* only UDP is supported */ - - no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport, - daddr, hnum, dif, &sk); - if (no_reuseport || IS_ERR_OR_NULL(sk)) - return sk; - - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); - if (reuse_sk) - sk = reuse_sk; - return sk; -} - /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -512,9 +490,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto done; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - sk = udp4_lookup_run_bpf(net, udptable, skb, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + udptable == net->ipv4.udp_table) { + sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, dif, + udp_ehashfn); if (sk) { result = sk; goto done; @@ -799,7 +779,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) (u8 *)(uh+1)); goto out; } - if (!inet->recverr) { + if (!inet_test_bit(RECVERR, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else @@ -982,7 +962,8 @@ csum_partial: send: err = ip_send_skb(sock_net(sk), skb); if (err) { - if (err == -ENOBUFS && !inet->recverr) { + if (err == -ENOBUFS && + !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; @@ -1557,7 +1538,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) spin_unlock(&list->lock); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); busylock_release(busy); return 0; @@ -1890,7 +1871,7 @@ try_again: if (udp_sk(sk)->gro_enabled) udp_cmsg_recv(msg, sk, skb); - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); err = copied; @@ -2412,7 +2393,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - sk = skb_steal_sock(skb, &refcounted); + sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, + &refcounted, udp_ehashfn); + if (IS_ERR(sk)) + goto no_sk; + if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -2433,7 +2418,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); - +no_sk: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 5f8104cf082d..9b18f371af0d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct sock *sk = sock->sk; /* Disable multicast loopback */ - inet_sk(sk)->mc_loop = 0; + inet_clear_bit(MC_LOOP, sk); /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ inet_inc_convert_csum(sk); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 9403bbaf1b61..cdcc0f6b4f0a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -124,22 +124,13 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) xfrm_dst_destroy(xdst); } -static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) -{ - if (!unregister) - return; - - xfrm_dst_ifdown(dst, dev); -} - static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, - .ifdown = xfrm4_dst_ifdown, + .ifdown = xfrm_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = 32768, }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 94cec2075eee..47d1dd8501b7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -202,6 +202,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -262,6 +263,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -1061,20 +1063,28 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, struct fib6_info *f6i = NULL; int err = 0; - if (addr_type == IPV6_ADDR_ANY || - (addr_type & IPV6_ADDR_MULTICAST && - !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) || - (!(idev->dev->flags & IFF_LOOPBACK) && - !netif_is_l3_master(idev->dev) && - addr_type & IPV6_ADDR_LOOPBACK)) + if (addr_type == IPV6_ADDR_ANY) { + NL_SET_ERR_MSG_MOD(extack, "Invalid address"); return ERR_PTR(-EADDRNOTAVAIL); + } else if (addr_type & IPV6_ADDR_MULTICAST && + !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag"); + return ERR_PTR(-EADDRNOTAVAIL); + } else if (!(idev->dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(idev->dev) && + addr_type & IPV6_ADDR_LOOPBACK) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device"); + return ERR_PTR(-EADDRNOTAVAIL); + } if (idev->dead) { - err = -ENODEV; /*XXX*/ + NL_SET_ERR_MSG_MOD(extack, "device is going away"); + err = -ENODEV; goto out; } if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); err = -EACCES; goto out; } @@ -1101,7 +1111,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags); + f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); f6i = NULL; @@ -2731,6 +2741,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) return; } + if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft) + goto put; + /* * Two things going on here: * 1) Add routes for on-link prefixes @@ -2925,30 +2938,40 @@ static int inet6_addr_add(struct net *net, int ifindex, ASSERT_RTNL(); - if (cfg->plen > 128) + if (cfg->plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } /* check the lifetime */ - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) + if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) { + NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); return -EINVAL; + } - if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) + if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) { + NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64"); return -EINVAL; + } dev = __dev_get_by_index(net, ifindex); if (!dev) return -ENODEV; idev = addrconf_add_dev(dev); - if (IS_ERR(idev)) + if (IS_ERR(idev)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return PTR_ERR(idev); + } if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, true, cfg->pfx, ifindex); - if (ret < 0) + if (ret < 0) { + NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed"); return ret; + } } cfg->scope = ipv6_addr_scope(cfg->pfx); @@ -3005,22 +3028,29 @@ static int inet6_addr_add(struct net *net, int ifindex, } static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, - const struct in6_addr *pfx, unsigned int plen) + const struct in6_addr *pfx, unsigned int plen, + struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; - if (plen > 128) + if (plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } dev = __dev_get_by_index(net, ifindex); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); return -ENODEV; + } idev = __in6_dev_get(dev); - if (!idev) + if (!idev) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return -ENXIO; + } read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { @@ -3043,6 +3073,8 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, } } read_unlock_bh(&idev->lock); + + NL_SET_ERR_MSG_MOD(extack, "address not found"); return -EADDRNOTAVAIL; } @@ -3085,7 +3117,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) rtnl_lock(); err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, - ireq.ifr6_prefixlen); + ireq.ifr6_prefixlen, NULL); rtnl_unlock(); return err; } @@ -3488,7 +3520,7 @@ static int fixup_permanent_addr(struct net *net, struct fib6_info *f6i, *prev; f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false, - GFP_ATOMIC); + GFP_ATOMIC, NULL); if (IS_ERR(f6i)) return PTR_ERR(f6i); @@ -4698,7 +4730,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa_flags &= IFA_F_MANAGETEMPADDR; return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, - ifm->ifa_prefixlen); + ifm->ifa_prefixlen, extack); } static int modify_prefix_route(struct inet6_ifaddr *ifp, @@ -4903,8 +4935,10 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, } dev = __dev_get_by_index(net, ifm->ifa_index); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); return -ENODEV; + } if (tb[IFA_FLAGS]) cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]); @@ -4939,10 +4973,12 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, } if (nlh->nlmsg_flags & NLM_F_EXCL || - !(nlh->nlmsg_flags & NLM_F_REPLACE)) + !(nlh->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "address already assigned"); err = -EEXIST; - else + } else { err = inet6_addr_modify(net, ifa, &cfg); + } in6_ifa_put(ifa); @@ -5602,6 +5638,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na; + array[DEVCONF_ACCEPT_RA_MIN_LFT] = cnf->accept_ra_min_lft; } static inline size_t inet6_ifla6_size(void) @@ -6796,6 +6833,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "accept_ra_min_lft", + .data = &ipv6_devconf.accept_ra_min_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, .maxlen = sizeof(int), diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5d593ddc0347..368824fe9719 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -102,9 +102,9 @@ bool ipv6_mod_enabled(void) } EXPORT_SYMBOL_GPL(ipv6_mod_enabled); -static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { - const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + const int offset = sk->sk_prot->ipv6_pinfo_offset; return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } @@ -200,12 +200,12 @@ lookup_protocol: sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); - inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) - inet->hdrincl = 1; + inet_set_bit(HDRINCL, sk); } sk->sk_destruct = inet6_sock_destruct; @@ -229,7 +229,7 @@ lookup_protocol: */ inet->uc_ttl = -1; - inet->mc_loop = 1; + inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet->mc_index = 0; RCU_INIT_POINTER(inet->mc_list, NULL); @@ -399,7 +399,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if (snum || !(inet->bind_address_no_port || + if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { @@ -435,10 +435,8 @@ out_unlock: goto out; } -/* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sock *sk = sock->sk; u32 flags = BIND_WITH_LOCK; const struct proto *prot; int err = 0; @@ -462,6 +460,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return __inet6_bind(sk, uaddr, addr_len, flags); } + +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + return inet6_bind_sk(sock->sk, uaddr, addr_len); +} EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index dacdea7fcb62..bb17f484ee2c 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -305,7 +305,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC); + f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); goto out; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9b6818453afe..41ebc4e57473 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -38,10 +38,11 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) +static void ip6_datagram_flow_key_init(struct flowi6 *fl6, + const struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); @@ -523,7 +524,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); - if (inet_sk(sk)->cmsg_flags) + if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 202fc3aaa83c..4952ae792450 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -612,8 +612,6 @@ looped_back: kfree(buf); - skb_dst_drop(skb); - ip6_route_input(skb); if (skb_dst(skb)->error) { @@ -650,7 +648,6 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; - struct in6_addr daddr; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; @@ -798,9 +795,7 @@ looped_back: return -1; } - daddr = *addr; - *addr = ipv6_hdr(skb)->daddr; - ipv6_hdr(skb)->daddr = daddr; + swap(*addr, ipv6_hdr(skb)->daddr); ip6_route_input(skb); if (skb_dst(skb)->error) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 65fa5014bc85..6d88f5248c1f 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1034,11 +1034,9 @@ drop_no_count: return 0; } -void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, - u8 type, +void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, - const struct in6_addr *daddr, - int oif) + const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 3faf62530d6a..69caed07315f 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <net/genetlink.h> -#include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index bee45dfeb187..67e8c9440977 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -5,7 +5,6 @@ #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> -#include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b64b49012655..b0e8d278e8a9 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -39,6 +39,7 @@ u32 inet6_ehashfn(const struct net *net, return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so @@ -111,22 +112,40 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned short hnum) +/** + * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. + * @net: network namespace. + * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. + * @skb: context for a potential SK_REUSEPORT program. + * @doff: header offset. + * @saddr: source address. + * @sport: source port. + * @daddr: destination address. + * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. + * + * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to + * the selected sock or an error. + */ +struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum, + inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } +EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(struct net *net, @@ -143,8 +162,8 @@ static struct sock *inet6_lhash2_lookup(struct net *net, sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - result = lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + result = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; @@ -156,30 +175,30 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } -static inline struct sock *inet6_lookup_run_bpf(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, const int dif) +struct sock *inet6_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, const int dif, + inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != net->ipv4.tcp_death_row.hashinfo) - return NULL; /* only TCP is supported */ - - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport, + no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } +EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, @@ -193,9 +212,11 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - result = inet6_lookup_run_bpf(net, hashinfo, skb, doff, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + hashinfo == net->ipv4.tcp_death_row.hashinfo) { + result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, + saddr, sport, daddr, hnum, dif, + inet6_ehashfn); if (result) goto done; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bac768d36cc1..28b01a068412 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); + INIT_HLIST_NODE(&f6i->gc_link); + return f6i; } @@ -246,6 +248,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; @@ -1057,6 +1060,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, lockdep_is_held(&table->tb6_lock)); } } + + fib6_clean_expires_locked(rt); } /* @@ -1118,9 +1123,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) - fib6_clean_expires(iter); + fib6_clean_expires_locked(iter); else - fib6_set_expires(iter, rt->expires); + fib6_set_expires_locked(iter, + rt->expires); if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, @@ -1479,6 +1485,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); + + if (fib6_has_expires(rt)) + hlist_add_head(&rt->gc_link, &table->tb6_gc_hlist); + fib6_start_gc(info->nl_net, rt); } @@ -2285,9 +2295,8 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct fib6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { - struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -2295,7 +2304,7 @@ static int fib6_age(struct fib6_info *rt, void *arg) * Routes are expired even if they are in use. */ - if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { + if (fib6_has_expires(rt) && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; @@ -2312,6 +2321,40 @@ static int fib6_age(struct fib6_info *rt, void *arg) return 0; } +static void fib6_gc_table(struct net *net, + struct fib6_table *tb6, + struct fib6_gc_args *gc_args) +{ + struct fib6_info *rt; + struct hlist_node *n; + struct nl_info info = { + .nl_net = net, + .skip_notify = false, + }; + + hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) + if (fib6_age(rt, gc_args) == -1) + fib6_del(rt, &info); +} + +static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) +{ + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + spin_lock_bh(&table->tb6_lock); + fib6_gc_table(net, table, gc_args); + spin_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; @@ -2327,7 +2370,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; - fib6_clean_all(net, fib6_age, &gc_args); + fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1e8c90e97608..f8a1f6bb3f87 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1591,7 +1591,7 @@ emsgsize: } } } else if ((flags & MSG_SPLICE_PAGES) && length) { - if (inet_sk(sk)->hdrincl) + if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) @@ -1693,7 +1693,10 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; copy = datalen - transhdrlen - fraggap - pagedlen; - if (copy < 0) { + /* [!] NOTE: copy may be negative if pagedlen>0 + * because then the equation may reduces to -fraggap. + */ + if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { err = -EINVAL; goto error; } @@ -1744,6 +1747,8 @@ alloc_new_skb: err = -EFAULT; kfree_skb(skb); goto error; + } else if (flags & MSG_SPLICE_PAGES) { + copy = 0; } offset += copy; @@ -1791,6 +1796,10 @@ alloc_new_skb: } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; + err = -EIO; + if (WARN_ON_ONCE(copy > msg->msg_iter.count)) + goto error; + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) @@ -1986,7 +1995,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; - if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl) + if (sk->sk_socket->type == SOCK_RAW && + !inet_test_bit(HDRINCL, sk)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index ae818ff46224..0e2a0847b387 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -102,7 +102,7 @@ int ip6_ra_control(struct sock *sk, int sel) struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { - if (inet_sk(sk)->is_icsk) { + if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { @@ -474,8 +474,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); - sk->sk_socket->ops = &inet_stream_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); + WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; @@ -488,8 +488,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); - sk->sk_socket->ops = &inet_dgram_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); + WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, @@ -633,7 +633,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ - inet_sk(sk)->transparent = valbool; + inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; @@ -641,7 +641,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ - inet_sk(sk)->freebind = valbool; + inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; @@ -831,7 +831,7 @@ done: goto e_inval; retv = -EPROTO; - if (inet_sk(sk)->is_icsk) + if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; @@ -923,7 +923,7 @@ done: goto e_inval; np->recverr = valbool; if (!val) - skb_queue_purge(&sk->sk_error_queue); + skb_errqueue_purge(&sk->sk_error_queue); retv = 0; break; case IPV6_FLOWINFO_SEND: @@ -1330,11 +1330,11 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_TRANSPARENT: - val = inet_sk(sk)->transparent; + val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: - val = inet_sk(sk)->freebind; + val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 714cdc9e2b8e..5ce25bcb9974 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1699,11 +1699,9 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } -static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int proto, int len) +static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, + struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a42be96ae209..553c8664e0a7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1267,10 +1267,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) } #endif - /* - * set the RA_RECV flag in the interface - */ - in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", @@ -1328,6 +1324,14 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_lft) { + ND_PRINTK(2, info, + "RA: router lifetime (%ds) is too short: %s\n", + lifetime, skb->dev->name); + goto skip_defrtr; + } + /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ @@ -1340,8 +1344,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } - lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); - #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ @@ -1517,6 +1519,9 @@ skip_linkparms: if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; + if (ri->lifetime != 0 && + ntohl(ri->lifetime) < in6_dev->cnf.accept_ra_min_lft) + continue; if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index cb4eb1d2c620..d59b296b4f51 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> +#include <linux/rcupdate.h> #include <linux/sysctl.h> #include <net/ipv6_frag.h> @@ -96,6 +97,12 @@ static void __net_exit defrag6_net_exit(struct net *net) } } +static const struct nf_defrag_hook defrag_hook = { + .owner = THIS_MODULE, + .enable = nf_defrag_ipv6_enable, + .disable = nf_defrag_ipv6_disable, +}; + static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; @@ -114,6 +121,9 @@ static int __init nf_defrag_init(void) pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } + + rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook); + return ret; cleanup_frag6: @@ -124,6 +134,7 @@ cleanup_frag6: static void __exit nf_defrag_fini(void) { + rcu_assign_pointer(nf_defrag_v6_hook, NULL); unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index c2c291827a2c..1b2772834972 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -215,6 +215,7 @@ struct proto pingv6_prot = { .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), }; EXPORT_SYMBOL_GPL(pingv6_prot); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 49381f35b623..0eae7661a85c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -291,7 +291,6 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); int err; int harderr; @@ -315,7 +314,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, } if (np->recverr) { u8 *payload = skb->data; - if (!inet->hdrincl) + if (!inet_test_bit(HDRINCL, sk)) payload += offset; ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); } @@ -406,7 +405,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) skb->len, inet->inet_num, 0)); - if (inet->hdrincl) { + if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); @@ -762,12 +761,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; - /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields. - * Doing this indirectly yields the same result. - */ - hdrincl = inet->hdrincl; - hdrincl = READ_ONCE(hdrincl); + hdrincl = inet_test_bit(HDRINCL, sk); /* * Get and verify the address. @@ -1000,7 +994,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_HDRINCL: if (sk->sk_type != SOCK_RAW) return -EINVAL; - inet_sk(sk)->hdrincl = !!val; + inet_assign_bit(HDRINCL, sk, val); return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && @@ -1068,7 +1062,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case IPV6_HDRINCL: - val = inet_sk(sk)->hdrincl; + val = inet_test_bit(HDRINCL, sk); break; case IPV6_CHECKSUM: /* @@ -1216,6 +1210,7 @@ struct proto rawv6_prot = { .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 56a55585eb79..846aec8e0093 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -90,7 +90,7 @@ unsigned int ip6_mtu(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, - struct net_device *dev, int how); + struct net_device *dev); static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); @@ -371,8 +371,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) fib6_info_release(from); } -static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) +static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; @@ -3761,10 +3760,10 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->dst_nocount = true; if (cfg->fc_flags & RTF_EXPIRES) - fib6_set_expires(rt, jiffies + - clock_t_to_jiffies(cfg->fc_expires)); + fib6_set_expires_locked(rt, jiffies + + clock_t_to_jiffies(cfg->fc_expires)); else - fib6_clean_expires(rt); + fib6_clean_expires_locked(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -4544,7 +4543,8 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, - bool anycast, gfp_t gfp_flags) + bool anycast, gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, @@ -4566,7 +4566,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, cfg.fc_flags |= RTF_LOCAL; } - f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); + f6i = ip6_route_info_create(&cfg, gfp_flags, extack); if (!IS_ERR(f6i)) { f6i->dst_nocount = true; @@ -4581,21 +4581,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { - struct net_device *dev; struct net *net; struct in6_addr *addr; }; static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { - struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (!rt->nh && - ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && - ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { + ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && + !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; @@ -4608,7 +4606,6 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); struct arg_dev_net_ip adni = { - .dev = ifp->idev->dev, .net = net, .addr = &ifp->addr, }; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index b1c028df686e..a013b92cbb86 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -272,8 +272,6 @@ static int rpl_input(struct sk_buff *skb) dst = dst_cache_get(&rlwt->cache); preempt_enable(); - skb_dst_drop(skb); - if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); @@ -284,6 +282,7 @@ static int rpl_input(struct sk_buff *skb) preempt_enable(); } } else { + skb_dst_drop(skb); skb_dst_set(skb, dst); } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index dd433cc265c8..24e2b4b494cb 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -109,15 +109,19 @@ struct bpf_lwt_prog { #define next_csid_chk_lcnode_fn_bits(flen) \ next_csid_chk_lcblock_bits(flen) +/* flag indicating that flavors are set up for a given End* behavior */ +#define SEG6_F_LOCAL_FLAVORS SEG6_F_ATTR(SEG6_LOCAL_FLAVORS) + #define SEG6_F_LOCAL_FLV_OP(flvname) BIT(SEG6_LOCAL_FLV_OP_##flvname) +#define SEG6_F_LOCAL_FLV_NEXT_CSID SEG6_F_LOCAL_FLV_OP(NEXT_CSID) #define SEG6_F_LOCAL_FLV_PSP SEG6_F_LOCAL_FLV_OP(PSP) /* Supported RFC8986 Flavor operations are reported in this bitmask */ #define SEG6_LOCAL_FLV8986_SUPP_OPS SEG6_F_LOCAL_FLV_PSP -/* Supported Flavor operations are reported in this bitmask */ -#define SEG6_LOCAL_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_OP(NEXT_CSID) | \ +#define SEG6_LOCAL_END_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_NEXT_CSID | \ SEG6_LOCAL_FLV8986_SUPP_OPS) +#define SEG6_LOCAL_END_X_FLV_SUPP_OPS SEG6_F_LOCAL_FLV_NEXT_CSID struct seg6_flavors_info { /* Flavor operations */ @@ -411,9 +415,72 @@ static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) return input_action_end_finish(skb, slwt); } +static int input_action_end_x_finish(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + seg6_lookup_nexthop(skb, &slwt->nh6, 0); + + return dst_input(skb); +} + +static int input_action_end_x_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + return input_action_end_x_finish(skb, slwt); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int end_x_next_csid_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + + if (seg6_next_csid_is_arg_zero(daddr, finfo)) + return input_action_end_x_core(skb, slwt); + + /* update DA */ + seg6_next_csid_advance_arg(daddr, finfo); + + return input_action_end_x_finish(skb, slwt); +} + static bool seg6_next_csid_enabled(__u32 fops) { - return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID); + return fops & SEG6_F_LOCAL_FLV_NEXT_CSID; +} + +/* Processing of SRv6 End, End.X, and End.T behaviors can be extended through + * the flavors framework. These behaviors must report the subset of (flavor) + * operations they currently implement. In this way, if a user specifies a + * flavor combination that is not supported by a given End* behavior, the + * kernel refuses to instantiate the tunnel reporting the error. + */ +static int seg6_flv_supp_ops_by_action(int action, __u32 *fops) +{ + switch (action) { + case SEG6_LOCAL_ACTION_END: + *fops = SEG6_LOCAL_END_FLV_SUPP_OPS; + break; + case SEG6_LOCAL_ACTION_END_X: + *fops = SEG6_LOCAL_END_X_FLV_SUPP_OPS; + break; + default: + return -EOPNOTSUPP; + } + + return 0; } /* We describe the packet state in relation to the absence/presence of the SRH @@ -746,21 +813,14 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) /* regular endpoint, and forward to specified nexthop */ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct ipv6_sr_hdr *srh; - - srh = get_and_validate_srh(skb); - if (!srh) - goto drop; - - advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - - seg6_lookup_nexthop(skb, &slwt->nh6, 0); + const struct seg6_flavors_info *finfo = &slwt->flv_info; + __u32 fops = finfo->flv_ops; - return dst_input(skb); + /* check for the presence of NEXT-C-SID since it applies first */ + if (seg6_next_csid_enabled(fops)) + return end_x_next_csid_core(skb, slwt); -drop: - kfree_skb(skb); - return -EINVAL; + return input_action_end_x_core(skb, slwt); } static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt) @@ -1404,13 +1464,14 @@ static struct seg6_action_desc seg6_action_table[] = { .action = SEG6_LOCAL_ACTION_END, .attrs = 0, .optattrs = SEG6_F_LOCAL_COUNTERS | - SEG6_F_ATTR(SEG6_LOCAL_FLAVORS), + SEG6_F_LOCAL_FLAVORS, .input = input_action_end, }, { .action = SEG6_LOCAL_ACTION_END_X, .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), - .optattrs = SEG6_F_LOCAL_COUNTERS, + .optattrs = SEG6_F_LOCAL_COUNTERS | + SEG6_F_LOCAL_FLAVORS, .input = input_action_end_x, }, { @@ -2070,7 +2131,8 @@ static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt, { struct seg6_flavors_info *finfo = &slwt->flv_info; struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1]; - unsigned long fops; + int action = slwt->action; + __u32 fops, supp_fops; int rc; rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX, @@ -2086,7 +2148,8 @@ static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt, return -EINVAL; fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]); - if (fops & ~SEG6_LOCAL_FLV_SUPP_OPS) { + rc = seg6_flv_supp_ops_by_action(action, &supp_fops); + if (rc < 0 || (fops & ~supp_fops)) { NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)"); return -EOPNOTSUPP; } @@ -2618,6 +2681,11 @@ int __init seg6_local_init(void) */ BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long)); + /* Check whether the number of defined flavors exceeds the maximum + * allowed value. + */ + BUILD_BUG_ON(SEG6_LOCAL_FLV_OP_MAX + 1 > BITS_PER_TYPE(__u32)); + /* If the default NEXT-C-SID Locator-Block/Node Function lengths (in * bits) have been changed with invalid values, kernel build stops * here. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6e86721e1cdb..3a88545a265d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2176,6 +2176,7 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f787e6b8424c..ebc6ae47cfea 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -72,11 +72,12 @@ int udpv6_init_sock(struct sock *sk) return 0; } -static u32 udp6_ehashfn(const struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp6_ehashfn(const struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { static u32 udp6_ehash_secret __read_mostly; static u32 udp_ipv6_hash_secret __read_mostly; @@ -161,24 +162,6 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned int hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -195,44 +178,35 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); + badness = score; + + if (sk->sk_state == TCP_ESTABLISHED) { + result = sk; + continue; + } + + result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp6_ehashfn); + if (!result) { + result = sk; + continue; + } + /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) + if (!reuseport_has_conns(sk)) return result; - result = result ? : sk; - badness = score; + /* Reuseport logic returned an error, keep original score. */ + if (IS_ERR(result)) + continue; + + badness = compute_score(sk, net, saddr, sport, + daddr, hnum, dif, sdif); } } return result; } -static inline struct sock *udp6_lookup_run_bpf(struct net *net, - struct udp_table *udptable, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - u16 hnum, const int dif) -{ - struct sock *sk, *reuse_sk; - bool no_reuseport; - - if (udptable != net->ipv4.udp_table) - return NULL; /* only UDP is supported */ - - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport, - daddr, hnum, dif, &sk); - if (no_reuseport || IS_ERR_OR_NULL(sk)) - return sk; - - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); - if (reuse_sk) - sk = reuse_sk; - return sk; -} - /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -257,9 +231,11 @@ struct sock *__udp6_lib_lookup(struct net *net, goto done; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - sk = udp6_lookup_run_bpf(net, udptable, skb, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + udptable == net->ipv4.udp_table) { + sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, dif, + udp6_ehashfn); if (sk) { result = sk; goto done; @@ -444,7 +420,7 @@ try_again: ip6_datagram_recv_common_ctl(sk, msg, skb); if (is_udp4) { - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); } else { @@ -992,7 +968,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; /* Check if the socket is already available, e.g. due to early demux */ - sk = skb_steal_sock(skb, &refcounted); + sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, + &refcounted, udp6_ehashfn); + if (IS_ERR(sk)) + goto no_sk; + if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -1026,7 +1006,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto report_csum_error; return udp6_unicast_rcv_skb(sk, skb, uh); } - +no_sk: reason = SKB_DROP_REASON_NO_SOCKET; if (!uh->check) @@ -1802,6 +1782,7 @@ struct proto udpv6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = NULL, .diag_destroy = udp_abort, }; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 8e010d07917a..267d491e9707 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -67,6 +67,7 @@ struct proto udplitev6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = &udplite_table, }; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index eecc5e59da17..188224a76685 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -124,14 +124,10 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) xfrm_dst_destroy(xdst); } -static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) +static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct xfrm_dst *xdst; - if (!unregister) - return; - xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = diff --git a/net/key/af_key.c b/net/key/af_key.c index b4ea4cf9fad4..d68d01804dc7 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1281,7 +1281,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } - memset(&natt->encap_oa, 0, sizeof(natt->encap_oa)); } err = xfrm_init_state(x); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index f9073bc7281f..9a2a9ed3ba47 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -552,7 +552,7 @@ static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ff78217f0cb1..ed8ebb6f5909 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -36,9 +36,6 @@ struct l2tp_ip6_sock { u32 conn_id; u32 peer_conn_id; - /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see - * inet6_sk_generic - */ struct ipv6_pinfo inet6; }; @@ -730,6 +727,7 @@ static struct proto l2tp_ip6_prot = { .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), + .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index d037009ee10f..0a3f5e0bec00 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -14,14 +14,15 @@ #include <linux/init.h> #include <linux/slab.h> -#include <net/llc_sap.h> -#include <net/llc_conn.h> -#include <net/sock.h> -#include <net/tcp_states.h> -#include <net/llc_c_ev.h> +#include <net/llc.h> #include <net/llc_c_ac.h> +#include <net/llc_c_ev.h> #include <net/llc_c_st.h> +#include <net/llc_conn.h> #include <net/llc_pdu.h> +#include <net/llc_sap.h> +#include <net/sock.h> +#include <net/tcp_states.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index a3829ce548f9..84e531f86b82 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o + mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c index 5a0a84ad94af..8a16672b94e2 100644 --- a/net/mptcp/bpf.c +++ b/net/mptcp/bpf.c @@ -19,3 +19,18 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) return NULL; } + +BTF_SET8_START(bpf_mptcp_fmodret_ids) +BTF_ID_FLAGS(func, update_socket_protocol) +BTF_SET8_END(bpf_mptcp_fmodret_ids) + +static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = { + .owner = THIS_MODULE, + .set = &bpf_mptcp_fmodret_ids, +}; + +static int __init bpf_mptcp_kfunc_init(void) +{ + return register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); +} +late_initcall(bpf_mptcp_kfunc_init); diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index ae20b7d92e28..c46c22a84d23 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -32,6 +32,7 @@ struct mptcp_pernet { u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; + char scheduler[MPTCP_SCHED_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net) return mptcp_get_pernet(net)->pm_type; } +const char *mptcp_get_scheduler(const struct net *net) +{ + return mptcp_get_pernet(net)->scheduler; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; @@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; + strcpy(pernet->scheduler, "default"); } #ifdef CONFIG_SYSCTL @@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, + { + .procname = "scheduler", + .maxlen = MPTCP_SCHED_NAME_MAX, + .mode = 0644, + .proc_handler = proc_dostring, + }, {} }; @@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; + table[6].data = &pernet->scheduler; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 7dbbad1e4f55..d8da5374d9e1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); msk = mptcp_sk(sk); - if (subflow->backup != bkup) { + if (subflow->backup != bkup) subflow->backup = bkup; - mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) - msk->last_snd = NULL; - else - __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags); - mptcp_data_unlock(sk); - } mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5692daf57a4d..9661f3812682 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -9,6 +9,7 @@ #include <linux/inet.h> #include <linux/kernel.h> #include <net/tcp.h> +#include <net/inet_common.h> #include <net/netns/generic.h> #include <net/mptcp.h> #include <net/genetlink.h> @@ -471,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con slow = lock_sock_fast(ssk); if (prio) { - if (subflow->backup != backup) - msk->last_snd = NULL; - subflow->send_mp_prio = 1; subflow->backup = backup; subflow->request_bkup = backup; @@ -1005,8 +1003,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, bool is_ipv6 = sk->sk_family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; - struct socket *ssock; - struct sock *newsk; + struct sock *newsk, *ssk; int backlog = 1024; int err; @@ -1032,28 +1029,32 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, &mptcp_keys[is_ipv6]); lock_sock(newsk); - ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); + ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); release_sock(newsk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (entry->addr.family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen); + if (ssk->sk_family == AF_INET) + err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ssk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#endif if (err) return err; inet_sk_state_store(newsk, TCP_LISTEN); - err = kernel_listen(ssock, backlog); - if (err) - return err; - - mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); - - return 0; + lock_sock(ssk); + err = __inet_listen_sk(ssk, backlog); + if (!err) + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); + release_sock(ssk); + return err; } int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d80658547836..933b257eee02 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -67,11 +67,11 @@ static bool mptcp_is_tcpsk(struct sock *sk) * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ - sock->ops = &inet_stream_ops; + WRITE_ONCE(sock->ops, &inet_stream_ops); return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { - sock->ops = &inet6_stream_ops; + WRITE_ONCE(sock->ops, &inet6_stream_ops); return true; #endif } @@ -90,8 +90,8 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; + msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); - WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); @@ -101,6 +101,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) /* This is the first subflow, always with id 0 */ subflow->local_id_valid = 1; mptcp_sock_graft(msk->first, sk->sk_socket); + iput(SOCK_INODE(ssock)); return 0; } @@ -108,7 +109,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ -struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) +struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; @@ -116,10 +117,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); - if (!msk->subflow) { - if (msk->first) - return ERR_PTR(-EINVAL); - + if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); @@ -127,7 +125,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) mptcp_sockopt_sync(msk, msk->first); } - return msk->subflow; + return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) @@ -1368,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; @@ -1379,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) { - if (!msk->first) - return NULL; - return __tcp_can_send(msk->first) && - sk_stream_memory_free(msk->first) ? msk->first : NULL; - } - - /* re-use last subflow, if the burst allow that */ - if (msk->last_snd && msk->snd_burst > 0 && - sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_set_timeout(sk); - return msk->last_snd; - } - /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; @@ -1448,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); - if (!burst) { - msk->last_snd = NULL; + if (!burst) return ssk; - } subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); - msk->last_snd = ssk; msk->snd_burst = burst; return ssk; } @@ -1501,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk) mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } -void __mptcp_push_pending(struct sock *sk, unsigned int flags) +static int __subflow_push_pending(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) { - struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info = { - .flags = flags, - }; - bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len; + int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; + info->sent = dfrag->already_sent; + info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; - prev_ssk = ssk; - ssk = mptcp_subflow_get_send(msk); - - /* First check. If the ssk has changed since - * the last round, release prev_ssk - */ - if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(prev_ssk, &info); - if (!ssk) - goto out; - - /* Need to lock the new subflow only if different - * from the previous one, otherwise we are still - * helding the relevant lock - */ - if (ssk != prev_ssk) - lock_sock(ssk); - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { - if (ret == -EAGAIN) - continue; - mptcp_push_release(ssk, &info); + err = copied ? : ret; goto out; } - do_check_data_fin = true; - info.sent += ret; + info->sent += ret; + copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + + if (msk->snd_burst <= 0 || + !sk_stream_memory_free(ssk) || + !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { + err = copied; + goto out; + } + mptcp_set_timeout(sk); + } + err = copied; + +out: + return err; +} + +void __mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + bool do_check_data_fin = false; + int push_count = 1; + + while (mptcp_send_head(sk) && (push_count > 0)) { + struct mptcp_subflow_context *subflow; + int ret = 0; + + if (mptcp_sched_get_send(msk)) + break; + + push_count = 0; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + + prev_ssk = ssk; + ssk = mptcp_subflow_tcp_sock(subflow); + if (ssk != prev_ssk) { + /* First check. If the ssk has changed since + * the last round, release prev_ssk + */ + if (prev_ssk) + mptcp_push_release(prev_ssk, &info); + + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock + */ + lock_sock(ssk); + } + + push_count++; + + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) { + if (ret != -EAGAIN || + (1 << ssk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) + push_count--; + continue; + } + do_check_data_fin = true; + } + } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); -out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); @@ -1572,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool struct mptcp_sendmsg_info info = { .data_lock_held = true, }; - struct mptcp_data_frag *dfrag; + bool keep_pushing = true; struct sock *xmit_ssk; - int len, copied = 0; + int copied = 0; info.flags = 0; - while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; - len = dfrag->data_len - dfrag->already_sent; - while (len > 0) { - int ret = 0; + while (mptcp_send_head(sk) && keep_pushing) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + int ret = 0; - /* check for a different subflow usage only after - * spooling the first chunk of data - */ - xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); - if (!xmit_ssk) - goto out; - if (xmit_ssk != ssk) { - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), - MPTCP_DELEGATE_SEND); - goto out; - } - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + /* check for a different subflow usage only after + * spooling the first chunk of data + */ + if (first) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + first = false; if (ret <= 0) - goto out; + break; + copied += ret; + continue; + } + + if (mptcp_sched_get_send(msk)) + goto out; - info.sent += ret; + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) + keep_pushing = false; copied += ret; - len -= ret; - first = false; + } - mptcp_update_post_push(msk, dfrag, ret); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + xmit_ssk = mptcp_subflow_tcp_sock(subflow); + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(subflow, + MPTCP_DELEGATE_SEND); + keep_pushing = false; + } + } } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } out: @@ -1642,7 +1669,6 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; struct sock *ssk; int ret; @@ -1653,9 +1679,9 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; @@ -1689,7 +1715,7 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (!mptcp_disconnect(sk, 0)) sk->sk_socket->state = SS_UNCONNECTED; } - inet_sk(sk)->defer_connect = 0; + inet_clear_bit(DEFER_CONNECT, sk); return ret; } @@ -1707,7 +1733,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); - if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) { + if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || + msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); @@ -1881,6 +1908,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; @@ -1911,9 +1939,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); + scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; + msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; @@ -1922,8 +1952,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; @@ -1932,18 +1962,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < advmss) - rcvmem += 128; - - do_div(rcvwin, advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = tcp_win_from_space(sk, rcvbuf); + window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we @@ -2202,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t) * * A backup subflow is returned only if that is the only kind available. */ -static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) - return NULL; - mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2243,14 +2263,6 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) return min_stale_count > 1 ? backup : NULL; } -static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) -{ - if (msk->subflow) { - iput(SOCK_INODE(msk->subflow)); - WRITE_ONCE(msk->subflow, NULL); - } -} - bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; @@ -2329,7 +2341,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, goto out_release; } - dispose_it = !msk->subflow || ssk != msk->subflow->sk; + dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); @@ -2350,7 +2362,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); - msk->subflow->state = SS_UNCONNECTED; mptcp_subflow_ctx_reset(subflow); release_sock(ssk); @@ -2383,9 +2394,6 @@ out_release: WRITE_ONCE(msk->first, NULL); out: - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (need_push) __mptcp_push_pending(sk, 0); } @@ -2502,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - size_t copied = 0; struct sock *ssk; - int ret; + int ret, err; + u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ - ssk = mptcp_subflow_get_retrans(msk); + err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2530,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - if (!ssk) + if (err) goto reset_timer; - lock_sock(ssk); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + u16 copied = 0; - /* limit retransmission to the bytes already sent on some subflows */ - info.sent = 0; - info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; - while (info.sent < info.limit) { - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret <= 0) - break; + mptcp_subflow_set_scheduled(subflow, false); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); - copied += ret; - info.sent += ret; - } - if (copied) { - dfrag->already_sent = max(dfrag->already_sent, info.sent); - msk->bytes_retrans += copied; - tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, - info.size_goal); - WRITE_ONCE(msk->allow_infinite_fallback, false); + ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : + dfrag->already_sent; + while (info.sent < info.limit) { + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + info.sent += ret; + } + if (copied) { + len = max(copied, len); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); + } + + release_sock(ssk); + } } - release_sock(ssk); + msk->bytes_retrans += len; + dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); @@ -2663,7 +2685,7 @@ unlock: sock_put(sk); } -static int __mptcp_init_sock(struct sock *sk) +static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -2690,8 +2712,6 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); - - return 0; } static void mptcp_ca_reset(struct sock *sk) @@ -2711,9 +2731,7 @@ static int mptcp_init_sock(struct sock *sk) struct net *net = sock_net(sk); int ret; - ret = __mptcp_init_sock(sk); - if (ret) - return ret; + __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; @@ -2721,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; + ret = mptcp_init_sched(mptcp_sk(sk), + mptcp_sched_find(mptcp_get_scheduler(net))); + if (ret) + return ret; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will @@ -2866,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk) mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; + mptcp_release_sched(msk); sk->sk_prot->destroy(sk); @@ -3055,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); - msk->last_snd = NULL; WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->push_pending = 0; @@ -3111,7 +3134,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk = mptcp_sk(nsk); msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; - WRITE_ONCE(msk->subflow, NULL); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) @@ -3122,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; + mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; @@ -3175,25 +3198,17 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } -static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, +static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err, bool kern) { - struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *listener; struct sock *newsk; - listener = READ_ONCE(msk->subflow); - if (WARN_ON_ONCE(!listener)) { - *err = -EINVAL; - return NULL; - } - - pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); - newsk = inet_csk_accept(listener->sk, flags, err, kern); + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + newsk = inet_csk_accept(ssk, flags, err, kern); if (!newsk) return NULL; - pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -3210,9 +3225,9 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, } newsk = new_mptcp_sock; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { - MPTCP_INC_STATS(sock_net(sk), + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } @@ -3253,10 +3268,8 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - /* clears msk->subflow, allowing the following to close - * even the initial subflow - */ - mptcp_dispose_initial_subflow(msk); + /* allow the following to close even the initial subflow */ + msk->free_first = 1; mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } @@ -3336,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); - if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) - msk->last_snd = NULL; } __mptcp_update_rmem(sk); @@ -3406,14 +3417,12 @@ static void mptcp_unhash(struct sock *sk) static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; - ssock = msk->subflow; - pr_debug("msk=%p, subflow=%p", msk, ssock); - if (WARN_ON_ONCE(!ssock)) + pr_debug("msk=%p, ssk=%p", msk, msk->first); + if (WARN_ON_ONCE(!msk->first)) return -EINVAL; - return inet_csk_get_port(ssock->sk, snum); + return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) @@ -3588,25 +3597,24 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; int err = -EINVAL; + struct sock *ssk; - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); - mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_SYN_SENT); - subflow = mptcp_subflow_ctx(ssock->sk); + subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ - if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { - MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); + if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } if (likely(!__mptcp_check_fallback(msk))) @@ -3615,25 +3623,42 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ - if (msk->fastopening) - err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1); - else - err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK); - inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; + if (!msk->fastopening) + lock_sock(ssk); + + /* the following mirrors closely a very small chunk of code from + * __inet_stream_connect() + */ + if (ssk->sk_state != TCP_CLOSE) + goto out; + + if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { + err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); + if (err) + goto out; + } + + err = ssk->sk_prot->connect(ssk, uaddr, addr_len); + if (err < 0) + goto out; + + inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); + +out: + if (!msk->fastopening) + release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ - if (unlikely(err && err != -EINPROGRESS)) { - inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + if (unlikely(err)) { + /* avoid leaving a dangling token in an unconnected socket */ + mptcp_token_destroy(msk); + inet_sk_state_store(sk, TCP_CLOSE); return err; } - mptcp_copy_inaddrs(sk, ssock->sk); - - /* silence EINPROGRESS and let the caller inet_stream_connect - * handle the connection in progress - */ + mptcp_copy_inaddrs(sk, ssk); return 0; } @@ -3674,22 +3699,27 @@ static struct proto mptcp_prot = { static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct socket *ssock; - int err; + struct sock *ssk, *sk = sock->sk; + int err = -EINVAL; - lock_sock(sock->sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + lock_sock(sk); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + err = PTR_ERR(ssk); goto unlock; } - err = ssock->ops->bind(ssock, uaddr, addr_len); + if (sk->sk_family == AF_INET) + err = inet_bind_sk(ssk, uaddr, addr_len); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (sk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, uaddr, addr_len); +#endif if (!err) - mptcp_copy_inaddrs(sock->sk, ssock->sk); + mptcp_copy_inaddrs(sk, ssk); unlock: - release_sock(sock->sk); + release_sock(sk); return err; } @@ -3697,7 +3727,7 @@ static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; - struct socket *ssock; + struct sock *ssk; int err; pr_debug("msk=%p", msk); @@ -3708,22 +3738,24 @@ static int mptcp_listen(struct socket *sock, int backlog) if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + err = PTR_ERR(ssk); goto unlock; } - mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); - err = ssock->ops->listen(ssock, backlog); - inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + lock_sock(ssk); + err = __inet_listen_sk(ssk, backlog); + release_sock(ssk); + inet_sk_state_store(sk, inet_sk_state_load(ssk)); + if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - mptcp_copy_inaddrs(sk, ssock->sk); - mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); + mptcp_copy_inaddrs(sk, ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } unlock: @@ -3735,8 +3767,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct socket *ssock; - struct sock *newsk; + struct sock *ssk, *newsk; int err; pr_debug("msk=%p", msk); @@ -3744,11 +3775,11 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ - ssock = READ_ONCE(msk->subflow); - if (!ssock) + ssk = READ_ONCE(msk->first); + if (!ssk) return -EINVAL; - newsk = mptcp_accept(sock->sk, flags, &err, kern); + newsk = mptcp_accept(ssk, flags, &err, kern); if (!newsk) return err; @@ -3775,11 +3806,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ - if (msk->first && - unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { __mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first), 0); - if (unlikely(list_empty(&msk->conn_list))) + if (unlikely(list_is_singular(&msk->conn_list))) inet_sk_state_store(newsk, TCP_CLOSE); } } @@ -3818,12 +3848,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) { - struct socket *ssock = READ_ONCE(msk->subflow); + struct sock *ssk = READ_ONCE(msk->first); - if (WARN_ON_ONCE(!ssock || !ssock->sk)) + if (WARN_ON_ONCE(!ssk)) return 0; - return inet_csk_listen_poll(ssock->sk); + return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); @@ -3838,7 +3868,8 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); - } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + } else if (state == TCP_SYN_SENT && + inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } @@ -3934,6 +3965,7 @@ void __init mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) @@ -3987,6 +4019,7 @@ int __init mptcp_proto_v6_init(void) strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); + mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ba2a873a4d2e..7254b3562575 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -123,7 +123,6 @@ #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_CONNECTED 6 -#define MPTCP_RESET_SCHEDULER 7 struct mptcp_skb_cb { u64 map_seq; @@ -269,7 +268,6 @@ struct mptcp_sock { u64 rcv_data_fin_seq; u64 bytes_retrans; int rmem_fwd_alloc; - struct sock *last_snd; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; @@ -299,7 +297,8 @@ struct mptcp_sock { cork:1, nodelay:1, fastopening:1, - in_accept_queue:1; + in_accept_queue:1, + free_first:1; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -308,19 +307,19 @@ struct mptcp_sock { struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; - struct socket *subflow; /* outgoing connect/listener/!mp_capable - * The mptcp ops can safely dereference, using suitable - * ONCE annotation, the subflow outside the socket - * lock as such sock is freed after close(). - */ - struct sock *first; + struct sock *first; /* The mptcp ops can safely dereference, using suitable + * ONCE annotation, the subflow outside the socket + * lock as such sock is freed after close(). + */ struct mptcp_pm_data pm; + struct mptcp_sched_ops *sched; struct { u32 space; /* bytes copied in last measurement window */ u32 copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; + u8 scaling_ratio; u32 subflow_id; u32 setsockopt_seq; @@ -350,9 +349,14 @@ static inline int __mptcp_rmem(const struct sock *sk) return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); } +static inline int mptcp_win_from_space(const struct sock *sk, int space) +{ + return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); +} + static inline int __mptcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) @@ -487,6 +491,7 @@ struct mptcp_subflow_context { is_mptfo : 1, /* subflow is doing TFO */ __unused : 9; enum mptcp_data_avail data_avail; + bool scheduled; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -620,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); +const char *mptcp_get_scheduler(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -634,7 +640,7 @@ void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); -struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk); +struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); @@ -652,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); +struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_register_scheduler(struct mptcp_sched_ops *sched); +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); +void mptcp_sched_init(void); +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched); +void mptcp_release_sched(struct mptcp_sock *msk); +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled); +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); +int mptcp_sched_get_send(struct mptcp_sock *msk); +int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline bool __tcp_can_send(const struct sock *ssk) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c new file mode 100644 index 000000000000..4ab0693c069c --- /dev/null +++ b/net/mptcp/sched.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2022, SUSE. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include "protocol.h" + +static DEFINE_SPINLOCK(mptcp_sched_list_lock); +static LIST_HEAD(mptcp_sched_list); + +static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : + mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static struct mptcp_sched_ops mptcp_sched_default = { + .get_subflow = mptcp_sched_default_get_subflow, + .name = "default", + .owner = THIS_MODULE, +}; + +/* Must be called with rcu read lock held */ +struct mptcp_sched_ops *mptcp_sched_find(const char *name) +{ + struct mptcp_sched_ops *sched, *ret = NULL; + + list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { + if (!strcmp(sched->name, name)) { + ret = sched; + break; + } + } + + return ret; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + if (!sched->get_subflow) + return -EINVAL; + + spin_lock(&mptcp_sched_list_lock); + if (mptcp_sched_find(sched->name)) { + spin_unlock(&mptcp_sched_list_lock); + return -EEXIST; + } + list_add_tail_rcu(&sched->list, &mptcp_sched_list); + spin_unlock(&mptcp_sched_list_lock); + + pr_debug("%s registered", sched->name); + return 0; +} + +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) +{ + if (sched == &mptcp_sched_default) + return; + + spin_lock(&mptcp_sched_list_lock); + list_del_rcu(&sched->list); + spin_unlock(&mptcp_sched_list_lock); +} + +void mptcp_sched_init(void) +{ + mptcp_register_scheduler(&mptcp_sched_default); +} + +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched) +{ + if (!sched) + sched = &mptcp_sched_default; + + if (!bpf_try_module_get(sched, sched->owner)) + return -EBUSY; + + msk->sched = sched; + if (msk->sched->init) + msk->sched->init(msk); + + pr_debug("sched=%s", msk->sched->name); + + return 0; +} + +void mptcp_release_sched(struct mptcp_sock *msk) +{ + struct mptcp_sched_ops *sched = msk->sched; + + if (!sched) + return; + + msk->sched = NULL; + if (sched->release) + sched->release(msk); + + bpf_module_put(sched, sched->owner); +} + +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled) +{ + WRITE_ONCE(subflow->scheduled, scheduled); +} + +int mptcp_sched_get_send(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_send */ + if (__mptcp_check_fallback(msk)) { + if (msk->first && + __tcp_can_send(msk->first) && + sk_stream_memory_free(msk->first)) { + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); + return 0; + } + return -EINVAL; + } + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + data.reinject = false; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); + return msk->sched->get_subflow(msk, &data); +} + +int mptcp_sched_get_retrans(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_retrans */ + if (__mptcp_check_fallback(msk)) + return -EINVAL; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + data.reinject = true; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); + return msk->sched->get_subflow(msk, &data); +} diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a3f1fe810cc9..8260202c0066 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -292,7 +292,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; - struct socket *ssock; + struct sock *ssk; int ret; switch (optname) { @@ -301,22 +301,22 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BINDTODEVICE: case SO_BINDTOIFINDEX: lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { release_sock(sk); - return PTR_ERR(ssock); + return PTR_ERR(ssk); } - ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen); if (ret == 0) { if (optname == SO_REUSEPORT) - sk->sk_reuseport = ssock->sk->sk_reuseport; + sk->sk_reuseport = ssk->sk_reuseport; else if (optname == SO_REUSEADDR) - sk->sk_reuse = ssock->sk->sk_reuse; + sk->sk_reuse = ssk->sk_reuse; else if (optname == SO_BINDTODEVICE) - sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + sk->sk_bound_dev_if = ssk->sk_bound_dev_if; else if (optname == SO_BINDTOIFINDEX) - sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + sk->sk_bound_dev_if = ssk->sk_bound_dev_if; } release_sock(sk); return ret; @@ -390,20 +390,20 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, { struct sock *sk = (struct sock *)msk; int ret = -EOPNOTSUPP; - struct socket *ssock; + struct sock *ssk; switch (optname) { case IPV6_V6ONLY: case IPV6_TRANSPARENT: case IPV6_FREEBIND: lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { release_sock(sk); - return PTR_ERR(ssock); + return PTR_ERR(ssk); } - ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen); if (ret != 0) { release_sock(sk); return ret; @@ -413,13 +413,15 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, switch (optname) { case IPV6_V6ONLY: - sk->sk_ipv6only = ssock->sk->sk_ipv6only; + sk->sk_ipv6only = ssk->sk_ipv6only; break; case IPV6_TRANSPARENT: - inet_sk(sk)->transparent = inet_sk(ssock->sk)->transparent; + inet_assign_bit(TRANSPARENT, sk, + inet_test_bit(TRANSPARENT, ssk)); break; case IPV6_FREEBIND: - inet_sk(sk)->freebind = inet_sk(ssock->sk)->freebind; + inet_assign_bit(FREEBIND, sk, + inet_test_bit(FREEBIND, ssk)); break; } @@ -684,8 +686,7 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; - struct inet_sock *issk; - struct socket *ssock; + struct sock *ssk; int err; err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); @@ -694,20 +695,19 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { release_sock(sk); - return PTR_ERR(ssock); + return PTR_ERR(ssk); } - issk = inet_sk(ssock->sk); - switch (optname) { case IP_FREEBIND: - issk->freebind = inet_sk(sk)->freebind; + inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); break; case IP_TRANSPARENT: - issk->transparent = inet_sk(sk)->transparent; + inet_assign_bit(TRANSPARENT, ssk, + inet_test_bit(TRANSPARENT, sk)); break; default: release_sock(sk); @@ -763,18 +763,18 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; - struct socket *sock; + struct sock *ssk; int ret; /* Limit to first subflow, before the connection establishment */ lock_sock(sk); - sock = __mptcp_nmpc_socket(msk); - if (IS_ERR(sock)) { - ret = PTR_ERR(sock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + ret = PTR_ERR(ssk); goto unlock; } - ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); + ret = tcp_setsockopt(ssk, level, optname, optval, optlen); unlock: release_sock(sk); @@ -864,9 +864,8 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int char __user *optval, int __user *optlen) { struct sock *sk = (struct sock *)msk; - struct socket *ssock; - int ret; struct sock *ssk; + int ret; lock_sock(sk); ssk = msk->first; @@ -875,13 +874,13 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int goto out; } - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - ret = PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + ret = PTR_ERR(ssk); goto out; } - ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); + ret = tcp_getsockopt(ssk, level, optname, optval, optlen); out: release_sock(sk); @@ -1441,8 +1440,8 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) __tcp_sock_set_cork(ssk, !!msk->cork); __tcp_sock_set_nodelay(ssk, !!msk->nodelay); - inet_sk(ssk)->transparent = inet_sk(sk)->transparent; - inet_sk(ssk)->freebind = inet_sk(sk)->freebind; + inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); + inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); } static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 94ae7dd01c65..9bf3c7bc1762 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); - *full_space = tcp_full_space(sk); + *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } void __mptcp_error_report(struct sock *sk) diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index d27f4eccce6d..a3a6753a1db7 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -563,7 +563,7 @@ int ncsi_send_netlink_timeout(struct ncsi_request *nr, int ncsi_send_netlink_err(struct net_device *dev, u32 snd_seq, u32 snd_portid, - struct nlmsghdr *nlhdr, + const struct nlmsghdr *nlhdr, int err) { struct nlmsghdr *nlh; diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h index 39a1a9d7bf77..747767ea0aae 100644 --- a/net/ncsi/ncsi-netlink.h +++ b/net/ncsi/ncsi-netlink.h @@ -19,7 +19,7 @@ int ncsi_send_netlink_timeout(struct ncsi_request *nr, int ncsi_send_netlink_err(struct net_device *dev, u32 snd_seq, u32 snd_portid, - struct nlmsghdr *nlhdr, + const struct nlmsghdr *nlhdr, int err); #endif /* __NCSI_NETLINK_H__ */ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5f76ae86a656..ef4e76e5aef9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -680,6 +680,12 @@ EXPORT_SYMBOL_GPL(nfnl_ct_hook); const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); +const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v4_hook); + +const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v6_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) u8 nf_ctnetlink_has_listener; EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 0b68e2e2824e..e564b5174261 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -872,7 +872,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) BUG_ON(!set); read_lock_bh(&ip_set_ref_lock); - strncpy(name, set->name, IPSET_MAXNAMELEN); + strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1326,7 +1326,7 @@ static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, goto out; } } - strncpy(set->name, name2, IPSET_MAXNAMELEN); + strscpy_pad(set->name, name2, IPSET_MAXNAMELEN); out: write_unlock_bh(&ip_set_ref_lock); @@ -1380,9 +1380,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, return -EBUSY; } - strncpy(from_name, from->name, IPSET_MAXNAMELEN); - strncpy(from->name, to->name, IPSET_MAXNAMELEN); - strncpy(to->name, from_name, IPSET_MAXNAMELEN); + strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN); + strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN); + strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN); swap(from->ref, to->ref); ip_set(inst, from_id) = to; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index cb83ca506c5c..3230506ae3ff 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1346,7 +1346,7 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } @@ -1946,7 +1946,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 264f2f87a437..da5af28ff57b 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1297,11 +1297,9 @@ static void set_sock_size(struct sock *sk, int mode, int val) */ static void set_mcast_loop(struct sock *sk, u_char loop) { - struct inet_sock *inet = inet_sk(sk); - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; + inet_assign_bit(MC_LOOP, sk, loop); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index c36da56d756f..e502ec00b2fe 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/kmod.h> +#include <linux/module.h> #include <linux/netfilter.h> #include <net/netfilter/nf_bpf_link.h> @@ -23,8 +25,90 @@ struct bpf_nf_link { struct nf_hook_ops hook_ops; struct net *net; u32 dead; + const struct nf_defrag_hook *defrag_hook; }; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +static const struct nf_defrag_hook * +get_proto_defrag_hook(struct bpf_nf_link *link, + const struct nf_defrag_hook __rcu *global_hook, + const char *mod) +{ + const struct nf_defrag_hook *hook; + int err; + + /* RCU protects us from races against module unloading */ + rcu_read_lock(); + hook = rcu_dereference(global_hook); + if (!hook) { + rcu_read_unlock(); + err = request_module(mod); + if (err) + return ERR_PTR(err < 0 ? err : -EINVAL); + + rcu_read_lock(); + hook = rcu_dereference(global_hook); + } + + if (hook && try_module_get(hook->owner)) { + /* Once we have a refcnt on the module, we no longer need RCU */ + hook = rcu_pointer_handoff(hook); + } else { + WARN_ONCE(!hook, "%s has bad registration", mod); + hook = ERR_PTR(-ENOENT); + } + rcu_read_unlock(); + + if (!IS_ERR(hook)) { + err = hook->enable(link->net); + if (err) { + module_put(hook->owner); + hook = ERR_PTR(err); + } + } + + return hook; +} +#endif + +static int bpf_nf_enable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook __maybe_unused *hook; + + switch (link->hook_ops.pf) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + case NFPROTO_IPV4: + hook = get_proto_defrag_hook(link, nf_defrag_v4_hook, "nf_defrag_ipv4"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + case NFPROTO_IPV6: + hook = get_proto_defrag_hook(link, nf_defrag_v6_hook, "nf_defrag_ipv6"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif + default: + return -EAFNOSUPPORT; + } +} + +static void bpf_nf_disable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook *hook = link->defrag_hook; + + if (!hook) + return; + hook->disable(link->net); + module_put(hook->owner); +} + static void bpf_nf_link_release(struct bpf_link *link) { struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); @@ -32,11 +116,11 @@ static void bpf_nf_link_release(struct bpf_link *link) if (nf_link->dead) return; - /* prevent hook-not-found warning splat from netfilter core when - * .detach was already called - */ - if (!cmpxchg(&nf_link->dead, 0, 1)) + /* do not double release in case .detach was already called */ + if (!cmpxchg(&nf_link->dead, 0, 1)) { nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); + bpf_nf_disable_defrag(nf_link); + } } static void bpf_nf_link_dealloc(struct bpf_link *link) @@ -92,6 +176,8 @@ static const struct bpf_link_ops bpf_nf_link_lops = { static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) { + int prio; + switch (attr->link_create.netfilter.pf) { case NFPROTO_IPV4: case NFPROTO_IPV6: @@ -102,19 +188,18 @@ static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) return -EAFNOSUPPORT; } - if (attr->link_create.netfilter.flags) + if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG) return -EOPNOTSUPP; - /* make sure conntrack confirm is always last. - * - * In the future, if userspace can e.g. request defrag, then - * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG" - * should fail. - */ - switch (attr->link_create.netfilter.priority) { - case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */ - case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */ - } + /* make sure conntrack confirm is always last */ + prio = attr->link_create.netfilter.priority; + if (prio == NF_IP_PRI_FIRST) + return -ERANGE; /* sabotage_in and other warts */ + else if (prio == NF_IP_PRI_LAST) + return -ERANGE; /* e.g. conntrack confirm */ + else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) && + prio <= NF_IP_PRI_CONNTRACK_DEFRAG) + return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */ return 0; } @@ -149,6 +234,7 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->net = net; link->dead = false; + link->defrag_hook = NULL; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -156,8 +242,17 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) { + err = bpf_nf_enable_defrag(link); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + } + err = nf_register_net_hook(net, &link->hook_ops); if (err) { + bpf_nf_disable_defrag(link); bpf_link_cleanup(&link_primer); return err; } diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 0d36d7285e3f..c7a6114091ae 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/btf_ids.h> #include <linux/net_namespace.h> +#include <net/xdp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 992393102d5f..9f6f2e643575 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1756,7 +1756,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); - exp = nf_ct_find_expectation(net, zone, tuple); + exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 96948e98ec53..81ca348915c9 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; @@ -211,7 +211,7 @@ nf_ct_find_expectation(struct net *net, !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; - if (exp->flags & NF_CT_EXPECT_PERMANENT) { + if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 69c8c8c7e9b8..334db22199c1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1321,15 +1321,11 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; - ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL); + ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, + cta_ip_nla_policy, NULL); if (ret < 0) return ret; - ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX, - cta_ip_nla_policy, NULL); - if (ret) - return ret; - switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index d4fd626d2b8c..e2db1f4ec2df 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -69,6 +69,7 @@ #define DCCP_MSL (2 * 60 * HZ) +#ifdef CONFIG_NF_CONNTRACK_PROCFS static const char * const dccp_state_names[] = { [CT_DCCP_NONE] = "NONE", [CT_DCCP_REQUEST] = "REQUEST", @@ -81,6 +82,7 @@ static const char * const dccp_state_names[] = { [CT_DCCP_IGNORE] = "IGNORE", [CT_DCCP_INVALID] = "INVALID", }; +#endif #define sNO CT_DCCP_NONE #define sRQ CT_DCCP_REQUEST diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 1c26f03fc661..a010b25076ca 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -34,7 +34,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; - unsigned int enc_keys; + unsigned long long enc_keys; if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) return; @@ -43,8 +43,8 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); mask->enc_key_id.keyid = 0xffffffff; - enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL); if (ip_tunnel_info_af(tun_info) == AF_INET) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, @@ -55,7 +55,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, mask->enc_ipv4.src = 0xffffffff; if (key->enc_ipv4.dst) mask->enc_ipv4.dst = 0xffffffff; - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else { memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, @@ -70,7 +70,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, sizeof(struct in6_addr))) memset(&mask->enc_ipv6.dst, 0xff, sizeof(struct in6_addr)); - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -163,14 +163,14 @@ static int nf_flow_rule_match(struct nf_flow_match *match, return -EOPNOTSUPP; } mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(key->control.addr_type); + match->dissector.used_keys |= BIT_ULL(key->control.addr_type); mask->basic.n_proto = 0xffff; switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: case IPPROTO_GRE: @@ -182,9 +182,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->basic.ip_proto = tuple->l4proto; mask->basic.ip_proto = 0xff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); switch (tuple->l4proto) { case IPPROTO_TCP: @@ -194,7 +194,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); break; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index eb8b1167dced..41b826dff6f5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3675,6 +3675,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) return -EMLINK; list_for_each_entry(rule, &chain->rules, list) { + if (fatal_signal_pending(current)) + return -EINTR; + if (!nft_is_active_next(ctx->net, rule)) continue; @@ -10485,6 +10488,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, if (ctx->chain == chain) return -ELOOP; + if (fatal_signal_pending(current)) + return -EINTR; + list_for_each_entry(rule, &chain->rules, list) { nft_rule_for_each_expr(expr, last, rule) { struct nft_immediate_expr *priv; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 910ef881c3b8..12ab78fa5d84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -35,12 +35,12 @@ void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } @@ -59,7 +59,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, .mask = match->mask.basic.n_proto, }; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; @@ -70,8 +70,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); - } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) && + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); + } else if (match->dissector.used_keys & + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; @@ -80,7 +81,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index e57eb168ee13..53c9e76473ba 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -470,7 +470,6 @@ __build_packet_message(struct nfnl_log_net *log, sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - ktime_t tstamp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), @@ -599,10 +598,9 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - tstamp = skb_tstamp_cond(skb, false); - if (hooknum <= NF_INET_FORWARD && tstamp) { + if (hooknum <= NF_INET_FORWARD) { + struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true)); struct nfulnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 6eb21a4f5698..cd4652259095 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -162,7 +162,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, memcpy(key + reg->offset, data, reg->len); memcpy(mask + reg->offset, datamask, reg->len); - flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.used_keys |= BIT_ULL(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; if (reg->key == FLOW_DISSECTOR_KEY_META && diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 38958e067aa8..86bb9d7797d9 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -108,7 +108,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, helper = rcu_dereference(help->helper); if (helper == NULL) goto err; - strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); + strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { @@ -262,6 +262,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; return; } + __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); @@ -368,6 +369,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } + __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 6e049fd48760..04b51f285332 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -14,17 +14,18 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> +#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ + NFTA_FIB_F_PRESENT) + const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { [NFTA_FIB_DREG] = { .type = NLA_U32 }, [NFTA_FIB_RESULT] = { .type = NLA_U32 }, - [NFTA_FIB_FLAGS] = { .type = NLA_U32 }, + [NFTA_FIB_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL), }; EXPORT_SYMBOL(nft_fib_policy); -#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ - NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ - NFTA_FIB_F_PRESENT) - int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { @@ -77,7 +78,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS])); - if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL)) + if (priv->flags == 0) return -EINVAL; if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == @@ -150,7 +151,7 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv, if (priv->flags & NFTA_FIB_F_PRESENT) *dreg = !!dev; else - strncpy(reg, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ); break; default: WARN_ON_ONCE(1); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 29ac48cdd6db..870e5b113d13 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -90,7 +90,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, - [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 }, + [NFTA_LOOKUP_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, @@ -120,9 +121,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); - if (flags & ~NFT_LOOKUP_F_INV) - return -EINVAL; - if (flags & NFT_LOOKUP_F_INV) priv->invert = true; } diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index b115d77fbbc7..8a14aaca93bb 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -20,7 +20,8 @@ struct nft_masq { }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; @@ -47,11 +48,8 @@ static int nft_masq_init(const struct nft_ctx *ctx, struct nft_masq *priv = nft_expr_priv(expr); int err; - if (tb[NFTA_MASQ_FLAGS]) { + if (tb[NFTA_MASQ_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8fdc7318c03c..f7da7c43333b 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -185,12 +185,12 @@ static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, case NFT_META_IIFKIND: if (!in || !in->rtnl_link_ops) return false; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_OIFKIND: if (!out || !out->rtnl_link_ops) return false; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; default: return false; @@ -206,7 +206,7 @@ static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) { - strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ); } static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 5c29915ab028..583885ce7232 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -132,7 +132,8 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, @@ -246,11 +247,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_NAT_FLAGS]) { + if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EOPNOTSUPP; - } return nf_ct_netns_get(ctx->net, family); } diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 70820c66b591..7f61506e5b44 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -23,7 +23,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; - char os_match[NFT_OSF_MAXGENRELEN + 1]; + char os_match[NFT_OSF_MAXGENRELEN]; const struct tcphdr *tcp; struct nf_osf_data data; struct tcphdr _tcph; @@ -45,7 +45,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, } if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { - strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); } else { if (priv->flags & NFT_OSF_F_VERSION) snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", @@ -53,7 +53,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, else strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); - strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN); } } diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index a70196ffcb1e..a58bd8d291ff 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -22,7 +22,8 @@ struct nft_redir { static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_REDIR_FLAGS] = { .type = NLA_U32 }, + [NFTA_REDIR_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_redir_validate(const struct nft_ctx *ctx, @@ -68,11 +69,8 @@ static int nft_redir_init(const struct nft_ctx *ctx, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_REDIR_FLAGS]) { + if (tb[NFTA_REDIR_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } return nf_ct_netns_get(ctx->net, ctx->family); } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 470282cf3fae..21624d68314f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -768,7 +768,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, m->u.user.match_size = msize; strscpy(name, match->name, sizeof(name)); module_put(match->me); - strncpy(m->u.user.name, name, sizeof(m->u.user.name)); + strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name)); *size += off; *dstptr += msize; @@ -1148,7 +1148,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, t->u.user.target_size = tsize; strscpy(name, target->name, sizeof(name)); module_put(target->me); - strncpy(t->u.user.name, name, sizeof(t->u.user.name)); + strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name)); *size += off; *dstptr += tsize; @@ -2014,4 +2014,3 @@ static void __exit xt_fini(void) module_init(xt_init); module_exit(xt_fini); - diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 68ccbe50bb1e..5d1fb7018dba 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -29,7 +29,7 @@ if (tbl == NULL) \ return NULL; \ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ - strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + strscpy_pad(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h index 85d7ecb05728..9518ab56ec98 100644 --- a/net/netlabel/netlabel_cipso_v4.h +++ b/net/netlabel/netlabel_cipso_v4.h @@ -149,7 +149,4 @@ enum { /* NetLabel protocol functions */ int netlbl_cipsov4_genl_init(void); -/* Free the memory associated with a CIPSOv4 DOI definition */ -void netlbl_cipsov4_doi_free(struct rcu_head *entry); - #endif diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 383631873748..642b9d382fb4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -84,7 +84,7 @@ struct listeners { static inline int netlink_is_kernel(struct sock *sk) { - return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; + return nlk_test_bit(KERNEL_SOCKET, sk); } struct netlink_table *nl_table __read_mostly; @@ -349,9 +349,7 @@ static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, static void netlink_overrun(struct sock *sk) { - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { + if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; @@ -677,6 +675,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; @@ -704,6 +703,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; + release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) @@ -719,6 +719,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; + nlk->netlink_release = release; out: return err; @@ -763,6 +764,8 @@ static int netlink_release(struct socket *sock) * OK. Socket is unlinked, any packets that arrive now * will be purged. */ + if (nlk->netlink_release) + nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock @@ -1402,9 +1405,7 @@ EXPORT_SYMBOL_GPL(netlink_has_listeners); bool netlink_strict_get_check(struct sk_buff *skb) { - const struct netlink_sock *nlk = nlk_sk(NETLINK_CB(skb).sk); - - return nlk->flags & NETLINK_F_STRICT_CHK; + return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); } EXPORT_SYMBOL_GPL(netlink_strict_get_check); @@ -1432,6 +1433,8 @@ struct netlink_broadcast_data { int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; + int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); + void *tx_data; }; static void do_one_broadcast(struct sock *sk, @@ -1448,7 +1451,7 @@ static void do_one_broadcast(struct sock *sk, return; if (!net_eq(sock_net(sk), p->net)) { - if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + if (!nlk_test_bit(LISTEN_ALL_NSID, sk)) return; if (!peernet_has_id(sock_net(sk), p->net)) @@ -1481,10 +1484,17 @@ static void do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; - if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; goto out; } + + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + kfree_skb(p->skb2); + p->skb2 = NULL; + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; @@ -1496,7 +1506,7 @@ static void do_one_broadcast(struct sock *sk, val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); - if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; } else { p->congested |= val; @@ -1507,8 +1517,12 @@ out: sock_put(sk); } -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation) +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, + u32 portid, + u32 group, gfp_t allocation, + int (*filter)(struct sock *dsk, + struct sk_buff *skb, void *data), + void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; @@ -1527,6 +1541,8 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, info.allocation = allocation; info.skb = skb; info.skb2 = NULL; + info.tx_filter = filter; + info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ @@ -1552,6 +1568,14 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, } return -ESRCH; } +EXPORT_SYMBOL(netlink_broadcast_filtered); + +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, + u32 group, gfp_t allocation) +{ + return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, + NULL, NULL); +} EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { @@ -1576,7 +1600,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) !test_bit(p->group - 1, nlk->groups)) goto out; - if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { + if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) { ret = 1; goto out; } @@ -1629,10 +1653,7 @@ static void netlink_update_socket_mc(struct netlink_sock *nlk, old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; - if (new) - __set_bit(group - 1, nlk->groups); - else - __clear_bit(group - 1, nlk->groups); + __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } @@ -1643,7 +1664,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; - int err; + int nr = -1; if (level != SOL_NETLINK) return -ENOPROTOOPT; @@ -1654,14 +1675,12 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: - if (val) - nlk->flags |= NETLINK_F_RECV_PKTINFO; - else - nlk->flags &= ~NETLINK_F_RECV_PKTINFO; - err = 0; + nr = NETLINK_F_RECV_PKTINFO; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { + int err; + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); @@ -1681,61 +1700,38 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); - err = 0; break; } case NETLINK_BROADCAST_ERROR: - if (val) - nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; - else - nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; - err = 0; + nr = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: + assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val); if (val) { - nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); - } else { - nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } - err = 0; break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; - - if (val) - nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; - else - nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; - err = 0; + nr = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: - if (val) - nlk->flags |= NETLINK_F_CAP_ACK; - else - nlk->flags &= ~NETLINK_F_CAP_ACK; - err = 0; + nr = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: - if (val) - nlk->flags |= NETLINK_F_EXT_ACK; - else - nlk->flags &= ~NETLINK_F_EXT_ACK; - err = 0; + nr = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: - if (val) - nlk->flags |= NETLINK_F_STRICT_CHK; - else - nlk->flags &= ~NETLINK_F_STRICT_CHK; - err = 0; + nr = NETLINK_F_STRICT_CHK; break; default: - err = -ENOPROTOOPT; + return -ENOPROTOOPT; } - return err; + if (nr >= 0) + assign_bit(nr, &nlk->flags, val); + return 0; } static int netlink_getsockopt(struct socket *sock, int level, int optname, @@ -1802,7 +1798,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; len = sizeof(int); - val = nlk->flags & flag ? 1 : 0; + val = test_bit(flag, &nlk->flags); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) @@ -1979,9 +1975,9 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*addr); } - if (nlk->flags & NETLINK_F_RECV_PKTINFO) + if (nlk_test_bit(RECV_PKTINFO, sk)) netlink_cmsg_recv_pktinfo(msg, skb); - if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + if (nlk_test_bit(LISTEN_ALL_NSID, sk)) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); @@ -2058,7 +2054,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, goto out_sock_release; nlk = nlk_sk(sk); - nlk->flags |= NETLINK_F_KERNEL_SOCKET; + set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags); netlink_table_grab(); if (!nl_table[unit].registered) { @@ -2069,6 +2065,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; + nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; @@ -2192,7 +2189,7 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); - if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) { + if (extack->_msg && test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) nlmsg_end(skb, nlh); @@ -2321,8 +2318,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { - struct netlink_sock *nlk, *nlk2; struct netlink_callback *cb; + struct netlink_sock *nlk; struct sock *sk; int ret; @@ -2357,8 +2354,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; - nlk2 = nlk_sk(NETLINK_CB(skb).sk); - cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK); + cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); if (control->start) { cb->extack = control->extack; @@ -2402,7 +2398,7 @@ netlink_ack_tlv_len(struct netlink_sock *nlk, int err, { size_t tlvlen; - if (!extack || !(nlk->flags & NETLINK_F_EXT_ACK)) + if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) return 0; tlvlen = 0; @@ -2474,7 +2470,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, * requests to cap the error message, and get extra error data if * requested. */ - if (err && !(nlk->flags & NETLINK_F_CAP_ACK)) + if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 90a3198a9b7f..2145979b9986 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -8,14 +8,16 @@ #include <net/sock.h> /* flags */ -#define NETLINK_F_KERNEL_SOCKET 0x1 -#define NETLINK_F_RECV_PKTINFO 0x2 -#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_F_RECV_NO_ENOBUFS 0x8 -#define NETLINK_F_LISTEN_ALL_NSID 0x10 -#define NETLINK_F_CAP_ACK 0x20 -#define NETLINK_F_EXT_ACK 0x40 -#define NETLINK_F_STRICT_CHK 0x80 +enum { + NETLINK_F_KERNEL_SOCKET, + NETLINK_F_RECV_PKTINFO, + NETLINK_F_BROADCAST_SEND_ERROR, + NETLINK_F_RECV_NO_ENOBUFS, + NETLINK_F_LISTEN_ALL_NSID, + NETLINK_F_CAP_ACK, + NETLINK_F_EXT_ACK, + NETLINK_F_STRICT_CHK, +}; #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) @@ -23,10 +25,10 @@ struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; + unsigned long flags; u32 portid; u32 dst_portid; u32 dst_group; - u32 flags; u32 subscriptions; u32 ngroups; unsigned long *groups; @@ -42,6 +44,8 @@ struct netlink_sock { void (*netlink_rcv)(struct sk_buff *skb); int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); + void (*netlink_release)(struct sock *sk, + unsigned long *groups); struct module *module; struct rhash_head node; @@ -54,6 +58,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } +#define nlk_test_bit(nr, sk) test_bit(NETLINK_F_##nr, &nlk_sk(sk)->flags) + struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; @@ -64,6 +70,8 @@ struct netlink_table { struct module *module; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sk, + unsigned long *groups); int registered; }; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index e4f21b1067bc..9c4f231be275 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -27,15 +27,15 @@ static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) if (nlk->cb_running) flags |= NDIAG_FLAG_CB_RUNNING; - if (nlk->flags & NETLINK_F_RECV_PKTINFO) + if (nlk_test_bit(RECV_PKTINFO, sk)) flags |= NDIAG_FLAG_PKTINFO; - if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) flags |= NDIAG_FLAG_BROADCAST_ERROR; - if (nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) + if (nlk_test_bit(RECV_NO_ENOBUFS, sk)) flags |= NDIAG_FLAG_NO_ENOBUFS; - if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + if (nlk_test_bit(LISTEN_ALL_NSID, sk)) flags |= NDIAG_FLAG_LISTEN_ALL_NSID; - if (nlk->flags & NETLINK_F_CAP_ACK) + if (nlk_test_bit(CAP_ACK, sk)) flags |= NDIAG_FLAG_CAP_ACK; return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index a157247a1e45..8315d31b53db 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -52,6 +52,18 @@ static void genl_unlock_all(void) up_write(&cb_lock); } +static void genl_op_lock(const struct genl_family *family) +{ + if (!family->parallel_ops) + genl_lock(); +} + +static void genl_op_unlock(const struct genl_family *family) +{ + if (!family->parallel_ops) + genl_unlock(); +} + static DEFINE_IDR(genl_fam_idr); /* @@ -593,8 +605,12 @@ static int genl_validate_ops(const struct genl_family *family) return -EINVAL; /* Check sort order */ - if (a->cmd < b->cmd) + if (a->cmd < b->cmd) { continue; + } else if (a->cmd > b->cmd) { + WARN_ON(1); + return -EINVAL; + } if (a->internal_flags != b->internal_flags || ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | @@ -828,64 +844,63 @@ static int genl_start(struct netlink_callback *cb) genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } - info->family = ctx->family; info->op = *ops; - info->attrs = attrs; + info->info.family = ctx->family; + info->info.snd_seq = cb->nlh->nlmsg_seq; + info->info.snd_portid = NETLINK_CB(cb->skb).portid; + info->info.nlhdr = cb->nlh; + info->info.genlhdr = nlmsg_data(cb->nlh); + info->info.attrs = attrs; + genl_info_net_set(&info->info, sock_net(cb->skb->sk)); + info->info.extack = cb->extack; + memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr)); cb->data = info; if (ops->start) { - if (!ctx->family->parallel_ops) - genl_lock(); + genl_op_lock(ctx->family); rc = ops->start(cb); - if (!ctx->family->parallel_ops) - genl_unlock(); + genl_op_unlock(ctx->family); } if (rc) { - genl_family_rcv_msg_attrs_free(info->attrs); + genl_family_rcv_msg_attrs_free(info->info.attrs); genl_dumpit_info_free(info); cb->data = NULL; } return rc; } -static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - const struct genl_split_ops *ops = &genl_dumpit_info(cb)->op; + struct genl_dumpit_info *dump_info = cb->data; + const struct genl_split_ops *ops = &dump_info->op; + struct genl_info *info = &dump_info->info; int rc; - genl_lock(); + info->extack = cb->extack; + + genl_op_lock(info->family); rc = ops->dumpit(skb, cb); - genl_unlock(); + genl_op_unlock(info->family); return rc; } -static int genl_lock_done(struct netlink_callback *cb) +static int genl_done(struct netlink_callback *cb) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - const struct genl_split_ops *ops = &info->op; + struct genl_dumpit_info *dump_info = cb->data; + const struct genl_split_ops *ops = &dump_info->op; + struct genl_info *info = &dump_info->info; int rc = 0; + info->extack = cb->extack; + if (ops->done) { - genl_lock(); + genl_op_lock(info->family); rc = ops->done(cb); - genl_unlock(); + genl_op_unlock(info->family); } genl_family_rcv_msg_attrs_free(info->attrs); - genl_dumpit_info_free(info); - return rc; -} - -static int genl_parallel_done(struct netlink_callback *cb) -{ - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - const struct genl_split_ops *ops = &info->op; - int rc = 0; - - if (ops->done) - rc = ops->done(cb); - genl_family_rcv_msg_attrs_free(info->attrs); - genl_dumpit_info_free(info); + genl_dumpit_info_free(dump_info); return rc; } @@ -897,6 +912,14 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, int hdrlen, struct net *net) { struct genl_start_context ctx; + struct netlink_dump_control c = { + .module = family->module, + .data = &ctx, + .start = genl_start, + .dump = genl_dumpit, + .done = genl_done, + .extack = extack, + }; int err; ctx.family = family; @@ -905,31 +928,9 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, ctx.ops = ops; ctx.hdrlen = hdrlen; - if (!family->parallel_ops) { - struct netlink_dump_control c = { - .module = family->module, - .data = &ctx, - .start = genl_start, - .dump = genl_lock_dumpit, - .done = genl_lock_done, - .extack = extack, - }; - - genl_unlock(); - err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - genl_lock(); - } else { - struct netlink_dump_control c = { - .module = family->module, - .data = &ctx, - .start = genl_start, - .dump = ops->dumpit, - .done = genl_parallel_done, - .extack = extack, - }; - - err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - } + genl_op_unlock(family); + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_op_lock(family); return err; } @@ -953,9 +954,9 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; + info.family = family; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); - info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); @@ -1061,13 +1062,9 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (family == NULL) return -ENOENT; - if (!family->parallel_ops) - genl_lock(); - + genl_op_lock(family); err = genl_family_rcv_msg(family, skb, nlh, extack); - - if (!family->parallel_ops) - genl_unlock(); + genl_op_unlock(family); return err; } @@ -1392,7 +1389,7 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; - struct nlattr **tb = info->attrs; + struct nlattr **tb = info->info.attrs; const struct genl_family *rt; struct genl_op_iter i; int err; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index e9ac6a6f934e..aa1dbf654c3e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -110,10 +110,10 @@ static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) struct nfc_dev *dev; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); - idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index cab1e02b63e0..fd66014d8a76 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -27,6 +27,7 @@ #include <net/sctp/checksum.h> #include "datapath.h" +#include "drop.h" #include "flow.h" #include "conntrack.h" #include "vport.h" @@ -781,7 +782,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct vport *vport = data->vport; if (skb_cow_head(skb, data->l2_len) < 0) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); return -ENOMEM; } @@ -852,6 +853,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, struct sk_buff *skb, u16 mru, struct sw_flow_key *key) { + enum ovs_drop_reason reason; u16 orig_network_offset = 0; if (eth_p_mpls(skb->protocol)) { @@ -861,6 +863,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, if (skb_network_offset(skb) > MAX_L2_LEN) { OVS_NLERR(1, "L2 header too long to fragment"); + reason = OVS_DROP_FRAG_L2_TOO_LONG; goto err; } @@ -901,12 +904,13 @@ static void ovs_fragment(struct net *net, struct vport *vport, WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", ovs_vport_name(vport), ntohs(key->eth.type), mru, vport->dev->mtu); + reason = OVS_DROP_FRAG_INVALID_PROTO; goto err; } return; err: - kfree_skb(skb); + ovs_kfree_skb_reason(skb, reason); } static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, @@ -933,10 +937,10 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, ovs_fragment(net, vport, skb, mru, key); } else { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); } } else { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); } } @@ -1010,7 +1014,7 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, return clone_execute(dp, skb, key, 0, nla_data(actions), nla_len(actions), true, false); - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_IP_TTL); return 0; } @@ -1036,7 +1040,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, if ((arg->probability != U32_MAX) && (!arg->probability || get_random_u32() > arg->probability)) { if (last) - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_LAST_ACTION); return 0; } @@ -1297,6 +1301,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (trace_ovs_do_execute_action_enabled()) trace_ovs_do_execute_action(dp, skb, key, a, rem); + /* Actions that rightfully have to consume the skb should do it + * and return directly. + */ switch (nla_type(a)) { case OVS_ACTION_ATTR_OUTPUT: { int port = nla_get_u32(a); @@ -1332,6 +1339,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, output_userspace(dp, skb, key, a, attr, len, OVS_CB(skb)->cutlen); OVS_CB(skb)->cutlen = 0; + if (nla_is_last(a, rem)) { + consume_skb(skb); + return 0; + } break; case OVS_ACTION_ATTR_HASH: @@ -1446,7 +1457,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_METER: if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) { - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_METER); return 0; } break; @@ -1477,15 +1488,24 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return dec_ttl_exception_handler(dp, skb, key, a); break; + + case OVS_ACTION_ATTR_DROP: { + enum ovs_drop_reason reason = nla_get_u32(a) + ? OVS_DROP_EXPLICIT_WITH_ERROR + : OVS_DROP_EXPLICIT; + + ovs_kfree_skb_reason(skb, reason); + return 0; + } } if (unlikely(err)) { - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_ACTION_ERROR); return err; } } - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_LAST_ACTION); return 0; } @@ -1547,7 +1567,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, /* Out of per CPU action FIFO space. Drop the 'skb' and * log an error. */ - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_DEFERRED_LIMIT); if (net_ratelimit()) { if (actions) { /* Sample action */ @@ -1599,7 +1619,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_RECURSION_LIMIT); err = -ENETDOWN; goto out; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 331730fd3580..0b9a785dea45 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack_act_ct.h> #include "datapath.h" +#include "drop.h" #include "conntrack.h" #include "flow.h" #include "flow_netlink.h" @@ -455,45 +456,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, return 0; } -static struct nf_conntrack_expect * -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, - u16 proto, const struct sk_buff *skb) -{ - struct nf_conntrack_tuple tuple; - struct nf_conntrack_expect *exp; - - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) - return NULL; - - exp = __nf_ct_expect_find(net, zone, &tuple); - if (exp) { - struct nf_conntrack_tuple_hash *h; - - /* Delete existing conntrack entry, if it clashes with the - * expectation. This can happen since conntrack ALGs do not - * check for clashes between (new) expectations and existing - * conntrack entries. nf_conntrack_in() will check the - * expectations only if a conntrack entry can not be found, - * which can lead to OVS finding the expectation (here) in the - * init direction, but which will not be removed by the - * nf_conntrack_in() call, if a matching conntrack entry is - * found instead. In this case all init direction packets - * would be reported as new related packets, while reply - * direction packets would be reported as un-related - * established packets. - */ - h = nf_conntrack_find_get(net, zone, &tuple); - if (h) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - nf_ct_delete(ct, 0, 0); - nf_ct_put(ct); - } - } - - return exp; -} - /* This replicates logic from nf_conntrack_core.c that is not exported. */ static enum ip_conntrack_info ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) @@ -852,36 +814,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - struct nf_conntrack_expect *exp; - - /* If we pass an expected packet through nf_conntrack_in() the - * expectation is typically removed, but the packet could still be - * lost in upcall processing. To prevent this from happening we - * perform an explicit expectation lookup. Expected connections are - * always new, and will be passed through conntrack only when they are - * committed, as it is OK to remove the expectation at that time. - */ - exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); - if (exp) { - u8 state; - - /* NOTE: New connections are NATted and Helped only when - * committed, so we are not calling into NAT here. - */ - state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; - __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - struct nf_conn *ct; - int err; + struct nf_conn *ct; + int err; - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; - ct = (struct nf_conn *)skb_nfct(skb); - if (ct) - nf_ct_deliver_cached_events(ct); - } + ct = (struct nf_conn *)skb_nfct(skb); + if (ct) + nf_ct_deliver_cached_events(ct); return 0; } @@ -1094,7 +1036,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, skb_push_rcsum(skb, nh_ofs); if (err) - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_CONNTRACK); return err; } @@ -1460,7 +1402,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + if (ct_info.commit) + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); @@ -1662,7 +1605,7 @@ static struct sk_buff * ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *skb; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 3d7a91e64c88..11c69415c605 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -41,6 +41,7 @@ #include <net/pkt_cls.h> #include "datapath.h" +#include "drop.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" @@ -589,7 +590,7 @@ out: static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; @@ -966,7 +967,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow *flow = NULL, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; @@ -1213,7 +1214,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow_key key; struct sw_flow *flow; struct sk_buff *reply = NULL; @@ -1314,7 +1315,7 @@ error: static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; @@ -1373,7 +1374,7 @@ unlock: static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; @@ -1641,7 +1642,7 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, { struct datapath *dp; - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) return; @@ -1934,7 +1935,8 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; @@ -1967,7 +1969,8 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; @@ -2002,7 +2005,8 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); goto err_unlock_free; @@ -2245,7 +2249,7 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct vport_parms parms; struct sk_buff *reply; struct vport *vport; @@ -2347,7 +2351,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; @@ -2403,7 +2407,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; @@ -2446,7 +2450,7 @@ exit_unlock_free: static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *reply; struct vport *vport; int err; @@ -2702,6 +2706,17 @@ static struct pernet_operations ovs_net_ops = { .size = sizeof(struct ovs_net), }; +static const char * const ovs_drop_reasons[] = { +#define S(x) (#x), + OVS_DROP_REASONS(S) +#undef S +}; + +static struct drop_reason_list drop_reason_list_ovs = { + .reasons = ovs_drop_reasons, + .n_reasons = ARRAY_SIZE(ovs_drop_reasons), +}; + static int __init dp_init(void) { int err; @@ -2743,6 +2758,9 @@ static int __init dp_init(void) if (err < 0) goto error_unreg_netdev; + drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + &drop_reason_list_ovs); + return 0; error_unreg_netdev: @@ -2769,6 +2787,7 @@ static void dp_cleanup(void) ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); + drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH); rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); diff --git a/net/openvswitch/drop.h b/net/openvswitch/drop.h new file mode 100644 index 000000000000..cedf9b7b5796 --- /dev/null +++ b/net/openvswitch/drop.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * OpenvSwitch drop reason list. + */ + +#ifndef OPENVSWITCH_DROP_H +#define OPENVSWITCH_DROP_H +#include <linux/skbuff.h> +#include <net/dropreason.h> + +#define OVS_DROP_REASONS(R) \ + R(OVS_DROP_LAST_ACTION) \ + R(OVS_DROP_ACTION_ERROR) \ + R(OVS_DROP_EXPLICIT) \ + R(OVS_DROP_EXPLICIT_WITH_ERROR) \ + R(OVS_DROP_METER) \ + R(OVS_DROP_RECURSION_LIMIT) \ + R(OVS_DROP_DEFERRED_LIMIT) \ + R(OVS_DROP_FRAG_L2_TOO_LONG) \ + R(OVS_DROP_FRAG_INVALID_PROTO) \ + R(OVS_DROP_CONNTRACK) \ + R(OVS_DROP_IP_TTL) \ + /* deliberate comment for trailing \ */ + +enum ovs_drop_reason { + __OVS_DROP_REASON = SKB_DROP_REASON_SUBSYS_OPENVSWITCH << + SKB_DROP_REASON_SUBSYS_SHIFT, +#define ENUM(x) x, + OVS_DROP_REASONS(ENUM) +#undef ENUM + + OVS_DROP_MAX, +}; + +static inline void +ovs_kfree_skb_reason(struct sk_buff *skb, enum ovs_drop_reason reason) +{ + kfree_skb_reason(skb, (u32)reason); +} + +#endif /* OPENVSWITCH_DROP_H */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 41116361433d..88965e2068ac 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -38,6 +38,7 @@ #include <net/tun_proto.h> #include <net/erspan.h> +#include "drop.h" #include "flow_netlink.h" struct ovs_len_tbl { @@ -61,6 +62,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_RECIRC: case OVS_ACTION_ATTR_TRUNC: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_DROP: break; case OVS_ACTION_ATTR_CT: @@ -2394,7 +2396,7 @@ static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len) /* Whenever new actions are added, the need to update this * function should be considered. */ - BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 23); + BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 24); if (!actions) return; @@ -3182,6 +3184,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), [OVS_ACTION_ATTR_DEC_TTL] = (u32)-1, + [OVS_ACTION_ATTR_DROP] = sizeof(u32), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3453,6 +3456,11 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_DROP: + if (!nla_is_last(a, rem)) + return -EINVAL; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index c4ebf810e4b1..cc08e0403909 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -211,7 +211,7 @@ ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { struct sk_buff *skb; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) @@ -272,7 +272,7 @@ error: static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; struct sk_buff *reply; @@ -409,7 +409,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) struct dp_meter *meter, *old_meter; struct sk_buff *reply; struct ovs_header *ovs_reply_header; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct dp_meter_table *meter_tbl; struct datapath *dp; int err; @@ -482,7 +482,7 @@ exit_free_meter: static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *meter; @@ -535,7 +535,7 @@ exit_unlock: static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *old_meter; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a2935bd18ed9..8f97648d652f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2931,8 +2931,10 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, if (prepad + len < PAGE_SIZE || !linear) linear = len; + if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err, 0); + err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 78beb74146e7..41ece61eb57a 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -23,6 +23,8 @@ #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) +#define QRTR_PORT_CTRL_LEGACY 0xffff + /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version @@ -495,6 +497,9 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) goto err; } + if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) + cb->dst_port = QRTR_PORT_CTRL; + if (!size || len != ALIGN(size, 4) + hdrlen) goto err; diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 0f7a729f1a1f..b1db0b519179 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -16,7 +16,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> -static RADIX_TREE(nodes, GFP_KERNEL); +static DEFINE_XARRAY(nodes); static struct { struct socket *sock; @@ -66,14 +66,14 @@ struct qrtr_server { struct qrtr_node { unsigned int id; - struct radix_tree_root servers; + struct xarray servers; }; static struct qrtr_node *node_get(unsigned int node_id) { struct qrtr_node *node; - node = radix_tree_lookup(&nodes, node_id); + node = xa_load(&nodes, node_id); if (node) return node; @@ -83,8 +83,9 @@ static struct qrtr_node *node_get(unsigned int node_id) return NULL; node->id = node_id; + xa_init(&node->servers); - if (radix_tree_insert(&nodes, node_id, node)) { + if (xa_store(&nodes, node_id, node, GFP_KERNEL)) { kfree(node); return NULL; } @@ -193,40 +194,23 @@ static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, static int announce_servers(struct sockaddr_qrtr *sq) { - struct radix_tree_iter iter; struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; int ret; node = node_get(qrtr_ns.local_node); if (!node) return 0; - rcu_read_lock(); /* Announce the list of servers registered in this node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } - - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -256,14 +240,17 @@ static struct qrtr_server *server_add(unsigned int service, goto err; /* Delete the old server on the same port */ - old = radix_tree_lookup(&node->servers, port); + old = xa_store(&node->servers, port, srv, GFP_KERNEL); if (old) { - radix_tree_delete(&node->servers, port); - kfree(old); + if (xa_is_err(old)) { + pr_err("failed to add server [0x%x:0x%x] ret:%d\n", + srv->service, srv->instance, xa_err(old)); + goto err; + } else { + kfree(old); + } } - radix_tree_insert(&node->servers, port, srv); - trace_qrtr_ns_server_add(srv->service, srv->instance, srv->node, srv->port); @@ -280,11 +267,11 @@ static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) struct qrtr_server *srv; struct list_head *li; - srv = radix_tree_lookup(&node->servers, port); + srv = xa_load(&node->servers, port); if (!srv) return -ENOENT; - radix_tree_delete(&node->servers, port); + xa_erase(&node->servers, port); /* Broadcast the removal of local servers */ if (srv->node == qrtr_ns.local_node && bcast) @@ -344,13 +331,12 @@ static int ctrl_cmd_hello(struct sockaddr_qrtr *sq) static int ctrl_cmd_bye(struct sockaddr_qrtr *from) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_ctrl_pkt pkt; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct msghdr msg = { }; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -361,22 +347,9 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) if (!node) return 0; - rcu_read_lock(); /* Advertise removal of this client to all servers of remote node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); + xa_for_each(&node->servers, index, srv) server_del(node, srv->port, true); - rcu_read_lock(); - } - rcu_read_unlock(); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); @@ -387,18 +360,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE); pkt.client.node = cpu_to_le32(from->sq_node); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -411,11 +373,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pr_err("failed to send bye cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -423,7 +381,6 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, unsigned int node_id, unsigned int port) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_lookup *lookup; struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; @@ -432,7 +389,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, struct qrtr_node *node; struct list_head *tmp; struct list_head *li; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -477,18 +434,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pkt.client.node = cpu_to_le32(node_id); pkt.client.port = cpu_to_le32(port); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -501,11 +447,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pr_err("failed to send del client cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -576,13 +518,12 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { - struct radix_tree_iter node_iter; struct qrtr_server_filter filter; - struct radix_tree_iter srv_iter; struct qrtr_lookup *lookup; + struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **node_slot; - void __rcu **srv_slot; + unsigned long node_idx; + unsigned long srv_idx; /* Accept only local observers */ if (from->sq_node != qrtr_ns.local_node) @@ -601,40 +542,14 @@ static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, filter.service = service; filter.instance = instance; - rcu_read_lock(); - radix_tree_for_each_slot(node_slot, &nodes, &node_iter, 0) { - node = radix_tree_deref_slot(node_slot); - if (!node) - continue; - if (radix_tree_deref_retry(node)) { - node_slot = radix_tree_iter_retry(&node_iter); - continue; - } - node_slot = radix_tree_iter_resume(node_slot, &node_iter); - - radix_tree_for_each_slot(srv_slot, &node->servers, - &srv_iter, 0) { - struct qrtr_server *srv; - - srv = radix_tree_deref_slot(srv_slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - srv_slot = radix_tree_iter_retry(&srv_iter); - continue; - } - + xa_for_each(&nodes, node_idx, node) { + xa_for_each(&node->servers, srv_idx, srv) { if (!server_match(srv, &filter)) continue; - srv_slot = radix_tree_iter_resume(srv_slot, &srv_iter); - - rcu_read_unlock(); lookup_notify(from, srv, true); - rcu_read_lock(); } } - rcu_read_unlock(); /* Empty notification, to indicate end of listing */ lookup_notify(from, NULL, true); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index ca4c3a667091..d2fdb1529585 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -17,7 +17,6 @@ */ #define RDS_RDMA_REJ_INCOMPAT 1 -int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, diff --git a/net/rds/rds.h b/net/rds/rds.h index d35d1fc39807..dc360252c515 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -863,7 +863,6 @@ int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_message_inc_free(struct rds_incoming *inc); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); @@ -1013,7 +1012,5 @@ void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); struct rds_transport *rds_trans_get(int t_type); -int rds_trans_init(void); -void rds_trans_exit(void); #endif diff --git a/net/rds/tcp.h b/net/rds/tcp.h index f8b5930d7b34..053aa7da87ef 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -56,7 +56,6 @@ void rds_tcp_restore_callbacks(struct socket *sock, struct rds_tcp_connection *tc); u32 rds_tcp_write_seq(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); -u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); extern struct rds_transport rds_tcp_transport; void rds_tcp_accept_work(struct sock *sk); int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 4b95cb1ac435..470c70deffe2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -347,8 +347,7 @@ config NET_SCH_FQ_PIE config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT - select NET_INGRESS - select NET_EGRESS + select NET_XGRESS help Say Y here if you want to use classifiers for incoming and/or outgoing packets. This qdisc doesn't do anything else besides running classifiers, @@ -679,6 +678,7 @@ config NET_EMATCH_IPT config NET_CLS_ACT bool "Actions" select NET_CLS + select NET_XGRESS help Say Y here if you want to use traffic control actions. Actions get attached to classifiers and are invoked after a successful diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index abc71a06d634..7c652d14528b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1238,7 +1238,8 @@ static int tcf_ct_fill_params(struct net *net, } } - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + if (p->ct_action & TCA_CT_ACT_COMMIT) + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; err: nf_ct_put(p->tmpl); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9f0711da9c95..e5314a31f75a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -72,6 +72,7 @@ struct fl_flow_key { struct flow_dissector_key_num_of_vlans num_of_vlans; struct flow_dissector_key_pppoe pppoe; struct flow_dissector_key_l2tpv3 l2tpv3; + struct flow_dissector_key_ipsec ipsec; struct flow_dissector_key_cfm cfm; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ @@ -726,6 +727,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_SPI] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_SPI_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED }, }; @@ -796,6 +799,24 @@ static void fl_set_key_val(struct nlattr **tb, nla_memcpy(mask, tb[mask_type], len); } +static int fl_set_key_spi(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + if (key->basic.ip_proto != IPPROTO_ESP && + key->basic.ip_proto != IPPROTO_AH) { + NL_SET_ERR_MSG(extack, + "Protocol must be either ESP or AH"); + return -EINVAL; + } + + fl_set_key_val(tb, &key->ipsec.spi, + TCA_FLOWER_KEY_SPI, + &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK, + sizeof(key->ipsec.spi)); + return 0; +} + static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1895,6 +1916,12 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; } + if (tb[TCA_FLOWER_KEY_SPI]) { + ret = fl_set_key_spi(tb, key, mask, extack); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -2068,6 +2095,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3); FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPSEC, ipsec); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CFM, cfm); skb_flow_dissector_init(dissector, keys, cnt); @@ -3365,6 +3394,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->l2tpv3.session_id))) goto nla_put_failure; + if (key->ipsec.spi && + fl_dump_key_val(skb, &key->ipsec.spi, TCA_FLOWER_KEY_SPI, + &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK, + sizeof(key->ipsec.spi))) + goto nla_put_failure; + if ((key->basic.ip_proto == IPPROTO_TCP || key->basic.ip_proto == IPPROTO_UDP || key->basic.ip_proto == IPPROTO_SCTP) && diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 6fdba069f6bf..da34fd4c9269 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -502,7 +502,7 @@ META_COLLECTOR(int_sk_lingertime) *err = -1; return; } - dst->value = sk->sk_lingertime / HZ; + dst->value = READ_ONCE(sk->sk_lingertime) / HZ; } META_COLLECTOR(int_sk_err_qlen) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index e35a4e90f4e6..19901e77cd3b 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -17,7 +17,6 @@ struct drr_class { struct Qdisc_class_common common; - unsigned int filter_cnt; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; @@ -150,8 +149,10 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; - if (cl->filter_cnt > 0) + if (qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG(extack, "DRR class is in use"); return -EBUSY; + } sch_tree_lock(sch); @@ -187,8 +188,8 @@ static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, { struct drr_class *cl = drr_find_class(sch, classid); - if (cl != NULL) - cl->filter_cnt++; + if (cl) + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -197,7 +198,7 @@ static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static int drr_graft_class(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 70b0c5873d32..98805303218d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,7 +116,6 @@ struct hfsc_class { struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ struct tcf_block *block; - unsigned int filter_cnt; /* filter count */ unsigned int level; /* class level in hierarchy */ struct hfsc_sched *sched; /* scheduler data */ @@ -1094,8 +1093,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg, struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; - if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) + if (cl->level > 0 || qdisc_class_in_use(&cl->cl_common) || + cl == &q->root) { + NL_SET_ERR_MSG(extack, "HFSC class in use"); return -EBUSY; + } sch_tree_lock(sch); @@ -1223,7 +1225,7 @@ hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) if (cl != NULL) { if (p != NULL && p->level <= cl->level) return 0; - cl->filter_cnt++; + qdisc_class_get(&cl->cl_common); } return (unsigned long)cl; @@ -1234,7 +1236,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct hfsc_class *cl = (struct hfsc_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->cl_common); } static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 325c29041c7d..0d947414e616 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -102,7 +102,6 @@ struct htb_class { struct tcf_proto __rcu *filter_list; /* class attached filters */ struct tcf_block *block; - int filter_cnt; int level; /* our level (see above) */ unsigned int children; @@ -1710,8 +1709,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, * tc subsys guarantee us that in htb_destroy it holds no class * refs so that we can remove children safely there ? */ - if (cl->children || cl->filter_cnt) + if (cl->children || qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG(extack, "HTB class in use"); return -EBUSY; + } if (!cl->level && htb_parent_last_child(cl)) last_child = 1; @@ -1810,10 +1811,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter"); goto failure; } - if (hopt->quantum) { - NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter"); - goto failure; - } } /* Keeping backward compatible with rate_table based iproute2 tc */ @@ -1910,6 +1907,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -1931,6 +1929,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -2017,6 +2016,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -2108,7 +2108,7 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, * be broken by class during destroy IIUC. */ if (cl) - cl->filter_cnt++; + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -2116,8 +2116,7 @@ static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - if (cl) - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index e43a45499372..a463a63192c3 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -13,6 +13,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tcx.h> struct ingress_sched_data { struct tcf_block *block; @@ -78,6 +79,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_INGRESS) @@ -85,7 +88,13 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); - mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->block_info.chain_head_change = clsact_chain_head_change; @@ -103,11 +112,22 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress); if (sch->parent != TC_H_INGRESS) return; tcf_block_put_ext(q->block, sch, &q->block_info); + + if (entry) { + tcx_miniq_set_active(entry, false); + if (!tcx_entry_is_active(entry)) { + tcx_entry_update(dev, NULL, true); + tcx_entry_free(entry); + } + } + net_dec_ingress_queue(); } @@ -223,6 +243,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, { struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_CLSACT) @@ -231,7 +253,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); net_inc_egress_queue(); - mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; @@ -244,7 +272,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block); - mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); + entry = tcx_entry_fetch_or_create(dev, false, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, false); q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS; q->egress_block_info.chain_head_change = clsact_chain_head_change; @@ -256,12 +290,31 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress); + struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress); if (sch->parent != TC_H_CLSACT) return; - tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); + + if (ingress_entry) { + tcx_miniq_set_active(ingress_entry, false); + if (!tcx_entry_is_active(ingress_entry)) { + tcx_entry_update(dev, NULL, true); + tcx_entry_free(ingress_entry); + } + } + + if (egress_entry) { + tcx_miniq_set_active(egress_entry, false); + if (!tcx_entry_is_active(egress_entry)) { + tcx_entry_update(dev, NULL, false); + tcx_entry_free(egress_entry); + } + } net_dec_ingress_queue(); net_dec_egress_queue(); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 38d9aa0cd30e..4ad39a4a3cf5 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -105,6 +105,11 @@ struct netem_sched_data { u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; + struct prng { + u64 seed; + struct rnd_state prng_state; + } prng; + struct disttable *delay_dist; enum { @@ -179,15 +184,16 @@ static void init_crandom(struct crndstate *state, unsigned long rho) * Next number depends on last value. * rho is scaled to avoid floating point. */ -static u32 get_crandom(struct crndstate *state) +static u32 get_crandom(struct crndstate *state, struct prng *p) { u64 value, rho; unsigned long answer; + struct rnd_state *s = &p->prng_state; if (!state || state->rho == 0) /* no correlation */ - return get_random_u32(); + return prandom_u32_state(s); - value = get_random_u32(); + value = prandom_u32_state(s); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; @@ -201,7 +207,7 @@ static u32 get_crandom(struct crndstate *state) static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; - u32 rnd = get_random_u32(); + u32 rnd = prandom_u32_state(&q->prng.prng_state); /* * Makes a comparison between rnd and the transition @@ -266,18 +272,19 @@ static bool loss_4state(struct netem_sched_data *q) static bool loss_gilb_ell(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; + struct rnd_state *s = &q->prng.prng_state; switch (clg->state) { case GOOD_STATE: - if (get_random_u32() < clg->a1) + if (prandom_u32_state(s) < clg->a1) clg->state = BAD_STATE; - if (get_random_u32() < clg->a4) + if (prandom_u32_state(s) < clg->a4) return true; break; case BAD_STATE: - if (get_random_u32() < clg->a2) + if (prandom_u32_state(s) < clg->a2) clg->state = GOOD_STATE; - if (get_random_u32() > clg->a3) + if (prandom_u32_state(s) > clg->a3) return true; } @@ -289,7 +296,7 @@ static bool loss_event(struct netem_sched_data *q) switch (q->loss_model) { case CLG_RANDOM: /* Random packet drop 0 => none, ~0 => all */ - return q->loss && q->loss >= get_crandom(&q->loss_cor); + return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng); case CLG_4_STATES: /* 4state loss model algorithm (used also for GI model) @@ -318,6 +325,7 @@ static bool loss_event(struct netem_sched_data *q) */ static s64 tabledist(s64 mu, s32 sigma, struct crndstate *state, + struct prng *prng, const struct disttable *dist) { s64 x; @@ -327,7 +335,7 @@ static s64 tabledist(s64 mu, s32 sigma, if (sigma == 0) return mu; - rnd = get_crandom(state); + rnd = get_crandom(state, prng); /* default uniform distribution */ if (dist == NULL) @@ -449,7 +457,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb->prev = NULL; /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; /* Drop packet? */ @@ -492,7 +500,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * If packet is going to be hardware checksummed, then * do it now in software before we mangle it. */ - if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) { if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) @@ -530,12 +538,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ - q->reorder < get_crandom(&q->reorder_cor)) { + q->reorder < get_crandom(&q->reorder_cor, &q->prng)) { u64 now; s64 delay; delay = tabledist(q->latency, q->jitter, - &q->delay_cor, q->delay_dist); + &q->delay_cor, &q->prng, q->delay_dist); now = ktime_get_ns(); @@ -639,7 +647,7 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), - NULL, q->slot_dist); + NULL, &q->prng, q->slot_dist); q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; @@ -922,6 +930,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, + [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -1040,6 +1049,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); + if (tb[TCA_NETEM_PRNG_SEED]) + q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); + else + q->prng.seed = get_random_u64(); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + unlock: sch_tree_unlock(sch); @@ -1203,6 +1218,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; } + if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed, + TCA_NETEM_PAD)) + goto nla_put_failure; + return nla_nest_end(skb, nla); nla_put_failure: diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index befaf74b33ca..1a25752f1a9a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -130,8 +130,6 @@ struct qfq_aggregate; struct qfq_class { struct Qdisc_class_common common; - unsigned int filter_cnt; - struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; @@ -545,8 +543,10 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; - if (cl->filter_cnt > 0) + if (qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG_MOD(extack, "QFQ class in use"); return -EBUSY; + } sch_tree_lock(sch); @@ -580,8 +580,8 @@ static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, { struct qfq_class *cl = qfq_find_class(sch, classid); - if (cl != NULL) - cl->filter_cnt++; + if (cl) + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -590,7 +590,7 @@ static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct qfq_class *cl = (struct qfq_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 8c9cfff7fd05..1cb5e41c0ec7 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2099,11 +2099,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, return -EOPNOTSUPP; } - /* pre-allocate qdisc, attachment can't fail */ - q->qdiscs = kcalloc(dev->num_tx_queues, - sizeof(q->qdiscs[0]), + q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); - if (!q->qdiscs) return -ENOMEM; @@ -2145,25 +2142,32 @@ static void taprio_attach(struct Qdisc *sch) /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { - struct Qdisc *qdisc = q->qdiscs[ntx]; - struct Qdisc *old; + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + struct Qdisc *qdisc = q->qdiscs[ntx]; + + /* In offload mode, the root taprio qdisc is bypassed + * and the netdev TX queues see the children directly + */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; - old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + dev_queue_qdisc = qdisc; } else { - old = dev_graft_qdisc(qdisc->dev_queue, sch); - qdisc_refcount_inc(sch); + /* In software mode, attach the root taprio qdisc + * to all netdev TX queues, so that dev_qdisc_enqueue() + * goes through taprio_enqueue(). + */ + dev_queue_qdisc = sch; } + old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); + /* The qdisc's refcount requires to be elevated once + * for each netdev TX queue it is grafted onto + */ + qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } - - /* access to the child qdiscs is not needed in offload mode */ - if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - kfree(q->qdiscs); - q->qdiscs = NULL; - } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, @@ -2192,13 +2196,23 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, if (dev->flags & IFF_UP) dev_deactivate(dev); + /* In offload mode, the child Qdisc is directly attached to the netdev + * TX queue, and thus, we need to keep its refcount elevated in order + * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. + * However, save the reference to the new qdisc in the private array in + * both software and offload cases, to have an up-to-date reference to + * our children. + */ + *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - *old = dev_graft_qdisc(dev_queue, new); - } else { - *old = q->qdiscs[cl - 1]; - q->qdiscs[cl - 1] = new; + WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); + if (new) + qdisc_refcount_inc(new); + if (*old) + qdisc_put(*old); } + q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; @@ -2436,12 +2450,14 @@ start_error: static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx = cl - 1; - if (!dev_queue) + if (ntx >= dev->num_tx_queues) return NULL; - return rtnl_dereference(dev_queue->qdisc_sleeping); + return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) @@ -2456,11 +2472,11 @@ static unsigned long taprio_find(struct Qdisc *sch, u32 classid) static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); - tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; + tcm->tcm_info = child->handle; return 0; } @@ -2470,16 +2486,14 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, __releases(d->lock) __acquires(d->lock) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; - struct Qdisc *child; - child = rtnl_dereference(dev_queue->qdisc_sleeping); if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; diff --git a/net/sctp/input.c b/net/sctp/input.c index 2613c4d74b16..17fcaa9b0df9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -581,7 +581,7 @@ static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, default: return; } - if (!sock_owned_by_user(sk) && inet_sk(sk)->recverr) { + if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 274d07bd774f..2185f44198de 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -360,7 +360,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) ret = inet_addr_type_table(net, addr->v4.sin_addr.s_addr, tb_id); if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && - !sp->inet.freebind && + !inet_test_bit(FREEBIND, sk) && !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind)) return 0; @@ -435,7 +435,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 76f1bce49a8e..fd0631e70d46 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9482,7 +9482,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, atomic_set(&newinet->inet_id, get_random_u16()); newinet->uc_ttl = inet->uc_ttl; - newinet->mc_loop = 1; + inet_set_bit(MC_LOOP, newsk); newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; @@ -9732,6 +9732,7 @@ struct proto sctpv6_prot = { .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), + .ipv6_pinfo_offset = offsetof(struct sctp6_sock, inet6), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - offsetof(struct sctp6_sock, sctp.subscribe) + diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f5834af5fad5..bacdd971615e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -641,20 +641,22 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; + if (link->lgr->max_links > 1) { + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; + } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); return 0; } @@ -1144,7 +1146,7 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, #define SMC_CLC_MAX_ACCEPT_LEN \ (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ - sizeof(struct smc_clc_first_contact_ext) + \ + sizeof(struct smc_clc_first_contact_ext_v2x) + \ sizeof(struct smc_clc_msg_trail)) /* CLC handshake during connect */ @@ -1198,8 +1200,8 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)aclc; struct smc_clc_first_contact_ext *fce = - (struct smc_clc_first_contact_ext *) - (((u8 *)clc_v2) + sizeof(*clc_v2)); + smc_get_clc_first_contact_ext(clc_v2, false); + int rc; if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) return 0; @@ -1218,6 +1220,12 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, return SMC_CLC_DECL_NOINDIRECT; } } + + ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; + return 0; } @@ -1236,6 +1244,8 @@ static int smc_connect_rdma(struct smc_sock *smc, memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); + ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); if (reason_code) @@ -1386,6 +1396,16 @@ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)aclc; + if (ini->first_contact_peer) { + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(aclc_v2, true); + + ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; + } + rc = smc_v2_determine_accepted_chid(aclc_v2, ini); if (rc) return rc; @@ -1420,7 +1440,7 @@ static int smc_connect_ism(struct smc_sock *smc, } rc = smc_clc_send_confirm(smc, ini->first_contact_local, - aclc->hdr.version, eid, NULL); + aclc->hdr.version, eid, ini); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); @@ -1820,7 +1840,7 @@ void smc_close_non_accepted(struct sock *sk) lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ - sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; + WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT); __smc_release(smc); release_sock(sk); sock_put(sk); /* sock_hold above */ @@ -1870,10 +1890,12 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - down_write(&link->lgr->llc_conf_mutex); - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); - up_write(&link->lgr->llc_conf_mutex); + if (link->lgr->max_links > 1) { + down_write(&link->lgr->llc_conf_mutex); + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); + up_write(&link->lgr->llc_conf_mutex); + } return 0; } @@ -1996,6 +2018,10 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, } } + ini->release_nr = pclc_v2_ext->hdr.flag.release; + if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE) + ini->release_nr = SMC_RELEASE; + out: if (!ini->smcd_version && !ini->smcr_version) return rc; @@ -2430,6 +2456,10 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_decl; + rc = smc_clc_srv_v2x_features_validate(pclc, ini); + if (rc) + goto out_decl; + mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); @@ -2443,7 +2473,7 @@ static void smc_listen_work(struct work_struct *work) /* send SMC Accept CLC message */ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, - accept_version, ini->negotiated_eid); + accept_version, ini->negotiated_eid, ini); if (rc) goto out_unlock; @@ -2462,6 +2492,18 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } + rc = smc_clc_v2x_features_confirm_check(cclc, ini); + if (rc) { + if (!ini->is_smcd) + goto out_unlock; + goto out_decl; + } + + /* fce smc release version is needed in smc_listen_rdma_finish, + * so save fce info here. + */ + smc_conn_save_peer_info_fce(new_smc, cclc); + /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, diff --git a/net/smc/smc.h b/net/smc/smc.h index 1f2b912c43d1..24745fde4ac2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,7 +21,10 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ -#define SMC_RELEASE 0 + +#define SMC_RELEASE_0 0 +#define SMC_RELEASE_1 1 +#define SMC_RELEASE SMC_RELEASE_1 /* the latest release version */ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index c90d9e5dda54..8deb46c28f1d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -391,9 +391,7 @@ smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) return false; } else { if (hdr->typev1 == SMC_TYPE_D && - ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 && - (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 + - sizeof(struct smc_clc_first_contact_ext))) + ntohs(hdr->length) < SMCD_CLC_ACCEPT_CONFIRM_LEN_V2) return false; if (hdr->typev1 == SMC_TYPE_R && ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) @@ -420,13 +418,29 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) return true; } -static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len) +static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce, + struct smc_init_info *ini) { + int ret = sizeof(*fce); + memset(fce, 0, sizeof(*fce)); - fce->os_type = SMC_CLC_OS_LINUX; - fce->release = SMC_RELEASE; - memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname)); - (*len) += sizeof(*fce); + fce->fce_v2_base.os_type = SMC_CLC_OS_LINUX; + fce->fce_v2_base.release = ini->release_nr; + memcpy(fce->fce_v2_base.hostname, smc_hostname, sizeof(smc_hostname)); + if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) { + ret = sizeof(struct smc_clc_first_contact_ext); + goto out; + } + + if (ini->release_nr >= SMC_RELEASE_1) { + if (!ini->is_smcd) { + fce->max_conns = ini->max_conns; + fce->max_links = ini->max_links; + } + } + +out: + return ret; } /* check if received message has a correct header length and contains valid @@ -927,8 +941,11 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) sizeof(struct smc_clc_smcd_gid_chid); } } - if (smcr_indicated(ini->smc_type_v2)) + if (smcr_indicated(ini->smc_type_v2)) { memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); + v2_ext->max_conns = SMC_CONN_PER_LGR_PREFER; + v2_ext->max_links = SMC_LINKS_PER_LGR_MAX_PREFER; + } pclc_base->hdr.length = htons(plen); memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -986,13 +1003,13 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, u8 *eid, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct smc_clc_first_contact_ext_v2x fce; struct smc_clc_msg_accept_confirm *clc; - struct smc_clc_first_contact_ext fce; struct smc_clc_fce_gid_ext gle; struct smc_clc_msg_trail trl; + int i, len, fce_len; struct kvec vec[5]; struct msghdr msg; - int i, len; /* send SMC Confirm CLC msg */ clc = (struct smc_clc_msg_accept_confirm *)clc_v2; @@ -1018,8 +1035,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, if (eid && eid[0]) memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; - if (first_contact) - smc_clc_fill_fce(&fce, &len); + if (first_contact) { + fce_len = smc_clc_fill_fce(&fce, ini); + len += fce_len; + } clc_v2->hdr.length = htons(len); } memcpy(trl.eyecatcher, SMCD_EYECATCHER, @@ -1063,15 +1082,14 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) { - smc_clc_fill_fce(&fce, &len); - fce.v2_direct = !link->lgr->uses_gateway; - memset(&gle, 0, sizeof(gle)); - if (ini && clc->hdr.type == SMC_CLC_CONFIRM) { + fce_len = smc_clc_fill_fce(&fce, ini); + len += fce_len; + fce.fce_v2_base.v2_direct = !link->lgr->uses_gateway; + if (clc->hdr.type == SMC_CLC_CONFIRM) { + memset(&gle, 0, sizeof(gle)); gle.gid_cnt = ini->smcrv2.gidlist.len; len += sizeof(gle); len += gle.gid_cnt * sizeof(gle.gid[0]); - } else { - len += sizeof(gle.reserved); } } clc_v2->hdr.length = htons(len); @@ -1094,7 +1112,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, sizeof(trl); if (version > SMC_V1 && first_contact) { vec[i].iov_base = &fce; - vec[i++].iov_len = sizeof(fce); + vec[i++].iov_len = fce_len; if (!conn->lgr->is_smcd) { if (clc->hdr.type == SMC_CLC_CONFIRM) { vec[i].iov_base = &gle; @@ -1102,9 +1120,6 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, vec[i].iov_base = &ini->smcrv2.gidlist.list; vec[i++].iov_len = gle.gid_cnt * sizeof(gle.gid[0]); - } else { - vec[i].iov_base = &gle.reserved; - vec[i++].iov_len = sizeof(gle.reserved); } } } @@ -1141,7 +1156,7 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, /* send CLC ACCEPT message across internal TCP socket */ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, - u8 version, u8 *negotiated_eid) + u8 version, u8 *negotiated_eid, struct smc_init_info *ini) { struct smc_clc_msg_accept_confirm_v2 aclc_v2; int len; @@ -1149,13 +1164,95 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, memset(&aclc_v2, 0, sizeof(aclc_v2)); aclc_v2.hdr.type = SMC_CLC_ACCEPT; len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, - version, negotiated_eid, NULL); + version, negotiated_eid, ini); if (len < ntohs(aclc_v2.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; return len > 0 ? 0 : len; } +int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *pclc_v2_ext; + + ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; + + if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || + ini->release_nr < SMC_RELEASE_1) + return 0; + + pclc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!pclc_v2_ext) + return SMC_CLC_DECL_NOV2EXT; + + if (ini->smcr_version & SMC_V2) { + ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, SMC_CONN_PER_LGR_PREFER); + if (ini->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + + ini->max_links = min_t(u8, pclc_v2_ext->max_links, SMC_LINKS_PER_LGR_MAX_PREFER); + if (ini->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; + } + + return 0; +} + +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini) +{ + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; + + if (ini->release_nr < SMC_RELEASE_1) + return 0; + + if (!ini->is_smcd) { + if (fce_v2x->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + ini->max_conns = fce_v2x->max_conns; + + if (fce_v2x->max_links > SMC_LINKS_ADD_LNK_MAX || + fce_v2x->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; + ini->max_links = fce_v2x->max_links; + } + + return 0; +} + +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)cclc; + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(clc_v2, ini->is_smcd); + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; + + if (cclc->hdr.version == SMC_V1 || + !(cclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return 0; + + if (ini->release_nr != fce->release) + return SMC_CLC_DECL_RELEASEERR; + + if (fce->release < SMC_RELEASE_1) + return 0; + + if (!ini->is_smcd) { + if (fce_v2x->max_conns != ini->max_conns) + return SMC_CLC_DECL_MAXCONNERR; + if (fce_v2x->max_links != ini->max_links) + return SMC_CLC_DECL_MAXLINKERR; + } + + return 0; +} + void smc_clc_get_hostname(u8 **host) { *host = &smc_hostname[0]; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fee545c9a10..c5c8e7db775a 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -45,6 +45,9 @@ #define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ #define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ #define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ +#define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */ +#define SMC_CLC_DECL_MAXCONNERR 0x0303000a /* max connections negotiate failed */ +#define SMC_CLC_DECL_MAXLINKERR 0x0303000b /* max links negotiate failed */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -133,7 +136,9 @@ struct smc_clc_smcd_gid_chid { struct smc_clc_v2_extension { struct smc_clnt_opts_area_hdr hdr; u8 roce[16]; /* RoCEv2 GID */ - u8 reserved[16]; + u8 max_conns; + u8 max_links; + u8 reserved[14]; u8 user_eids[][SMC_MAX_EID_LEN]; }; @@ -147,7 +152,9 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ struct smc_clc_msg_smcd { /* SMC-D GID information */ struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ - u8 reserved[28]; + u8 vendor_oui[3]; /* vendor organizationally unique identifier */ + u8 vendor_exp_options[5]; + u8 reserved[20]; }; struct smc_clc_smcd_v2_extension { @@ -231,8 +238,19 @@ struct smc_clc_first_contact_ext { u8 hostname[SMC_MAX_HOSTNAME_LEN]; }; +struct smc_clc_first_contact_ext_v2x { + struct smc_clc_first_contact_ext fce_v2_base; + u8 max_conns; /* for SMC-R only */ + u8 max_links; /* for SMC-R only */ + u8 reserved3[2]; + __be32 vendor_exp_options; + u8 reserved4[8]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 (Third Edition) + * (https://www.ibm.com/support/pages/node/7009315) + */ + struct smc_clc_fce_gid_ext { - u8 reserved[16]; u8 gid_cnt; u8 reserved2[3]; u8 gid[][SMC_GID_SIZE]; @@ -370,6 +388,27 @@ smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)); } +static inline struct smc_clc_first_contact_ext * +smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm_v2 *clc_v2, + bool is_smcd) +{ + int clc_v2_len; + + if (clc_v2->hdr.version == SMC_V1 || + !(clc_v2->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return NULL; + + if (is_smcd) + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm_v2, d1); + else + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm_v2, r1); + + return (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + + clc_v2_len); +} + struct smcd_dev; struct smc_init_info; @@ -382,7 +421,13 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, - u8 version, u8 *negotiated_eid); + u8 version, u8 *negotiated_eid, struct smc_init_info *ini); +int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini); +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini); +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini); void smc_clc_init(void) __init; void smc_clc_exit(void); void smc_clc_get_hostname(u8 **host); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 6b78075404d7..bd01dd31e4bd 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -319,6 +319,10 @@ static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links)) + goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; @@ -895,9 +899,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->uses_gateway = ini->smcrv2.uses_gateway; memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, ETH_ALEN); + lgr->max_conns = ini->max_conns; + lgr->max_links = ini->max_links; } else { ibdev = ini->ib_dev; ibport = ini->ib_port; + lgr->max_conns = SMC_CONN_PER_LGR_MAX; + lgr->max_links = SMC_LINKS_ADD_LNK_MAX; } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); @@ -1664,6 +1672,9 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + continue; + /* trigger local add link processing */ link = smc_llc_usable_link(lgr); if (link) @@ -1888,7 +1899,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + (lgr->conns_num < lgr->max_conns && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1645fba0d2d3..120027d40469 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -22,6 +22,15 @@ #include "smc_ib.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ +#define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, + * also is the default value for SMC-R v1 and v2.0 + */ +#define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distrubutions may modify it to a value between + * 16-255 as needed. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -164,6 +173,15 @@ struct smc_link { */ #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 +#define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ +#define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the + * default value for smc-r v1.0 and v2.0 + */ +#define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distrubutions may modify it to a value between + * 1-2 as needed. + */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { @@ -331,6 +349,10 @@ struct smc_link_group { __be32 saddr; /* net namespace */ struct net *net; + u8 max_conns; + /* max conn can be assigned to lgr */ + u8 max_links; + /* max links can be added in lgr */ }; struct { /* SMC-D */ u64 peer_gid; @@ -374,6 +396,9 @@ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; u8 smc_type_v2; + u8 release_nr; + u8 max_conns; + u8 max_links; u8 first_contact_peer; u8 first_contact_local; unsigned short vlan_id; @@ -539,7 +564,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); -void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 034295676e88..4df5f8c8a0a1 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -96,7 +96,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk); int smc_ib_create_queue_pair(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); -int smc_ib_modify_qp_reset(struct smc_link *lnk); int smc_ib_modify_qp_error(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 90f0b60b196a..018ce8133b02 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -52,14 +52,13 @@ struct smc_llc_msg_confirm_link { /* type 0x01 */ u8 link_num; u8 link_uid[SMC_LGR_ID_SIZE]; u8 max_links; - u8 reserved[9]; + u8 max_conns; + u8 reserved[8]; }; #define SMC_LLC_FLAG_ADD_LNK_REJ 0x40 #define SMC_LLC_REJ_RSN_NO_ALT_PATH 1 -#define SMC_LLC_ADD_LNK_MAX_LINKS 2 - struct smc_llc_msg_add_link { /* type 0x02 */ struct smc_llc_hdr hd; u8 sender_mac[ETH_ALEN]; @@ -471,7 +470,12 @@ int smc_llc_send_confirm_link(struct smc_link *link, hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; + confllc->max_links = SMC_LINKS_ADD_LNK_MAX; + if (link->lgr->smc_version == SMC_V2 && + link->lgr->peer_smc_release >= SMC_RELEASE_1) { + confllc->max_conns = link->lgr->max_conns; + confllc->max_links = link->lgr->max_links; + } /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -1041,6 +1045,11 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; } + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out_reject; + } + ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { ini->check_smcrv2 = true; @@ -1165,6 +1174,9 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + goto out; + ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) goto out; @@ -1410,6 +1422,11 @@ int smc_llc_srv_add_link(struct smc_link *link, goto out; } + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out; + } + /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { diff --git a/net/socket.c b/net/socket.c index 2b0e54b2405c..848116d06b51 100644 --- a/net/socket.c +++ b/net/socket.c @@ -136,9 +136,10 @@ static void sock_splice_eof(struct file *file); static void sock_show_fdinfo(struct seq_file *m, struct file *f) { struct socket *sock = f->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); - if (sock->ops->show_fdinfo) - sock->ops->show_fdinfo(m, sock); + if (ops->show_fdinfo) + ops->show_fdinfo(m, sock); } #else #define sock_show_fdinfo NULL @@ -646,12 +647,14 @@ EXPORT_SYMBOL(sock_alloc); static void __sock_release(struct socket *sock, struct inode *inode) { - if (sock->ops) { - struct module *owner = sock->ops->owner; + const struct proto_ops *ops = READ_ONCE(sock->ops); + + if (ops) { + struct module *owner = ops->owner; if (inode) inode_lock(inode); - sock->ops->release(sock); + ops->release(sock); sock->sk = NULL; if (inode) inode_unlock(inode); @@ -722,7 +725,7 @@ static noinline void call_trace_sock_send_length(struct sock *sk, int ret, static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, + int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); @@ -786,13 +789,14 @@ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops = READ_ONCE(sock->ops); - if (!sock->ops->sendmsg_locked) + if (!ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); - return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg)); + return ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } EXPORT_SYMBOL(kernel_sendmsg_locked); @@ -1017,7 +1021,8 @@ static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int f static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg, + inet6_recvmsg, inet_recvmsg, sock, msg, msg_data_left(msg), flags); if (trace_sock_recv_length_enabled()) @@ -1072,19 +1077,23 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct socket *sock = file->private_data; + const struct proto_ops *ops; - if (unlikely(!sock->ops->splice_read)) + ops = READ_ONCE(sock->ops); + if (unlikely(!ops->splice_read)) return copy_splice_read(file, ppos, pipe, len, flags); - return sock->ops->splice_read(sock, ppos, pipe, len, flags); + return ops->splice_read(sock, ppos, pipe, len, flags); } static void sock_splice_eof(struct file *file) { struct socket *sock = file->private_data; + const struct proto_ops *ops; - if (sock->ops->splice_eof) - sock->ops->splice_eof(sock); + ops = READ_ONCE(sock->ops); + if (ops->splice_eof) + ops->splice_eof(sock); } static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1181,13 +1190,14 @@ EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct ifreq ifr; bool need_copyout; int err; void __user *argp = (void __user *)arg; void __user *data; - err = sock->ops->ioctl(sock, cmd, arg); + err = ops->ioctl(sock, cmd, arg); /* * If this ioctl is unknown try to hand it down @@ -1216,6 +1226,7 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { + const struct proto_ops *ops; struct socket *sock; struct sock *sk; void __user *argp = (void __user *)arg; @@ -1223,6 +1234,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct net *net; sock = file->private_data; + ops = READ_ONCE(sock->ops); sk = sock->sk; net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { @@ -1280,23 +1292,23 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: - if (!sock->ops->gettstamp) { + if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } - err = sock->ops->gettstamp(sock, argp, - cmd == SIOCGSTAMP_OLD, - !IS_ENABLED(CONFIG_64BIT)); + err = ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_OLD, + !IS_ENABLED(CONFIG_64BIT)); break; case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: - if (!sock->ops->gettstamp) { + if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } - err = sock->ops->gettstamp(sock, argp, - cmd == SIOCGSTAMP_NEW, - false); + err = ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_NEW, + false); break; case SIOCGIFCONF: @@ -1357,9 +1369,10 @@ EXPORT_SYMBOL(sock_create_lite); static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); __poll_t events = poll_requested_events(wait), flag = 0; - if (!sock->ops->poll) + if (!ops->poll) return 0; if (sk_can_busy_loop(sock->sk)) { @@ -1371,14 +1384,14 @@ static __poll_t sock_poll(struct file *file, poll_table *wait) flag = POLL_BUSY_LOOP; } - return sock->ops->poll(file, sock, wait) | flag; + return ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) { struct socket *sock = file->private_data; - return sock->ops->mmap(file, sock, vma); + return READ_ONCE(sock->ops)->mmap(file, sock, vma); } static int sock_close(struct inode *inode, struct file *filp) @@ -1644,12 +1657,36 @@ struct file *__sys_socket_file(int family, int type, int protocol) return sock_alloc_file(sock, flags, NULL); } +/* A hook for bpf progs to attach to and update socket protocol. + * + * A static noinline declaration here could cause the compiler to + * optimize away the function. A global noinline declaration will + * keep the definition, but may optimize away the callsite. + * Therefore, __weak is needed to ensure that the call is still + * emitted, by telling the compiler that we don't know what the + * function might eventually be. + * + * __diag_* below are needed to dismiss the missing prototype warning. + */ + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "A fmod_ret entry point for BPF programs"); + +__weak noinline int update_socket_protocol(int family, int type, int protocol) +{ + return protocol; +} + +__diag_pop(); + int __sys_socket(int family, int type, int protocol) { struct socket *sock; int flags; - sock = __sys_socket_create(family, type, protocol); + sock = __sys_socket_create(family, type, + update_socket_protocol(family, type, protocol)); if (IS_ERR(sock)) return PTR_ERR(sock); @@ -1728,7 +1765,7 @@ int __sys_socketpair(int family, int type, int protocol, int __user *usockvec) goto out; } - err = sock1->ops->socketpair(sock1, sock2); + err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2); if (unlikely(err < 0)) { sock_release(sock2); sock_release(sock1); @@ -1789,7 +1826,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) (struct sockaddr *)&address, addrlen); if (!err) - err = sock->ops->bind(sock, + err = READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *) &address, addrlen); } @@ -1823,7 +1860,7 @@ int __sys_listen(int fd, int backlog) err = security_socket_listen(sock, backlog); if (!err) - err = sock->ops->listen(sock, backlog); + err = READ_ONCE(sock->ops)->listen(sock, backlog); fput_light(sock->file, fput_needed); } @@ -1843,6 +1880,7 @@ struct file *do_accept(struct file *file, unsigned file_flags, struct file *newfile; int err, len; struct sockaddr_storage address; + const struct proto_ops *ops; sock = sock_from_file(file); if (!sock) @@ -1851,15 +1889,16 @@ struct file *do_accept(struct file *file, unsigned file_flags, newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); + ops = READ_ONCE(sock->ops); newsock->type = sock->type; - newsock->ops = sock->ops; + newsock->ops = ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ - __module_get(newsock->ops->owner); + __module_get(ops->owner); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) @@ -1869,14 +1908,13 @@ struct file *do_accept(struct file *file, unsigned file_flags, if (err) goto out_fd; - err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags, + err = ops->accept(sock, newsock, sock->file->f_flags | file_flags, false); if (err < 0) goto out_fd; if (upeer_sockaddr) { - len = newsock->ops->getname(newsock, - (struct sockaddr *)&address, 2); + len = ops->getname(newsock, (struct sockaddr *)&address, 2); if (len < 0) { err = -ECONNABORTED; goto out_fd; @@ -1989,8 +2027,8 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, if (err) goto out; - err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, - sock->file->f_flags | file_flags); + err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, + addrlen, sock->file->f_flags | file_flags); out: return err; } @@ -2039,7 +2077,7 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, if (err) goto out_put; - err = sock->ops->getname(sock, (struct sockaddr *)&address, 0); + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); if (err < 0) goto out_put; /* "err" is actually length in this case */ @@ -2071,13 +2109,15 @@ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { + const struct proto_ops *ops = READ_ONCE(sock->ops); + err = security_socket_getpeername(sock); if (err) { fput_light(sock->file, fput_needed); return err; } - err = sock->ops->getname(sock, (struct sockaddr *)&address, 1); + err = ops->getname(sock, (struct sockaddr *)&address, 1); if (err >= 0) /* "err" is actually length in this case */ err = move_addr_to_user(&address, err, usockaddr, @@ -2227,6 +2267,7 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { sockptr_t optval = USER_SOCKPTR(user_optval); + const struct proto_ops *ops; char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2255,12 +2296,13 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, if (kernel_optval) optval = KERNEL_SOCKPTR(kernel_optval); + ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) err = sock_setsockopt(sock, level, optname, optval, optlen); - else if (unlikely(!sock->ops->setsockopt)) + else if (unlikely(!ops->setsockopt)) err = -EOPNOTSUPP; else - err = sock->ops->setsockopt(sock, level, optname, optval, + err = ops->setsockopt(sock, level, optname, optval, optlen); kfree(kernel_optval); out_put: @@ -2285,6 +2327,7 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { int max_optlen __maybe_unused; + const struct proto_ops *ops; int err, fput_needed; struct socket *sock; @@ -2299,12 +2342,13 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, if (!in_compat_syscall()) max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); - else if (unlikely(!sock->ops->getsockopt)) + else if (unlikely(!ops->getsockopt)) err = -EOPNOTSUPP; else - err = sock->ops->getsockopt(sock, level, optname, optval, + err = ops->getsockopt(sock, level, optname, optval, optlen); if (!in_compat_syscall()) @@ -2332,7 +2376,7 @@ int __sys_shutdown_sock(struct socket *sock, int how) err = security_socket_shutdown(sock, how); if (!err) - err = sock->ops->shutdown(sock, how); + err = READ_ONCE(sock->ops)->shutdown(sock, how); return err; } @@ -3324,6 +3368,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; struct net *net = sock_net(sk); + const struct proto_ops *ops; if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) return sock_ioctl(file, cmd, (unsigned long)argp); @@ -3333,10 +3378,11 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return compat_siocwandev(net, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: - if (!sock->ops->gettstamp) + ops = READ_ONCE(sock->ops); + if (!ops->gettstamp) return -ENOIOCTLCMD; - return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, - !COMPAT_USE_64BIT_TIME); + return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, + !COMPAT_USE_64BIT_TIME); case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: @@ -3417,6 +3463,7 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = file->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); int ret = -ENOIOCTLCMD; struct sock *sk; struct net *net; @@ -3424,8 +3471,8 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, sk = sock->sk; net = sock_net(sk); - if (sock->ops->compat_ioctl) - ret = sock->ops->compat_ioctl(sock, cmd, arg); + if (ops->compat_ioctl) + ret = ops->compat_ioctl(sock, cmd, arg); if (ret == -ENOIOCTLCMD && (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) @@ -3449,7 +3496,7 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { - return sock->ops->bind(sock, addr, addrlen); + return READ_ONCE(sock->ops)->bind(sock, addr, addrlen); } EXPORT_SYMBOL(kernel_bind); @@ -3463,7 +3510,7 @@ EXPORT_SYMBOL(kernel_bind); int kernel_listen(struct socket *sock, int backlog) { - return sock->ops->listen(sock, backlog); + return READ_ONCE(sock->ops)->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen); @@ -3481,6 +3528,7 @@ EXPORT_SYMBOL(kernel_listen); int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; + const struct proto_ops *ops = READ_ONCE(sock->ops); int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, @@ -3488,15 +3536,15 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) if (err < 0) goto done; - err = sock->ops->accept(sock, *newsock, flags, true); + err = ops->accept(sock, *newsock, flags, true); if (err < 0) { sock_release(*newsock); *newsock = NULL; goto done; } - (*newsock)->ops = sock->ops; - __module_get((*newsock)->ops->owner); + (*newsock)->ops = ops; + __module_get(ops->owner); done: return err; @@ -3519,7 +3567,12 @@ EXPORT_SYMBOL(kernel_accept); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { - return sock->ops->connect(sock, addr, addrlen, flags); + struct sockaddr_storage address; + + memcpy(&address, addr, addrlen); + + return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, + addrlen, flags); } EXPORT_SYMBOL(kernel_connect); @@ -3534,7 +3587,7 @@ EXPORT_SYMBOL(kernel_connect); int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, 0); + return READ_ONCE(sock->ops)->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); @@ -3549,7 +3602,7 @@ EXPORT_SYMBOL(kernel_getsockname); int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, 1); + return READ_ONCE(sock->ops)->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); @@ -3563,7 +3616,7 @@ EXPORT_SYMBOL(kernel_getpeername); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { - return sock->ops->shutdown(sock, how); + return READ_ONCE(sock->ops)->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2eb8df44f894..8c9a8ee76aa0 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -43,7 +43,7 @@ #include <net/udp.h> #include <net/tcp.h> #include <net/tcp_states.h> -#include <net/tls.h> +#include <net/tls_prot.h> #include <net/handshake.h> #include <linux/uaccess.h> #include <linux/highmem.h> @@ -226,27 +226,30 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) } static int -svc_tcp_sock_process_cmsg(struct svc_sock *svsk, struct msghdr *msg, +svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, struct cmsghdr *cmsg, int ret) { - if (cmsg->cmsg_level == SOL_TLS && - cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { - u8 content_type = *((u8 *)CMSG_DATA(cmsg)); - - switch (content_type) { - case TLS_RECORD_TYPE_DATA: - /* TLS sets EOR at the end of each application data - * record, even though there might be more frames - * waiting to be decrypted. - */ - msg->msg_flags &= ~MSG_EOR; - break; - case TLS_RECORD_TYPE_ALERT: - ret = -ENOTCONN; - break; - default: - ret = -EAGAIN; - } + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -ENOTCONN : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; } return ret; } @@ -258,13 +261,14 @@ svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg) struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; + struct socket *sock = svsk->sk_sock; int ret; msg->msg_control = &u; msg->msg_controllen = sizeof(u); - ret = sock_recvmsg(svsk->sk_sock, msg, MSG_DONTWAIT); + ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); if (unlikely(msg->msg_controllen != sizeof(u))) - ret = svc_tcp_sock_process_cmsg(svsk, msg, &u.cmsg, ret); + ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret); return ret; } @@ -1624,6 +1628,8 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + tls_handshake_close(svsk->sk_sock); + svc_sock_detach(xprt); if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9f010369100a..268a2cc61acd 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -47,7 +47,7 @@ #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> -#include <net/tls.h> +#include <net/tls_prot.h> #include <net/handshake.h> #include <linux/bvec.h> @@ -360,24 +360,27 @@ static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, struct cmsghdr *cmsg, int ret) { - if (cmsg->cmsg_level == SOL_TLS && - cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { - u8 content_type = *((u8 *)CMSG_DATA(cmsg)); - - switch (content_type) { - case TLS_RECORD_TYPE_DATA: - /* TLS sets EOR at the end of each application data - * record, even though there might be more frames - * waiting to be decrypted. - */ - msg->msg_flags &= ~MSG_EOR; - break; - case TLS_RECORD_TYPE_ALERT: - ret = -ENOTCONN; - break; - default: - ret = -EAGAIN; - } + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -EACCES : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; } return ret; } @@ -777,6 +780,8 @@ static void xs_stream_data_receive(struct sock_xprt *transport) } if (ret == -ESHUTDOWN) kernel_sock_shutdown(transport->sock, SHUT_RDWR); + else if (ret == -EACCES) + xprt_wake_pending_tasks(&transport->xprt, -EACCES); else xs_poll_check_readable(transport); out: @@ -1292,6 +1297,8 @@ static void xs_close(struct rpc_xprt *xprt) dprintk("RPC: xs_close xprt %p\n", xprt); + if (transport->sock) + tls_handshake_close(transport->sock); xs_reset_transport(transport); xprt->reestablish_timeout = 0; } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8cc42aea19c7..5b045284849e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -862,3 +862,28 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev, NULL); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); + +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .dev = dev, + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + }, + }; + int err; + + ASSERT_RTNL(); + + err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_REPLAY, + brport_dev, &brport_info.info, + extack); + return notifier_to_errno(err); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_replay); diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 0772cfadaa0d..93f82398283d 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -131,6 +131,5 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); void tipc_set_node_id(struct net *net, u8 *id); void tipc_set_node_addr(struct net *net, u32 addr); char *tipc_nodeid2string(char *str, u8 *id); -u32 tipc_node_id2hash(u8 *id128); #endif diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 1ee60649bd17..41eac1ee0c09 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -214,8 +214,6 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); -int tipc_media_set_priority(const char *name, u32 new_value); -int tipc_media_set_window(const char *name, u32 new_value); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); diff --git a/net/tipc/link.h b/net/tipc/link.h index a16f401fdabd..d80f5649b395 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -148,8 +148,6 @@ int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq, struct sk_buff_head *retrq); -void tipc_link_build_bc_sync_msg(struct tipc_link *l, - struct sk_buff_head *xmitq); void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *xmitq); diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index e231e6964d61..c677f6f082df 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -67,7 +67,6 @@ struct distr_item { __be32 key; }; -void tipc_named_bcast(struct net *net, struct sk_buff *skb); struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ); struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities); diff --git a/net/tipc/net.h b/net/tipc/net.h index d0c91d2df20a..1cb1e43cf34a 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -43,7 +43,6 @@ extern const struct nla_policy tipc_nl_net_policy[]; int tipc_net_init(struct net *net, u8 *node_id, u32 addr); void tipc_net_finalize_work(struct work_struct *work); -void tipc_sched_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 9b47c8409231..5bc076f2fa74 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -208,7 +208,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, goto err_out; } - info.attrs = attrbuf; + info.info.attrs = attrbuf; if (nlmsg_len(cb.nlh) > 0) { err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, @@ -1294,7 +1294,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) struct tipc_nl_compat_msg msg; struct nlmsghdr *req_nlh; struct nlmsghdr *rep_nlh; - struct tipc_genlmsghdr *req_userhdr = info->userhdr; + struct tipc_genlmsghdr *req_userhdr = genl_info_userhdr(info); memset(&msg, 0, sizeof(msg)); diff --git a/net/tipc/node.c b/net/tipc/node.c index a9c5b6594889..3105abe97bb9 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2662,7 +2662,7 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; @@ -2870,7 +2870,7 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, int err; if (!prev_node) { - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ef8e5139a873..bb1118d02f95 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3791,7 +3791,7 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_sock *tsk; if (!tsk_portid) { - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; if (!attrs[TIPC_NLA_SOCK]) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 926232557e77..f892b0903dba 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -465,7 +465,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) int i; if (!bid && !skip_cnt) { - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; char *bname; diff --git a/net/tls/tls.h b/net/tls/tls.h index 86cef1c68e03..164d6a955e26 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -39,6 +39,7 @@ #include <linux/types.h> #include <linux/skmsg.h> #include <net/tls.h> +#include <net/tls_prot.h> #define TLS_PAGE_ORDER (min_t(unsigned int, PAGE_ALLOC_COSTLY_ORDER, \ TLS_MAX_PAYLOAD_SIZE >> PAGE_SHIFT)) @@ -86,10 +87,6 @@ void tls_ctx_free(struct sock *sk, struct tls_context *ctx); void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); -int tls_sk_query(struct sock *sk, int optname, char __user *optval, - int __user *optlen); -int tls_sk_attach(struct sock *sk, int optname, char __user *optval, - unsigned int optlen); void tls_err_abort(struct sock *sk, int err); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); @@ -110,6 +107,8 @@ bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 529101eb20bd..2392d06845aa 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -440,9 +440,13 @@ static int tls_push_data(struct sock *sk, long timeo; if (flags & - ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES)) + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; + if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) + return -EINVAL; + if (unlikely(sk->sk_err)) return -sk->sk_err; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 4a8ee2f6badb..f550c84f3408 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -959,10 +959,12 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_BASE][TLS_SW ].poll = tls_sk_poll; + ops[TLS_BASE][TLS_SW ].read_sock = tls_sw_read_sock; ops[TLS_SW ][TLS_SW ] = ops[TLS_SW ][TLS_BASE]; ops[TLS_SW ][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_SW ][TLS_SW ].poll = tls_sk_poll; + ops[TLS_SW ][TLS_SW ].read_sock = tls_sw_read_sock; #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index f37f4a0fcd3c..ca1e0e198ceb 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -369,7 +369,6 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, static int tls_strp_read_copyin(struct tls_strparser *strp) { - struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; desc.arg.data = strp; @@ -377,7 +376,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp) desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ - sock->ops->read_sock(strp->sk, &desc, tls_strp_copyin); + tcp_read_sock(strp->sk, &desc, tls_strp_copyin); return desc.error; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 53f944e6d8ef..5c122d7bb784 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -984,6 +984,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, int ret = 0; int pending; + if (!eor && (msg->msg_flags & MSG_EOR)) + return -EINVAL; + if (unlikely(msg->msg_controllen)) { ret = tls_process_cmsg(sk, msg, &record_type); if (ret) { @@ -1193,7 +1196,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | + MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | MSG_EOR | MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; @@ -1845,13 +1848,10 @@ tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot, return sk_flush_backlog(sk); } -static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, - bool nonblock) +static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, + bool nonblock) { long timeo; - int err; - - lock_sock(sk); timeo = sock_rcvtimeo(sk, nonblock); @@ -1865,26 +1865,30 @@ static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, !READ_ONCE(ctx->reader_present), &wait); remove_wait_queue(&ctx->wq, &wait); - if (timeo <= 0) { - err = -EAGAIN; - goto err_unlock; - } - if (signal_pending(current)) { - err = sock_intr_errno(timeo); - goto err_unlock; - } + if (timeo <= 0) + return -EAGAIN; + if (signal_pending(current)) + return sock_intr_errno(timeo); } WRITE_ONCE(ctx->reader_present, 1); return 0; +} -err_unlock: - release_sock(sk); +static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, + bool nonblock) +{ + int err; + + lock_sock(sk); + err = tls_rx_reader_acquire(sk, ctx, nonblock); + if (err) + release_sock(sk); return err; } -static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) +static void tls_rx_reader_release(struct sock *sk, struct tls_sw_context_rx *ctx) { if (unlikely(ctx->reader_contended)) { if (wq_has_sleeper(&ctx->wq)) @@ -1896,6 +1900,11 @@ static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) } WRITE_ONCE(ctx->reader_present, 0); +} + +static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) +{ + tls_rx_reader_release(sk, ctx); release_sock(sk); } @@ -2193,6 +2202,102 @@ splice_requeue: goto splice_read_end; } +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct tls_prot_info *prot = &tls_ctx->prot_info; + struct strp_msg *rxm = NULL; + struct sk_buff *skb = NULL; + struct sk_psock *psock; + size_t flushed_at = 0; + bool released = true; + struct tls_msg *tlm; + ssize_t copied = 0; + ssize_t decrypted; + int err, used; + + psock = sk_psock_get(sk); + if (psock) { + sk_psock_put(sk, psock); + return -EINVAL; + } + err = tls_rx_reader_acquire(sk, ctx, true); + if (err < 0) + return err; + + /* If crypto failed the connection is broken */ + err = ctx->async_wait.err; + if (err) + goto read_sock_end; + + decrypted = 0; + do { + if (!skb_queue_empty(&ctx->rx_list)) { + skb = __skb_dequeue(&ctx->rx_list); + rxm = strp_msg(skb); + tlm = tls_msg(skb); + } else { + struct tls_decrypt_arg darg; + + err = tls_rx_rec_wait(sk, NULL, true, released); + if (err <= 0) + goto read_sock_end; + + memset(&darg.inargs, 0, sizeof(darg.inargs)); + + err = tls_rx_one_record(sk, NULL, &darg); + if (err < 0) { + tls_err_abort(sk, -EBADMSG); + goto read_sock_end; + } + + released = tls_read_flush_backlog(sk, prot, INT_MAX, + 0, decrypted, + &flushed_at); + skb = darg.skb; + rxm = strp_msg(skb); + tlm = tls_msg(skb); + decrypted += rxm->full_len; + + tls_rx_rec_done(ctx); + } + + /* read_sock does not support reading control messages */ + if (tlm->control != TLS_RECORD_TYPE_DATA) { + err = -EINVAL; + goto read_sock_requeue; + } + + used = read_actor(desc, skb, rxm->offset, rxm->full_len); + if (used <= 0) { + if (!copied) + err = used; + goto read_sock_requeue; + } + copied += used; + if (used < rxm->full_len) { + rxm->offset += used; + rxm->full_len -= used; + if (!desc->count) + goto read_sock_requeue; + } else { + consume_skb(skb); + if (!desc->count) + skb = NULL; + } + } while (skb); + +read_sock_end: + tls_rx_reader_release(sk, ctx); + return copied ? : err; + +read_sock_requeue: + __skb_queue_head(&ctx->rx_list, skb); + goto read_sock_end; +} + bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); diff --git a/net/unix/scm.c b/net/unix/scm.c index f9152881d77f..e9dde7176c8a 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -29,10 +29,11 @@ struct sock *unix_get_socket(struct file *filp) /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops = READ_ONCE(sock->ops); struct sock *s = sock->sk; /* PF_UNIX ? */ - if (s && sock->ops && sock->ops->family == PF_UNIX) + if (s && ops && ops->family == PF_UNIX) u_sock = s; } else { /* Could be an io_uring instance */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index b769fc258931..352d042b130b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -348,37 +348,34 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0, off; - struct sk_buff *skb, *tmp; - int err = -EFAULT; + struct sk_buff *skb; + size_t total = 0; + int err; spin_lock_bh(&vvs->rx_lock); - skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { - off = 0; + skb_queue_walk(&vvs->rx_queue, skb) { + size_t bytes; - if (total == len) - break; + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; - while (total < len && off < skb->len) { - bytes = len - total; - if (bytes > skb->len - off) - bytes = skb->len - off; + spin_unlock_bh(&vvs->rx_lock); - /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. - */ - spin_unlock_bh(&vvs->rx_lock); + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + goto out; - err = memcpy_to_msg(msg, skb->data + off, bytes); - if (err) - goto out; + total += bytes; - spin_lock_bh(&vvs->rx_lock); + spin_lock_bh(&vvs->rx_lock); - total += bytes; - off += bytes; - } + if (total == len) + break; } spin_unlock_bh(&vvs->rx_lock); @@ -463,6 +460,63 @@ out: return err; } +static ssize_t +virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk, + struct msghdr *msg) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct sk_buff *skb; + size_t total, len; + + spin_lock_bh(&vvs->rx_lock); + + if (!vvs->msg_count) { + spin_unlock_bh(&vvs->rx_lock); + return 0; + } + + total = 0; + len = msg_data_left(msg); + + skb_queue_walk(&vvs->rx_queue, skb) { + struct virtio_vsock_hdr *hdr; + + if (total < len) { + size_t bytes; + int err; + + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; + + spin_unlock_bh(&vvs->rx_lock); + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + return err; + + spin_lock_bh(&vvs->rx_lock); + } + + total += skb->len; + hdr = virtio_vsock_hdr(skb); + + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) + msg->msg_flags |= MSG_EOR; + + break; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; +} + static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags) @@ -557,9 +611,9 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); + return virtio_transport_seqpacket_do_peek(vsk, msg); + else + return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index b7b072194282..dbda3ababa14 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -116,9 +116,6 @@ struct vmci_transport { spinlock_t lock; /* protects sk. */ }; -int vmci_transport_register(void); -void vmci_transport_unregister(void); - int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst, struct sockaddr_vm *src); int vmci_transport_send_read_bh(struct sockaddr_vm *dst, diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 10ea85c03147..fcfc8472f73d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> +#include <net/netdev_rx_queue.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -135,14 +136,14 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u64 addr; int err; addr = xp_get_handle(xskb); - err = xskq_prod_reserve_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; @@ -152,48 +153,138 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; } -static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *from_buf, *to_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u32 frags = xdp_buff_has_frags(xdp); + struct xdp_buff_xsk *pos, *tmp; + struct list_head *xskb_list; + u32 contd = 0; + int err; - if (unlikely(xdp_data_meta_unsupported(from))) { - from_buf = from->data; - to_buf = to->data; - metalen = 0; - } else { - from_buf = from->data_meta; - metalen = from->data - from->data_meta; - to_buf = to->data - metalen; + if (frags) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err || likely(!frags)) + goto out; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + if (list_is_singular(xskb_list)) + contd = 0; + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) + return err; + list_del(&pos->xskb_list_node); } - memcpy(to_buf, from_buf, len + metalen); +out: + return err; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static void *xsk_copy_xdp_start(struct xdp_buff *from) { + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} + +static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; - int err; - u32 len; + skb_frag_t *frag; - len = xdp->data_end - xdp->data; - if (len > xsk_pool_get_rx_frame_size(xs->pool)) { - xs->rx_dropped++; - return -ENOSPC; + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; } - xsk_xdp = xsk_buff_alloc(xs->pool); - if (!xsk_xdp) { + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } - xsk_copy_xdp(xsk_xdp, xdp, len); - err = __xsk_rcv_zc(xs, xsk_xdp, len); - if (err) { - xsk_buff_free(xsk_xdp); - return err; + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + return 0; } @@ -215,7 +306,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; @@ -223,6 +314,11 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } + sk_mark_napi_id_once_xdp(&xs->sk, xdp); return 0; } @@ -236,12 +332,13 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (!err) { - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); } spin_unlock_bh(&xs->rx_lock); @@ -250,19 +347,19 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; - u32 len; - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; - return __xsk_rcv_zc(xs, xdp, len); + return xsk_rcv_zc(xs, xdp, len); } - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; @@ -321,7 +418,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) + xskq_cons_release(xs->tx); continue; } @@ -408,37 +506,91 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static void xsk_destruct_skb(struct sk_buff *skb) +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(xs->pool->cq, addr); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + + return ret; +} + +static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) { - u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; - struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_submit_addr(xs->pool->cq, addr); + xskq_prod_submit_n(xs->pool->cq, n); spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +} + +static void xsk_destruct_skb(struct sk_buff *skb) +{ + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } +static void xsk_set_destructor_arg(struct sk_buff *skb) +{ + long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + + skb_shinfo(skb)->destructor_arg = (void *)num; +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); - skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); - skb_reserve(skb, hr); + skb_reserve(skb, hr); + } addr = desc->addr; len = desc->len; @@ -448,7 +600,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, offset = offset_in_page(buffer); addr = buffer - pool->addrs; - for (copied = 0, i = 0; copied < len; i++) { + for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { + if (unlikely(i >= MAX_SKB_FRAGS)) + return ERR_PTR(-EFAULT); + page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); @@ -473,43 +628,77 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct net_device *dev = xs->dev; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; + int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); - if (IS_ERR(skb)) - return skb; + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_err; + } } else { u32 hr, tr, len; void *buffer; - int err; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); - tr = dev->needed_tailroom; + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; - skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; - skb_reserve(skb, hr); - skb_put(skb, len); + skb_reserve(skb, hr); + skb_put(skb, len); - buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); - err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { - kfree_skb(skb); - return ERR_PTR(err); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) + goto free_err; + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct page *page; + u8 *vaddr; + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EFAULT; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap_local_page(page); + memcpy(vaddr, buffer, len); + kunmap_local(vaddr); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); } } skb->dev = dev; skb->priority = xs->sk.sk_priority; skb->mark = READ_ONCE(xs->sk.sk_mark); - skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; skb->destructor = xsk_destruct_skb; + xsk_set_destructor_arg(skb); return skb; + +free_err: + if (err == -EAGAIN) { + xsk_cq_cancel_locked(xs, 1); + } else { + xsk_set_destructor_arg(skb); + xsk_drop_skb(skb); + xskq_cons_release(xs->tx); + } + + return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) @@ -519,7 +708,6 @@ static int __xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; - unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -544,47 +732,51 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (xskq_prod_reserve(xs->pool->cq)) { - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + if (xsk_cq_reserve_addr_locked(xs, desc.addr)) goto out; - } - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - goto out; + if (err == -EAGAIN) + goto out; + err = 0; + continue; + } + + xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ - skb->destructor = sock_wfree; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - /* Free skb without triggering the perf drop trace */ - consume_skb(skb); + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); + xsk_consume_skb(skb); err = -EAGAIN; goto out; } - xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; + xs->skb = NULL; goto out; } sent_frame = true; + xs->skb = NULL; } - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } out: if (sent_frame) @@ -834,6 +1026,9 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + if (xs->skb) + xsk_drop_skb(xs->skb); + mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); @@ -897,7 +1092,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | - XDP_USE_NEED_WAKEUP)) + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); @@ -929,7 +1124,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || - (flags & XDP_USE_NEED_WAKEUP)) { + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -1029,6 +1224,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; + xs->sg = !!(flags & XDP_USE_SG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 26f6d304451e..b3f7b310811e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -86,6 +86,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->umem = umem; pool->addrs = umem->addrs; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); spin_lock_init(&pool->cq_lock); @@ -99,6 +100,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -187,6 +189,11 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } + if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6d40a77fccbe..13354a1e4280 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -48,6 +48,11 @@ struct xsk_queue { size_t ring_vmalloc_size; }; +struct parsed_desc { + u32 mb; + u32 valid; +}; + /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and @@ -130,18 +135,26 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} + static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 offset = desc->addr & (pool->chunk_size - 1); + if (!desc->len) + return false; + if (offset + desc->len > pool->chunk_size) return false; if (desc->addr >= pool->addrs_cnt) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -151,6 +164,9 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + if (!desc->len) + return false; + if (desc->len > pool->chunk_size) return false; @@ -158,7 +174,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -170,6 +186,11 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool, xp_aligned_validate_desc(pool, desc); } +static inline bool xskq_has_descs(struct xsk_queue *q) +{ + return q->cached_cons != q->cached_prod; +} + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) @@ -185,17 +206,15 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { - while (q->cached_cons != q->cached_prod) { + if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; - if (xskq_cons_is_valid_desc(q, desc, pool)) - return true; - - q->cached_cons++; + return xskq_cons_is_valid_desc(q, desc, pool); } + q->queue_empty_descs++; return false; } @@ -204,30 +223,52 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) q->cached_cons += cnt; } -static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, - u32 max) +static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, + struct xdp_desc *desc, struct parsed_desc *parsed) +{ + parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); + parsed->mb = xp_mb_desc(desc); +} + +static inline +u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, + u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; + u32 total_descs = 0, nr_frags = 0; + /* track first entry, if stumble upon *any* invalid descriptor, rewind + * current packet that consists of frags and stop the processing + */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; + struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; - if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { - /* Skip the entry */ - cached_cons++; - continue; + cached_cons++; + parse_desc(q, pool, &descs[nb_entries], &parsed); + if (unlikely(!parsed.valid)) + break; + + if (likely(!parsed.mb)) { + total_descs += (nr_frags + 1); + nr_frags = 0; + } else { + nr_frags++; + if (nr_frags == pool->netdev->xdp_zc_max_segs) { + nr_frags = 0; + break; + } } - nb_entries++; - cached_cons++; } + cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); - return nb_entries; + return total_descs; } /* Functions for consumers */ @@ -292,6 +333,11 @@ static inline void xskq_cons_release(struct xsk_queue *q) q->cached_cons++; } +static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons -= cnt; +} + static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -319,9 +365,9 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } -static inline void xskq_prod_cancel(struct xsk_queue *q) +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { - q->cached_prod--; + q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -360,7 +406,7 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, - u64 addr, u32 len) + u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; @@ -372,6 +418,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; + ring->desc[idx].options = flags; return 0; } @@ -386,16 +433,6 @@ static inline void xskq_prod_submit(struct xsk_queue *q) __xskq_prod_submit(q, q->cached_prod); } -static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 idx = q->ring->producer; - - ring->desc[idx++ & q->ring_mask] = addr; - - __xskq_prod_submit(q, idx); -} - static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 533697e2488f..3784534c9185 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -247,12 +247,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - /* We don't yet support UDP encapsulation and TFC padding. */ - if (x->encap || x->tfcpad) { - NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); - return -EINVAL; - } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); @@ -260,6 +254,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; + + /* We don't yet support UDP encapsulation and TFC padding. */ + if ((!is_packet_offload && x->encap) || x->tfcpad) { + NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); + return -EINVAL; + } + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { |