diff options
Diffstat (limited to 'net')
487 files changed, 18751 insertions, 17681 deletions
diff --git a/net/802/garp.c b/net/802/garp.c index fc9eb02a912f..ab24b21fbb49 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -407,7 +407,7 @@ static void garp_join_timer_arm(struct garp_applicant *app) { unsigned long delay; - delay = prandom_u32_max(msecs_to_jiffies(garp_join_time)); + delay = get_random_u32_below(msecs_to_jiffies(garp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } @@ -618,7 +618,7 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl /* Delete timer and generate a final TRANSMIT_PDU event to flush out * all pending messages before the applicant is gone. */ - del_timer_sync(&app->join_timer); + timer_shutdown_sync(&app->join_timer); spin_lock_bh(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); diff --git a/net/802/mrp.c b/net/802/mrp.c index 6c927d4b35f0..eafc21ecc287 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -592,7 +592,7 @@ static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; - delay = prandom_u32_max(msecs_to_jiffies(mrp_join_time)); + delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } @@ -911,8 +911,8 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ - del_timer_sync(&app->join_timer); - del_timer_sync(&app->periodic_timer); + timer_shutdown_sync(&app->join_timer); + timer_shutdown_sync(&app->periodic_timer); spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); diff --git a/net/9p/client.c b/net/9p/client.c index aaa37b07e30a..622ec6a586ee 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -297,6 +297,11 @@ p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size, p9pdu_reset(&req->rc); req->t_err = 0; req->status = REQ_STATUS_ALLOC; + /* refcount needs to be set to 0 before inserting into the idr + * so p9_tag_lookup does not accept a request that is not fully + * initialized. refcount_set to 2 below will mark request ready. + */ + refcount_set(&req->refcount, 0); init_waitqueue_head(&req->wq); INIT_LIST_HEAD(&req->req_list); @@ -438,7 +443,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) * the status change is visible to another thread */ smp_wmb(); - req->status = status; + WRITE_ONCE(req->status, status); wake_up(&req->wq); p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); @@ -514,10 +519,9 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) int ecode; err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); - if (req->rc.size >= c->msize) { - p9_debug(P9_DEBUG_ERROR, - "requested packet size too big: %d\n", - req->rc.size); + if (req->rc.size > req->rc.capacity && !req->rc.zc) { + pr_err("requested packet size too big: %d does not fit %zu (type=%d)\n", + req->rc.size, req->rc.capacity, req->rc.id); return -EIO; } /* dump the response from server @@ -600,7 +604,7 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) /* if we haven't received a response for oldreq, * remove it from the list */ - if (oldreq->status == REQ_STATUS_SENT) { + if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) { if (c->trans_mod->cancelled) c->trans_mod->cancelled(c, oldreq); } @@ -680,6 +684,9 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) if (IS_ERR(req)) return req; + req->tc.zc = false; + req->rc.zc = false; + if (signal_pending(current)) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); @@ -697,7 +704,8 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) } again: /* Wait for the response */ - err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(req->wq, + READ_ONCE(req->status) >= REQ_STATUS_RCVD); /* Make sure our req is coherent with regard to updates in other * threads - echoes to wmb() in the callback @@ -711,7 +719,7 @@ again: goto again; } - if (req->status == REQ_STATUS_ERROR) { + if (READ_ONCE(req->status) == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; } @@ -724,7 +732,7 @@ again: p9_client_flush(c, req); /* if we received the response anyway, don't signal error */ - if (req->status == REQ_STATUS_RCVD) + if (READ_ONCE(req->status) == REQ_STATUS_RCVD) err = 0; } recalc_sigpending: @@ -778,6 +786,9 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, if (IS_ERR(req)) return req; + req->tc.zc = true; + req->rc.zc = true; + if (signal_pending(current)) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); @@ -793,7 +804,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, if (err != -ERESTARTSYS) goto recalc_sigpending; } - if (req->status == REQ_STATUS_ERROR) { + if (READ_ONCE(req->status) == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; } @@ -806,7 +817,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, p9_client_flush(c, req); /* if we received the response anyway, don't signal error */ - if (req->status == REQ_STATUS_RCVD) + if (READ_ONCE(req->status) == REQ_STATUS_RCVD) err = 0; } recalc_sigpending: @@ -2043,7 +2054,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) struct kvec kv = {.iov_base = data, .iov_len = count}; struct iov_iter to; - iov_iter_kvec(&to, READ, &kv, 1, count); + iov_iter_kvec(&to, ITER_DEST, &kv, 1, count); p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", fid->fid, offset, count); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 07db2f436d44..00b684616e8d 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -20,7 +20,6 @@ #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> -#include <linux/idr.h> #include <linux/file.h> #include <linux/parser.h> #include <linux/slab.h> @@ -202,11 +201,11 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { list_move(&req->req_list, &cancel_list); - req->status = REQ_STATUS_ERROR; + WRITE_ONCE(req->status, REQ_STATUS_ERROR); } list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { list_move(&req->req_list, &cancel_list); - req->status = REQ_STATUS_ERROR; + WRITE_ONCE(req->status, REQ_STATUS_ERROR); } spin_unlock(&m->req_lock); @@ -467,7 +466,7 @@ static void p9_write_work(struct work_struct *work) req = list_entry(m->unsent_req_list.next, struct p9_req_t, req_list); - req->status = REQ_STATUS_SENT; + WRITE_ONCE(req->status, REQ_STATUS_SENT); p9_debug(P9_DEBUG_TRANS, "move req %p\n", req); list_move_tail(&req->req_list, &m->req_list); @@ -676,7 +675,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) return m->err; spin_lock(&m->req_lock); - req->status = REQ_STATUS_UNSENT; + WRITE_ONCE(req->status, REQ_STATUS_UNSENT); list_add_tail(&req->req_list, &m->unsent_req_list); spin_unlock(&m->req_lock); @@ -703,7 +702,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) if (req->status == REQ_STATUS_UNSENT) { list_del(&req->req_list); - req->status = REQ_STATUS_FLSHD; + WRITE_ONCE(req->status, REQ_STATUS_FLSHD); p9_req_put(client, req); ret = 0; } @@ -732,7 +731,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) * remove it from the list. */ list_del(&req->req_list); - req->status = REQ_STATUS_FLSHD; + WRITE_ONCE(req->status, REQ_STATUS_FLSHD); spin_unlock(&m->req_lock); p9_req_put(client, req); @@ -868,6 +867,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) } csocket->sk->sk_allocation = GFP_NOIO; + csocket->sk->sk_use_task_frag = false; file = sock_alloc_file(csocket, 0, NULL); if (IS_ERR(file)) { pr_err("%s (%d): failed to map fd\n", diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 6ff706760676..83f9100d46bf 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -21,7 +21,6 @@ #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> -#include <linux/idr.h> #include <linux/file.h> #include <linux/parser.h> #include <linux/semaphore.h> @@ -507,7 +506,7 @@ dont_need_post_recv: * because doing if after could erase the REQ_STATUS_RCVD * status in case of a very fast reply. */ - req->status = REQ_STATUS_SENT; + WRITE_ONCE(req->status, REQ_STATUS_SENT); err = ib_post_send(rdma->qp, &wr, NULL); if (err) goto send_error; @@ -517,7 +516,7 @@ dont_need_post_recv: /* Handle errors that happened during or while preparing the send: */ send_error: - req->status = REQ_STATUS_ERROR; + WRITE_ONCE(req->status, REQ_STATUS_ERROR); kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index e757f0601304..3c27ffb781e3 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -22,7 +22,6 @@ #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> -#include <linux/idr.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/slab.h> @@ -263,7 +262,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); - req->status = REQ_STATUS_SENT; + WRITE_ONCE(req->status, REQ_STATUS_SENT); req_retry: spin_lock_irqsave(&chan->lock, flags); @@ -469,7 +468,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, inlen = n; } } - req->status = REQ_STATUS_SENT; + WRITE_ONCE(req->status, REQ_STATUS_SENT); req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); @@ -532,9 +531,10 @@ req_retry_pinned: spin_unlock_irqrestore(&chan->lock, flags); kicked = 1; p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); - err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(req->wq, + READ_ONCE(req->status) >= REQ_STATUS_RCVD); // RERROR needs reply (== error string) in static data - if (req->status == REQ_STATUS_RCVD && + if (READ_ONCE(req->status) == REQ_STATUS_RCVD && unlikely(req->rc.sdata[4] == P9_RERROR)) handle_rerror(req, in_hdr_len, offs, in_pages); diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index aaa5fd364691..82c7005ede65 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -157,7 +157,7 @@ again: &masked_prod, masked_cons, XEN_9PFS_RING_SIZE(ring)); - p9_req->status = REQ_STATUS_SENT; + WRITE_ONCE(p9_req->status, REQ_STATUS_SENT); virt_wmb(); /* write ring before updating pointer */ prod += size; ring->intf->out_prod = prod; @@ -212,11 +212,13 @@ static void p9_xen_response(struct work_struct *work) dev_warn(&priv->dev->dev, "requested packet size too big: %d for tag %d with capacity %zd\n", h.size, h.tag, req->rc.capacity); - req->status = REQ_STATUS_ERROR; + WRITE_ONCE(req->status, REQ_STATUS_ERROR); goto recv_error; } - memcpy(&req->rc, &h, sizeof(h)); + req->rc.size = h.size; + req->rc.id = h.id; + req->rc.tag = h.tag; req->rc.offset = 0; masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring)); @@ -303,13 +305,12 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) kfree(priv); } -static int xen_9pfs_front_remove(struct xenbus_device *dev) +static void xen_9pfs_front_remove(struct xenbus_device *dev) { struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); dev_set_drvdata(&dev->dev, NULL); xen_9pfs_front_free(priv); - return 0; } static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, diff --git a/net/Makefile b/net/Makefile index 6a62e5b27378..0914bea9c335 100644 --- a/net/Makefile +++ b/net/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ +obj-$(CONFIG_NET_DEVLINK) += devlink/ obj-$(CONFIG_NET_DSA) += dsa/ obj-$(CONFIG_ATALK) += appletalk/ obj-$(CONFIG_X25) += x25/ diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 0fdbdfd19474..466353b3dde4 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -108,9 +108,9 @@ static struct device_attribute *atm_attrs[] = { }; -static int atm_uevent(struct device *cdev, struct kobj_uevent_env *env) +static int atm_uevent(const struct device *cdev, struct kobj_uevent_env *env) { - struct atm_dev *adev; + const struct atm_dev *adev; if (!cdev) return -ENODEV; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 7f6a7c96ac92..828fb393ee94 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -27,7 +27,6 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> -#include <linux/prandom.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> @@ -280,7 +279,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32_max(2 * BATADV_JITTER); + msecs += get_random_u32_below(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } @@ -288,7 +287,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { - return jiffies + msecs_to_jiffies(prandom_u32_max(BATADV_JITTER / 2)); + return jiffies + msecs_to_jiffies(get_random_u32_below(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index f1741fbfb617..acff565849ae 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -21,7 +21,6 @@ #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/nl80211.h> -#include <linux/prandom.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -51,7 +50,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs; msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER; - msecs += prandom_u32_max(2 * BATADV_JITTER); + msecs += get_random_u32_below(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq, msecs_to_jiffies(msecs)); diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 033639df96d8..e710e9afe78f 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -21,7 +21,6 @@ #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> -#include <linux/prandom.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -90,7 +89,7 @@ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ - msecs += prandom_u32_max(msecs / 5) - (msecs / 10); + msecs += get_random_u32_below(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } @@ -109,7 +108,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32_max(2 * BATADV_JITTER); + msecs += get_random_u32_below(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } @@ -800,8 +799,8 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, /* only unknown & newer OGMs contain TVLVs we are interested in */ if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) - batadv_tvlv_containers_process(bat_priv, true, orig_node, - NULL, NULL, + batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node, + NULL, (unsigned char *)(ogm2 + 1), ntohs(ogm2->tvlv_len)); diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index fefb51a5f606..6968e55eb971 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -822,7 +822,7 @@ int batadv_dat_init(struct batadv_priv *bat_priv) batadv_dat_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_dat_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_DAT, 1, + NULL, NULL, BATADV_TVLV_DAT, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_dat_tvlv_container_update(bat_priv); return 0; diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 9349c76f30c5..6a964a773f57 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -259,7 +259,7 @@ void batadv_gw_init(struct batadv_priv *bat_priv) atomic_set(&bat_priv->gw.sel_class, 1); batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_GW, 1, + NULL, NULL, BATADV_TVLV_GW, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index c48803b32bb0..156ed39eded1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2022.3" +#define BATADV_SOURCE_VERSION "2023.1" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index b238455913df..315394f12c55 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -26,7 +26,6 @@ #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/kernel.h> -#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -1137,222 +1136,19 @@ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_tt_node_get() - get a multicast tt node - * @bat_priv: the bat priv with all the soft interface information - * @ethhdr: the ether header containing the multicast destination - * - * Return: an orig_node matching the multicast address provided by ethhdr - * via a translation table lookup. This increases the returned nodes refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr) -{ - return batadv_transtable_search(bat_priv, NULL, ethhdr->h_dest, - BATADV_NO_FLAGS); -} - -/** - * batadv_mcast_forw_ipv4_node_get() - get a node with an ipv4 flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_ipv4_list, - mcast_want_all_ipv4_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_ipv6_node_get() - get a node with an ipv6 flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set - * and increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_ipv6_list, - mcast_want_all_ipv6_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_ip_node_get() - get a node with an ipv4/ipv6 flag - * @bat_priv: the bat priv with all the soft interface information - * @ethhdr: an ethernet header to determine the protocol family from - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or - * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr) -{ - switch (ntohs(ethhdr->h_proto)) { - case ETH_P_IP: - return batadv_mcast_forw_ipv4_node_get(bat_priv); - case ETH_P_IPV6: - return batadv_mcast_forw_ipv6_node_get(bat_priv); - default: - /* we shouldn't be here... */ - return NULL; - } -} - -/** - * batadv_mcast_forw_unsnoop_node_get() - get a node with an unsnoopable flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag - * set and increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_unsnoopables_list, - mcast_want_all_unsnoopables_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_rtr4_node_get() - get a node with an ipv4 mcast router flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR4 flag unset and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_rtr4_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_rtr4_list, - mcast_want_all_rtr4_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_rtr6_node_get() - get a node with an ipv6 mcast router flag - * @bat_priv: the bat priv with all the soft interface information - * - * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR6 flag unset - * and increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_rtr6_node_get(struct batadv_priv *bat_priv) -{ - struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_orig_node, - &bat_priv->mcast.want_all_rtr6_list, - mcast_want_all_rtr6_node) { - if (!kref_get_unless_zero(&tmp_orig_node->refcount)) - continue; - - orig_node = tmp_orig_node; - break; - } - rcu_read_unlock(); - - return orig_node; -} - -/** - * batadv_mcast_forw_rtr_node_get() - get a node with an ipv4/ipv6 router flag - * @bat_priv: the bat priv with all the soft interface information - * @ethhdr: an ethernet header to determine the protocol family from - * - * Return: an orig_node which has no BATADV_MCAST_WANT_NO_RTR4 or - * BATADV_MCAST_WANT_NO_RTR6 flag, depending on the provided ethhdr, set and - * increases its refcount. - */ -static struct batadv_orig_node * -batadv_mcast_forw_rtr_node_get(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr) -{ - switch (ntohs(ethhdr->h_proto)) { - case ETH_P_IP: - return batadv_mcast_forw_rtr4_node_get(bat_priv); - case ETH_P_IPV6: - return batadv_mcast_forw_rtr6_node_get(bat_priv); - default: - /* we shouldn't be here... */ - return NULL; - } -} - -/** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information - * @skb: The multicast packet to check - * @orig: an originator to be set to forward the skb to + * @skb: the multicast packet to check * @is_routable: stores whether the destination is routable * - * Return: the forwarding mode as enum batadv_forw_mode and in case of - * BATADV_FORW_SINGLE set the orig to the single originator the skb - * should be forwarded to. + * Return: The forwarding mode as enum batadv_forw_mode. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_orig_node **orig, int *is_routable) + int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; - unsigned int mcast_fanout; struct ethhdr *ethhdr; int rtr_count = 0; @@ -1361,7 +1157,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) - return BATADV_FORW_ALL; + return BATADV_FORW_BCAST; ethhdr = eth_hdr(skb); @@ -1374,32 +1170,15 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, total_count = tt_count + ip_count + unsnoop_count + rtr_count; - switch (total_count) { - case 1: - if (tt_count) - *orig = batadv_mcast_forw_tt_node_get(bat_priv, ethhdr); - else if (ip_count) - *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr); - else if (unsnoop_count) - *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv); - else if (rtr_count) - *orig = batadv_mcast_forw_rtr_node_get(bat_priv, - ethhdr); - - if (*orig) - return BATADV_FORW_SINGLE; - - fallthrough; - case 0: + if (!total_count) return BATADV_FORW_NONE; - default: - mcast_fanout = atomic_read(&bat_priv->multicast_fanout); + else if (unsnoop_count) + return BATADV_FORW_BCAST; - if (!unsnoop_count && total_count <= mcast_fanout) - return BATADV_FORW_SOME; - } + if (total_count <= atomic_read(&bat_priv->multicast_fanout)) + return BATADV_FORW_UCASTS; - return BATADV_FORW_ALL; + return BATADV_FORW_BCAST; } /** @@ -1411,10 +1190,10 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ -int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, - struct sk_buff *skb, - unsigned short vid, - struct batadv_orig_node *orig_node) +static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side @@ -2039,7 +1818,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, - NULL, BATADV_TVLV_MCAST, 2, + NULL, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 8aec818d0bf6..a9770d8d6d36 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -17,23 +17,16 @@ */ enum batadv_forw_mode { /** - * @BATADV_FORW_ALL: forward the packet to all nodes (currently via - * classic flooding) + * @BATADV_FORW_BCAST: forward the packet to all nodes via a batman-adv + * broadcast packet */ - BATADV_FORW_ALL, + BATADV_FORW_BCAST, /** - * @BATADV_FORW_SOME: forward the packet to some nodes (currently via - * a multicast-to-unicast conversion and the BATMAN unicast routing - * protocol) + * @BATADV_FORW_UCASTS: forward the packet to some nodes via one + * or more batman-adv unicast packets */ - BATADV_FORW_SOME, - - /** - * @BATADV_FORW_SINGLE: forward the packet to a single node (currently - * via the BATMAN unicast routing protocol) - */ - BATADV_FORW_SINGLE, + BATADV_FORW_UCASTS, /** @BATADV_FORW_NONE: don't forward, drop it */ BATADV_FORW_NONE, @@ -43,14 +36,8 @@ enum batadv_forw_mode { enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_orig_node **mcast_single_orig, int *is_routable); -int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, - struct sk_buff *skb, - unsigned short vid, - struct batadv_orig_node *orig_node); - int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable); @@ -69,20 +56,9 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); static inline enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_orig_node **mcast_single_orig, int *is_routable) { - return BATADV_FORW_ALL; -} - -static inline int -batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, - struct sk_buff *skb, - unsigned short vid, - struct batadv_orig_node *orig_node) -{ - kfree_skb(skb); - return NET_XMIT_DROP; + return BATADV_FORW_BCAST; } static inline int diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 5f4aeeb60dc4..71ebd0284f95 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -25,8 +25,8 @@ #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> -#include <linux/prandom.h> #include <linux/printk.h> +#include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> @@ -160,7 +160,7 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) batadv_nc_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_NC, 1, + NULL, NULL, BATADV_TVLV_NC, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_nc_tvlv_container_update(bat_priv); return 0; @@ -1009,7 +1009,7 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, static u8 batadv_nc_random_weight_tq(u8 tq) { /* randomize the estimated packet loss (max TQ - estimated TQ) */ - u8 rand_tq = prandom_u32_max(BATADV_TQ_MAX_VALUE + 1 - tq); + u8 rand_tq = get_random_u32_below(BATADV_TQ_MAX_VALUE + 1 - tq); /* convert to (randomized) estimated tq again */ return BATADV_TQ_MAX_VALUE - rand_tq; diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 83f31494ea4d..163cd43c4821 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1073,10 +1073,9 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, if (tvlv_buff_len > skb->len - hdr_size) goto free_skb; - ret = batadv_tvlv_containers_process(bat_priv, false, NULL, - unicast_tvlv_packet->src, - unicast_tvlv_packet->dst, - tvlv_buff, tvlv_buff_len); + ret = batadv_tvlv_containers_process(bat_priv, BATADV_UNICAST_TVLV, + NULL, skb, tvlv_buff, + tvlv_buff_len); if (ret != NET_RX_SUCCESS) { ret = batadv_route_unicast_packet(skb, recv_if); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0f5c0679b55a..125f4628687c 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -48,7 +48,6 @@ #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" -#include "originator.h" #include "send.h" #include "translation-table.h" @@ -196,8 +195,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, unsigned short vid; u32 seqno; int gw_mode; - enum batadv_forw_mode forw_mode = BATADV_FORW_SINGLE; - struct batadv_orig_node *mcast_single_orig = NULL; + enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; @@ -301,14 +299,18 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, - &mcast_single_orig, &mcast_is_routable); - if (forw_mode == BATADV_FORW_NONE) - goto dropped; - - if (forw_mode == BATADV_FORW_SINGLE || - forw_mode == BATADV_FORW_SOME) + switch (forw_mode) { + case BATADV_FORW_BCAST: + break; + case BATADV_FORW_UCASTS: do_bcast = false; + break; + case BATADV_FORW_NONE: + fallthrough; + default: + goto dropped; + } } } @@ -357,10 +359,7 @@ send: if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); - } else if (mcast_single_orig) { - ret = batadv_mcast_forw_send_orig(bat_priv, skb, vid, - mcast_single_orig); - } else if (forw_mode == BATADV_FORW_SOME) { + } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else { @@ -386,7 +385,6 @@ dropped: dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: - batadv_orig_node_put(mcast_single_orig); batadv_hardif_put(primary_if); return NETDEV_TX_OK; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 01d30c1e412c..36ca31252a73 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -4168,11 +4168,11 @@ int batadv_tt_init(struct batadv_priv *bat_priv) } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, - batadv_tt_tvlv_unicast_handler_v1, + batadv_tt_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_TT, 1, BATADV_NO_FLAGS); batadv_tvlv_handler_register(bat_priv, NULL, - batadv_roam_tvlv_unicast_handler_v1, + batadv_roam_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS); INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge); diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 7ec2e2343884..2a583215d439 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -352,10 +352,9 @@ end: * appropriate handlers * @bat_priv: the bat priv with all the soft interface information * @tvlv_handler: tvlv callback function handling the tvlv content - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet + * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * @@ -364,15 +363,20 @@ end: */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, - bool ogm_source, + u8 packet_type, struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) + struct sk_buff *skb, void *tvlv_value, + u16 tvlv_value_len) { + unsigned int tvlv_offset; + u8 *src, *dst; + if (!tvlv_handler) return NET_RX_SUCCESS; - if (ogm_source) { + switch (packet_type) { + case BATADV_IV_OGM: + case BATADV_OGM2: if (!tvlv_handler->ogm_handler) return NET_RX_SUCCESS; @@ -383,19 +387,32 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, BATADV_NO_FLAGS, tvlv_value, tvlv_value_len); tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; - } else { - if (!src) - return NET_RX_SUCCESS; - - if (!dst) + break; + case BATADV_UNICAST_TVLV: + if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->unicast_handler) return NET_RX_SUCCESS; + src = ((struct batadv_unicast_tvlv_packet *)skb->data)->src; + dst = ((struct batadv_unicast_tvlv_packet *)skb->data)->dst; + return tvlv_handler->unicast_handler(bat_priv, src, dst, tvlv_value, tvlv_value_len); + case BATADV_MCAST: + if (!skb) + return NET_RX_SUCCESS; + + if (!tvlv_handler->mcast_handler) + return NET_RX_SUCCESS; + + tvlv_offset = (unsigned char *)tvlv_value - skb->data; + skb_set_network_header(skb, tvlv_offset); + skb_set_transport_header(skb, tvlv_offset + tvlv_value_len); + + return tvlv_handler->mcast_handler(bat_priv, skb); } return NET_RX_SUCCESS; @@ -405,10 +422,9 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the soft interface information - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet + * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * @@ -416,10 +432,10 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, + u8 packet_type, struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) + struct sk_buff *skb, void *tvlv_value, + u16 tvlv_value_len) { struct batadv_tvlv_handler *tvlv_handler; struct batadv_tvlv_hdr *tvlv_hdr; @@ -441,20 +457,24 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, tvlv_hdr->version); ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, - ogm_source, orig_node, - src, dst, tvlv_value, + packet_type, orig_node, skb, + tvlv_value, tvlv_value_cont_len); batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } - if (!ogm_source) + if (packet_type != BATADV_IV_OGM && + packet_type != BATADV_OGM2) return ret; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler, &bat_priv->tvlv.handler_list, list) { + if (!tvlv_handler->ogm_handler) + continue; + if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) tvlv_handler->ogm_handler(bat_priv, orig_node, @@ -490,7 +510,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, tvlv_value = batadv_ogm_packet + 1; - batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, + batadv_tvlv_containers_process(bat_priv, BATADV_IV_OGM, orig_node, NULL, tvlv_value, tvlv_value_len); } @@ -504,6 +524,10 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, * @uptr: unicast tvlv handler callback function. This function receives the * source & destination of the unicast packet as well as the tvlv content * to process. + * @mptr: multicast packet tvlv handler callback function. This function + * receives the full skb to process, with the skb network header pointing + * to the current tvlv and the skb transport header pointing to the first + * byte after the current tvlv. * @type: tvlv handler type to be registered * @version: tvlv handler version to be registered * @flags: flags to enable or disable TVLV API behavior @@ -518,6 +542,8 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len), + int (*mptr)(struct batadv_priv *bat_priv, + struct sk_buff *skb), u8 type, u8 version, u8 flags) { struct batadv_tvlv_handler *tvlv_handler; @@ -539,6 +565,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; + tvlv_handler->mcast_handler = mptr; tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 4cf8af00fc11..e5697230d991 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> @@ -34,14 +35,16 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len), + int (*mptr)(struct batadv_priv *bat_priv, + struct sk_buff *skb), u8 type, u8 version, u8 flags); void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, u8 type, u8 version); int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, + u8 packet_type, struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_buff, u16 tvlv_buff_len); + struct sk_buff *skb, void *tvlv_buff, + u16 tvlv_buff_len); void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 758cd797a063..ca9449ec9836 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -2335,6 +2335,12 @@ struct batadv_tvlv_handler { u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len); + /** + * @mcast_handler: handler callback which is given the tvlv payload to + * process on incoming mcast packet + */ + int (*mcast_handler)(struct batadv_priv *bat_priv, struct sk_buff *skb); + /** @type: tvlv type this handler feels responsible for */ u8 type; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 215af9b3b589..4eb1b3ced0d2 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -441,7 +441,7 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, skb->len); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len); if (err > 0) { @@ -972,6 +972,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); hci_dev_unlock(hdev); + hci_dev_put(hdev); if (!hcon) return -ENOENT; diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index ae3bdc6dfc92..da7cac0a1b71 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -78,6 +78,17 @@ config BT_LE Bluetooth Low Energy includes support low-energy physical layer available with Bluetooth version 4.0 or later. +config BT_LE_L2CAP_ECRED + bool "Bluetooth L2CAP Enhanced Credit Flow Control" + depends on BT_LE + default y + help + Bluetooth Low Energy L2CAP Enhanced Credit Flow Control available with + Bluetooth version 5.2 or later. + + This can be overridden by passing bluetooth.enable_ecred=[1|0] + on the kernel commandline. + config BT_6LOWPAN tristate "Bluetooth 6LoWPAN support" depends on BT_LE && 6LOWPAN diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 1fcc482397c3..e7adb8a98cf9 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -56,7 +56,7 @@ static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *dat memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, total_len); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, total_len); l2cap_chan_send(chan, &msg, total_len); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index dc65974f5adb..1c3c7ff5c3c6 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -737,7 +737,7 @@ static int __init bt_init(void) err = bt_sysfs_init(); if (err < 0) - return err; + goto cleanup_led; err = sock_register(&bt_sock_family_ops); if (err) @@ -773,6 +773,8 @@ unregister_socket: sock_unregister(PF_BLUETOOTH); cleanup_sysfs: bt_sysfs_cleanup(); +cleanup_led: + bt_leds_cleanup(); return err; } diff --git a/net/bluetooth/hci_codec.c b/net/bluetooth/hci_codec.c index 38201532f58e..3cc135bb1d30 100644 --- a/net/bluetooth/hci_codec.c +++ b/net/bluetooth/hci_codec.c @@ -72,9 +72,8 @@ static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport, continue; } - skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS, - sizeof(*cmd), cmd, - HCI_CMD_TIMEOUT); + skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS, + sizeof(*cmd), cmd, 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read codec capabilities (%ld)", PTR_ERR(skb)); @@ -127,8 +126,8 @@ void hci_read_supported_codecs(struct hci_dev *hdev) struct hci_op_read_local_codec_caps caps; __u8 i; - skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, - HCI_CMD_TIMEOUT); + skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, + 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", @@ -158,7 +157,8 @@ void hci_read_supported_codecs(struct hci_dev *hdev) for (i = 0; i < std_codecs->num; i++) { caps.id = std_codecs->codec[i]; caps.direction = 0x00; - hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps); + hci_read_codec_capabilities(hdev, + LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) @@ -178,7 +178,8 @@ void hci_read_supported_codecs(struct hci_dev *hdev) caps.cid = vnd_codecs->codec[i].cid; caps.vid = vnd_codecs->codec[i].vid; caps.direction = 0x00; - hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps); + hci_read_codec_capabilities(hdev, + LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps); } error: @@ -194,8 +195,8 @@ void hci_read_supported_codecs_v2(struct hci_dev *hdev) struct hci_op_read_local_codec_caps caps; __u8 i; - skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL, - HCI_CMD_TIMEOUT); + skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL, + 0, HCI_CMD_TIMEOUT, NULL); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a6c12863a253..17b946f9ba31 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -821,19 +821,23 @@ static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err) static int hci_le_terminate_big(struct hci_dev *hdev, u8 big, u8 bis) { struct iso_list_data *d; + int ret; bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", big, bis); - d = kmalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - memset(d, 0, sizeof(*d)); d->big = big; d->bis = bis; - return hci_cmd_sync_queue(hdev, terminate_big_sync, d, - terminate_big_destroy); + ret = hci_cmd_sync_queue(hdev, terminate_big_sync, d, + terminate_big_destroy); + if (ret) + kfree(d); + + return ret; } static int big_terminate_sync(struct hci_dev *hdev, void *data) @@ -858,19 +862,23 @@ static int big_terminate_sync(struct hci_dev *hdev, void *data) static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle) { struct iso_list_data *d; + int ret; bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, sync_handle); - d = kmalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - memset(d, 0, sizeof(*d)); d->big = big; d->sync_handle = sync_handle; - return hci_cmd_sync_queue(hdev, big_terminate_sync, d, - terminate_big_destroy); + ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, + terminate_big_destroy); + if (ret) + kfree(d); + + return ret; } /* Cleanup BIS connection @@ -1053,8 +1061,15 @@ int hci_conn_del(struct hci_conn *conn) if (conn->type == ACL_LINK) { struct hci_conn *sco = conn->link; - if (sco) + if (sco) { sco->link = NULL; + /* Due to race, SCO connection might be not established + * yet at this point. Delete it now, otherwise it is + * possible for it to be stuck and can't be deleted. + */ + if (sco->handle == HCI_CONN_HANDLE_UNSET) + hci_conn_del(sco); + } /* Unacked frames */ hdev->acl_cnt += conn->sent; @@ -1235,6 +1250,8 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) if (conn != hci_lookup_le_connect(hdev)) goto done; + /* Flush to make sure we send create conn cancel command if needed */ + flush_delayed_work(&conn->le_conn_timeout); hci_conn_failed(conn, bt_status(err)); done: @@ -1881,7 +1898,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data) continue; /* Check if all CIS(s) belonging to a CIG are ready */ - if (conn->link->state != BT_CONNECTED || + if (!conn->link || conn->link->state != BT_CONNECTED || conn->state != BT_CONNECT) { cmd.cp.num_cis = 0; break; @@ -1973,16 +1990,14 @@ static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, qos->latency = conn->le_conn_latency; } -static struct hci_conn *hci_bind_bis(struct hci_conn *conn, - struct bt_iso_qos *qos) +static void hci_bind_bis(struct hci_conn *conn, + struct bt_iso_qos *qos) { /* Update LINK PHYs according to QoS preference */ conn->le_tx_phy = qos->out.phy; conn->le_tx_phy = qos->out.phy; conn->iso_qos = *qos; conn->state = BT_BOUND; - - return conn; } static int create_big_sync(struct hci_dev *hdev, void *data) @@ -2046,19 +2061,12 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) return -EBUSY; - cp = kmalloc(sizeof(*cp), GFP_KERNEL); + cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) { hci_dev_clear_flag(hdev, HCI_PA_SYNC); return -ENOMEM; } - /* Convert from ISO socket address type to HCI address type */ - if (dst_type == BDADDR_LE_PUBLIC) - dst_type = ADDR_LE_DEV_PUBLIC; - else - dst_type = ADDR_LE_DEV_RANDOM; - - memset(cp, 0, sizeof(*cp)); cp->sid = sid; cp->addr_type = dst_type; bacpy(&cp->addr, dst); @@ -2118,11 +2126,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, if (IS_ERR(conn)) return conn; - conn = hci_bind_bis(conn, qos); - if (!conn) { - hci_conn_drop(conn); - return ERR_PTR(-ENOMEM); - } + hci_bind_bis(conn, qos); /* Add Basic Announcement into Peridic Adv Data if BASE is set */ if (base_len && base) { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0540555b3704..b65c3aabcd53 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2660,7 +2660,7 @@ int hci_register_dev(struct hci_dev *hdev) error = hci_register_suspend_notifier(hdev); if (error) - goto err_wqueue; + BT_WARN("register suspend notifier failed error:%d\n", error); queue_work(hdev->req_workqueue, &hdev->power_on); @@ -2764,7 +2764,8 @@ int hci_register_suspend_notifier(struct hci_dev *hdev) { int ret = 0; - if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + if (!hdev->suspend_notifier.notifier_call && + !test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } @@ -2776,8 +2777,11 @@ int hci_unregister_suspend_notifier(struct hci_dev *hdev) { int ret = 0; - if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) + if (hdev->suspend_notifier.notifier_call) { ret = unregister_pm_notifier(&hdev->suspend_notifier); + if (!ret) + hdev->suspend_notifier.notifier_call = NULL; + } return ret; } @@ -3981,7 +3985,7 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; - kfree_skb(skb); + dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 3f401ec5bb0c..b7f682922a16 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -757,7 +757,7 @@ static ssize_t force_static_address_write(struct file *file, bool enable; int err; - if (test_bit(HCI_UP, &hdev->flags)) + if (hdev_is_powered(hdev)) return -EBUSY; err = kstrtobool_from_user(user_buf, count, &enable); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index faca701bce2a..ad92a4be5851 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -801,9 +801,6 @@ static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); - if (rp->status) - return rp->status; - sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO); if (!sent) return rp->status; @@ -811,9 +808,17 @@ static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); - if (conn) + if (!conn) { + rp->status = 0xff; + goto unlock; + } + + if (!rp->status) conn->auth_payload_timeout = get_unaligned_le16(sent + 2); + hci_encrypt_cfm(conn, 0); + +unlock: hci_dev_unlock(hdev); return rp->status; @@ -3680,8 +3685,13 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, cp.handle = cpu_to_le16(conn->handle); cp.timeout = cpu_to_le16(hdev->auth_payload_timeout); - hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO, - sizeof(cp), &cp); + if (hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO, + sizeof(cp), &cp)) { + bt_dev_err(hdev, "write auth payload timeout failed"); + goto notify; + } + + goto unlock; } notify: @@ -3838,8 +3848,11 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, conn->handle, conn->link); /* Create CIS if LE is already connected */ - if (conn->link && conn->link->state == BT_CONNECTED) + if (conn->link && conn->link->state == BT_CONNECTED) { + rcu_read_unlock(); hci_le_create_cis(conn->link); + rcu_read_lock(); + } if (i == rp->num_handles) break; @@ -6494,7 +6507,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, info->length)) break; - evt_type = __le16_to_cpu(info->type); + evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK; legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &info->bdaddr, diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 5a0296a4352e..f7e006a36382 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -269,7 +269,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, void hci_req_add(struct hci_request *req, u16 opcode, u32 plen, const void *param) { - bt_dev_err(req->hdev, "HCI_REQ-0x%4.4x", opcode); + bt_dev_dbg(req->hdev, "HCI_REQ-0x%4.4x", opcode); hci_req_add_ev(req, opcode, plen, param, 0); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 76c3107c9f91..117eedb6f709 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -12,6 +12,7 @@ #include <net/bluetooth/mgmt.h> #include "hci_request.h" +#include "hci_codec.h" #include "hci_debugfs.h" #include "smp.h" #include "eir.h" @@ -3054,6 +3055,7 @@ int hci_update_name_sync(struct hci_dev *hdev) * Enable Authentication * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class -> * Set Name -> Set EIR) + * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address) */ int hci_powered_update_sync(struct hci_dev *hdev) { @@ -3093,6 +3095,23 @@ int hci_powered_update_sync(struct hci_dev *hdev) hci_update_eir_sync(hdev); } + /* If forcing static address is in use or there is no public + * address use the static address as random address (but skip + * the HCI command if the current random address is already the + * static one. + * + * In case BR/EDR has been disabled on a dual-mode controller + * and a static address has been configured, then use that + * address instead of the public BR/EDR address. + */ + if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || + (!bacmp(&hdev->bdaddr, BDADDR_ANY) && + !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) { + if (bacmp(&hdev->static_addr, BDADDR_ANY)) + return hci_set_random_addr_sync(hdev, + &hdev->static_addr); + } + return 0; } @@ -3553,7 +3572,7 @@ static const struct hci_init_stage hci_init2[] = { static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) { /* Use Read LE Buffer Size V2 if supported */ - if (hdev->commands[41] & 0x20) + if (iso_capable(hdev) && hdev->commands[41] & 0x20) return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE_V2, 0, NULL, HCI_CMD_TIMEOUT); @@ -3578,10 +3597,10 @@ static int hci_le_read_supported_states_sync(struct hci_dev *hdev) /* LE Controller init stage 2 command sequence */ static const struct hci_init_stage le_init2[] = { - /* HCI_OP_LE_READ_BUFFER_SIZE */ - HCI_INIT(hci_le_read_buffer_size_sync), /* HCI_OP_LE_READ_LOCAL_FEATURES */ HCI_INIT(hci_le_read_local_features_sync), + /* HCI_OP_LE_READ_BUFFER_SIZE */ + HCI_INIT(hci_le_read_buffer_size_sync), /* HCI_OP_LE_READ_SUPPORTED_STATES */ HCI_INIT(hci_le_read_supported_states_sync), {} @@ -3780,7 +3799,8 @@ static int hci_read_page_scan_activity_sync(struct hci_dev *hdev) static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev) { if (!(hdev->commands[18] & 0x04) || - !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING)) + !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || + test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING, @@ -4238,11 +4258,12 @@ static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev) /* Read local codec list if the HCI command is supported */ static int hci_read_local_codecs_sync(struct hci_dev *hdev) { - if (!(hdev->commands[29] & 0x20)) - return 0; + if (hdev->commands[45] & 0x04) + hci_read_supported_codecs_v2(hdev); + else if (hdev->commands[29] & 0x20) + hci_read_supported_codecs(hdev); - return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, - HCI_CMD_TIMEOUT); + return 0; } /* Read local pairing options if the HCI command is supported */ @@ -4258,7 +4279,7 @@ static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev) /* Get MWS transport configuration if the HCI command is supported */ static int hci_get_mws_transport_config_sync(struct hci_dev *hdev) { - if (!(hdev->commands[30] & 0x08)) + if (!mws_transport_config_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG, @@ -4298,7 +4319,8 @@ static int hci_set_err_data_report_sync(struct hci_dev *hdev) bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); if (!(hdev->commands[18] & 0x08) || - !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING)) + !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || + test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) return 0; if (enabled == hdev->err_data_reporting) @@ -4457,6 +4479,9 @@ static const struct { HCI_QUIRK_BROKEN(STORED_LINK_KEY, "HCI Delete Stored Link Key command is advertised, " "but not supported."), + HCI_QUIRK_BROKEN(ERR_DATA_REPORTING, + "HCI Read Default Erroneous Data Reporting command is " + "advertised, but not supported."), HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER, "HCI Read Transmit Power Level command is advertised, " "but not supported."), @@ -4696,6 +4721,7 @@ int hci_dev_open_sync(struct hci_dev *hdev) hdev->flush(hdev); if (hdev->sent_cmd) { + cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } @@ -6161,20 +6187,13 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, static int _update_adv_data_sync(struct hci_dev *hdev, void *data) { - u8 instance = *(u8 *)data; - - kfree(data); + u8 instance = PTR_ERR(data); return hci_update_adv_data_sync(hdev, instance); } int hci_update_adv_data(struct hci_dev *hdev, u8 instance) { - u8 *inst_ptr = kmalloc(1, GFP_KERNEL); - - if (!inst_ptr) - return -ENOMEM; - - *inst_ptr = instance; - return hci_cmd_sync_queue(hdev, _update_adv_data_sync, inst_ptr, NULL); + return hci_cmd_sync_queue(hdev, _update_adv_data_sync, + ERR_PTR(instance), NULL); } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index f825857db6d0..24444b502e58 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -261,36 +261,42 @@ static int iso_connect_bis(struct sock *sk) if (!bis_capable(hdev)) { err = -EOPNOTSUPP; - goto done; + goto unlock; } /* Fail if out PHYs are marked as disabled */ if (!iso_pi(sk)->qos.out.phy) { err = -EINVAL; - goto done; + goto unlock; } - hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type, + hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, + le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); - goto done; + goto unlock; } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; - goto done; + goto unlock; } - /* Update source addr of the socket */ - bacpy(&iso_pi(sk)->src, &hcon->src); + hci_dev_unlock(hdev); + hci_dev_put(hdev); err = iso_chan_add(conn, sk, NULL); if (err) - goto done; + return err; + + lock_sock(sk); + + /* Update source addr of the socket */ + bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); @@ -300,7 +306,10 @@ static int iso_connect_bis(struct sock *sk) iso_sock_set_timer(sk, sk->sk_sndtimeo); } -done: + release_sock(sk); + return err; + +unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -324,13 +333,13 @@ static int iso_connect_cis(struct sock *sk) if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; - goto done; + goto unlock; } /* Fail if either PHYs are marked as disabled */ if (!iso_pi(sk)->qos.in.phy && !iso_pi(sk)->qos.out.phy) { err = -EINVAL; - goto done; + goto unlock; } /* Just bind if DEFER_SETUP has been set */ @@ -340,7 +349,7 @@ static int iso_connect_cis(struct sock *sk) &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); - goto done; + goto unlock; } } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, @@ -348,7 +357,7 @@ static int iso_connect_cis(struct sock *sk) &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); - goto done; + goto unlock; } } @@ -356,15 +365,20 @@ static int iso_connect_cis(struct sock *sk) if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; - goto done; + goto unlock; } - /* Update source addr of the socket */ - bacpy(&iso_pi(sk)->src, &hcon->src); + hci_dev_unlock(hdev); + hci_dev_put(hdev); err = iso_chan_add(conn, sk, NULL); if (err) - goto done; + return err; + + lock_sock(sk); + + /* Update source addr of the socket */ + bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); @@ -377,7 +391,10 @@ static int iso_connect_cis(struct sock *sk) iso_sock_set_timer(sk, sk->sk_sndtimeo); } -done: + release_sock(sk); + return err; + +unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -831,20 +848,23 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; + release_sock(sk); + if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); if (err) - goto done; + return err; + + lock_sock(sk); if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); } -done: release_sock(sk); return err; } @@ -873,12 +893,11 @@ static int iso_listen_bis(struct sock *sk) if (!hdev) return -EHOSTUNREACH; - hci_dev_lock(hdev); - - err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type, + err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, + le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid); - hci_dev_unlock(hdev); + hci_dev_put(hdev); return err; } @@ -1098,28 +1117,22 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iso_pinfo *pi = iso_pi(sk); - int err; BT_DBG("sk %p", sk); - lock_sock(sk); - if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { switch (sk->sk_state) { case BT_CONNECT2: + lock_sock(sk); iso_conn_defer_accept(pi->conn->hcon); sk->sk_state = BT_CONFIG; release_sock(sk); return 0; case BT_CONNECT: - err = iso_connect_cis(sk); - release_sock(sk); - return err; + return iso_connect_cis(sk); } } - release_sock(sk); - return bt_sock_recvmsg(sock, msg, len, flags); } @@ -1414,33 +1427,29 @@ static void iso_conn_ready(struct iso_conn *conn) struct sock *parent; struct sock *sk = conn->sk; struct hci_ev_le_big_sync_estabilished *ev; + struct hci_conn *hcon; BT_DBG("conn %p", conn); if (sk) { iso_sock_ready(conn->sk); } else { - iso_conn_lock(conn); - - if (!conn->hcon) { - iso_conn_unlock(conn); + hcon = conn->hcon; + if (!hcon) return; - } - ev = hci_recv_event_data(conn->hcon->hdev, + ev = hci_recv_event_data(hcon->hdev, HCI_EVT_LE_BIG_SYNC_ESTABILISHED); if (ev) - parent = iso_get_sock_listen(&conn->hcon->src, - &conn->hcon->dst, + parent = iso_get_sock_listen(&hcon->src, + &hcon->dst, iso_match_big, ev); else - parent = iso_get_sock_listen(&conn->hcon->src, + parent = iso_get_sock_listen(&hcon->src, BDADDR_ANY, NULL, NULL); - if (!parent) { - iso_conn_unlock(conn); + if (!parent) return; - } lock_sock(parent); @@ -1448,30 +1457,29 @@ static void iso_conn_ready(struct iso_conn *conn) BTPROTO_ISO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); - iso_conn_unlock(conn); return; } iso_sock_init(sk, parent); - bacpy(&iso_pi(sk)->src, &conn->hcon->src); - iso_pi(sk)->src_type = conn->hcon->src_type; + bacpy(&iso_pi(sk)->src, &hcon->src); + iso_pi(sk)->src_type = hcon->src_type; /* If hcon has no destination address (BDADDR_ANY) it means it * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED so we need to * initialize using the parent socket destination address. */ - if (!bacmp(&conn->hcon->dst, BDADDR_ANY)) { - bacpy(&conn->hcon->dst, &iso_pi(parent)->dst); - conn->hcon->dst_type = iso_pi(parent)->dst_type; - conn->hcon->sync_handle = iso_pi(parent)->sync_handle; + if (!bacmp(&hcon->dst, BDADDR_ANY)) { + bacpy(&hcon->dst, &iso_pi(parent)->dst); + hcon->dst_type = iso_pi(parent)->dst_type; + hcon->sync_handle = iso_pi(parent)->sync_handle; } - bacpy(&iso_pi(sk)->dst, &conn->hcon->dst); - iso_pi(sk)->dst_type = conn->hcon->dst_type; + bacpy(&iso_pi(sk)->dst, &hcon->dst); + iso_pi(sk)->dst_type = hcon->dst_type; - hci_conn_hold(conn->hcon); - __iso_chan_add(conn, sk, parent); + hci_conn_hold(hcon); + iso_chan_add(conn, sk, parent); if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; @@ -1482,8 +1490,6 @@ static void iso_conn_ready(struct iso_conn *conn) parent->sk_data_ready(parent); release_sock(parent); - - iso_conn_unlock(conn); } } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 9c24947aa41e..adfc3ea06d08 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -45,7 +45,7 @@ #define LE_FLOWCTL_MAX_CREDITS 65535 bool disable_ertm; -bool enable_ecred; +bool enable_ecred = IS_ENABLED(CONFIG_BT_LE_L2CAP_ECRED); static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD; @@ -2683,14 +2683,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); - /* Channel lock is released before requesting new skb and then - * reacquired thus we need to recheck channel state. - */ - if (chan->state != BT_CONNECTED) { - kfree_skb(skb); - return -ENOTCONN; - } - l2cap_do_send(chan, skb); return len; } @@ -2735,14 +2727,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); - /* Channel lock is released before requesting new skb and then - * reacquired thus we need to recheck channel state. - */ - if (chan->state != BT_CONNECTED) { - kfree_skb(skb); - return -ENOTCONN; - } - l2cap_do_send(chan, skb); err = len; break; @@ -2763,14 +2747,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) */ err = l2cap_segment_sdu(chan, &seg_queue, msg, len); - /* The channel could have been closed while segmenting, - * check that it is still connected. - */ - if (chan->state != BT_CONNECTED) { - __skb_queue_purge(&seg_queue); - err = -ENOTCONN; - } - if (err) break; @@ -4453,7 +4429,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, chan->ident = cmd->ident; l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp); - chan->num_conf_rsp++; + if (chan->num_conf_rsp < L2CAP_CONF_MAX_CONF_RSP) + chan->num_conf_rsp++; /* Reset config buffer. */ chan->conf_len = 0; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ca8f07f3542b..eebe256104bc 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1624,6 +1624,14 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan, if (!skb) return ERR_PTR(err); + /* Channel lock is released before requesting new skb and then + * reacquired thus we need to recheck channel state. + */ + if (chan->state != BT_CONNECTED) { + kfree_skb(skb); + return ERR_PTR(-ENOTCONN); + } + skb->priority = sk->sk_priority; bt_cb(skb)->l2cap.chan = chan; diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 469a0c95b6e8..53a796ac078c 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -170,7 +170,7 @@ __u8 bt_status(int err) case -EMLINK: return 0x09; - case EALREADY: + case -EALREADY: return 0x0b; case -EBUSY: @@ -191,7 +191,7 @@ __u8 bt_status(int err) case -ECONNABORTED: return 0x16; - case ELOOP: + case -ELOOP: return 0x17; case -EPROTONOSUPPORT: diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a92e7e485feb..7add66f30e4d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -859,6 +859,12 @@ static u32 get_supported_settings(struct hci_dev *hdev) hdev->set_bdaddr) settings |= MGMT_SETTING_CONFIGURATION; + if (cis_central_capable(hdev)) + settings |= MGMT_SETTING_CIS_CENTRAL; + + if (cis_peripheral_capable(hdev)) + settings |= MGMT_SETTING_CIS_PERIPHERAL; + settings |= MGMT_SETTING_PHY_CONFIGURATION; return settings; @@ -932,6 +938,12 @@ static u32 get_current_settings(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; + if (cis_central_capable(hdev)) + settings |= MGMT_SETTING_CIS_CENTRAL; + + if (cis_peripheral_capable(hdev)) + settings |= MGMT_SETTING_CIS_PERIPHERAL; + return settings; } @@ -7373,9 +7385,8 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, /* To avoid client trying to guess when to poll again for information we * calculate conn info age as random value between min/max set in hdev. */ - conn_info_age = hdev->conn_info_min_age + - prandom_u32_max(hdev->conn_info_max_age - - hdev->conn_info_min_age); + conn_info_age = get_random_u32_inclusive(hdev->conn_info_min_age, + hdev->conn_info_max_age - 1); /* Query controller to refresh cached values if they are too old or were * never read. @@ -8859,7 +8870,7 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev, * extra parameters we don't know about will be ignored in this request. */ if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_INVALID_PARAMS); flags = __le32_to_cpu(cp->flags); diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index 6a8b7e84293d..bdf978605d5a 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -27,7 +27,7 @@ struct mgmt_mesh_tx { struct sock *sk; u8 handle; u8 instance; - u8 param[sizeof(struct mgmt_cp_mesh_send) + 29]; + u8 param[sizeof(struct mgmt_cp_mesh_send) + 31]; }; struct mgmt_pending_cmd { diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 7324764384b6..053ef8f25fae 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -35,6 +35,8 @@ #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> +#include <trace/events/sock.h> + #define VERSION "1.11" static bool disable_cfc; @@ -186,6 +188,8 @@ static void rfcomm_l2state_change(struct sock *sk) static void rfcomm_l2data_ready(struct sock *sk) { + trace_sk_data_ready(sk); + BT_DBG("%p", sk); rfcomm_schedule(); } @@ -590,7 +594,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) ret = rfcomm_dlc_send_frag(d, frag); if (ret < 0) { - kfree_skb(frag); + dev_kfree_skb_irq(frag); goto unlock; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 21e24da4847f..4397e14ff560 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -391,6 +391,7 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a addr->sa_family != AF_BLUETOOTH) return -EINVAL; + sock_hold(sk); lock_sock(sk); if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { @@ -410,14 +411,18 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a d->sec_level = rfcomm_pi(sk)->sec_level; d->role_switch = rfcomm_pi(sk)->role_switch; + /* Drop sock lock to avoid potential deadlock with the RFCOMM lock */ + release_sock(sk); err = rfcomm_dlc_open(d, &rfcomm_pi(sk)->src, &sa->rc_bdaddr, sa->rc_channel); - if (!err) + lock_sock(sk); + if (!err && !sock_flag(sk, SOCK_ZAPPED)) err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); done: release_sock(sk); + sock_put(sk); return err; } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 11f853d0500f..70663229b3cc 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -605,7 +605,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE, iv, 2, 1 + len); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len); diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 2d434c1f4617..ff4f89a2b02a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -124,8 +124,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err < 0) goto out; - set_memory_ro((long)image, 1); - set_memory_x((long)image, 1); + set_memory_rox((long)image, 1); prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args); @@ -155,6 +154,23 @@ static bool bpf_dummy_ops_is_valid_access(int off, int size, return bpf_tracing_btf_ctx_access(off, size, type, prog, info); } +static int bpf_dummy_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_dummy_ops, test_sleepable): + break; + default: + if (prog->aux->sleepable) + return -EINVAL; + } + + return 0; +} + static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, @@ -209,6 +225,7 @@ static void bpf_dummy_unreg(void *kdata) struct bpf_struct_ops bpf_bpf_dummy_ops = { .verifier_ops = &bpf_dummy_verifier_ops, .init = bpf_dummy_init, + .check_member = bpf_dummy_ops_check_member, .init_member = bpf_dummy_init_member, .reg = bpf_dummy_reg, .unreg = bpf_dummy_unreg, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6094ef7cffcd..6f3d654b3339 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -234,7 +234,7 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes, int i, n; LIST_HEAD(list); - n = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, (void **)skbs); + n = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, (void **)skbs); if (unlikely(n == 0)) { for (i = 0; i < nframes; i++) xdp_return_frame(frames[i]); @@ -396,10 +396,12 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { run_ctx.prog_item = &item; + local_bh_disable(); if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); + local_bh_enable(); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); @@ -484,12 +486,11 @@ out: __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in vmlinux BTF"); -int noinline bpf_fentry_test1(int a) +__bpf_kfunc int bpf_fentry_test1(int a) { return a + 1; } EXPORT_SYMBOL_GPL(bpf_fentry_test1); -ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO); int noinline bpf_fentry_test2(int a, u64 b) { @@ -530,27 +531,35 @@ int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) return (long)arg->a; } -int noinline bpf_modify_return_test(int a, int *b) +__bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } -u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) +__bpf_kfunc u64 bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) { return a + b + c + d; } -int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) +__bpf_kfunc int bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) { return a + b; } -struct sock * noinline bpf_kfunc_call_test3(struct sock *sk) +__bpf_kfunc struct sock *bpf_kfunc_call_test3(struct sock *sk) { return sk; } +long noinline bpf_kfunc_call_test4(signed char a, short b, int c, long d) +{ + /* Provoke the compiler to assume that the caller has sign-extended a, + * b and c on platforms where this is required (e.g. s390x). + */ + return (long)a + (long)b + (long)c + d; +} + struct prog_test_member1 { int a; }; @@ -575,21 +584,21 @@ static struct prog_test_ref_kfunc prog_test_struct = { .cnt = REFCOUNT_INIT(1), }; -noinline struct prog_test_ref_kfunc * +__bpf_kfunc struct prog_test_ref_kfunc * bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) { refcount_inc(&prog_test_struct.cnt); return &prog_test_struct; } -noinline struct prog_test_member * +__bpf_kfunc struct prog_test_member * bpf_kfunc_call_memb_acquire(void) { WARN_ON_ONCE(1); return NULL; } -noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) +__bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) { if (!p) return; @@ -597,11 +606,11 @@ noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) refcount_dec(&p->cnt); } -noinline void bpf_kfunc_call_memb_release(struct prog_test_member *p) +__bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } -noinline void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p) +__bpf_kfunc void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p) { WARN_ON_ONCE(1); } @@ -614,12 +623,14 @@ static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const i return (int *)p; } -noinline int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) +__bpf_kfunc int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, + const int rdwr_buf_size) { return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size); } -noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) +__bpf_kfunc int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, + const int rdonly_buf_size) { return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); } @@ -629,16 +640,17 @@ noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, * Acquire functions must return struct pointers, so these ones are * failing. */ -noinline int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) +__bpf_kfunc int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, + const int rdonly_buf_size) { return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); } -noinline void bpf_kfunc_call_int_mem_release(int *p) +__bpf_kfunc void bpf_kfunc_call_int_mem_release(int *p) { } -noinline struct prog_test_ref_kfunc * +__bpf_kfunc struct prog_test_ref_kfunc * bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **pp, int a, int b) { struct prog_test_ref_kfunc *p = READ_ONCE(*pp); @@ -687,58 +699,72 @@ struct prog_test_fail3 { char arr2[]; }; -noinline void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) +__bpf_kfunc void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) +{ +} + +__bpf_kfunc void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) { } -noinline void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) +__bpf_kfunc void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) { } -noinline void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) +__bpf_kfunc void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p) { } -noinline void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p) +__bpf_kfunc void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p) { } -noinline void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p) +__bpf_kfunc void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p) { } -noinline void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p) +__bpf_kfunc void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz) { } -noinline void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz) +__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len) { } -noinline void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len) +__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len) { } -noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len) +__bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) { } -noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) +__bpf_kfunc void bpf_kfunc_call_test_destructive(void) { } -noinline void bpf_kfunc_call_test_destructive(void) +__bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused) { + return arg; } __diag_pop(); -ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); +BTF_SET8_START(bpf_test_modify_return_ids) +BTF_ID_FLAGS(func, bpf_modify_return_test) +BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) +BTF_SET8_END(bpf_test_modify_return_ids) + +static const struct btf_kfunc_id_set bpf_test_modify_return_set = { + .owner = THIS_MODULE, + .set = &bpf_test_modify_return_ids, +}; BTF_SET8_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) BTF_ID_FLAGS(func, bpf_kfunc_call_test2) BTF_ID_FLAGS(func, bpf_kfunc_call_test3) +BTF_ID_FLAGS(func, bpf_kfunc_call_test4) BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) @@ -760,6 +786,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_SET8_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, @@ -1128,7 +1155,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, } sock_init_data(NULL, sk); - skb = build_skb(data, 0); + skb = slab_build_skb(data); if (!skb) { kfree(data); kfree(ctx); @@ -1293,6 +1320,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) return -EINVAL; + if (bpf_prog_is_dev_bound(prog->aux)) + return -EINVAL; + if (do_live) { if (!batch_size) batch_size = NAPI_POLL_WEIGHT; @@ -1666,7 +1696,8 @@ static int __init bpf_prog_test_run_init(void) }; int ret; - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); + ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 228fd5b20f10..24f01ff113f0 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -262,14 +262,14 @@ static void release_nbp(struct kobject *kobj) kfree(p); } -static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) +static void brport_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { struct net_bridge_port *p = kobj_to_brport(kobj); net_ns_get_ownership(dev_net(p->dev), uid, gid); } -static struct kobj_type brport_ktype = { +static const struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index ae7d93c08880..25c48d81a597 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -259,7 +259,7 @@ static int __mdb_fill_info(struct sk_buff *skb, #endif } else { ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); - e.state = MDB_PG_FLAGS_PERMANENT; + e.state = MDB_PERMANENT; } e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, @@ -421,8 +421,6 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); - cb->seq = net->dev_base_seq; - for_each_netdev_rcu(net, dev) { if (netif_is_bridge_master(dev)) { struct net_bridge *br = netdev_priv(dev); @@ -663,51 +661,80 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } -static bool is_valid_mdb_entry(struct br_mdb_entry *entry, - struct netlink_ext_ack *extack) +static const struct nla_policy +br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { + [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), +}; + +static const struct nla_policy +br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { + [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol), +}; + +static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { + [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), + [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, + MCAST_INCLUDE), + [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), + [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), +}; + +static int validate_mdb_entry(const struct nlattr *attr, + struct netlink_ext_ack *extack) { + struct br_mdb_entry *entry = nla_data(attr); + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length"); + return -EINVAL; + } + if (entry->ifindex == 0) { NL_SET_ERR_MSG_MOD(extack, "Zero entry ifindex is not allowed"); - return false; + return -EINVAL; } if (entry->addr.proto == htons(ETH_P_IP)) { if (!ipv4_is_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is not multicast"); - return false; + return -EINVAL; } if (ipv4_is_local_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is local multicast"); - return false; + return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) } else if (entry->addr.proto == htons(ETH_P_IPV6)) { if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 entry group address is link-local all nodes"); - return false; + return -EINVAL; } #endif } else if (entry->addr.proto == 0) { /* L2 mdb */ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { NL_SET_ERR_MSG_MOD(extack, "L2 entry group is not multicast"); - return false; + return -EINVAL; } } else { NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol"); - return false; + return -EINVAL; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG_MOD(extack, "Unknown entry state"); - return false; + return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG_MOD(extack, "Invalid entry VLAN id"); - return false; + return -EINVAL; } - return true; + return 0; } static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, @@ -748,12 +775,6 @@ static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, return true; } -static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { - [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, - sizeof(struct in_addr), - sizeof(struct in6_addr)), -}; - static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, @@ -786,21 +807,317 @@ out: return brmctx; } +static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + unsigned char flags) +{ + unsigned long now = jiffies; + + pg->flags = flags; + pg->rt_protocol = cfg->rt_protocol; + if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) + mod_timer(&pg->timer, + now + brmctx->multicast_membership_interval); + else + del_timer(&pg->timer); + + br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); + + return 0; +} + +static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_mcast *brmctx, + unsigned char flags, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + unsigned long now = jiffies; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, cfg->br)) != NULL; + pp = &p->next) { + if (p->key.port == cfg->p) { + if (!(cfg->nlflags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); + return -EEXIST; + } + return br_mdb_replace_group_sg(cfg, mp, p, brmctx, + flags); + } + if ((unsigned long)p->key.port < (unsigned long)cfg->p) + break; + } + + p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, + MCAST_INCLUDE, cfg->rt_protocol, extack); + if (unlikely(!p)) + return -ENOMEM; + + rcu_assign_pointer(*pp, p); + if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); + br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); + + /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for + * proper replication. + */ + if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) { + struct net_bridge_mdb_entry *star_mp; + struct br_ip star_group; + + star_group = p->key.addr; + memset(&star_group.src, 0, sizeof(star_group.src)); + star_mp = br_mdb_ip_get(cfg->br, &star_group); + if (star_mp) + br_multicast_sg_add_exclude_ports(star_mp, p); + } + + return 0; +} + +static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, + struct br_ip *src_ip, + struct net_bridge_mcast *brmctx, + struct netlink_ext_ack *extack) +{ + struct net_bridge_mdb_entry *sgmp; + struct br_mdb_config sg_cfg; + struct br_ip sg_ip; + u8 flags = 0; + + sg_ip = cfg->group; + sg_ip.src = src_ip->src; + sgmp = br_multicast_new_group(cfg->br, &sg_ip); + if (IS_ERR(sgmp)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry"); + return PTR_ERR(sgmp); + } + + if (cfg->entry->state == MDB_PERMANENT) + flags |= MDB_PG_FLAGS_PERMANENT; + if (cfg->filter_mode == MCAST_EXCLUDE) + flags |= MDB_PG_FLAGS_BLOCKED; + + memset(&sg_cfg, 0, sizeof(sg_cfg)); + sg_cfg.br = cfg->br; + sg_cfg.p = cfg->p; + sg_cfg.entry = cfg->entry; + sg_cfg.group = sg_ip; + sg_cfg.src_entry = true; + sg_cfg.filter_mode = MCAST_INCLUDE; + sg_cfg.rt_protocol = cfg->rt_protocol; + sg_cfg.nlflags = cfg->nlflags; + return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); +} + +static int br_mdb_add_group_src(const struct br_mdb_config *cfg, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + struct br_mdb_src_entry *src, + struct netlink_ext_ack *extack) +{ + struct net_bridge_group_src *ent; + unsigned long now = jiffies; + int err; + + ent = br_multicast_find_group_src(pg, &src->addr); + if (!ent) { + ent = br_multicast_new_group_src(pg, &src->addr); + if (!ent) { + NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry"); + return -ENOSPC; + } + } else if (!(cfg->nlflags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); + return -EEXIST; + } + + if (cfg->filter_mode == MCAST_INCLUDE && + cfg->entry->state == MDB_TEMPORARY) + mod_timer(&ent->timer, now + br_multicast_gmi(brmctx)); + else + del_timer(&ent->timer); + + /* Install a (S, G) forwarding entry for the source. */ + err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack); + if (err) + goto err_del_sg; + + ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED; + + return 0; + +err_del_sg: + __br_multicast_del_group_src(ent); + return err; +} + +static void br_mdb_del_group_src(struct net_bridge_port_group *pg, + struct br_mdb_src_entry *src) +{ + struct net_bridge_group_src *ent; + + ent = br_multicast_find_group_src(pg, &src->addr); + if (WARN_ON_ONCE(!ent)) + return; + br_multicast_del_group_src(ent, false); +} + +static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + struct netlink_ext_ack *extack) +{ + int i, err; + + for (i = 0; i < cfg->num_src_entries; i++) { + err = br_mdb_add_group_src(cfg, pg, brmctx, + &cfg->src_entries[i], extack); + if (err) + goto err_del_group_srcs; + } + + return 0; + +err_del_group_srcs: + for (i--; i >= 0; i--) + br_mdb_del_group_src(pg, &cfg->src_entries[i]); + return err; +} + +static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + struct netlink_ext_ack *extack) +{ + struct net_bridge_group_src *ent; + struct hlist_node *tmp; + int err; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags |= BR_SGRP_F_DELETE; + + err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack); + if (err) + goto err_clear_delete; + + hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) { + if (ent->flags & BR_SGRP_F_DELETE) + br_multicast_del_group_src(ent, false); + } + + return 0; + +err_clear_delete: + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags &= ~BR_SGRP_F_DELETE; + return err; +} + +static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + unsigned char flags, + struct netlink_ext_ack *extack) +{ + unsigned long now = jiffies; + int err; + + err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack); + if (err) + return err; + + pg->flags = flags; + pg->filter_mode = cfg->filter_mode; + pg->rt_protocol = cfg->rt_protocol; + if (!(flags & MDB_PG_FLAGS_PERMANENT) && + cfg->filter_mode == MCAST_EXCLUDE) + mod_timer(&pg->timer, + now + brmctx->multicast_membership_interval); + else + del_timer(&pg->timer); + + br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); + + if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) + br_multicast_star_g_handle_mode(pg, cfg->filter_mode); + + return 0; +} + +static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_mcast *brmctx, + unsigned char flags, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + unsigned long now = jiffies; + int err; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, cfg->br)) != NULL; + pp = &p->next) { + if (p->key.port == cfg->p) { + if (!(cfg->nlflags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); + return -EEXIST; + } + return br_mdb_replace_group_star_g(cfg, mp, p, brmctx, + flags, extack); + } + if ((unsigned long)p->key.port < (unsigned long)cfg->p) + break; + } + + p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, + cfg->filter_mode, cfg->rt_protocol, + extack); + if (unlikely(!p)) + return -ENOMEM; + + err = br_mdb_add_group_srcs(cfg, p, brmctx, extack); + if (err) + goto err_del_port_group; + + rcu_assign_pointer(*pp, p); + if (!(flags & MDB_PG_FLAGS_PERMANENT) && + cfg->filter_mode == MCAST_EXCLUDE) + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); + br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); + /* If we are adding a new EXCLUDE port group (*, G), it needs to be + * also added to all (S, G) entries for proper replication. + */ + if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) && + cfg->filter_mode == MCAST_EXCLUDE) + br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); + + return 0; + +err_del_port_group: + br_multicast_del_port_group(p); + return err; +} + static int br_mdb_add_group(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { - struct net_bridge_mdb_entry *mp, *star_mp; - struct net_bridge_port_group __rcu **pp; struct br_mdb_entry *entry = cfg->entry; struct net_bridge_port *port = cfg->p; + struct net_bridge_mdb_entry *mp; struct net_bridge *br = cfg->br; - struct net_bridge_port_group *p; struct net_bridge_mcast *brmctx; struct br_ip group = cfg->group; - unsigned long now = jiffies; unsigned char flags = 0; - struct br_ip star_group; - u8 filter_mode; brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) @@ -823,55 +1140,13 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, return 0; } - for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; - pp = &p->next) { - if (p->key.port == port) { - NL_SET_ERR_MSG_MOD(extack, "Group is already joined by port"); - return -EEXIST; - } - if ((unsigned long)p->key.port < (unsigned long)port) - break; - } - - filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : - MCAST_INCLUDE; - if (entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; - p = br_multicast_new_port_group(port, &group, *pp, flags, NULL, - filter_mode, RTPROT_STATIC); - if (unlikely(!p)) { - NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); - return -ENOMEM; - } - rcu_assign_pointer(*pp, p); - if (entry->state == MDB_TEMPORARY) - mod_timer(&p->timer, - now + brmctx->multicast_membership_interval); - br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); - /* if we are adding a new EXCLUDE port group (*,G) it needs to be also - * added to all S,G entries for proper replication, if we are adding - * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be - * added to it for proper replication - */ - if (br_multicast_should_handle_mode(brmctx, group.proto)) { - switch (filter_mode) { - case MCAST_EXCLUDE: - br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); - break; - case MCAST_INCLUDE: - star_group = p->key.addr; - memset(&star_group.src, 0, sizeof(star_group.src)); - star_mp = br_mdb_ip_get(br, &star_group); - if (star_mp) - br_multicast_sg_add_exclude_ports(star_mp, p); - break; - } - } - - return 0; + if (br_multicast_is_star_g(&group)) + return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack); + else + return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack); } static int __br_mdb_add(const struct br_mdb_config *cfg, @@ -886,6 +1161,76 @@ static int __br_mdb_add(const struct br_mdb_config *cfg, return ret; } +static int br_mdb_config_src_entry_init(struct nlattr *src_entry, + struct br_mdb_src_entry *src, + __be16 proto, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, + br_mdbe_src_list_entry_pol, extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) + return -EINVAL; + + if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) + return -EINVAL; + + src->addr.proto = proto; + nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS], + nla_len(tb[MDBE_SRCATTR_ADDRESS])); + + return 0; +} + +static int br_mdb_config_src_list_init(struct nlattr *src_list, + struct br_mdb_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *src_entry; + int rem, err; + int i = 0; + + nla_for_each_nested(src_entry, src_list, rem) + cfg->num_src_entries++; + + if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)", + PG_SRC_ENT_LIMIT - 1); + return -EINVAL; + } + + cfg->src_entries = kcalloc(cfg->num_src_entries, + sizeof(struct br_mdb_src_entry), GFP_KERNEL); + if (!cfg->src_entries) + return -ENOMEM; + + nla_for_each_nested(src_entry, src_list, rem) { + err = br_mdb_config_src_entry_init(src_entry, + &cfg->src_entries[i], + cfg->entry->addr.proto, + extack); + if (err) + goto err_src_entry_init; + i++; + } + + return 0; + +err_src_entry_init: + kfree(cfg->src_entries); + return err; +} + +static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg) +{ + kfree(cfg->src_entries); +} + static int br_mdb_config_attrs_init(struct nlattr *set_attrs, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) @@ -905,9 +1250,63 @@ static int br_mdb_config_attrs_init(struct nlattr *set_attrs, __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); + if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&cfg->group)) { + NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); + return -EINVAL; + } + cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]); + } else { + cfg->filter_mode = MCAST_EXCLUDE; + } + + if (mdb_attrs[MDBE_ATTR_SRC_LIST]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&cfg->group)) { + NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); + return -EINVAL; + } + if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) { + NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); + return -EINVAL; + } + err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST], + cfg, extack); + if (err) + return err; + } + + if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); + return -EINVAL; + } + + if (mdb_attrs[MDBE_ATTR_RTPROT]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups"); + return -EINVAL; + } + cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]); + } + return 0; } +static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 }, + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + validate_mdb_entry, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) @@ -918,11 +1317,14 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, int err; err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, - MDBA_SET_ENTRY_MAX, NULL, extack); + MDBA_SET_ENTRY_MAX, mdba_policy, extack); if (err) return err; memset(cfg, 0, sizeof(*cfg)); + cfg->filter_mode = MCAST_EXCLUDE; + cfg->rt_protocol = RTPROT_STATIC; + cfg->nlflags = nlh->nlmsg_flags; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { @@ -957,14 +1359,8 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } - if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) { - NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length"); - return -EINVAL; - } cfg->entry = nla_data(tb[MDBA_SET_ENTRY]); - if (!is_valid_mdb_entry(cfg->entry, extack)) - return -EINVAL; if (cfg->entry->ifindex != cfg->br->dev->ifindex) { struct net_device *pdev; @@ -996,6 +1392,11 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, return 0; } +static void br_mdb_config_fini(struct br_mdb_config *cfg) +{ + br_mdb_config_src_list_fini(cfg); +} + static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1009,28 +1410,29 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + err = -EINVAL; /* host join errors which can happen before creating the group */ if (!cfg.p && !br_group_is_l2(&cfg.group)) { /* don't allow any flags for host-joined IP groups */ if (cfg.entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); - return -EINVAL; + goto out; } if (!br_multicast_is_star_g(&cfg.group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); - return -EINVAL; + goto out; } } if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); - return -EINVAL; + goto out; } if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); - return -EINVAL; + goto out; } vg = nbp_vlan_group(cfg.p); } else { @@ -1052,6 +1454,8 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = __br_mdb_add(&cfg, extack); } +out: + br_mdb_config_fini(&cfg); return err; } @@ -1127,6 +1531,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, err = __br_mdb_del(&cfg); } + br_mdb_config_fini(&cfg); return err; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index db4c3900ae95..96d1fc78dd39 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -31,6 +31,7 @@ #include <net/ip6_checksum.h> #include <net/addrconf.h> #endif +#include <trace/events/bridge.h> #include "br_private.h" #include "br_private_mcast_eht.h" @@ -234,6 +235,29 @@ out: return pmctx; } +static struct net_bridge_mcast_port * +br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid) +{ + struct net_bridge_mcast_port *pmctx = NULL; + struct net_bridge_vlan *vlan; + + lockdep_assert_held_once(&port->br->multicast_lock); + + if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return NULL; + + /* Take RCU to access the vlan. */ + rcu_read_lock(); + + vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid); + if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) + pmctx = &vlan->port_mcast_ctx; + + rcu_read_unlock(); + + return pmctx; +} + /* when snooping we need to check if the contexts should be used * in the following order: * - if pmctx is non-NULL (port), check if it should be used @@ -552,7 +576,8 @@ static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src, continue; if (p->rt_protocol != RTPROT_KERNEL && - (p->flags & MDB_PG_FLAGS_PERMANENT)) + (p->flags & MDB_PG_FLAGS_PERMANENT) && + !(src->flags & BR_SGRP_F_USER_ADDED)) break; if (fastleave) @@ -605,7 +630,7 @@ static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) WARN_ON(!hlist_unhashed(&mp->mdb_node)); WARN_ON(mp->ports); - del_timer_sync(&mp->timer); + timer_shutdown_sync(&mp->timer); kfree_rcu(mp, rcu); } @@ -646,22 +671,122 @@ static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) src = container_of(gc, struct net_bridge_group_src, mcast_gc); WARN_ON(!hlist_unhashed(&src->node)); - del_timer_sync(&src->timer); + timer_shutdown_sync(&src->timer); kfree_rcu(src, rcu); } -void br_multicast_del_group_src(struct net_bridge_group_src *src, - bool fastleave) +void __br_multicast_del_group_src(struct net_bridge_group_src *src) { struct net_bridge *br = src->pg->key.port->br; - br_multicast_fwd_src_remove(src, fastleave); hlist_del_init_rcu(&src->node); src->pg->src_ents--; hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } +void br_multicast_del_group_src(struct net_bridge_group_src *src, + bool fastleave) +{ + br_multicast_fwd_src_remove(src, fastleave); + __br_multicast_del_group_src(src); +} + +static int +br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx, + struct netlink_ext_ack *extack, + const char *what) +{ + u32 max = READ_ONCE(pmctx->mdb_max_entries); + u32 n = READ_ONCE(pmctx->mdb_n_entries); + + if (max && n >= max) { + NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u", + what, n, max); + return -E2BIG; + } + + WRITE_ONCE(pmctx->mdb_n_entries, n + 1); + return 0; +} + +static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx) +{ + u32 n = READ_ONCE(pmctx->mdb_n_entries); + + WARN_ON_ONCE(n == 0); + WRITE_ONCE(pmctx->mdb_n_entries, n - 1); +} + +static int br_multicast_port_ngroups_inc(struct net_bridge_port *port, + const struct br_ip *group, + struct netlink_ext_ack *extack) +{ + struct net_bridge_mcast_port *pmctx; + int err; + + lockdep_assert_held_once(&port->br->multicast_lock); + + /* Always count on the port context. */ + err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack, + "Port"); + if (err) { + trace_br_mdb_full(port->dev, group); + return err; + } + + /* Only count on the VLAN context if VID is given, and if snooping on + * that VLAN is enabled. + */ + if (!group->vid) + return 0; + + pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid); + if (!pmctx) + return 0; + + err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN"); + if (err) { + trace_br_mdb_full(port->dev, group); + goto dec_one_out; + } + + return 0; + +dec_one_out: + br_multicast_port_ngroups_dec_one(&port->multicast_ctx); + return err; +} + +static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid) +{ + struct net_bridge_mcast_port *pmctx; + + lockdep_assert_held_once(&port->br->multicast_lock); + + if (vid) { + pmctx = br_multicast_port_vid_to_port_ctx(port, vid); + if (pmctx) + br_multicast_port_ngroups_dec_one(pmctx); + } + br_multicast_port_ngroups_dec_one(&port->multicast_ctx); +} + +u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx) +{ + return READ_ONCE(pmctx->mdb_n_entries); +} + +void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max) +{ + WRITE_ONCE(pmctx->mdb_max_entries, max); +} + +u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx) +{ + return READ_ONCE(pmctx->mdb_max_entries); +} + static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) { struct net_bridge_port_group *pg; @@ -670,8 +795,8 @@ static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) WARN_ON(!hlist_unhashed(&pg->mglist)); WARN_ON(!hlist_empty(&pg->src_list)); - del_timer_sync(&pg->rexmit_timer); - del_timer_sync(&pg->timer); + timer_shutdown_sync(&pg->rexmit_timer); + timer_shutdown_sync(&pg->timer); kfree_rcu(pg, rcu); } @@ -696,6 +821,7 @@ void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, } else { br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); } + br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); @@ -1159,6 +1285,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, return mp; if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { + trace_br_mdb_full(br->dev, group); br_mc_disabled_update(br->dev, false, NULL); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); return ERR_PTR(-E2BIG); @@ -1232,7 +1359,7 @@ br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) return NULL; } -static struct net_bridge_group_src * +struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip) { struct net_bridge_group_src *grp_src; @@ -1278,14 +1405,22 @@ struct net_bridge_port_group *br_multicast_new_port_group( unsigned char flags, const unsigned char *src, u8 filter_mode, - u8 rt_protocol) + u8 rt_protocol, + struct netlink_ext_ack *extack) { struct net_bridge_port_group *p; + int err; - p = kzalloc(sizeof(*p), GFP_ATOMIC); - if (unlikely(!p)) + err = br_multicast_port_ngroups_inc(port, group, extack); + if (err) return NULL; + p = kzalloc(sizeof(*p), GFP_ATOMIC); + if (unlikely(!p)) { + NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); + goto dec_out; + } + p->key.addr = *group; p->key.port = port; p->flags = flags; @@ -1299,8 +1434,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( if (!br_multicast_is_star_g(group) && rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params)) { - kfree(p); - return NULL; + NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group"); + goto free_out; } rcu_assign_pointer(p->next, next); @@ -1314,6 +1449,25 @@ struct net_bridge_port_group *br_multicast_new_port_group( eth_broadcast_addr(p->eth_addr); return p; + +free_out: + kfree(p); +dec_out: + br_multicast_port_ngroups_dec(port, group->vid); + return NULL; +} + +void br_multicast_del_port_group(struct net_bridge_port_group *p) +{ + struct net_bridge_port *port = p->key.port; + __u16 vid = p->key.addr.vid; + + hlist_del_init(&p->mglist); + if (!br_multicast_is_star_g(&p->key.addr)) + rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode, + br_sg_port_rht_params); + kfree(p); + br_multicast_port_ngroups_dec(port, vid); } void br_multicast_host_join(const struct net_bridge_mcast *brmctx, @@ -1381,7 +1535,7 @@ __br_multicast_add_group(struct net_bridge_mcast *brmctx, } p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, - filter_mode, RTPROT_KERNEL); + filter_mode, RTPROT_KERNEL, NULL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); goto out; @@ -1927,6 +2081,25 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) br_ip4_multicast_add_router(brmctx, pmctx); br_ip6_multicast_add_router(brmctx, pmctx); } + + if (br_multicast_port_ctx_is_vlan(pmctx)) { + struct net_bridge_port_group *pg; + u32 n = 0; + + /* The mcast_n_groups counter might be wrong. First, + * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries + * are flushed, thus mcast_n_groups after the toggle does not + * reflect the true values. And second, permanent entries added + * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected + * either. Thus we have to refresh the counter. + */ + + hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) { + if (pg->key.addr.vid == pmctx->vlan->vid) + n++; + } + WRITE_ONCE(pmctx->mdb_n_entries, n); + } } void br_multicast_enable_port(struct net_bridge_port *port) diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index f91c071d1608..c126aa4e7551 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -142,7 +142,7 @@ static void br_multicast_destroy_eht_set_entry(struct net_bridge_mcast_gc *gc) set_h = container_of(gc, struct net_bridge_group_eht_set_entry, mcast_gc); WARN_ON(!RB_EMPTY_NODE(&set_h->rb_node)); - del_timer_sync(&set_h->timer); + timer_shutdown_sync(&set_h->timer); kfree(set_h); } @@ -154,7 +154,7 @@ static void br_multicast_destroy_eht_set(struct net_bridge_mcast_gc *gc) WARN_ON(!RB_EMPTY_NODE(&eht_set->rb_node)); WARN_ON(!RB_EMPTY_ROOT(&eht_set->entry_tree)); - del_timer_sync(&eht_set->timer); + timer_shutdown_sync(&eht_set->timer); kfree(eht_set); } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index f20f4373ff40..638a4d5359db 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -214,7 +214,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = skb_ip_totlen(skb); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; @@ -871,6 +871,7 @@ static unsigned int ip_sabotage_in(void *priv, if (nf_bridge && !nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { + nf_bridge_info_free(skb); state->okfn(state->net, state->sk, skb); return NF_STOLEN; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4316cc82ae17..9173e52b89e2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -202,6 +202,8 @@ static inline size_t br_port_info_size(void) + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ @@ -298,7 +300,11 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, - p->multicast_eht_hosts_cnt)) + p->multicast_eht_hosts_cnt) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS, + br_multicast_ngroups_get(&p->multicast_ctx)) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS, + br_multicast_ngroups_get_max(&p->multicast_ctx))) return -EMSGSIZE; #endif @@ -858,6 +864,8 @@ static int br_afspec(struct net_bridge *br, } static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { + [IFLA_BRPORT_UNSPEC] = { .strict_start_type = + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 }, [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, @@ -881,6 +889,8 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, + [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, + [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -1015,6 +1025,13 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], if (err) return err; } + + if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) { + u32 max_groups; + + max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]); + br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups); + } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index 8914290c75d4..17abf092f7ca 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -188,6 +188,9 @@ initvars: } static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = { + [IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC] = { + .strict_start_type = IFLA_BRIDGE_VLAN_TUNNEL_FLAGS + 1 + }, [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 }, [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 }, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 3997e16c15fc..cef5f6ea850c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -93,11 +93,21 @@ struct bridge_mcast_stats { struct u64_stats_sync syncp; }; +struct br_mdb_src_entry { + struct br_ip addr; +}; + struct br_mdb_config { struct net_bridge *br; struct net_bridge_port *p; struct br_mdb_entry *entry; struct br_ip group; + bool src_entry; + u8 filter_mode; + u16 nlflags; + struct br_mdb_src_entry *src_entries; + int num_src_entries; + u8 rt_protocol; }; #endif @@ -116,6 +126,8 @@ struct net_bridge_mcast_port { struct hlist_node ip6_rlist; #endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; + u32 mdb_n_entries; + u32 mdb_max_entries; #endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ }; @@ -300,6 +312,7 @@ struct net_bridge_fdb_flush_desc { #define BR_SGRP_F_DELETE BIT(0) #define BR_SGRP_F_SEND BIT(1) #define BR_SGRP_F_INSTALLED BIT(2) +#define BR_SGRP_F_USER_ADDED BIT(3) struct net_bridge_mcast_gc { struct hlist_node gc_node; @@ -945,7 +958,9 @@ br_multicast_new_port_group(struct net_bridge_port *port, const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode, u8 rt_protocol); + u8 filter_mode, u8 rt_protocol, + struct netlink_ext_ack *extack); +void br_multicast_del_port_group(struct net_bridge_port_group *p); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, @@ -963,6 +978,9 @@ void br_multicast_uninit_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest); +u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx); +void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max); +u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx); void br_mdb_init(void); void br_mdb_uninit(void); void br_multicast_host_join(const struct net_bridge_mcast *brmctx, @@ -974,6 +992,10 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg); struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); +struct net_bridge_group_src * +br_multicast_new_group_src(struct net_bridge_port_group *pg, + struct br_ip *src_ip); +void __br_multicast_del_group_src(struct net_bridge_group_src *src); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); void br_multicast_ctx_init(struct net_bridge *br, @@ -1742,7 +1764,8 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); -bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, + const struct net_bridge_port *p); size_t br_vlan_opts_nl_size(void); int br_vlan_process_options(const struct net_bridge *br, const struct net_bridge_port *p, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 7eb6fd5bb917..de18e9c1d7a7 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -104,9 +104,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, return 0; if (err) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "bridge flag offload is not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "bridge flag offload is not supported"); return -EOPNOTSUPP; } @@ -115,9 +114,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "error setting offload flag on port"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "error setting offload flag on port"); return err; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index bc75fa1e4666..8a3dbc09ba38 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1816,6 +1816,7 @@ out_err: /* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts, + const struct net_bridge_port *p, u16 flags, bool dump_stats) { @@ -1842,7 +1843,7 @@ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, goto out_err; if (v_opts) { - if (!br_vlan_opts_fill(skb, v_opts)) + if (!br_vlan_opts_fill(skb, v_opts, p)) goto out_err; if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) @@ -1925,7 +1926,7 @@ void br_vlan_notify(const struct net_bridge *br, goto out_kfree; } - if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false)) + if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false)) goto out_err; nlmsg_end(skb, nlh); @@ -2030,7 +2031,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, if (!br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, - vlan_flags, dump_stats)) { + p, vlan_flags, dump_stats)) { err = -EMSGSIZE; break; } @@ -2056,7 +2057,7 @@ update_end: else if (!dump_global && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, - br_vlan_flags(range_start, pvid), + p, br_vlan_flags(range_start, pvid), dump_stats)) err = -EMSGSIZE; } @@ -2131,6 +2132,8 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT }, + [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, }; static int br_vlan_rtm_process_one(struct net_device *dev, diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index a2724d03278c..e378c2f3a9e2 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -48,7 +48,8 @@ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, curr_mc_rtr == range_mc_rtr; } -bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, + const struct net_bridge_port *p) { if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) || !__vlan_tun_put(skb, v)) @@ -58,6 +59,12 @@ bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, br_vlan_multicast_router(v))) return false; + if (p && !br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx) && + (nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS, + br_multicast_ngroups_get(&v->port_mcast_ctx)) || + nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS, + br_multicast_ngroups_get_max(&v->port_mcast_ctx)))) + return false; #endif return true; @@ -70,6 +77,8 @@ size_t br_vlan_opts_nl_size(void) + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */ #endif + 0; } @@ -212,6 +221,22 @@ static int br_vlan_process_one_opts(const struct net_bridge *br, return err; *changed = true; } + if (tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]) { + u32 val; + + if (!p) { + NL_SET_ERR_MSG_MOD(extack, "Can't set mcast_max_groups for non-port vlans"); + return -EINVAL; + } + if (br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx)) { + NL_SET_ERR_MSG_MOD(extack, "Multicast snooping disabled on this VLAN"); + return -EINVAL; + } + + val = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]); + br_multicast_ngroups_set_max(&v->port_mcast_ctx, val); + *changed = true; + } #endif return 0; diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 73242962be5d..71056ee84773 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -212,7 +212,7 @@ static int nf_ct_br_ip_check(const struct sk_buff *skb) iph->version != 4) return -1; - len = ntohs(iph->tot_len); + len = skb_ip_totlen(skb); if (skb->len < nhoff + len || len < (iph->ihl * 4)) return -1; @@ -256,7 +256,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct iphdr))) return NF_ACCEPT; - len = ntohs(ip_hdr(skb)->tot_len); + len = skb_ip_totlen(skb); if (pskb_trim_rcsum(skb, len)) return NF_ACCEPT; @@ -366,42 +366,12 @@ static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, return br_dev_queue_push_xmit(net, sk, skb); } -static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - int protoff; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - switch (skb->protocol) { - case htons(ETH_P_IP): - protoff = skb_network_offset(skb) + ip_hdrlen(skb); - break; - case htons(ETH_P_IPV6): { - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) - return nf_conntrack_confirm(skb); - } - break; - default: - return NF_ACCEPT; - } - return nf_confirm(skb, protoff, ct, ctinfo); -} - static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int ret; - ret = nf_ct_bridge_confirm(skb); + ret = nf_confirm(priv, skb, state); if (ret != NF_ACCEPT) return ret; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 748be7253248..4eebcc66c19a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -533,10 +533,6 @@ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) goto err; - ret = -EINVAL; - if (unlikely(msg->msg_iter.nr_segs == 0) || - unlikely(msg->msg_iter.iov->iov_base == NULL)) - goto err; noblock = msg->msg_flags & MSG_DONTWAIT; timeo = sock_sndtimeo(sk, noblock); @@ -1015,6 +1011,7 @@ static void caif_sock_destructor(struct sock *sk) return; } sk_stream_kill_queues(&cf_sk->sk); + WARN_ON_ONCE(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index cc405d8c7c30..8480684f2762 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -269,11 +269,15 @@ int cfctrl_linkup_request(struct cflayer *layer, default: pr_warn("Request setup of bad link type = %d\n", param->linktype); + cfpkt_destroy(pkt); return -EINVAL; } req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) + if (!req) { + cfpkt_destroy(pkt); return -ENOMEM; + } + req->client_layer = user_layer; req->cmd = CFCTRL_CMD_LINK_SETUP; req->param = *param; diff --git a/net/can/af_can.c b/net/can/af_can.c index 27dcdcc0b808..7343fd487dbe 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -446,7 +446,6 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, struct hlist_head *rcv_list; struct can_dev_rcv_lists *dev_rcv_lists; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; - int err = 0; /* insert new receiver (dev,canid,mask) -> (func,data) */ @@ -481,7 +480,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, rcv_lists_stats->rcv_entries); spin_unlock_bh(&net->can.rcvlists_lock); - return err; + return 0; } EXPORT_SYMBOL(can_rx_register); @@ -677,7 +676,7 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || (!can_is_can_skb(skb)))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); @@ -692,7 +691,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || (!can_is_canfd_skb(skb)))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); @@ -707,7 +706,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || (!can_is_canxl_skb(skb)))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); diff --git a/net/can/gw.c b/net/can/gw.c index 23a3d89cad81..37528826935e 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1139,6 +1139,13 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (gwj->dst.dev->type != ARPHRD_CAN) goto out; + /* is sending the skb back to the incoming interface intended? */ + if (gwj->src.dev == gwj->dst.dev && + !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { + err = -EINVAL; + goto out; + } + ASSERT_RTNL(); err = cgw_register_filter(net, gwj); diff --git a/net/can/isotp.c b/net/can/isotp.c index 608f8c24ae46..9bc344851704 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -140,7 +140,7 @@ struct isotp_sock { canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; - struct hrtimer rxtimer, txtimer; + struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; @@ -871,7 +871,7 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data) } /* start timer to send next consecutive frame with correct delay */ - hrtimer_start(&so->txtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) @@ -879,49 +879,39 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; - enum hrtimer_restart restart = HRTIMER_NORESTART; - switch (so->tx.state) { - case ISOTP_SENDING: + /* don't handle timeouts in IDLE state */ + if (so->tx.state == ISOTP_IDLE) + return HRTIMER_NORESTART; - /* cfecho should be consumed by isotp_rcv_echo() here */ - if (!so->cfecho) { - /* start timeout for unlikely lost echo skb */ - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), - ktime_set(ISOTP_ECHO_TIMEOUT, 0))); - restart = HRTIMER_RESTART; + /* we did not get any flow control or echo frame in time */ - /* push out the next consecutive frame */ - isotp_send_cframe(so); - break; - } + /* report 'communication error on send' */ + sk->sk_err = ECOMM; + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); - /* cfecho has not been cleared in isotp_rcv_echo() */ - pr_notice_once("can-isotp: cfecho %08X timeout\n", so->cfecho); - fallthrough; - - case ISOTP_WAIT_FC: - case ISOTP_WAIT_FIRST_FC: + /* reset tx state */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); - /* we did not get any flow control frame in time */ + return HRTIMER_NORESTART; +} - /* report 'communication error on send' */ - sk->sk_err = ECOMM; - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); +static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + txfrtimer); - /* reset tx state */ - so->tx.state = ISOTP_IDLE; - wake_up_interruptible(&so->wait); - break; + /* start echo timeout handling and cover below protocol error */ + hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), + HRTIMER_MODE_REL_SOFT); - default: - WARN_ONCE(1, "can-isotp: tx timer state %08X cfecho %08X\n", - so->tx.state, so->cfecho); - } + /* cfecho should be consumed by isotp_rcv_echo() here */ + if (so->tx.state == ISOTP_SENDING && !so->cfecho) + isotp_send_cframe(so); - return restart; + return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -1162,6 +1152,10 @@ static int isotp_release(struct socket *sock) /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + /* force state machines to be idle also when a signal occurred */ + so->tx.state = ISOTP_IDLE; + so->rx.state = ISOTP_IDLE; + spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); @@ -1194,6 +1188,7 @@ static int isotp_release(struct socket *sock) } } + hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); @@ -1225,6 +1220,9 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (len < ISOTP_MIN_NAMELEN) return -EINVAL; + if (addr->can_family != AF_CAN) + return -EINVAL; + /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); @@ -1597,6 +1595,8 @@ static int isotp_init(struct sock *sk) so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; + hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c index f33c47327927..ca4ad6cdd5cb 100644 --- a/net/can/j1939/address-claim.c +++ b/net/can/j1939/address-claim.c @@ -165,6 +165,46 @@ static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb) * leaving this function. */ ecu = j1939_ecu_get_by_name_locked(priv, name); + + if (ecu && ecu->addr == skcb->addr.sa) { + /* The ISO 11783-5 standard, in "4.5.2 - Address claim + * requirements", states: + * d) No CF shall begin, or resume, transmission on the + * network until 250 ms after it has successfully claimed + * an address except when responding to a request for + * address-claimed. + * + * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim + * prioritization" show that the CF begins the transmission + * after 250 ms from the first AC (address-claimed) message + * even if it sends another AC message during that time window + * to resolve the address contention with another CF. + * + * As stated in "4.4.2.3 - Address-claimed message": + * In order to successfully claim an address, the CF sending + * an address claimed message shall not receive a contending + * claim from another CF for at least 250 ms. + * + * As stated in "4.4.3.2 - NAME management (NM) message": + * 1) A commanding CF can + * d) request that a CF with a specified NAME transmit + * the address-claimed message with its current NAME. + * 2) A target CF shall + * d) send an address-claimed message in response to a + * request for a matching NAME + * + * Taking the above arguments into account, the 250 ms wait is + * requested only during network initialization. + * + * Do not restart the timer on AC message if both the NAME and + * the address match and so if the address has already been + * claimed (timer has expired) or the AC message has been sent + * to resolve the contention with another CF (timer is still + * running). + */ + goto out_ecu_put; + } + if (!ecu && j1939_address_is_unicast(skcb->addr.sa)) ecu = j1939_ecu_create_locked(priv, name); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index b670ba03a675..7e90f9e61d9b 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -189,7 +189,7 @@ activate_next: int time_ms = 0; if (err) - time_ms = 10 + prandom_u32_max(16); + time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index f26f4cfa9e63..fce9b9ebf13f 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1092,10 +1092,6 @@ static bool j1939_session_deactivate(struct j1939_session *session) bool active; j1939_session_list_lock(priv); - /* This function should be called with a session ref-count of at - * least 2. - */ - WARN_ON_ONCE(kref_read(&session->kref) < 2); active = j1939_session_deactivate_locked(session); j1939_session_list_unlock(priv); @@ -1168,7 +1164,7 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) { session->tx_retry++; j1939_tp_schedule_txtimer(session, - 10 + prandom_u32_max(16)); + 10 + get_random_u32_below(16)); } else { netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n", __func__, session); diff --git a/net/can/raw.c b/net/can/raw.c index 3eb7d3e2b541..f64469b98260 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -132,8 +132,8 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; /* make sure to not pass oversized frames to the socket */ - if ((can_is_canfd_skb(oskb) && !ro->fd_frames && !ro->xl_frames) || - (can_is_canxl_skb(oskb) && !ro->xl_frames)) + if ((!ro->fd_frames && can_is_canfd_skb(oskb)) || + (!ro->xl_frames && can_is_canxl_skb(oskb))) return; /* eliminate multiple filter matches for the same skb */ @@ -523,6 +523,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; + int fd_frames; int count = 0; int err = 0; @@ -664,12 +665,17 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_FD_FRAMES: - if (optlen != sizeof(ro->fd_frames)) + if (optlen != sizeof(fd_frames)) return -EINVAL; - if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) + if (copy_from_sockptr(&fd_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames && !fd_frames) + return -EINVAL; + + ro->fd_frames = fd_frames; break; case CAN_RAW_XL_FRAMES: @@ -679,6 +685,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames) + ro->fd_frames = ro->xl_frames; break; case CAN_RAW_JOIN_FILTERS: @@ -786,6 +795,25 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return 0; } +static bool raw_bad_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +{ + /* Classical CAN -> no checks for flags and device capabilities */ + if (can_is_can_skb(skb)) + return false; + + /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ + if (ro->fd_frames && can_is_canfd_skb(skb) && + (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) + return false; + + /* CAN XL -> needs to be enabled and a CAN XL device */ + if (ro->xl_frames && can_is_canxl_skb(skb) && + can_is_canxl_dev_mtu(mtu)) + return false; + + return true; +} + static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; @@ -833,20 +861,8 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto free_skb; err = -EINVAL; - if (ro->xl_frames && can_is_canxl_dev_mtu(dev->mtu)) { - /* CAN XL, CAN FD and Classical CAN */ - if (!can_is_canxl_skb(skb) && !can_is_canfd_skb(skb) && - !can_is_can_skb(skb)) - goto free_skb; - } else if (ro->fd_frames && dev->mtu == CANFD_MTU) { - /* CAN FD and Classical CAN */ - if (!can_is_canfd_skb(skb) && !can_is_can_skb(skb)) - goto free_skb; - } else { - /* Classical CAN */ - if (!can_is_can_skb(skb)) - goto free_skb; - } + if (raw_bad_txframe(ro, skb, dev->mtu)) + goto free_skb; sockcm_init(&sockc, sk); if (msg->msg_controllen) { @@ -857,6 +873,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index dfa237fbd5a3..cd7b0bf5369e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -17,6 +17,7 @@ #endif /* CONFIG_BLOCK */ #include <linux/dns_resolver.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> @@ -344,6 +345,9 @@ static void con_sock_state_closed(struct ceph_connection *con) static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; + + trace_sk_data_ready(sk); + if (atomic_read(&con->msgr->stopping)) { return; } @@ -446,6 +450,7 @@ int ceph_tcp_connect(struct ceph_connection *con) if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_use_task_frag = false; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 3ddbde87e4d6..d1787d7d33ef 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -30,7 +30,7 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) if (!buf) msg.msg_flags |= MSG_TRUNC; - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; @@ -49,7 +49,7 @@ static int ceph_tcp_recvpage(struct socket *sock, struct page *page, int r; BUG_ON(page_offset + length > PAGE_SIZE); - iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); + iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index cc8ff81a50b7..3009028c4fa2 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -168,7 +168,7 @@ static int do_try_sendpage(struct socket *sock, struct iov_iter *it) bv.bv_offset, bv.bv_len, CEPH_MSG_FLAGS); } else { - iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len); ret = sock_sendmsg(sock, &msg); } if (ret <= 0) { @@ -225,7 +225,7 @@ static void reset_in_kvecs(struct ceph_connection *con) WARN_ON(iov_iter_count(&con->v2.in_iter)); con->v2.in_kvec_cnt = 0; - iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0); + iov_iter_kvec(&con->v2.in_iter, ITER_DEST, con->v2.in_kvecs, 0, 0); } static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv) @@ -233,7 +233,7 @@ static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv) WARN_ON(iov_iter_count(&con->v2.in_iter)); con->v2.in_bvec = *bv; - iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len); + iov_iter_bvec(&con->v2.in_iter, ITER_DEST, &con->v2.in_bvec, 1, bv->bv_len); } static void set_in_skip(struct ceph_connection *con, int len) @@ -241,7 +241,7 @@ static void set_in_skip(struct ceph_connection *con, int len) WARN_ON(iov_iter_count(&con->v2.in_iter)); dout("%s con %p len %d\n", __func__, con, len); - iov_iter_discard(&con->v2.in_iter, READ, len); + iov_iter_discard(&con->v2.in_iter, ITER_DEST, len); } static void add_out_kvec(struct ceph_connection *con, void *buf, int len) @@ -265,7 +265,7 @@ static void reset_out_kvecs(struct ceph_connection *con) con->v2.out_kvec_cnt = 0; - iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0); + iov_iter_kvec(&con->v2.out_iter, ITER_SOURCE, con->v2.out_kvecs, 0, 0); con->v2.out_iter_sendpage = false; } @@ -277,7 +277,7 @@ static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv, con->v2.out_bvec = *bv; con->v2.out_iter_sendpage = zerocopy; - iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1, con->v2.out_bvec.bv_len); } @@ -290,7 +290,7 @@ static void set_out_bvec_zero(struct ceph_connection *con) con->v2.out_bvec.bv_offset = 0; con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE); con->v2.out_iter_sendpage = true; - iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1, con->v2.out_bvec.bv_len); } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index db60217f911b..faabad6603db 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -222,7 +222,7 @@ static void pick_new_mon(struct ceph_mon_client *monc) max--; } - n = prandom_u32_max(max); + n = get_random_u32_below(max); if (o >= 0 && n >= o) n++; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4e4f1e4bc265..11c04e7d928e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1479,7 +1479,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, static int pick_random_replica(const struct ceph_osds *acting) { - int i = prandom_u32_max(acting->size); + int i = get_random_u32_below(acting->size); dout("%s picked osd%d, primary osd%d\n", __func__, acting->osds[i], acting->primary); diff --git a/net/compat.c b/net/compat.c index 385f04a6be2f..161b7bea1f62 100644 --- a/net/compat.c +++ b/net/compat.c @@ -95,7 +95,8 @@ int get_compat_msghdr(struct msghdr *kmsg, if (err) return err; - err = import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen, + err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE, + compat_ptr(msg.msg_iov), msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; } diff --git a/net/core/Makefile b/net/core/Makefile index 5857cec87b83..8f367813bc68 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -12,7 +12,8 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o xdp.o flow_offload.o gro.o + fib_notifier.o xdp.o flow_offload.o gro.o \ + netdev-genl.o netdev-genl-gen.o obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o @@ -33,7 +34,6 @@ obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o -obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 9d2288c0736e..bb378c33f542 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -310,7 +310,6 @@ bpf_sk_storage_ptr(void *owner) return &sk->sk_bpf_storage; } -BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -321,7 +320,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &sk_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, diff --git a/net/core/dev.c b/net/core/dev.c index b76fb37b381e..18dc8d75ead9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1614,6 +1614,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) + N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1840,7 +1841,7 @@ EXPORT_SYMBOL(register_netdevice_notifier_net); * @nb: notifier * * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the + * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * @@ -1869,14 +1870,6 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb) -{ - rtnl_lock(); - __move_netdevice_notifier_net(src_net, dst_net, nb); - rtnl_unlock(); -} - int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -3001,6 +2994,8 @@ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); + if (size < READ_ONCE(dev->gso_ipv4_max_size)) + netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); @@ -5024,7 +5019,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) - trace_consume_skb(skb); + trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, SKB_DROP_REASON_NOT_SPECIFIED); @@ -6616,17 +6611,16 @@ static int napi_threaded_poll(void *data) static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; - unsigned long flags; /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ if (!READ_ONCE(sd->defer_list)) return; - spin_lock_irqsave(&sd->defer_lock, flags); + spin_lock_irq(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; - spin_unlock_irqrestore(&sd->defer_lock, flags); + spin_unlock_irq(&sd->defer_lock); while (skb != NULL) { next = skb->next; @@ -8319,9 +8313,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) } } if (dev->flags != old_flags) { - pr_info("device %s %s promiscuous mode\n", - dev->name, - dev->flags & IFF_PROMISC ? "entered" : "left"); + netdev_info(dev, "%s promiscuous mode\n", + dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, @@ -8389,6 +8382,8 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) } } if (dev->flags ^ old_flags) { + netdev_info(dev, "%s allmulticast mode\n", + dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) @@ -9224,8 +9219,12 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } - if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { - NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); + if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { + NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { @@ -10375,7 +10374,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = atomic_long_read(&src[i]); + dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); @@ -10611,6 +10610,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; + dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; @@ -10830,6 +10831,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_shutdown(dev); dev_xdp_uninstall(dev); + bpf_dev_bound_netdev_unregister(dev); netdev_offload_xstats_disable_all(dev); diff --git a/net/core/dev.h b/net/core/dev.h index 814ed5b7b960..e075e198092c 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -9,6 +9,7 @@ struct net_device; struct netdev_bpf; struct netdev_phys_item_id; struct netlink_ext_ack; +struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ @@ -100,6 +101,8 @@ static inline void netif_set_gso_max_size(struct net_device *dev, { /* dev->gso_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_size, size); + if (size <= GSO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gso_max_segs(struct net_device *dev, @@ -114,6 +117,23 @@ static inline void netif_set_gro_max_size(struct net_device *dev, { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_max_size, size); + if (size <= GRO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gro_ipv4_max_size, size); } +static inline void netif_set_gso_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_ipv4_max_size, size); +} + +static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_ipv4_max_size, size); +} + +int rps_cpumask_housekeeping(struct cpumask *mask); #endif diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 049cfbc58aa9..90e7e3811ae7 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -71,11 +71,11 @@ static void dev_addr_test_basic(struct kunit *test) memset(addr, 2, sizeof(addr)); eth_hw_addr_set(netdev, addr); - KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); + KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); memset(addr, 3, sizeof(addr)); dev_addr_set(netdev, addr); - KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); + KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); } static void dev_addr_test_sync_one(struct kunit *test) diff --git a/net/core/dst.c b/net/core/dst.c index bc9c9be4e080..31c08a3386d3 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -82,12 +82,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, if (ops->gc && !(flags & DST_NOCOUNT) && - dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); - return NULL; - } - } + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) @@ -174,7 +170,7 @@ void dst_release(struct dst_entry *dst) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); if (!newrefcnt) - call_rcu(&dst->rcu_head, dst_destroy_rcu); + call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); @@ -316,6 +312,8 @@ void metadata_dst_free(struct metadata_dst *md_dst) if (md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&md_dst->u.tun_info.dst_cache); #endif + if (md_dst->type == METADATA_XFRM) + dst_release(md_dst->u.xfrm_info.dst_orig); kfree(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free); @@ -340,16 +338,18 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { -#ifdef CONFIG_DST_CACHE int cpu; for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); +#ifdef CONFIG_DST_CACHE if (one_md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); - } #endif + if (one_md_dst->type == METADATA_XFRM) + dst_release(one_md_dst->u.xfrm_info.dst_orig); + } free_percpu(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/failover.c b/net/core/failover.c index 655411c4ca51..2a140b3ea669 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev) goto err_upper_link; } - slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: @@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev) netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) diff --git a/net/core/filter.c b/net/core/filter.c index 37baaa6b8fc3..1d6f165923bf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -80,6 +80,7 @@ #include <net/tls.h> #include <net/xdp.h> #include <net/mptcp.h> +#include <net/netfilter/nf_conntrack_bpf.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -3179,15 +3180,18 @@ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { + void *old_data; + /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; - skb_postpull_rcsum(skb, skb->data + off, len); - memmove(skb->data + len, skb->data, off); + old_data = skb->data; __skb_pull(skb, len); + skb_postpull_rcsum(skb, old_data + off, len); + memmove(skb->data, old_data, off); return 0; } @@ -3377,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) +#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK)) + BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ + BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3497,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; @@ -3515,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -3604,6 +3621,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (!shrink) + return -EINVAL; + + switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: + len_min = sizeof(struct iphdr); + break; + case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: + len_min = sizeof(struct ipv6hdr); + break; + default: + return -EINVAL; + } + } + len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || @@ -4124,9 +4157,13 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * - * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), - * which will flush all the different bulk queues, thus completing the - * redirect. + * 3. Before exiting its NAPI poll loop, the driver will call + * xdp_do_flush(), which will flush all the different bulk queues, + * thus completing the redirect. Note that xdp_do_flush() must be + * called before napi_complete_done() in the driver, as the + * XDP_REDIRECT logic relies on being inside a single NAPI instance + * through to the xdp_do_flush() call for RCU protection of all + * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of @@ -4281,16 +4318,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; - /* XDP_REDIRECT is not fully supported yet for xdp frags since - * not all XDP capable drivers can map non-linear xdp_frame in - * ndo_xdp_xmit. - */ - if (unlikely(xdp_buff_has_frags(xdp) && - map_type != BPF_MAP_TYPE_CPUMAP)) - return -EOPNOTSUPP; + if (map_type == BPF_MAP_TYPE_XSKMAP) { + /* XDP_REDIRECT is not supported AF_XDP yet. */ + if (unlikely(xdp_buff_has_frags(xdp))) + return -EOPNOTSUPP; - if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + } return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); @@ -4614,7 +4648,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | + BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -4652,6 +4687,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags &= ~TUNNEL_CSUM; if (flags & BPF_F_SEQ_NUMBER) info->key.tun_flags |= TUNNEL_SEQ; + if (flags & BPF_F_NO_TUNNEL_KEY) + info->key.tun_flags &= ~TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -5168,7 +5205,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_prot->setsockopt != tcp_setsockopt) + if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { @@ -5630,6 +5667,15 @@ static const struct bpf_func_proto bpf_bind_proto = { }; #ifdef CONFIG_XFRM + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +struct metadata_dst __percpu *xfrm_bpf_md_dst; +EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); + +#endif + BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { @@ -5676,12 +5722,8 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) -static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, - const struct neighbour *neigh, - const struct net_device *dev, u32 mtu) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) { - memcpy(params->dmac, neigh->ha, ETH_ALEN); - memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; if (mtu) @@ -5792,21 +5834,29 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (likely(nhc->nhc_gw_family != AF_INET6)) { if (nhc->nhc_gw_family) params->ipv4_dst = nhc->nhc_gw.ipv4; - - neigh = __ipv4_neigh_lookup_noref(dev, - (__force u32)params->ipv4_dst); } else { struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; params->family = AF_INET6; *dst = nhc->nhc_gw.ipv6; - neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); } - if (!neigh) + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + + if (likely(nhc->nhc_gw_family != AF_INET6)) + neigh = __ipv4_neigh_lookup_noref(dev, + (__force u32)params->ipv4_dst); + else + neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); + + if (!neigh || !(neigh->nud_state & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); - return bpf_fib_set_fwd_params(params, neigh, dev, mtu); +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); } #endif @@ -5914,24 +5964,33 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); - if (!neigh) + if (!neigh || !(neigh->nud_state & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); - return bpf_fib_set_fwd_params(params, neigh, dev, mtu); +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); } #endif +#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ + BPF_FIB_LOOKUP_SKIP_NEIGH) + BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; - if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; switch (params->family) { @@ -5969,7 +6028,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (plen < sizeof(*params)) return -EINVAL; - if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; if (params->tot_len) @@ -6831,9 +6890,6 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, FIELD)); \ } while (0) - if (insn > insn_buf) - return insn - insn_buf; - switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != @@ -7490,7 +7546,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, @@ -7522,7 +7578,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, @@ -7992,6 +8048,19 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) default: return bpf_sk_base_func_proto(func_id); } + +#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) + /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The + * kfuncs are defined in two different modules, and we want to be able + * to use them interchangably with the same BTF type ID. Because modules + * can't de-duplicate BTF IDs between each other, we need the type to be + * referenced in the vmlinux BTF or the verifier will get confused about + * the different types. So we add this dummy type reference which will + * be included in vmlinux BTF, allowing both modules to refer to the + * same type ID. + */ + BTF_TYPE_EMIT(struct nf_conn___init); +#endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; @@ -8705,7 +8774,7 @@ static bool xdp_is_valid_access(int off, int size, } if (type == BPF_WRITE) { - if (bpf_prog_is_dev_bound(prog->aux)) { + if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); @@ -10118,9 +10187,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) - if (insn > insn_buf) - return insn - insn_buf; - switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 4fcbdd71c59f..fae9c4694186 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -208,7 +208,7 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) est = xchg((__force struct net_rate_estimator **)rate_est, NULL); if (est) { - del_timer_sync(&est->timer); + timer_shutdown_sync(&est->timer); kfree_rcu(est, rcu); } } diff --git a/net/core/gro.c b/net/core/gro.c index fd8c6a7e8d3e..a606705a0859 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -162,16 +162,27 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) struct sk_buff *lp; int segs; - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ - gro_max_size = READ_ONCE(p->dev->gro_max_size); + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; + + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ + gro_max_size = p->protocol == htons(ETH_P_IPV6) ? + READ_ONCE(p->dev->gro_max_size) : + READ_ONCE(p->dev->gro_ipv4_max_size); if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { - if (p->protocol != htons(ETH_P_IPV6) || - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || + (p->protocol == htons(ETH_P_IPV6) && + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } @@ -505,8 +516,9 @@ found_ptype: NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; - /* Only support TCP at the moment. */ - if (!skb_is_gso_tcp(skb)) + /* Only support TCP and non DODGY users. */ + if (!skb_is_gso_tcp(skb) || + (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) NAPI_GRO_CB(skb)->flush = 1; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 952a54763358..6798f6d2423b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -111,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? prandom_u32_max(base) + (base >> 1) : 0; + return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -269,7 +269,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || - time_after(tref, n->updated)) + !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); @@ -289,7 +289,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) static void neigh_add_timer(struct neighbour *n, unsigned long when) { + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); @@ -1001,12 +1011,14 @@ static void neigh_periodic_work(struct work_struct *work) goto next_elt; } - if (time_before(n->used, n->confirmed)) + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; neigh_mark_dead(n); write_unlock(&n->lock); @@ -1662,11 +1674,21 @@ static void neigh_proxy_process(struct timer_list *t) spin_unlock(&tbl->proxy_queue.lock); } +static unsigned long neigh_proxy_delay(struct neigh_parms *p) +{ + /* If proxy_delay is zero, do not call get_random_u32_below() + * as it is undefined behavior. + */ + unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); + + return proxy_delay ? + jiffies + get_random_u32_below(proxy_delay) : jiffies; +} + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long sched_next = jiffies + - prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 679b84cc8794..15e3f4606b5f 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t store_rps_map(struct netdev_rx_queue *queue, - const char *buf, size_t len) +static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, + cpumask_var_t mask) { - struct rps_map *old_map, *map; - cpumask_var_t mask; - int err, cpu, i; static DEFINE_MUTEX(rps_map_mutex); - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); - if (err) { - free_cpumask_var(mask); - return err; - } - - if (!cpumask_empty(mask)) { - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); - if (cpumask_empty(mask)) { - free_cpumask_var(mask); - return -EINVAL; - } - } + struct rps_map *old_map, *map; + int cpu, i; map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); - if (!map) { - free_cpumask_var(mask); + if (!map) return -ENOMEM; - } i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) @@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, if (old_map) kfree_rcu(old_map, rcu); + return 0; +} +int rps_cpumask_housekeeping(struct cpumask *mask) +{ + if (!cpumask_empty(mask)) { + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); + if (cpumask_empty(mask)) + return -EINVAL; + } + return 0; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + const char *buf, size_t len) +{ + cpumask_var_t mask; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) + goto out; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto out; + + err = netdev_rx_queue_set_rps_mask(queue, mask); + +out: free_cpumask_var(mask); - return len; + return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, @@ -1020,7 +1032,7 @@ static void rx_queue_release(struct kobject *kobj) netdev_put(queue->dev, &queue->dev_tracker); } -static const void *rx_queue_namespace(struct kobject *kobj) +static const void *rx_queue_namespace(const struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); struct device *dev = &queue->dev->dev; @@ -1032,7 +1044,7 @@ static const void *rx_queue_namespace(struct kobject *kobj) return ns; } -static void rx_queue_get_ownership(struct kobject *kobj, +static void rx_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = rx_queue_namespace(kobj); @@ -1040,7 +1052,7 @@ static void rx_queue_get_ownership(struct kobject *kobj, net_ns_get_ownership(net, uid, gid); } -static struct kobj_type rx_queue_ktype __ro_after_init = { +static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_groups = rx_queue_default_groups, @@ -1048,6 +1060,18 @@ static struct kobj_type rx_queue_ktype __ro_after_init = { .get_ownership = rx_queue_get_ownership, }; +static int rx_queue_default_mask(struct net_device *dev, + struct netdev_rx_queue *queue) +{ +#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) + struct cpumask *rps_default_mask = READ_ONCE(dev_net(dev)->core.rps_default_mask); + + if (rps_default_mask && !cpumask_empty(rps_default_mask)) + return netdev_rx_queue_set_rps_mask(queue, rps_default_mask); +#endif + return 0; +} + static int rx_queue_add_kobject(struct net_device *dev, int index) { struct netdev_rx_queue *queue = dev->_rx + index; @@ -1071,6 +1095,10 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) goto err; } + error = rx_queue_default_mask(dev, queue); + if (error) + goto err; + kobject_uevent(kobj, KOBJ_ADD); return error; @@ -1623,7 +1651,7 @@ static void netdev_queue_release(struct kobject *kobj) netdev_put(queue->dev, &queue->dev_tracker); } -static const void *netdev_queue_namespace(struct kobject *kobj) +static const void *netdev_queue_namespace(const struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); struct device *dev = &queue->dev->dev; @@ -1635,7 +1663,7 @@ static const void *netdev_queue_namespace(struct kobject *kobj) return ns; } -static void netdev_queue_get_ownership(struct kobject *kobj, +static void netdev_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = netdev_queue_namespace(kobj); @@ -1643,7 +1671,7 @@ static void netdev_queue_get_ownership(struct kobject *kobj, net_ns_get_ownership(net, uid, gid); } -static struct kobj_type netdev_queue_ktype __ro_after_init = { +static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_groups = netdev_queue_default_groups, @@ -1873,9 +1901,9 @@ const struct kobj_ns_type_operations net_ns_type_operations = { }; EXPORT_SYMBOL_GPL(net_ns_type_operations); -static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) +static int netdev_uevent(const struct device *d, struct kobj_uevent_env *env) { - struct net_device *dev = to_net_dev(d); + const struct net_device *dev = to_net_dev(d); int retval; /* pass interface to uevent. */ @@ -1910,16 +1938,16 @@ static void netdev_release(struct device *d) netdev_freemem(dev); } -static const void *net_namespace(struct device *d) +static const void *net_namespace(const struct device *d) { - struct net_device *dev = to_net_dev(d); + const struct net_device *dev = to_net_dev(d); return dev_net(dev); } -static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid) +static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) { - struct net_device *dev = to_net_dev(d); + const struct net_device *dev = to_net_dev(d); const struct net *net = dev_net(dev); net_ns_get_ownership(net, uid, gid); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c40cd8dd75c7..805b7385dd8d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -41,6 +41,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full); #endif #if IS_ENABLED(CONFIG_PAGE_POOL) @@ -61,3 +62,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); + +EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5581d22cc191..7b69cf882b8e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -137,12 +137,12 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) return 0; if (ops->id && ops->size) { -cleanup: ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; } +cleanup: kfree(data); out: @@ -304,6 +304,12 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); +/* init code that must occur even if setup_net() is not called. */ +static __net_init void preinit_net(struct net *net) +{ + ref_tracker_dir_init(&net->notrefcnt_tracker, 128); +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -316,7 +322,6 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128); - ref_tracker_dir_init(&net->notrefcnt_tracker, 128); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); @@ -472,6 +477,8 @@ struct net *copy_net_ns(unsigned long flags, rv = -ENOMEM; goto dec_ucounts; } + + preinit_net(net); refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); @@ -1118,6 +1125,7 @@ void __init net_ns_init(void) init_net.key_domain = &init_net_key_domain; #endif down_write(&pernet_ops_rwsem); + preinit_net(&init_net); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c new file mode 100644 index 000000000000..48812ec843f5 --- /dev/null +++ b/net/core/netdev-genl-gen.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netdev-genl-gen.h" + +#include <linux/netdev.h> + +/* NETDEV_CMD_DEV_GET - do */ +static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { + [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* Ops table for netdev */ +static const struct genl_split_ops netdev_nl_ops[2] = { + { + .cmd = NETDEV_CMD_DEV_GET, + .doit = netdev_nl_dev_get_doit, + .policy = netdev_dev_get_nl_policy, + .maxattr = NETDEV_A_DEV_IFINDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_DEV_GET, + .dumpit = netdev_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +}; + +static const struct genl_multicast_group netdev_nl_mcgrps[] = { + [NETDEV_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family netdev_nl_family __ro_after_init = { + .name = NETDEV_FAMILY_NAME, + .version = NETDEV_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = netdev_nl_ops, + .n_split_ops = ARRAY_SIZE(netdev_nl_ops), + .mcgrps = netdev_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), +}; diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h new file mode 100644 index 000000000000..b16dc7e026bb --- /dev/null +++ b/net/core/netdev-genl-gen.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_NETDEV_GEN_H +#define _LINUX_NETDEV_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <linux/netdev.h> + +int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); + +enum { + NETDEV_NLGRP_MGMT, +}; + +extern struct genl_family netdev_nl_family; + +#endif /* _LINUX_NETDEV_GEN_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c new file mode 100644 index 000000000000..a4270fafdf11 --- /dev/null +++ b/net/core/netdev-genl.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +#include "netdev-genl-gen.h" + +static int +netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, + u32 portid, u32 seq, int flags, u32 cmd) +{ + void *hdr; + + hdr = genlmsg_put(rsp, portid, seq, &netdev_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, + netdev->xdp_features, NETDEV_A_DEV_PAD)) { + genlmsg_cancel(rsp, hdr); + return -EINVAL; + } + + genlmsg_end(rsp, hdr); + + return 0; +} + +static void +netdev_genl_dev_notify(struct net_device *netdev, int cmd) +{ + struct sk_buff *ntf; + + if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), + NETDEV_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (netdev_nl_dev_fill(netdev, ntf, 0, 0, 0, cmd)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, + 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); +} + +int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *netdev; + struct sk_buff *rsp; + u32 ifindex; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (netdev) + err = netdev_nl_dev_fill(netdev, rsp, info->snd_portid, + info->snd_seq, 0, info->genlhdr->cmd); + else + err = -ENODEV; + + rtnl_unlock(); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + int idx = 0, s_idx; + int h, s_h; + int err; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + rtnl_lock(); + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(netdev, head, index_hlist) { + if (idx < s_idx) + goto cont; + err = netdev_nl_dev_fill(netdev, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + NETDEV_CMD_DEV_GET); + if (err < 0) + break; +cont: + idx++; + } + } + + rtnl_unlock(); + + if (err != -EMSGSIZE) + return err; + + cb->args[1] = idx; + cb->args[0] = h; + cb->seq = net->dev_base_seq; + + return skb->len; +} + +static int netdev_genl_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REGISTER: + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + break; + case NETDEV_UNREGISTER: + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + break; + case NETDEV_XDP_FEAT_CHANGE: + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block netdev_genl_nb = { + .notifier_call = netdev_genl_netdevice_event, +}; + +static int __init netdev_genl_init(void) +{ + int err; + + err = register_netdevice_notifier(&netdev_genl_nb); + if (err) + return err; + + err = genl_register_family(&netdev_nl_family); + if (err) + goto err_unreg_ntf; + + return 0; + +err_unreg_ntf: + unregister_netdevice_notifier(&netdev_genl_nb); + return err; +} + +subsys_initcall(netdev_genl_init); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9be762e1d042..a089b704b986 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -682,7 +682,7 @@ int netpoll_setup(struct netpoll *np) } if (!netif_running(ndev)) { - unsigned long atmost, atleast; + unsigned long atmost; np_info(np, "device %s not up yet, forcing it\n", np->dev_name); @@ -694,7 +694,6 @@ int netpoll_setup(struct netpoll *np) } rtnl_unlock(); - atleast = jiffies + HZ/10; atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { if (time_after(jiffies, atmost)) { @@ -704,15 +703,6 @@ int netpoll_setup(struct netpoll *np) msleep(1); } - /* If carrier appears to come up instantly, we don't - * trust it and pause so that we don't pump all our - * queued console messages into the bitbucket. - */ - - if (time_before(jiffies, atleast)) { - np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); - msleep(4000); - } rtnl_lock(); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b203d8660e4..193c18799865 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -511,8 +511,8 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page) static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) { int ret; - /* BH protection not needed if current is serving softirq */ - if (in_serving_softirq()) + /* BH protection not needed if current is softirq */ + if (in_softirq()) ret = ptr_ring_produce(&pool->ring, page); else ret = ptr_ring_produce_bh(&pool->ring, page); @@ -570,7 +570,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq() && + if (allow_direct && in_softirq() && page_pool_recycle_in_cache(page, pool)) return NULL; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index c3763056c554..760238196db1 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2324,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = prandom_u32_max(pkt_dev->cflows); + flow = get_random_u32_below(pkt_dev->cflows); pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2380,9 +2380,8 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = prandom_u32_max(pkt_dev->queue_map_max - - pkt_dev->queue_map_min + 1) + - pkt_dev->queue_map_min; + t = get_random_u32_inclusive(pkt_dev->queue_map_min, + pkt_dev->queue_map_max); } else { t = pkt_dev->cur_queue_map + 1; if (t > pkt_dev->queue_map_max) @@ -2411,7 +2410,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = prandom_u32_max(pkt_dev->src_mac_count); + mc = get_random_u32_below(pkt_dev->src_mac_count); else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2437,7 +2436,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = prandom_u32_max(pkt_dev->dst_mac_count); + mc = get_random_u32_below(pkt_dev->dst_mac_count); else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2469,18 +2468,17 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = prandom_u32_max(4096); + pkt_dev->vlan_id = get_random_u32_below(4096); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = prandom_u32_max(4096); + pkt_dev->svlan_id = get_random_u32_below(4096); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = prandom_u32_max( - pkt_dev->udp_src_max - pkt_dev->udp_src_min) + - pkt_dev->udp_src_min; + pkt_dev->cur_udp_src = get_random_u32_inclusive(pkt_dev->udp_src_min, + pkt_dev->udp_src_max - 1); else { pkt_dev->cur_udp_src++; @@ -2491,9 +2489,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = prandom_u32_max( - pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + - pkt_dev->udp_dst_min; + pkt_dev->cur_udp_dst = get_random_u32_inclusive(pkt_dev->udp_dst_min, + pkt_dev->udp_dst_max - 1); } else { pkt_dev->cur_udp_dst++; if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) @@ -2508,7 +2505,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = prandom_u32_max(imx - imn) + imn; + t = get_random_u32_inclusive(imn, imx - 1); else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2530,8 +2527,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_IPDST_RND) { do { - t = prandom_u32_max(imx - imn) + - imn; + t = get_random_u32_inclusive(imn, imx - 1); s = htonl(t); } while (ipv4_is_loopback(s) || ipv4_is_multicast(s) || @@ -2578,9 +2574,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = prandom_u32_max(pkt_dev->max_pkt_size - - pkt_dev->min_pkt_size) + - pkt_dev->min_pkt_size; + t = get_random_u32_inclusive(pkt_dev->min_pkt_size, + pkt_dev->max_pkt_size - 1); } else { t = pkt_dev->cur_pkt_size + 1; if (t > pkt_dev->max_pkt_size) @@ -2589,7 +2584,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->cur_pkt_size = t; } else if (pkt_dev->n_imix_entries > 0) { struct imix_pkt *entry; - __u32 t = prandom_u32_max(IMIX_PRECISION); + __u32 t = get_random_u32_below(IMIX_PRECISION); __u8 entry_index = pkt_dev->imix_distribution[t]; entry = &pkt_dev->imix_entries[entry_index]; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64289bc98887..5d8eb57867a9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -58,7 +58,7 @@ #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 40 +#define RTNL_SLAVE_MAX_TYPE 42 struct rtnl_link { rtnl_doit_func doit; @@ -1074,6 +1074,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ @@ -1807,6 +1809,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || #ifdef CONFIG_RPS @@ -1968,6 +1972,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, + [IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2883,6 +2889,29 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); + + if (max_size > dev->tso_max_size) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_ipv4_max_size ^ max_size) { + netif_set_gso_ipv4_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); + + if (dev->gro_ipv4_max_size ^ gro_max_size) { + netif_set_gro_ipv4_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -3325,6 +3354,10 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) + netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) + netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } diff --git a/net/core/scm.c b/net/core/scm.c index 5c356f0dee30..acb7d776fa6e 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -229,6 +229,8 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) if (msg->msg_control_is_user) { struct cmsghdr __user *cm = msg->msg_control_user; + check_object_size(data, cmlen - sizeof(*cm), true); + if (!user_write_access_begin(cm, cmlen)) goto efault; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4bf95e36ed16..eb7d33b41e71 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,15 +79,44 @@ #include <linux/capability.h> #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> +#include <linux/textsearch.h> #include "dev.h" #include "sock_destructor.h" -struct kmem_cache *skbuff_head_cache __ro_after_init; +struct kmem_cache *skbuff_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif + +/* skb_small_head_cache and related code is only supported + * for CONFIG_SLAB and CONFIG_SLUB. + * As soon as SLOB is removed from the kernel, we can clean up this. + */ +#if !defined(CONFIG_SLOB) +# define HAVE_SKB_SMALL_HEAD_CACHE 1 +#endif + +#ifdef HAVE_SKB_SMALL_HEAD_CACHE +static struct kmem_cache *skb_small_head_cache __ro_after_init; + +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) + +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. + * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique + * size, and we can differentiate heads from skb_small_head_cache + * vs system slabs by looking at their size (skb_end_offset()). + */ +#define SKB_SMALL_HEAD_CACHE_SIZE \ + (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ + (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ + SKB_SMALL_HEAD_SIZE) + +#define SKB_SMALL_HEAD_HEADROOM \ + SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) +#endif /* HAVE_SKB_SMALL_HEAD_CACHE */ + int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -256,7 +285,7 @@ static struct sk_buff *napi_skb_cache_get(void) struct sk_buff *skb; if (unlikely(!nc->skb_count)) { - nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, + nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache, GFP_ATOMIC, NAPI_SKB_CACHE_BULK, nc->skb_cache); @@ -265,17 +294,15 @@ static struct sk_buff *napi_skb_cache_get(void) } skb = nc->skb_cache[--nc->skb_count]; - kasan_unpoison_object_data(skbuff_head_cache, skb); + kasan_unpoison_object_data(skbuff_cache, skb); return skb; } -/* Caller must provide SKB that is memset cleared */ -static void __build_skb_around(struct sk_buff *skb, void *data, - unsigned int frag_size) +static inline void __finalize_skb_around(struct sk_buff *skb, void *data, + unsigned int size) { struct skb_shared_info *shinfo; - unsigned int size = frag_size ? : ksize(data); size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -297,15 +324,71 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb_set_kcov_handle(skb, kcov_common_handle()); } +static inline void *__slab_build_skb(struct sk_buff *skb, void *data, + unsigned int *size) +{ + void *resized; + + /* Must find the allocation size (and grow it to match). */ + *size = ksize(data); + /* krealloc() will immediately return "data" when + * "ksize(data)" is requested: it is the existing upper + * bounds. As a result, GFP_ATOMIC will be ignored. Note + * that this "new" pointer needs to be passed back to the + * caller for use so the __alloc_size hinting will be + * tracked correctly. + */ + resized = krealloc(data, *size, GFP_ATOMIC); + WARN_ON_ONCE(resized != data); + return resized; +} + +/* build_skb() variant which can operate on slab buffers. + * Note that this should be used sparingly as slab buffers + * cannot be combined efficiently by GRO! + */ +struct sk_buff *slab_build_skb(void *data) +{ + struct sk_buff *skb; + unsigned int size; + + skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + data = __slab_build_skb(skb, data, &size); + __finalize_skb_around(skb, data, size); + + return skb; +} +EXPORT_SYMBOL(slab_build_skb); + +/* Caller must provide SKB that is memset cleared */ +static void __build_skb_around(struct sk_buff *skb, void *data, + unsigned int frag_size) +{ + unsigned int size = frag_size; + + /* frag_size == 0 is considered deprecated now. Callers + * using slab buffer should use slab_build_skb() instead. + */ + if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) + data = __slab_build_skb(skb, data, &size); + + __finalize_skb_around(skb, data, size); +} + /** * __build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data (must not be 0) * * Allocate a new &sk_buff. Caller provides space holding head and - * skb_shared_info. @data must have been allocated by kmalloc() only if - * @frag_size is 0, otherwise data should come from the page allocator - * or vmalloc() + * skb_shared_info. @data must have been allocated from the page + * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() + * allocation is deprecated, and callers should use slab_build_skb() + * instead.) * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : @@ -320,7 +403,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -332,8 +415,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) /* build_skb() is wrapper over __build_skb(), that specifically * takes care of skb->head and skb->pfmemalloc - * This means that if @frag_size is not zero, then @data must be backed - * by a page fragment, not kmalloc() or vmalloc() */ struct sk_buff *build_skb(void *data, unsigned int frag_size) { @@ -352,7 +433,7 @@ EXPORT_SYMBOL(build_skb); * build_skb_around - build a network buffer around provided skb * @skb: sk_buff provide by caller, must be memset cleared * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data */ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) @@ -374,7 +455,7 @@ EXPORT_SYMBOL(build_skb_around); /** * __napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __build_skb() that uses NAPI percpu caches to obtain * skbuff_head instead of inplace allocation. @@ -398,7 +479,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) /** * napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __napi_build_skb() that takes care of skb->head_frag * and skb->pfmemalloc when the data is a page or page fragment. @@ -425,17 +506,37 @@ EXPORT_SYMBOL(napi_build_skb); * may be used. Otherwise, the packet data may be discarded until enough * memory is free */ -static void *kmalloc_reserve(size_t size, gfp_t flags, int node, +static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, bool *pfmemalloc) { - void *obj; bool ret_pfmemalloc = false; + unsigned int obj_size; + void *obj; + + obj_size = SKB_HEAD_ALIGN(*size); +#ifdef HAVE_SKB_SMALL_HEAD_CACHE + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && + !(flags & KMALLOC_NOT_NORMAL_BITS)) { + /* skb_small_head_cache has non power of two size, + * likely forcing SLUB to use order-3 pages. + * We deliberately attempt a NOMEMALLOC allocation only. + */ + obj = kmem_cache_alloc_node(skb_small_head_cache, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj) { + *size = SKB_SMALL_HEAD_CACHE_SIZE; + goto out; + } + } +#endif + *size = obj_size = kmalloc_size_roundup(obj_size); /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. */ - obj = kmalloc_node_track_caller(size, + obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); if (obj || !(gfp_pfmemalloc_allowed(flags))) @@ -443,7 +544,7 @@ static void *kmalloc_reserve(size_t size, gfp_t flags, int node, /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(size, flags, node); + obj = kmalloc_node_track_caller(obj_size, flags, node); out: if (pfmemalloc) @@ -480,12 +581,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, { struct kmem_cache *cache; struct sk_buff *skb; - unsigned int osize; bool pfmemalloc; u8 *data; cache = (flags & SKB_ALLOC_FCLONE) - ? skbuff_fclone_cache : skbuff_head_cache; + ? skbuff_fclone_cache : skbuff_cache; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; @@ -505,18 +605,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - osize = kmalloc_size_roundup(size); - data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc); + data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - size = SKB_WITH_OVERHEAD(osize); - prefetchw(data + size); + prefetchw(data + SKB_WITH_OVERHEAD(size)); /* * Only clear those fields we need to clear, not those that we will @@ -524,7 +620,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - __build_skb_around(skb, data, osize); + __build_skb_around(skb, data, size); skb->pfmemalloc = pfmemalloc; if (flags & SKB_ALLOC_FCLONE) { @@ -579,8 +675,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, goto skb_success; } - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); + len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; @@ -679,8 +774,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, data = page_frag_alloc_1k(&nc->page_small, gfp_mask); pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); } else { - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); + len = SKB_HEAD_ALIGN(len); data = page_frag_alloc(&nc->page, len, gfp_mask); pfmemalloc = nc->page.pfmemalloc; @@ -756,6 +850,16 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data) return page_pool_return_skb_page(virt_to_page(data)); } +static void skb_kfree_head(void *head, unsigned int end_offset) +{ +#ifdef HAVE_SKB_SMALL_HEAD_CACHE + if (end_offset == SKB_SMALL_HEAD_HEADROOM) + kmem_cache_free(skb_small_head_cache, head); + else +#endif + kfree(head); +} + static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; @@ -765,7 +869,7 @@ static void skb_free_head(struct sk_buff *skb) return; skb_free_frag(head); } else { - kfree(head); + skb_kfree_head(head, skb_end_offset(skb)); } } @@ -817,7 +921,7 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: - kmem_cache_free(skbuff_head_cache, skb); + kmem_cache_free(skbuff_cache, skb); return; case SKB_FCLONE_ORIG: @@ -878,6 +982,21 @@ void __kfree_skb(struct sk_buff *skb) } EXPORT_SYMBOL(__kfree_skb); +static __always_inline +bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + if (unlikely(!skb_unref(skb))) + return false; + + DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + + if (reason == SKB_CONSUMED) + trace_consume_skb(skb, __builtin_return_address(0)); + else + trace_kfree_skb(skb, __builtin_return_address(0), reason); + return true; +} + /** * kfree_skb_reason - free an sk_buff with special reason * @skb: buffer to free @@ -890,28 +1009,58 @@ EXPORT_SYMBOL(__kfree_skb); void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { - if (unlikely(!skb_unref(skb))) + if (__kfree_skb_reason(skb, reason)) + __kfree_skb(skb); +} +EXPORT_SYMBOL(kfree_skb_reason); + +#define KFREE_SKB_BULK_SIZE 16 + +struct skb_free_array { + unsigned int skb_count; + void *skb_array[KFREE_SKB_BULK_SIZE]; +}; + +static void kfree_skb_add_bulk(struct sk_buff *skb, + struct skb_free_array *sa, + enum skb_drop_reason reason) +{ + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); return; + } - DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + skb_release_all(skb, reason); + sa->skb_array[sa->skb_count++] = skb; - if (reason == SKB_CONSUMED) - trace_consume_skb(skb); - else - trace_kfree_skb(skb, __builtin_return_address(0), reason); - __kfree_skb(skb); + if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { + kmem_cache_free_bulk(skbuff_cache, KFREE_SKB_BULK_SIZE, + sa->skb_array); + sa->skb_count = 0; + } } -EXPORT_SYMBOL(kfree_skb_reason); -void kfree_skb_list_reason(struct sk_buff *segs, - enum skb_drop_reason reason) +void __fix_address +kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) { + struct skb_free_array sa; + + sa.skb_count = 0; + while (segs) { struct sk_buff *next = segs->next; - kfree_skb_reason(segs, reason); + if (__kfree_skb_reason(segs, reason)) { + skb_poison_list(segs); + kfree_skb_add_bulk(segs, &sa, reason); + } + segs = next; } + + if (sa.skb_count) + kmem_cache_free_bulk(skbuff_cache, sa.skb_count, sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); @@ -1040,7 +1189,7 @@ void consume_skb(struct sk_buff *skb) if (!skb_unref(skb)) return; - trace_consume_skb(skb); + trace_consume_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); @@ -1055,7 +1204,7 @@ EXPORT_SYMBOL(consume_skb); */ void __consume_stateless_skb(struct sk_buff *skb) { - trace_consume_skb(skb); + trace_consume_skb(skb, __builtin_return_address(0)); skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1065,15 +1214,15 @@ static void napi_skb_cache_put(struct sk_buff *skb) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 i; - kasan_poison_object_data(skbuff_head_cache, skb); + kasan_poison_object_data(skbuff_cache, skb); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) - kasan_unpoison_object_data(skbuff_head_cache, + kasan_unpoison_object_data(skbuff_cache, nc->skb_cache[i]); - kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, + kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } @@ -1111,7 +1260,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; /* if reaching here SKB is ready to free */ - trace_consume_skb(skb); + trace_consume_skb(skb, __builtin_return_address(0)); /* if SKB is a clone, don't handle this case */ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { @@ -1257,14 +1406,18 @@ EXPORT_SYMBOL_GPL(skb_morph); int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { - unsigned long max_pg, num_pg, new_pg, old_pg; + unsigned long max_pg, num_pg, new_pg, old_pg, rlim; struct user_struct *user; if (capable(CAP_IPC_LOCK) || !size) return 0; + rlim = rlimit(RLIMIT_MEMLOCK); + if (rlim == RLIM_INFINITY) + return 0; + num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ - max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + max_pg = rlim >> PAGE_SHIFT; user = mmp->user ? : current_user(); old_pg = atomic_long_read(&user->locked_vm); @@ -1657,7 +1810,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + n = kmem_cache_alloc(skbuff_cache, gfp_mask); if (!n) return NULL; @@ -1839,10 +1992,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; size = SKB_WITH_OVERHEAD(size); @@ -1905,7 +2055,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, return 0; nofrags: - kfree(data); + skb_kfree_head(data, size); nodata: return -ENOMEM; } @@ -2428,6 +2578,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } else { /* Eaten partially. */ + if (skb_is_gso(skb) && !list->head_frag && + skb_headlen(list)) + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ @@ -4043,7 +4196,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_shinfo(skb)->frag_list = NULL; - do { + while (list_skb) { nskb = list_skb; list_skb = list_skb->next; @@ -4089,8 +4242,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, if (skb_needs_linearize(nskb, features) && __skb_linearize(nskb)) goto err_linearize; - - } while (list_skb); + } skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; @@ -4528,7 +4680,7 @@ static void skb_extensions_init(void) {} void __init skb_init(void) { - skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", + skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, @@ -4540,6 +4692,19 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +#ifdef HAVE_SKB_SMALL_HEAD_CACHE + /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. + * struct skb_shared_info is located at the end of skb->head, + * and should not be copied to/from user. + */ + skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", + SKB_SMALL_HEAD_CACHE_SIZE, + 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, + 0, + SKB_SMALL_HEAD_HEADROOM, + NULL); +#endif skb_extensions_init(); } @@ -5394,7 +5559,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); - kmem_cache_free(skbuff_head_cache, skb); + kmem_cache_free(skbuff_cache, skb); } else { __kfree_skb(skb); } @@ -6188,10 +6353,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); @@ -6207,7 +6369,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_cloned(skb)) { /* drop the old head gracefully */ if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -6307,10 +6469,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); @@ -6318,7 +6477,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } shinfo = (struct skb_shared_info *)(data + size); @@ -6354,7 +6513,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } skb_release_data(skb, SKB_CONSUMED); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e6b9ced3eda8..f81883759d38 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -8,6 +8,7 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> +#include <trace/events/sock.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -886,13 +887,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { - if (psock->sk_redir) + if (psock->sk_redir) { sock_put(psock->sk_redir); - psock->sk_redir = msg->sk_redir; - if (!psock->sk_redir) { + psock->sk_redir = NULL; + } + if (!msg->sk_redir) { ret = __SK_DROP; goto out; } + psock->redir_ingress = sk_msg_to_ingress(msg); + psock->sk_redir = msg->sk_redir; sock_hold(psock->sk_redir); } out: @@ -1111,6 +1115,8 @@ static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; + trace_sk_data_ready(sk); + rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { @@ -1207,6 +1213,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + trace_sk_data_ready(sk); + if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; sock->ops->read_skb(sk, sk_psock_verdict_recv); diff --git a/net/core/sock.c b/net/core/sock.c index 4571914a4aa8..341c565dbc26 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -901,13 +901,20 @@ int sock_set_timestamping(struct sock *sk, int optname, if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; + if (val & SOF_TIMESTAMPING_OPT_ID_TCP && + !(val & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) + atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); + else + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } @@ -1524,6 +1531,8 @@ set_sndbuf: ret = -EINVAL; break; } + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() */ WRITE_ONCE(sk->sk_txrehash, (u8)val); break; @@ -1793,7 +1802,8 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len); + return security_socket_getpeersec_stream(sock, + optval, optlen, len); case SO_MARK: v.val = sk->sk_mark; @@ -2330,17 +2340,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); - /* Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy). - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); @@ -2365,17 +2364,22 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); -static void sk_trim_gso_size(struct sock *sk) +static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) - return; + bool is_ipv6 = false; + u32 max_size; + #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6 && - sk_is_tcp(sk) && - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) - return; + is_ipv6 = (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif - sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ + max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : + READ_ONCE(dst->dev->gso_ipv4_max_size); + if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) + max_size = GSO_LEGACY_MAX_SIZE; + + return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) @@ -2395,10 +2399,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); - sk_trim_gso_size(sk); - sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } @@ -3283,6 +3284,8 @@ void sock_def_readable(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) @@ -3371,7 +3374,7 @@ void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) } EXPORT_SYMBOL(sk_stop_timer_sync); -void sock_init_data(struct socket *sock, struct sock *sk) +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; @@ -3382,6 +3385,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; + sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); @@ -3390,11 +3394,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; - sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); - sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } + sk->sk_uid = uid; rwlock_init(&sk->sk_callback_lock); if (sk->sk_kern_sock) @@ -3442,7 +3445,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; - sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* @@ -3453,6 +3455,16 @@ void sock_init_data(struct socket *sock, struct sock *sk) refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } +EXPORT_SYMBOL(sock_init_data_uid); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + kuid_t uid = sock ? + SOCK_INODE(sock)->i_uid : + make_kuid(sock_net(sk)->user_ns, 0); + + sock_init_data_uid(sock, sk, uid); +} EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) @@ -3687,8 +3699,6 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); - sock_put(sk); } EXPORT_SYMBOL(sk_common_release); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 81beb16ab1eb..a68a7290a3b2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -349,11 +349,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { + sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); + sock_put(sk); } } @@ -1567,15 +1569,16 @@ void sock_map_unhash(struct sock *sk) psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; + saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + } else { + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); } - - saved_unhash = psock->saved_unhash; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; + if (saved_unhash) + saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); @@ -1588,17 +1591,18 @@ void sock_map_destroy(struct sock *sk) psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->destroy) - sk->sk_prot->destroy(sk); - return; + saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + } else { + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); } - - saved_destroy = psock->saved_destroy; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - sk_psock_put(sk, psock); - saved_destroy(sk); + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; + if (saved_destroy) + saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); @@ -1613,16 +1617,21 @@ void sock_map_close(struct sock *sk, long timeout) if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); - return sk->sk_prot->close(sk, timeout); + saved_close = READ_ONCE(sk->sk_prot)->close; + } else { + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); } - - saved_close = psock->saved_close; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - release_sock(sk); - cancel_work_sync(&psock->work); - sk_psock_put(sk, psock); + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); diff --git a/net/core/stream.c b/net/core/stream.c index 75fded8495f5..434446ab14c5 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2; + current_timeo = vm_wait = get_random_u32_below(HZ / 5) + 2; add_wait_queue(sk_sleep(sk), &wait); @@ -196,6 +196,12 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); + /* Next, the error queue. + * We need to use queue lock, because other threads might + * add packets to the queue without socket lock being held. + */ + skb_queue_purge(&sk->sk_error_queue); + /* Next, the write queue. */ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); @@ -203,7 +209,6 @@ void sk_stream_kill_queues(struct sock *sk) sk_mem_reclaim_final(sk); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk->sk_forward_alloc); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5b1ce656baa1..74842b453407 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -16,6 +16,7 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/sched/isolation.h> #include <net/ip.h> #include <net/sock.h> @@ -45,7 +46,82 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) +static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) +{ + char kbuf[128]; + int len; + + if (*ppos || !*lenp) { + *lenp = 0; + return; + } + + len = min(sizeof(kbuf) - 1, *lenp); + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + return; + } + + if (len < *lenp) + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; +} +#endif + #ifdef CONFIG_RPS + +static struct cpumask *rps_default_mask_cow_alloc(struct net *net) +{ + struct cpumask *rps_default_mask; + + if (net->core.rps_default_mask) + return net->core.rps_default_mask; + + rps_default_mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!rps_default_mask) + return NULL; + + /* pairs with READ_ONCE in rx_queue_default_mask() */ + WRITE_ONCE(net->core.rps_default_mask, rps_default_mask); + return rps_default_mask; +} + +static int rps_default_mask_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = (struct net *)table->data; + int err = 0; + + rtnl_lock(); + if (write) { + struct cpumask *rps_default_mask = rps_default_mask_cow_alloc(net); + + err = -ENOMEM; + if (!rps_default_mask) + goto done; + + err = cpumask_parse(buffer, rps_default_mask); + if (err) + goto done; + + err = rps_cpumask_housekeeping(rps_default_mask); + if (err) + goto done; + } else { + dump_cpumask(buffer, lenp, ppos, + net->core.rps_default_mask ? : cpu_none_mask); + } + +done: + rtnl_unlock(); + return err; +} + static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -155,13 +231,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { - char kbuf[128]; - - if (*ppos || !*lenp) { - *lenp = 0; - goto done; - } - cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { @@ -171,17 +240,7 @@ write_unlock: } rcu_read_unlock(); - len = min(sizeof(kbuf) - 1, *lenp); - len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); - if (!len) { - *lenp = 0; - goto done; - } - if (len < *lenp) - kbuf[len++] = '\n'; - memcpy(buffer, kbuf, len); - *lenp = len; - *ppos += len; + dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -598,6 +657,14 @@ static struct ctl_table net_core_table[] = { }; static struct ctl_table netns_core_table[] = { +#if IS_ENABLED(CONFIG_RPS) + { + .procname = "rps_default_mask", + .data = &init_net, + .mode = 0644, + .proc_handler = rps_default_mask_sysctl + }, +#endif { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, @@ -643,11 +710,6 @@ static __net_init int sysctl_core_net_init(struct net *net) for (tmp = tbl; tmp->procname; tmp++) tmp->data += (char *)net - (char *)&init_net; - - /* Don't export any sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; - } } net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); @@ -670,6 +732,9 @@ static __net_exit void sysctl_core_net_exit(struct net *net) tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); +#if IS_ENABLED(CONFIG_RPS) + kfree(net->core.rps_default_mask); +#endif kfree(tbl); } diff --git a/net/core/tso.c b/net/core/tso.c index 4148f6d48953..e00796e3b146 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -5,14 +5,6 @@ #include <net/tso.h> #include <asm/unaligned.h> -/* Calculate expected number of TX descriptors */ -int tso_count_descs(const struct sk_buff *skb) -{ - /* The Marvell Way */ - return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; -} -EXPORT_SYMBOL(tso_count_descs); - void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 844c9d99dc0e..8c92fc553317 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> @@ -602,8 +604,7 @@ EXPORT_SYMBOL_GPL(xdp_warn); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) { - n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, - n_skb, skbs); + n_skb = kmem_cache_alloc_bulk(skbuff_cache, gfp, n_skb, skbs); if (unlikely(!n_skb)) return -ENOMEM; @@ -672,7 +673,7 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -709,3 +710,84 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) return nxdpf; } + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +/** + * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. + * @ctx: XDP context pointer. + * @timestamp: Return value pointer. + * + * Returns 0 on success or ``-errno`` on error. + */ +__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + return -EOPNOTSUPP; +} + +/** + * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. + * @ctx: XDP context pointer. + * @hash: Return value pointer. + * + * Returns 0 on success or ``-errno`` on error. + */ +__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash) +{ + return -EOPNOTSUPP; +} + +__diag_pop(); + +BTF_SET8_START(xdp_metadata_kfunc_ids) +#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, 0) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC +BTF_SET8_END(xdp_metadata_kfunc_ids) + +static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { + .owner = THIS_MODULE, + .set = &xdp_metadata_kfunc_ids, +}; + +BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) +#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + +u32 bpf_xdp_metadata_kfunc_id(int id) +{ + /* xdp_metadata_kfunc_ids is sorted and can't be used */ + return xdp_metadata_kfunc_ids_unsorted[id]; +} + +bool bpf_dev_bound_kfunc_id(u32 btf_id) +{ + return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id); +} + +static int __init xdp_metadata_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); +} +late_initcall(xdp_metadata_init); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; + if (support_sg) + dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT_SG; + + call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + dev->xdp_features &= ~(NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_NDO_XMIT_SG); + call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index f9949e051f49..c0c438128575 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -178,6 +178,7 @@ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { }; static LIST_HEAD(dcb_app_list); +static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) @@ -1099,11 +1100,46 @@ out: return err; } +/* Set or delete APP table or rewrite table entries. The APP struct is validated + * and the appropriate callback function is called. + */ +static int dcbnl_app_table_setdel(struct nlattr *attr, + struct net_device *netdev, + int (*setdel)(struct net_device *dev, + struct dcb_app *app)) +{ + struct dcb_app *app_data; + enum ieee_attrs_app type; + struct nlattr *attr_itr; + int rem, err; + + nla_for_each_nested(attr_itr, attr, rem) { + type = nla_type(attr_itr); + + if (!dcbnl_app_attr_type_validate(type)) + continue; + + if (nla_len(attr_itr) < sizeof(struct dcb_app)) + return -ERANGE; + + app_data = nla_data(attr_itr); + + if (!dcbnl_app_selector_validate(type, app_data->selector)) + return -EINVAL; + + err = setdel(netdev, app_data); + if (err) + return err; + } + + return 0; +} + /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; - struct nlattr *ieee, *app; + struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; @@ -1206,6 +1242,27 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); + rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); + if (!rewr) + return -EMSGSIZE; + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == netdev->ifindex) { + enum ieee_attrs_app type = + dcbnl_app_attr_type_get(itr->app.selector); + err = nla_put(skb, type, sizeof(itr->app), &itr->app); + if (err) { + spin_unlock_bh(&dcb_lock); + nla_nest_cancel(skb, rewr); + return -EMSGSIZE; + } + } + } + + spin_unlock_bh(&dcb_lock); + nla_nest_end(skb, rewr); + if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) @@ -1567,37 +1624,20 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, goto err; } - if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { - struct nlattr *attr; - int rem; - - nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { - enum ieee_attrs_app type = nla_type(attr); - struct dcb_app *app_data; - - if (!dcbnl_app_attr_type_validate(type)) - continue; - - if (nla_len(attr) < sizeof(struct dcb_app)) { - err = -ERANGE; - goto err; - } - - app_data = nla_data(attr); - - if (!dcbnl_app_selector_validate(type, - app_data->selector)) { - err = -EINVAL; - goto err; - } + if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], + netdev, + ops->dcbnl_setrewr ?: dcb_setrewr); + if (err) + goto err; + } - if (ops->ieee_setapp) - err = ops->ieee_setapp(netdev, app_data); - else - err = dcb_ieee_setapp(netdev, app_data); - if (err) - goto err; - } + if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], + netdev, ops->ieee_setapp ?: + dcb_ieee_setapp); + if (err) + goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { @@ -1684,31 +1724,19 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { - struct nlattr *attr; - int rem; - - nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { - enum ieee_attrs_app type = nla_type(attr); - struct dcb_app *app_data; - - if (!dcbnl_app_attr_type_validate(type)) - continue; - - app_data = nla_data(attr); - - if (!dcbnl_app_selector_validate(type, - app_data->selector)) { - err = -EINVAL; - goto err; - } + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], + netdev, ops->ieee_delapp ?: + dcb_ieee_delapp); + if (err) + goto err; + } - if (ops->ieee_delapp) - err = ops->ieee_delapp(netdev, app_data); - else - err = dcb_ieee_delapp(netdev, app_data); - if (err) - goto err; - } + if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], + netdev, + ops->dcbnl_delrewr ?: dcb_delrewr); + if (err) + goto err; } err: @@ -1939,6 +1967,22 @@ out: return ret; } +static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, + int ifindex, int proto) +{ + struct dcb_app_type *itr; + + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->app.selector == app->selector && + itr->app.priority == app->priority && + itr->ifindex == ifindex && + ((proto == -1) || itr->app.protocol == proto)) + return itr; + } + + return NULL; +} + static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { @@ -1955,7 +1999,8 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, return NULL; } -static int dcb_app_add(const struct dcb_app *app, int ifindex) +static int dcb_app_add(struct list_head *list, const struct dcb_app *app, + int ifindex) { struct dcb_app_type *entry; @@ -1965,7 +2010,7 @@ static int dcb_app_add(const struct dcb_app *app, int ifindex) memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; - list_add(&entry->list, &dcb_app_list); + list_add(&entry->list, list); return 0; } @@ -2028,7 +2073,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) } /* App type does not exist add new application type */ if (new->priority) - err = dcb_app_add(new, dev->ifindex); + err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) @@ -2061,6 +2106,63 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) } EXPORT_SYMBOL(dcb_ieee_getapp_mask); +/* Get protocol value from rewrite entry. */ +u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) +{ + struct dcb_app_type *itr; + u16 proto = 0; + + spin_lock_bh(&dcb_lock); + itr = dcb_rewr_lookup(app, dev->ifindex, -1); + if (itr) + proto = itr->app.protocol; + spin_unlock_bh(&dcb_lock); + + return proto; +} +EXPORT_SYMBOL(dcb_getrewr); + + /* Add rewrite entry to the rewrite list. */ +int dcb_setrewr(struct net_device *dev, struct dcb_app *new) +{ + int err; + + spin_lock_bh(&dcb_lock); + /* Search for existing match and abort if found. */ + if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { + err = -EEXIST; + goto out; + } + + err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); +out: + spin_unlock_bh(&dcb_lock); + + return err; +} +EXPORT_SYMBOL(dcb_setrewr); + +/* Delete rewrite entry from the rewrite list. */ +int dcb_delrewr(struct net_device *dev, struct dcb_app *del) +{ + struct dcb_app_type *itr; + int err = -ENOENT; + + spin_lock_bh(&dcb_lock); + /* Search for existing match and remove it. */ + itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); + if (itr) { + list_del(&itr->list); + kfree(itr); + err = 0; + } + + spin_unlock_bh(&dcb_lock); + + return err; +} +EXPORT_SYMBOL(dcb_delrewr); + /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface @@ -2088,7 +2190,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) goto out; } - err = dcb_app_add(new, dev->ifindex); + err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) @@ -2130,6 +2232,58 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); +/* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from + * priorities to the PCP and DEI values assigned to that priority. + */ +void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, + struct dcb_rewr_prio_pcp_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == DCB_APP_SEL_PCP && + itr->app.protocol < 16 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1 << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); + +/* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from + * priorities to the DSCP values assigned to that priority. + */ +void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && + itr->app.protocol < 64 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1ULL << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); + /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4260fe466993..b9d7c3dd1cb3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -551,11 +551,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); } return newsk; @@ -615,7 +613,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) @@ -679,7 +677,6 @@ ipv6_pktoptions: np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &DCCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); memmove(IP6CB(opt_skb), &DCCP_SKB_CB(opt_skb)->header.h6, sizeof(struct inet6_skb_parm)); diff --git a/net/devlink/Makefile b/net/devlink/Makefile new file mode 100644 index 000000000000..ef91a76646a3 --- /dev/null +++ b/net/devlink/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := leftover.o core.o netlink.o dev.o health.o diff --git a/net/devlink/core.c b/net/devlink/core.c new file mode 100644 index 000000000000..777b091ef74d --- /dev/null +++ b/net/devlink/core.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> + +#include "devl_internal.h" + +DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); + +void *devlink_priv(struct devlink *devlink) +{ + return &devlink->priv; +} +EXPORT_SYMBOL_GPL(devlink_priv); + +struct devlink *priv_to_devlink(void *priv) +{ + return container_of(priv, struct devlink, priv); +} +EXPORT_SYMBOL_GPL(priv_to_devlink); + +struct device *devlink_to_dev(const struct devlink *devlink) +{ + return devlink->dev; +} +EXPORT_SYMBOL_GPL(devlink_to_dev); + +struct net *devlink_net(const struct devlink *devlink) +{ + return read_pnet(&devlink->_net); +} +EXPORT_SYMBOL_GPL(devlink_net); + +void devl_assert_locked(struct devlink *devlink) +{ + lockdep_assert_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_assert_locked); + +#ifdef CONFIG_LOCKDEP +/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ +bool devl_lock_is_held(struct devlink *devlink) +{ + return lockdep_is_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock_is_held); +#endif + +void devl_lock(struct devlink *devlink) +{ + mutex_lock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock); + +int devl_trylock(struct devlink *devlink) +{ + return mutex_trylock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_trylock); + +void devl_unlock(struct devlink *devlink) +{ + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_unlock); + +/** + * devlink_try_get() - try to obtain a reference on a devlink instance + * @devlink: instance to reference + * + * Obtain a reference on a devlink instance. A reference on a devlink instance + * only implies that it's safe to take the instance lock. It does not imply + * that the instance is registered, use devl_is_registered() after taking + * the instance lock to check registration status. + */ +struct devlink *__must_check devlink_try_get(struct devlink *devlink) +{ + if (refcount_inc_not_zero(&devlink->refcount)) + return devlink; + return NULL; +} + +static void devlink_release(struct work_struct *work) +{ + struct devlink *devlink; + + devlink = container_of(to_rcu_work(work), struct devlink, rwork); + + mutex_destroy(&devlink->lock); + lockdep_unregister_key(&devlink->lock_key); + kfree(devlink); +} + +void devlink_put(struct devlink *devlink) +{ + if (refcount_dec_and_test(&devlink->refcount)) + queue_rcu_work(system_wq, &devlink->rwork); +} + +struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) +{ + struct devlink *devlink = NULL; + + rcu_read_lock(); +retry: + devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); + if (!devlink) + goto unlock; + + if (!devlink_try_get(devlink)) + goto next; + if (!net_eq(devlink_net(devlink), net)) { + devlink_put(devlink); + goto next; + } +unlock: + rcu_read_unlock(); + return devlink; + +next: + (*indexp)++; + goto retry; +} + +/** + * devl_register - Register devlink instance + * @devlink: devlink + */ +int devl_register(struct devlink *devlink) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + devl_assert_locked(devlink); + + xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); + devlink_notify_register(devlink); + + return 0; +} +EXPORT_SYMBOL_GPL(devl_register); + +void devlink_register(struct devlink *devlink) +{ + devl_lock(devlink); + devl_register(devlink); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_register); + +/** + * devl_unregister - Unregister devlink instance + * @devlink: devlink + */ +void devl_unregister(struct devlink *devlink) +{ + ASSERT_DEVLINK_REGISTERED(devlink); + devl_assert_locked(devlink); + + devlink_notify_unregister(devlink); + xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); +} +EXPORT_SYMBOL_GPL(devl_unregister); + +void devlink_unregister(struct devlink *devlink) +{ + devl_lock(devlink); + devl_unregister(devlink); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_unregister); + +/** + * devlink_alloc_ns - Allocate new devlink instance resources + * in specific namespace + * + * @ops: ops + * @priv_size: size of user private data + * @net: net namespace + * @dev: parent device + * + * Allocate new devlink instance resources, including devlink index + * and name. + */ +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net, + struct device *dev) +{ + struct devlink *devlink; + static u32 last_id; + int ret; + + WARN_ON(!ops || !dev); + if (!devlink_reload_actions_valid(ops)) + return NULL; + + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + if (!devlink) + return NULL; + + ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret < 0) + goto err_xa_alloc; + + devlink->netdevice_nb.notifier_call = devlink_port_netdevice_event; + ret = register_netdevice_notifier(&devlink->netdevice_nb); + if (ret) + goto err_register_netdevice_notifier; + + devlink->dev = dev; + devlink->ops = ops; + xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); + xa_init_flags(&devlink->params, XA_FLAGS_ALLOC); + xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); + write_pnet(&devlink->_net, net); + INIT_LIST_HEAD(&devlink->rate_list); + INIT_LIST_HEAD(&devlink->linecard_list); + INIT_LIST_HEAD(&devlink->sb_list); + INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); + INIT_LIST_HEAD(&devlink->region_list); + INIT_LIST_HEAD(&devlink->reporter_list); + INIT_LIST_HEAD(&devlink->trap_list); + INIT_LIST_HEAD(&devlink->trap_group_list); + INIT_LIST_HEAD(&devlink->trap_policer_list); + INIT_RCU_WORK(&devlink->rwork, devlink_release); + lockdep_register_key(&devlink->lock_key); + mutex_init(&devlink->lock); + lockdep_set_class(&devlink->lock, &devlink->lock_key); + refcount_set(&devlink->refcount, 1); + + return devlink; + +err_register_netdevice_notifier: + xa_erase(&devlinks, devlink->index); +err_xa_alloc: + kfree(devlink); + return NULL; +} +EXPORT_SYMBOL_GPL(devlink_alloc_ns); + +/** + * devlink_free - Free devlink instance resources + * + * @devlink: devlink + */ +void devlink_free(struct devlink *devlink) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + + WARN_ON(!list_empty(&devlink->trap_policer_list)); + WARN_ON(!list_empty(&devlink->trap_group_list)); + WARN_ON(!list_empty(&devlink->trap_list)); + WARN_ON(!list_empty(&devlink->reporter_list)); + WARN_ON(!list_empty(&devlink->region_list)); + WARN_ON(!list_empty(&devlink->resource_list)); + WARN_ON(!list_empty(&devlink->dpipe_table_list)); + WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(!list_empty(&devlink->linecard_list)); + WARN_ON(!xa_empty(&devlink->ports)); + + xa_destroy(&devlink->snapshot_ids); + xa_destroy(&devlink->params); + xa_destroy(&devlink->ports); + + WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb)); + + xa_erase(&devlinks, devlink->index); + + devlink_put(devlink); +} +EXPORT_SYMBOL_GPL(devlink_free); + +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + u32 actions_performed; + unsigned long index; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + devlinks_xa_for_each_registered_get(net, index, devlink) { + devl_lock(devlink); + err = 0; + if (devl_is_registered(devlink)) + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, + &actions_performed, NULL); + devl_unlock(devlink); + devlink_put(devlink); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); + } +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + +static int __init devlink_init(void) +{ + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; +} + +subsys_initcall(devlink_init); diff --git a/net/devlink/dev.c b/net/devlink/dev.c new file mode 100644 index 000000000000..bf1d6f1bcfc7 --- /dev/null +++ b/net/devlink/dev.c @@ -0,0 +1,1346 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> +#include <net/sock.h> +#include "devl_internal.h" + +struct devlink_info_req { + struct sk_buff *msg; + void (*version_cb)(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv); + void *version_cb_priv; +}; + +struct devlink_reload_combination { + enum devlink_reload_action action; + enum devlink_reload_limit limit; +}; + +static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { + { + /* can't reinitialize driver with no down time */ + .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, + }, +}; + +static bool +devlink_reload_combination_is_invalid(enum devlink_reload_action action, + enum devlink_reload_limit limit) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) + if (devlink_reload_invalid_combinations[i].action == action && + devlink_reload_invalid_combinations[i].limit == limit) + return true; + return false; +} + +static bool +devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) +{ + return test_bit(action, &devlink->ops->reload_actions); +} + +static bool +devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) +{ + return test_bit(limit, &devlink->ops->reload_limits); +} + +static int devlink_reload_stat_put(struct sk_buff *msg, + enum devlink_reload_limit limit, u32 value) +{ + struct nlattr *reload_stats_entry; + + reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); + if (!reload_stats_entry) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) + goto nla_put_failure; + nla_nest_end(msg, reload_stats_entry); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, reload_stats_entry); + return -EMSGSIZE; +} + +static int +devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) +{ + struct nlattr *reload_stats_attr, *act_info, *act_stats; + int i, j, stat_idx; + u32 value; + + if (!is_remote) + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + else + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); + + if (!reload_stats_attr) + return -EMSGSIZE; + + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if ((!is_remote && + !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC) + continue; + act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); + if (!act_info) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) + goto action_info_nest_cancel; + act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); + if (!act_stats) + goto action_info_nest_cancel; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + /* Remote stats are shown even if not locally supported. + * Stats of actions with unspecified limit are shown + * though drivers don't need to register unspecified + * limit. + */ + if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) || + devlink_reload_combination_is_invalid(i, j)) + continue; + + stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; + if (!is_remote) + value = devlink->stats.reload_stats[stat_idx]; + else + value = devlink->stats.remote_reload_stats[stat_idx]; + if (devlink_reload_stat_put(msg, j, value)) + goto action_stats_nest_cancel; + } + nla_nest_end(msg, act_stats); + nla_nest_end(msg, act_info); + } + nla_nest_end(msg, reload_stats_attr); + return 0; + +action_stats_nest_cancel: + nla_nest_cancel(msg, act_stats); +action_info_nest_cancel: + nla_nest_cancel(msg, act_info); +nla_put_failure: + nla_nest_cancel(msg, reload_stats_attr); + return -EMSGSIZE; +} + +static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + struct nlattr *dev_stats; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) + goto nla_put_failure; + + dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); + if (!dev_stats) + goto nla_put_failure; + + if (devlink_reload_stats_put(msg, devlink, false)) + goto dev_stats_nest_cancel; + if (devlink_reload_stats_put(msg, devlink, true)) + goto dev_stats_nest_cancel; + + nla_nest_end(msg, dev_stats); + genlmsg_end(msg, hdr); + return 0; + +dev_stats_nest_cancel: + nla_nest_cancel(msg, dev_stats); +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +void devlink_notify(struct devlink *devlink, enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) +{ + return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); +} + +const struct devlink_cmd devl_cmd_get = { + .dump_one = devlink_nl_cmd_get_dump_one, +}; + +static void devlink_reload_failed_set(struct devlink *devlink, + bool reload_failed) +{ + if (devlink->reload_failed == reload_failed) + return; + devlink->reload_failed = reload_failed; + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +bool devlink_is_reload_failed(const struct devlink *devlink) +{ + return devlink->reload_failed; +} +EXPORT_SYMBOL_GPL(devlink_is_reload_failed); + +static void +__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, + enum devlink_reload_limit limit, u32 actions_performed) +{ + unsigned long actions = actions_performed; + int stat_idx; + int action; + + for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { + stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; + reload_stats[stat_idx]++; + } + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +static void +devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, + u32 actions_performed) +{ + __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, + actions_performed); +} + +/** + * devlink_remote_reload_actions_performed - Update devlink on reload actions + * performed which are not a direct result of devlink reload call. + * + * This should be called by a driver after performing reload actions in case it was not + * a result of devlink reload call. For example fw_activate was performed as a result + * of devlink reload triggered fw_activate on another host. + * The motivation for this function is to keep data on reload actions performed on this + * function whether it was done due to direct devlink reload call or not. + * + * @devlink: devlink + * @limit: reload limit + * @actions_performed: bitmask of actions performed + */ +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed) +{ + if (WARN_ON(!actions_performed || + actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || + limit > DEVLINK_RELOAD_LIMIT_MAX)) + return; + + __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, + actions_performed); +} +EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); + +static struct net *devlink_netns_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; + struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; + struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; + struct net *net; + + if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { + NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + return ERR_PTR(-EINVAL); + } + + if (netns_pid_attr) { + net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); + } else if (netns_fd_attr) { + net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); + } else if (netns_id_attr) { + net = get_net_ns_by_id(sock_net(skb->sk), + nla_get_u32(netns_id_attr)); + if (!net) + net = ERR_PTR(-EINVAL); + } else { + WARN_ON(1); + net = ERR_PTR(-EINVAL); + } + if (IS_ERR(net)) { + NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + return ERR_PTR(-EINVAL); + } + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + return net; +} + +static void devlink_reload_netns_change(struct devlink *devlink, + struct net *curr_net, + struct net *dest_net) +{ + /* Userspace needs to be notified about devlink objects + * removed from original and entering new network namespace. + * The rest of the devlink objects are re-created during + * reload process so the notifications are generated separatelly. + */ + devlink_notify_unregister(devlink); + write_pnet(&devlink->_net, dest_net); + devlink_notify_register(devlink); +} + +int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack) +{ + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + struct net *curr_net; + int err; + + memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats)); + + err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); + if (err) + return err; + + curr_net = devlink_net(devlink); + if (dest_net && !net_eq(dest_net, curr_net)) + devlink_reload_netns_change(devlink, curr_net, dest_net); + + if (action == DEVLINK_RELOAD_ACTION_DRIVER_REINIT) + devlink_params_driverinit_load_new(devlink); + + err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); + devlink_reload_failed_set(devlink, !!err); + if (err) + return err; + + WARN_ON(!(*actions_performed & BIT(action))); + /* Catch driver on updating the remote action within devlink reload */ + WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats))); + devlink_reload_stats_update(devlink, limit, *actions_performed); + return 0; +} + +static int +devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, + enum devlink_command cmd, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, + actions_performed)) + goto nla_put_failure; + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; +} + +int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + enum devlink_reload_action action; + enum devlink_reload_limit limit; + struct net *dest_net = NULL; + u32 actions_performed; + int err; + + err = devlink_resources_validate(devlink, NULL, info); + if (err) { + NL_SET_ERR_MSG(info->extack, "resources size validation failed"); + return err; + } + + if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); + else + action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; + + if (!devlink_reload_action_is_supported(devlink, action)) { + NL_SET_ERR_MSG(info->extack, "Requested reload action is not supported by the driver"); + return -EOPNOTSUPP; + } + + limit = DEVLINK_RELOAD_LIMIT_UNSPEC; + if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { + struct nla_bitfield32 limits; + u32 limits_selected; + + limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); + limits_selected = limits.value & limits.selector; + if (!limits_selected) { + NL_SET_ERR_MSG(info->extack, "Invalid limit selected"); + return -EINVAL; + } + for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) + if (limits_selected & BIT(limit)) + break; + /* UAPI enables multiselection, but currently it is not used */ + if (limits_selected != BIT(limit)) { + NL_SET_ERR_MSG(info->extack, "Multiselection of limit is not supported"); + return -EOPNOTSUPP; + } + if (!devlink_reload_limit_is_supported(devlink, limit)) { + NL_SET_ERR_MSG(info->extack, "Requested limit is not supported by the driver"); + return -EOPNOTSUPP; + } + if (devlink_reload_combination_is_invalid(action, limit)) { + NL_SET_ERR_MSG(info->extack, "Requested limit is invalid for this action"); + return -EINVAL; + } + } + if (info->attrs[DEVLINK_ATTR_NETNS_PID] || + info->attrs[DEVLINK_ATTR_NETNS_FD] || + info->attrs[DEVLINK_ATTR_NETNS_ID]) { + dest_net = devlink_netns_get(skb, info); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + if (!net_eq(dest_net, devlink_net(devlink)) && + action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) { + NL_SET_ERR_MSG_MOD(info->extack, + "Changing namespace is only supported for reinit action"); + return -EOPNOTSUPP; + } + } + + err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); + + if (dest_net) + put_net(dest_net); + + if (err) + return err; + /* For backward compatibility generate reply only if attributes used by user */ + if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) + return 0; + + return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, + DEVLINK_CMD_RELOAD, info); +} + +bool devlink_reload_actions_valid(const struct devlink_ops *ops) +{ + const struct devlink_reload_combination *comb; + int i; + + if (!devlink_reload_supported(ops)) { + if (WARN_ON(ops->reload_actions)) + return false; + return true; + } + + if (WARN_ON(!ops->reload_actions || + ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) + return false; + + if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || + ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) + return false; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { + comb = &devlink_reload_invalid_combinations[i]; + if (ops->reload_actions == BIT(comb->action) && + ops->reload_limits == BIT(comb->limit)) + return false; + } + return true; +} + +static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; + void *hdr; + int err = 0; + u16 mode; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto nla_put_failure; + + if (ops->eswitch_mode_get) { + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto nla_put_failure; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto nla_put_failure; + } + + if (ops->eswitch_inline_mode_get) { + err = ops->eswitch_inline_mode_get(devlink, &inline_mode); + if (err) + goto nla_put_failure; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, + inline_mode); + if (err) + goto nla_put_failure; + } + + if (ops->eswitch_encap_mode_get) { + err = ops->eswitch_encap_mode_get(devlink, &encap_mode); + if (err) + goto nla_put_failure; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); + if (err) + goto nla_put_failure; + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return err; +} + +int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, + info->snd_portid, info->snd_seq, 0); + + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + enum devlink_eswitch_encap_mode encap_mode; + u8 inline_mode; + int err = 0; + u16 mode; + + if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { + if (!ops->eswitch_mode_set) + return -EOPNOTSUPP; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = devlink_rate_nodes_check(devlink, mode, info->extack); + if (err) + return err; + err = ops->eswitch_mode_set(devlink, mode, info->extack); + if (err) + return err; + } + + if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { + if (!ops->eswitch_inline_mode_set) + return -EOPNOTSUPP; + inline_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); + err = ops->eswitch_inline_mode_set(devlink, inline_mode, + info->extack); + if (err) + return err; + } + + if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { + if (!ops->eswitch_encap_mode_set) + return -EOPNOTSUPP; + encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); + err = ops->eswitch_encap_mode_set(devlink, encap_mode, + info->extack); + if (err) + return err; + } + + return 0; +} + +int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) +{ + if (!req->msg) + return 0; + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); +} +EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); + +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn) +{ + if (!req->msg) + return 0; + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, + bsn); +} +EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); + +static int devlink_info_version_put(struct devlink_info_req *req, int attr, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + struct nlattr *nest; + int err; + + if (req->version_cb) + req->version_cb(version_name, version_type, + req->version_cb_priv); + + if (!req->msg) + return 0; + + nest = nla_nest_start_noflag(req->msg, attr); + if (!nest) + return -EMSGSIZE; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, + version_name); + if (err) + goto nla_put_failure; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, + version_value); + if (err) + goto nla_put_failure; + + nla_nest_end(req->msg, nest); + + return 0; + +nla_put_failure: + nla_nest_cancel(req->msg, nest); + return err; +} + +int devlink_info_version_fixed_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); +} +EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); + +int devlink_info_version_stored_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); + +int devlink_info_version_stored_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); + +int devlink_info_version_running_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put); + +int devlink_info_version_running_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); + +static int devlink_nl_driver_info_get(struct device_driver *drv, + struct devlink_info_req *req) +{ + if (!drv) + return 0; + + if (drv->name[0]) + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, + drv->name); + + return 0; +} + +static int +devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, struct netlink_ext_ack *extack) +{ + struct device *dev = devlink_to_dev(devlink); + struct devlink_info_req req = {}; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto err_cancel_msg; + + req.msg = msg; + if (devlink->ops->info_get) { + err = devlink->ops->info_get(devlink, &req, extack); + if (err) + goto err_cancel_msg; + } + + err = devlink_nl_driver_info_get(dev->driver, &req); + if (err) + goto err_cancel_msg; + + genlmsg_end(msg, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(msg, hdr); + return err; +} + +int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) +{ + int err; + + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} + +const struct devlink_cmd devl_cmd_info_get = { + .dump_one = devlink_nl_cmd_info_get_dump_one, +}; + +static int devlink_nl_flash_update_fill(struct sk_buff *msg, + struct devlink *devlink, + enum devlink_command cmd, + struct devlink_flash_notify *params) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) + goto out; + + if (params->status_msg && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, + params->status_msg)) + goto nla_put_failure; + if (params->component && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, + params->component)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + params->done, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + params->total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, + params->timeout, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void __devlink_flash_update_notify(struct devlink *devlink, + enum devlink_command cmd, + struct devlink_flash_notify *params) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && + cmd != DEVLINK_CMD_FLASH_UPDATE_END && + cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); + if (err) + goto out_free_msg; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + return; + +out_free_msg: + nlmsg_free(msg); +} + +static void devlink_flash_update_begin_notify(struct devlink *devlink) +{ + struct devlink_flash_notify params = {}; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE, + ¶ms); +} + +static void devlink_flash_update_end_notify(struct devlink *devlink) +{ + struct devlink_flash_notify params = {}; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_END, + ¶ms); +} + +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .done = done, + .total = total, + }; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + ¶ms); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); + +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout) +{ + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .timeout = timeout, + }; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + ¶ms); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); + +struct devlink_flash_component_lookup_ctx { + const char *lookup_name; + bool lookup_name_found; +}; + +static void +devlink_flash_component_lookup_cb(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv) +{ + struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; + + if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || + lookup_ctx->lookup_name_found) + return; + + lookup_ctx->lookup_name_found = + !strcmp(lookup_ctx->lookup_name, version_name); +} + +static int devlink_flash_component_get(struct devlink *devlink, + struct nlattr *nla_component, + const char **p_component, + struct netlink_ext_ack *extack) +{ + struct devlink_flash_component_lookup_ctx lookup_ctx = {}; + struct devlink_info_req req = {}; + const char *component; + int ret; + + if (!nla_component) + return 0; + + component = nla_data(nla_component); + + if (!devlink->ops->info_get) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "component update is not supported by this device"); + return -EOPNOTSUPP; + } + + lookup_ctx.lookup_name = component; + req.version_cb = devlink_flash_component_lookup_cb; + req.version_cb_priv = &lookup_ctx; + + ret = devlink->ops->info_get(devlink, &req, NULL); + if (ret) + return ret; + + if (!lookup_ctx.lookup_name_found) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "selected component is not supported by this device"); + return -EINVAL; + } + *p_component = component; + return 0; +} + +int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_overwrite_mask, *nla_file_name; + struct devlink_flash_update_params params = {}; + struct devlink *devlink = info->user_ptr[0]; + const char *file_name; + u32 supported_params; + int ret; + + if (!devlink->ops->flash_update) + return -EOPNOTSUPP; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) + return -EINVAL; + + ret = devlink_flash_component_get(devlink, + info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], + ¶ms.component, info->extack); + if (ret) + return ret; + + supported_params = devlink->ops->supported_flash_update_params; + + nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; + if (nla_overwrite_mask) { + struct nla_bitfield32 sections; + + if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, + "overwrite settings are not supported by this device"); + return -EOPNOTSUPP; + } + sections = nla_get_bitfield32(nla_overwrite_mask); + params.overwrite_mask = sections.value & sections.selector; + } + + nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; + file_name = nla_data(nla_file_name); + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, + "failed to locate the requested firmware file"); + return ret; + } + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); + + return ret; +} + +static void __devlink_compat_running_version(struct devlink *devlink, + char *buf, size_t len) +{ + struct devlink_info_req req = {}; + const struct nlattr *nlattr; + struct sk_buff *msg; + int rem, err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + req.msg = msg; + err = devlink->ops->info_get(devlink, &req, NULL); + if (err) + goto free_msg; + + nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { + const struct nlattr *kv; + int rem_kv; + + if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) + continue; + + nla_for_each_nested(kv, nlattr, rem_kv) { + if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) + continue; + + strlcat(buf, nla_data(kv), len); + strlcat(buf, " ", len); + } + } +free_msg: + nlmsg_free(msg); +} + +void devlink_compat_running_version(struct devlink *devlink, + char *buf, size_t len) +{ + if (!devlink->ops->info_get) + return; + + devl_lock(devlink); + if (devl_is_registered(devlink)) + __devlink_compat_running_version(devlink, buf, len); + devl_unlock(devlink); +} + +int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) +{ + struct devlink_flash_update_params params = {}; + int ret; + + devl_lock(devlink); + if (!devl_is_registered(devlink)) { + ret = -ENODEV; + goto out_unlock; + } + + if (!devlink->ops->flash_update) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) + goto out_unlock; + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, NULL); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); +out_unlock: + devl_unlock(devlink); + + return ret; +} + +static int +devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink, + u32 portid, u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *selftests; + void *hdr; + int err; + int i; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, + DEVLINK_CMD_SELFTESTS_GET); + if (!hdr) + return -EMSGSIZE; + + err = -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto err_cancel_msg; + + selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); + if (!selftests) + goto err_cancel_msg; + + for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; + i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { + if (devlink->ops->selftest_check(devlink, i, extack)) { + err = nla_put_flag(msg, i); + if (err) + goto err_cancel_msg; + } + } + + nla_nest_end(msg, selftests); + genlmsg_end(msg, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(msg, hdr); + return err; +} + +int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + if (!devlink->ops->selftest_check) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid, + info->snd_seq, 0, info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) +{ + if (!devlink->ops->selftest_check) + return 0; + + return devlink_nl_selftests_fill(msg, devlink, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); +} + +const struct devlink_cmd devl_cmd_selftests_get = { + .dump_one = devlink_nl_cmd_selftests_get_dump_one, +}; + +static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, + enum devlink_selftest_status test_status) +{ + struct nlattr *result_attr; + + result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT); + if (!result_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) || + nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS, + test_status)) + goto nla_put_failure; + + nla_nest_end(skb, result_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, result_attr); + return -EMSGSIZE; +} + +static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { + [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, +}; + +int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1]; + struct devlink *devlink = info->user_ptr[0]; + struct nlattr *attrs, *selftests; + struct sk_buff *msg; + void *hdr; + int err; + int i; + + if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) + return -EOPNOTSUPP; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) + return -EINVAL; + + attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; + + err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs, + devlink_selftest_nl_policy, info->extack); + if (err < 0) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = -EMSGSIZE; + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); + if (!selftests) + goto genlmsg_cancel; + + for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; + i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { + enum devlink_selftest_status test_status; + + if (nla_get_flag(tb[i])) { + if (!devlink->ops->selftest_check(devlink, i, + info->extack)) { + if (devlink_selftest_result_put(msg, i, + DEVLINK_SELFTEST_STATUS_SKIP)) + goto selftests_nest_cancel; + continue; + } + + test_status = devlink->ops->selftest_run(devlink, i, + info->extack); + if (devlink_selftest_result_put(msg, i, test_status)) + goto selftests_nest_cancel; + } + } + + nla_nest_end(msg, selftests); + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +selftests_nest_cancel: + nla_nest_cancel(msg, selftests); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return err; +} diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h new file mode 100644 index 000000000000..e133f423294a --- /dev/null +++ b/net/devlink/devl_internal.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/xarray.h> +#include <net/devlink.h> +#include <net/net_namespace.h> + +#define DEVLINK_REGISTERED XA_MARK_1 + +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + +struct devlink { + u32 index; + struct xarray ports; + struct list_head rate_list; + struct list_head sb_list; + struct list_head dpipe_table_list; + struct list_head resource_list; + struct xarray params; + struct list_head region_list; + struct list_head reporter_list; + struct devlink_dpipe_headers *dpipe_headers; + struct list_head trap_list; + struct list_head trap_group_list; + struct list_head trap_policer_list; + struct list_head linecard_list; + const struct devlink_ops *ops; + struct xarray snapshot_ids; + struct devlink_dev_stats stats; + struct device *dev; + possible_net_t _net; + /* Serializes access to devlink instance specific objects such as + * port, sb, dpipe, resource, params, region, traps and more. + */ + struct mutex lock; + struct lock_class_key lock_key; + u8 reload_failed:1; + refcount_t refcount; + struct rcu_work rwork; + struct notifier_block netdevice_nb; + char priv[] __aligned(NETDEV_ALIGN); +}; + +extern struct xarray devlinks; +extern struct genl_family devlink_nl_family; + +/* devlink instances are open to the access from the user space after + * devlink_register() call. Such logical barrier allows us to have certain + * expectations related to locking. + * + * Before *_register() - we are in initialization stage and no parallel + * access possible to the devlink instance. All drivers perform that phase + * by implicitly holding device_lock. + * + * After *_register() - users and driver can access devlink instance at + * the same time. + */ +#define ASSERT_DEVLINK_REGISTERED(d) \ + WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) +#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ + WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) + +/* Iterate over devlink pointers which were possible to get reference to. + * devlink_put() needs to be called for each iterated devlink pointer + * in loop body in order to release the reference. + */ +#define devlinks_xa_for_each_registered_get(net, index, devlink) \ + for (index = 0; (devlink = devlinks_xa_find_get(net, &index)); index++) + +struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp); + +static inline bool devl_is_registered(struct devlink *devlink) +{ + devl_assert_locked(devlink); + return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); +} + +/* Netlink */ +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) +#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) +#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) +#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) + +enum devlink_multicast_groups { + DEVLINK_MCGRP_CONFIG, +}; + +/* state held across netlink dumps */ +struct devlink_nl_dump_state { + unsigned long instance; + int idx; + union { + /* DEVLINK_CMD_REGION_READ */ + struct { + u64 start_offset; + }; + /* DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET */ + struct { + u64 dump_ts; + }; + }; +}; + +struct devlink_cmd { + int (*dump_one)(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb); +}; + +extern const struct genl_small_ops devlink_nl_ops[56]; + +struct devlink * +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); + +void devlink_notify_unregister(struct devlink *devlink); +void devlink_notify_register(struct devlink *devlink); + +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb); + +static inline struct devlink_nl_dump_state * +devlink_dump_state(struct netlink_callback *cb) +{ + NL_ASSERT_DUMP_CTX_FITS(struct devlink_nl_dump_state); + + return (struct devlink_nl_dump_state *)cb->ctx; +} + +static inline int +devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) +{ + if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) + return -EMSGSIZE; + return 0; +} + +/* Commands */ +extern const struct devlink_cmd devl_cmd_get; +extern const struct devlink_cmd devl_cmd_port_get; +extern const struct devlink_cmd devl_cmd_sb_get; +extern const struct devlink_cmd devl_cmd_sb_pool_get; +extern const struct devlink_cmd devl_cmd_sb_port_pool_get; +extern const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get; +extern const struct devlink_cmd devl_cmd_param_get; +extern const struct devlink_cmd devl_cmd_region_get; +extern const struct devlink_cmd devl_cmd_info_get; +extern const struct devlink_cmd devl_cmd_health_reporter_get; +extern const struct devlink_cmd devl_cmd_trap_get; +extern const struct devlink_cmd devl_cmd_trap_group_get; +extern const struct devlink_cmd devl_cmd_trap_policer_get; +extern const struct devlink_cmd devl_cmd_rate_get; +extern const struct devlink_cmd devl_cmd_linecard_get; +extern const struct devlink_cmd devl_cmd_selftests_get; + +/* Notify */ +void devlink_notify(struct devlink *devlink, enum devlink_command cmd); + +/* Ports */ +int devlink_port_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr); + +struct devlink_port * +devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info); +struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs); + +/* Reload */ +bool devlink_reload_actions_valid(const struct devlink_ops *ops); +int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack); + +static inline bool devlink_reload_supported(const struct devlink_ops *ops) +{ + return ops->reload_down && ops->reload_up; +} + +/* Params */ +void devlink_params_driverinit_load_new(struct devlink *devlink); + +/* Resources */ +struct devlink_resource; +int devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info); + +/* Line cards */ +struct devlink_linecard; + +struct devlink_linecard * +devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info); + +/* Rates */ +int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack); +struct devlink_rate * +devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info); +struct devlink_rate * +devlink_rate_node_get_from_info(struct devlink *devlink, + struct genl_info *info); +/* Devlink nl cmds */ +int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, + struct genl_info *info); diff --git a/net/devlink/health.c b/net/devlink/health.c new file mode 100644 index 000000000000..0839706d5741 --- /dev/null +++ b/net/devlink/health.c @@ -0,0 +1,1333 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> +#include <net/sock.h> +#include <trace/events/devlink.h> +#include "devl_internal.h" + +struct devlink_fmsg_item { + struct list_head list; + int attrtype; + u8 nla_type; + u16 len; + int value[]; +}; + +struct devlink_fmsg { + struct list_head item_list; + bool putting_binary; /* This flag forces enclosing of binary data + * in an array brackets. It forces using + * of designated API: + * devlink_fmsg_binary_pair_nest_start() + * devlink_fmsg_binary_pair_nest_end() + */ +}; + +static struct devlink_fmsg *devlink_fmsg_alloc(void) +{ + struct devlink_fmsg *fmsg; + + fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); + if (!fmsg) + return NULL; + + INIT_LIST_HEAD(&fmsg->item_list); + + return fmsg; +} + +static void devlink_fmsg_free(struct devlink_fmsg *fmsg) +{ + struct devlink_fmsg_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { + list_del(&item->list); + kfree(item); + } + kfree(fmsg); +} + +struct devlink_health_reporter { + struct list_head list; + void *priv; + const struct devlink_health_reporter_ops *ops; + struct devlink *devlink; + struct devlink_port *devlink_port; + struct devlink_fmsg *dump_fmsg; + struct mutex dump_lock; /* lock parallel read/write from dump buffers */ + u64 graceful_period; + bool auto_recover; + bool auto_dump; + u8 health_state; + u64 dump_ts; + u64 dump_real_ts; + u64 error_count; + u64 recovery_count; + u64 last_recovery_ts; +}; + +void * +devlink_health_reporter_priv(struct devlink_health_reporter *reporter) +{ + return reporter->priv; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); + +static struct devlink_health_reporter * +__devlink_health_reporter_find_by_name(struct list_head *reporter_list, + const char *reporter_name) +{ + struct devlink_health_reporter *reporter; + + list_for_each_entry(reporter, reporter_list, list) + if (!strcmp(reporter->ops->name, reporter_name)) + return reporter; + return NULL; +} + +static struct devlink_health_reporter * +devlink_health_reporter_find_by_name(struct devlink *devlink, + const char *reporter_name) +{ + return __devlink_health_reporter_find_by_name(&devlink->reporter_list, + reporter_name); +} + +static struct devlink_health_reporter * +devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, + const char *reporter_name) +{ + return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, + reporter_name); +} + +static struct devlink_health_reporter * +__devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + if (WARN_ON(graceful_period && !ops->recover)) + return ERR_PTR(-EINVAL); + + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); + if (!reporter) + return ERR_PTR(-ENOMEM); + + reporter->priv = priv; + reporter->ops = ops; + reporter->devlink = devlink; + reporter->graceful_period = graceful_period; + reporter->auto_recover = !!ops->recover; + reporter->auto_dump = !!ops->dump; + mutex_init(&reporter->dump_lock); + return reporter; +} + +/** + * devl_port_health_reporter_create() - create devlink health reporter for + * specified port instance + * + * @port: devlink_port to which health reports will relate + * @ops: devlink health reporter ops + * @graceful_period: min time (in msec) between recovery attempts + * @priv: driver priv pointer + */ +struct devlink_health_reporter * +devl_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + devl_assert_locked(port->devlink); + + if (__devlink_health_reporter_find_by_name(&port->reporter_list, + ops->name)) + return ERR_PTR(-EEXIST); + + reporter = __devlink_health_reporter_create(port->devlink, ops, + graceful_period, priv); + if (IS_ERR(reporter)) + return reporter; + + reporter->devlink_port = port; + list_add_tail(&reporter->list, &port->reporter_list); + return reporter; +} +EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); + +struct devlink_health_reporter * +devlink_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink = port->devlink; + + devl_lock(devlink); + reporter = devl_port_health_reporter_create(port, ops, + graceful_period, priv); + devl_unlock(devlink); + return reporter; +} +EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); + +/** + * devl_health_reporter_create - create devlink health reporter + * + * @devlink: devlink instance which the health reports will relate + * @ops: devlink health reporter ops + * @graceful_period: min time (in msec) between recovery attempts + * @priv: driver priv pointer + */ +struct devlink_health_reporter * +devl_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + devl_assert_locked(devlink); + + if (devlink_health_reporter_find_by_name(devlink, ops->name)) + return ERR_PTR(-EEXIST); + + reporter = __devlink_health_reporter_create(devlink, ops, + graceful_period, priv); + if (IS_ERR(reporter)) + return reporter; + + list_add_tail(&reporter->list, &devlink->reporter_list); + return reporter; +} +EXPORT_SYMBOL_GPL(devl_health_reporter_create); + +struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + devl_lock(devlink); + reporter = devl_health_reporter_create(devlink, ops, + graceful_period, priv); + devl_unlock(devlink); + return reporter; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_create); + +static void +devlink_health_reporter_free(struct devlink_health_reporter *reporter) +{ + mutex_destroy(&reporter->dump_lock); + if (reporter->dump_fmsg) + devlink_fmsg_free(reporter->dump_fmsg); + kfree(reporter); +} + +/** + * devl_health_reporter_destroy() - destroy devlink health reporter + * + * @reporter: devlink health reporter to destroy + */ +void +devl_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ + devl_assert_locked(reporter->devlink); + + list_del(&reporter->list); + devlink_health_reporter_free(reporter); +} +EXPORT_SYMBOL_GPL(devl_health_reporter_destroy); + +void +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ + struct devlink *devlink = reporter->devlink; + + devl_lock(devlink); + devl_health_reporter_destroy(reporter); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); + +static int +devlink_nl_health_reporter_fill(struct sk_buff *msg, + struct devlink_health_reporter *reporter, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + struct devlink *devlink = reporter->devlink; + struct nlattr *reporter_attr; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + if (reporter->devlink_port) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) + goto genlmsg_cancel; + } + reporter_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_HEALTH_REPORTER); + if (!reporter_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, + reporter->ops->name)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, + reporter->health_state)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, + reporter->error_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, + reporter->recovery_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + reporter->graceful_period, + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, + reporter->auto_recover)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, + jiffies_to_msecs(reporter->dump_ts), + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, + reporter->dump_real_ts, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->dump && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + reporter->auto_dump)) + goto reporter_nest_cancel; + + nla_nest_end(msg, reporter_attr); + genlmsg_end(msg, hdr); + return 0; + +reporter_nest_cancel: + nla_nest_cancel(msg, reporter_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + struct devlink_port *devlink_port; + char *reporter_name; + + if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) + return NULL; + + reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); + devlink_port = devlink_port_get_from_attrs(devlink, attrs); + if (IS_ERR(devlink_port)) + return devlink_health_reporter_find_by_name(devlink, + reporter_name); + else + return devlink_port_health_reporter_find_by_name(devlink_port, + reporter_name); +} + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_health_reporter_get_from_attrs(devlink, info->attrs); +} + +int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + struct sk_buff *msg; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_health_reporter_fill(msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + info->snd_portid, info->snd_seq, + 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_health_reporter *reporter; + struct devlink_port *port; + unsigned long port_index; + int idx = 0; + int err; + + list_for_each_entry(reporter, &devlink->reporter_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + return err; + } + idx++; + } + xa_for_each(&devlink->ports, port_index, port) { + list_for_each_entry(reporter, &port->reporter_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + return err; + } + idx++; + } + } + + return 0; +} + +const struct devlink_cmd devl_cmd_health_reporter_get = { + .dump_one = devlink_nl_cmd_health_reporter_get_dump_one, +}; + +int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->recover && + (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + return -EOPNOTSUPP; + + if (!reporter->ops->dump && + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + return -EOPNOTSUPP; + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + reporter->graceful_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) + reporter->auto_recover = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + reporter->auto_dump = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); + + return 0; +} + +static void devlink_recover_notify(struct devlink_health_reporter *reporter, + enum devlink_command cmd) +{ + struct devlink *devlink = reporter->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) +{ + reporter->recovery_count++; + reporter->last_recovery_ts = jiffies; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); + +static int +devlink_health_reporter_recover(struct devlink_health_reporter *reporter, + void *priv_ctx, struct netlink_ext_ack *extack) +{ + int err; + + if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return 0; + + if (!reporter->ops->recover) + return -EOPNOTSUPP; + + err = reporter->ops->recover(reporter, priv_ctx, extack); + if (err) + return err; + + devlink_health_reporter_recovery_done(reporter); + reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + + return 0; +} + +static void +devlink_health_dump_clear(struct devlink_health_reporter *reporter) +{ + if (!reporter->dump_fmsg) + return; + devlink_fmsg_free(reporter->dump_fmsg); + reporter->dump_fmsg = NULL; +} + +static int devlink_health_do_dump(struct devlink_health_reporter *reporter, + void *priv_ctx, + struct netlink_ext_ack *extack) +{ + int err; + + if (!reporter->ops->dump) + return 0; + + if (reporter->dump_fmsg) + return 0; + + reporter->dump_fmsg = devlink_fmsg_alloc(); + if (!reporter->dump_fmsg) { + err = -ENOMEM; + return err; + } + + err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg); + if (err) + goto dump_err; + + err = reporter->ops->dump(reporter, reporter->dump_fmsg, + priv_ctx, extack); + if (err) + goto dump_err; + + err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg); + if (err) + goto dump_err; + + reporter->dump_ts = jiffies; + reporter->dump_real_ts = ktime_get_real_ns(); + + return 0; + +dump_err: + devlink_health_dump_clear(reporter); + return err; +} + +int devlink_health_report(struct devlink_health_reporter *reporter, + const char *msg, void *priv_ctx) +{ + enum devlink_health_reporter_state prev_health_state; + struct devlink *devlink = reporter->devlink; + unsigned long recover_ts_threshold; + int ret; + + /* write a log message of the current error */ + WARN_ON(!msg); + trace_devlink_health_report(devlink, reporter->ops->name, msg); + reporter->error_count++; + prev_health_state = reporter->health_state; + reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + + /* abort if the previous error wasn't recovered */ + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->graceful_period); + if (reporter->auto_recover && + (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || + (reporter->last_recovery_ts && reporter->recovery_count && + time_is_after_jiffies(recover_ts_threshold)))) { + trace_devlink_health_recover_aborted(devlink, + reporter->ops->name, + reporter->health_state, + jiffies - + reporter->last_recovery_ts); + return -ECANCELED; + } + + if (reporter->auto_dump) { + mutex_lock(&reporter->dump_lock); + /* store current dump of current error, for later analysis */ + devlink_health_do_dump(reporter, priv_ctx, NULL); + mutex_unlock(&reporter->dump_lock); + } + + if (!reporter->auto_recover) + return 0; + + devl_lock(devlink); + ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL); + devl_unlock(devlink); + + return ret; +} +EXPORT_SYMBOL_GPL(devlink_health_report); + +void +devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state state) +{ + if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && + state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) + return; + + if (reporter->health_state == state) + return; + + reporter->health_state = state; + trace_devlink_health_reporter_state_update(reporter->devlink, + reporter->ops->name, state); + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); + +int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + return devlink_health_reporter_recover(reporter, NULL, info->extack); +} + +static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, + int attrtype) +{ + struct devlink_fmsg_item *item; + + item = kzalloc(sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->attrtype = attrtype; + list_add_tail(&item->list, &fmsg->item_list); + + return 0; +} + +int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); + +static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); +} + +int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); + +#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) + +static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) +{ + struct devlink_fmsg_item *item; + + if (fmsg->putting_binary) + return -EINVAL; + + if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) + return -EMSGSIZE; + + item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->nla_type = NLA_NUL_STRING; + item->len = strlen(name) + 1; + item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; + memcpy(&item->value, name, item->len); + list_add_tail(&item->list, &fmsg->item_list); + + return 0; +} + +int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) +{ + int err; + + if (fmsg->putting_binary) + return -EINVAL; + + err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); + if (err) + return err; + + err = devlink_fmsg_put_name(fmsg, name); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); + +int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); + +int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name) +{ + int err; + + if (fmsg->putting_binary) + return -EINVAL; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); + +int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) +{ + int err; + + if (fmsg->putting_binary) + return -EINVAL; + + err = devlink_fmsg_nest_end(fmsg); + if (err) + return err; + + err = devlink_fmsg_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); + +int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name) +{ + int err; + + err = devlink_fmsg_arr_pair_nest_start(fmsg, name); + if (err) + return err; + + fmsg->putting_binary = true; + return err; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start); + +int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg) +{ + if (!fmsg->putting_binary) + return -EINVAL; + + fmsg->putting_binary = false; + return devlink_fmsg_arr_pair_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end); + +static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, + const void *value, u16 value_len, + u8 value_nla_type) +{ + struct devlink_fmsg_item *item; + + if (value_len > DEVLINK_FMSG_MAX_SIZE) + return -EMSGSIZE; + + item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->nla_type = value_nla_type; + item->len = value_len; + item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; + memcpy(&item->value, value, item->len); + list_add_tail(&item->list, &fmsg->item_list); + + return 0; +} + +static int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); +} + +static int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); +} + +int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); + +static int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); +} + +int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) +{ + if (fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, + NLA_NUL_STRING); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); + +int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) +{ + if (!fmsg->putting_binary) + return -EINVAL; + + return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); + +int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, + bool value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_bool_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); + +int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, + u8 value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_u8_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); + +int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, + u32 value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_u32_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); + +int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, + u64 value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_u64_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); + +int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, + const char *value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_string_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); + +int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, + const void *value, u32 value_len) +{ + u32 data_size; + int end_err; + u32 offset; + int err; + + err = devlink_fmsg_binary_pair_nest_start(fmsg, name); + if (err) + return err; + + for (offset = 0; offset < value_len; offset += data_size) { + data_size = value_len - offset; + if (data_size > DEVLINK_FMSG_MAX_SIZE) + data_size = DEVLINK_FMSG_MAX_SIZE; + err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); + if (err) + break; + /* Exit from loop with a break (instead of + * return) to make sure putting_binary is turned off in + * devlink_fmsg_binary_pair_nest_end + */ + } + + end_err = devlink_fmsg_binary_pair_nest_end(fmsg); + if (end_err) + err = end_err; + + return err; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); + +static int +devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) +{ + switch (msg->nla_type) { + case NLA_FLAG: + case NLA_U8: + case NLA_U32: + case NLA_U64: + case NLA_NUL_STRING: + case NLA_BINARY: + return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, + msg->nla_type); + default: + return -EINVAL; + } +} + +static int +devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) +{ + int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; + u8 tmp; + + switch (msg->nla_type) { + case NLA_FLAG: + /* Always provide flag data, regardless of its value */ + tmp = *(bool *)msg->value; + + return nla_put_u8(skb, attrtype, tmp); + case NLA_U8: + return nla_put_u8(skb, attrtype, *(u8 *)msg->value); + case NLA_U32: + return nla_put_u32(skb, attrtype, *(u32 *)msg->value); + case NLA_U64: + return nla_put_u64_64bit(skb, attrtype, *(u64 *)msg->value, + DEVLINK_ATTR_PAD); + case NLA_NUL_STRING: + return nla_put_string(skb, attrtype, (char *)&msg->value); + case NLA_BINARY: + return nla_put(skb, attrtype, msg->len, (void *)&msg->value); + default: + return -EINVAL; + } +} + +static int +devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, + int *start) +{ + struct devlink_fmsg_item *item; + struct nlattr *fmsg_nlattr; + int err = 0; + int i = 0; + + fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); + if (!fmsg_nlattr) + return -EMSGSIZE; + + list_for_each_entry(item, &fmsg->item_list, list) { + if (i < *start) { + i++; + continue; + } + + switch (item->attrtype) { + case DEVLINK_ATTR_FMSG_OBJ_NEST_START: + case DEVLINK_ATTR_FMSG_PAIR_NEST_START: + case DEVLINK_ATTR_FMSG_ARR_NEST_START: + case DEVLINK_ATTR_FMSG_NEST_END: + err = nla_put_flag(skb, item->attrtype); + break; + case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: + err = devlink_fmsg_item_fill_type(item, skb); + if (err) + break; + err = devlink_fmsg_item_fill_data(item, skb); + break; + case DEVLINK_ATTR_FMSG_OBJ_NAME: + err = nla_put_string(skb, item->attrtype, + (char *)&item->value); + break; + default: + err = -EINVAL; + break; + } + if (!err) + *start = ++i; + else + break; + } + + nla_nest_end(skb, fmsg_nlattr); + return err; +} + +static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, + struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct nlmsghdr *nlh; + struct sk_buff *skb; + bool last = false; + int index = 0; + void *hdr; + int err; + + while (!last) { + int tmp_index = index; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, flags | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if (!err) + last = true; + else if (err != -EMSGSIZE || tmp_index == index) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + err = genlmsg_reply(skb, info); + if (err) + return err; + } + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + return genlmsg_reply(skb, info); + +nla_put_failure: + nlmsg_free(skb); + return err; +} + +static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, + struct netlink_callback *cb, + enum devlink_command cmd) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + int index = state->idx; + int tmp_index = index; + void *hdr; + int err; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if ((err && err != -EMSGSIZE) || tmp_index == index) + goto nla_put_failure; + + state->idx = index; + genlmsg_end(skb, hdr); + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return err; +} + +int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + struct devlink_fmsg *fmsg; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->diagnose) + return -EOPNOTSUPP; + + fmsg = devlink_fmsg_alloc(); + if (!fmsg) + return -ENOMEM; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + goto out; + + err = reporter->ops->diagnose(reporter, fmsg, info->extack); + if (err) + goto out; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + goto out; + + err = devlink_fmsg_snd(fmsg, info, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); + +out: + devlink_fmsg_free(fmsg); + return err; +} + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_cb(struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_health_reporter *reporter; + struct nlattr **attrs = info->attrs; + struct devlink *devlink; + + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + return NULL; + devl_unlock(devlink); + + reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + devlink_put(devlink); + return reporter; +} + +int devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_health_reporter *reporter; + int err; + + reporter = devlink_health_reporter_get_from_cb(cb); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->dump) + return -EOPNOTSUPP; + + mutex_lock(&reporter->dump_lock); + if (!state->idx) { + err = devlink_health_do_dump(reporter, NULL, cb->extack); + if (err) + goto unlock; + state->dump_ts = reporter->dump_ts; + } + if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) { + NL_SET_ERR_MSG(cb->extack, "Dump trampled, please retry"); + err = -EAGAIN; + goto unlock; + } + + err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); +unlock: + mutex_unlock(&reporter->dump_lock); + return err; +} + +int devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->dump) + return -EOPNOTSUPP; + + mutex_lock(&reporter->dump_lock); + devlink_health_dump_clear(reporter); + mutex_unlock(&reporter->dump_lock); + return 0; +} + +int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->test) + return -EOPNOTSUPP; + + return reporter->ops->test(reporter, info->extack); +} diff --git a/net/core/devlink.c b/net/devlink/leftover.c index ab40ebcb4aea..dffca2f9bfa7 100644 --- a/net/core/devlink.c +++ b/net/devlink/leftover.c @@ -31,58 +31,12 @@ #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> -#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ - (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) - -struct devlink_dev_stats { - u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; -}; - -struct devlink { - u32 index; - struct xarray ports; - struct list_head rate_list; - struct list_head sb_list; - struct list_head dpipe_table_list; - struct list_head resource_list; - struct list_head param_list; - struct list_head region_list; - struct list_head reporter_list; - struct mutex reporters_lock; /* protects reporter_list */ - struct devlink_dpipe_headers *dpipe_headers; - struct list_head trap_list; - struct list_head trap_group_list; - struct list_head trap_policer_list; - struct list_head linecard_list; - struct mutex linecards_lock; /* protects linecard_list */ - const struct devlink_ops *ops; - u64 features; - struct xarray snapshot_ids; - struct devlink_dev_stats stats; - struct device *dev; - possible_net_t _net; - /* Serializes access to devlink instance specific objects such as - * port, sb, dpipe, resource, params, region, traps and more. - */ - struct mutex lock; - struct lock_class_key lock_key; - u8 reload_failed:1; - refcount_t refcount; - struct completion comp; - struct rcu_head rcu; - struct notifier_block netdevice_nb; - char priv[] __aligned(NETDEV_ALIGN); -}; - -struct devlink_linecard_ops; -struct devlink_linecard_type; +#include "devl_internal.h" struct devlink_linecard { struct list_head list; struct devlink *devlink; unsigned int index; - refcount_t refcount; const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; @@ -122,24 +76,6 @@ struct devlink_resource { void *occ_get_priv; }; -void *devlink_priv(struct devlink *devlink) -{ - return &devlink->priv; -} -EXPORT_SYMBOL_GPL(devlink_priv); - -struct devlink *priv_to_devlink(void *priv) -{ - return container_of(priv, struct devlink, priv); -} -EXPORT_SYMBOL_GPL(priv_to_devlink); - -struct device *devlink_to_dev(const struct devlink *devlink) -{ - return devlink->dev; -} -EXPORT_SYMBOL_GPL(devlink_to_dev); - static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { { .name = "destination mac", @@ -207,176 +143,6 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), }; -static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { - [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, -}; - -static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); -#define DEVLINK_REGISTERED XA_MARK_1 -#define DEVLINK_UNREGISTERING XA_MARK_2 - -/* devlink instances are open to the access from the user space after - * devlink_register() call. Such logical barrier allows us to have certain - * expectations related to locking. - * - * Before *_register() - we are in initialization stage and no parallel - * access possible to the devlink instance. All drivers perform that phase - * by implicitly holding device_lock. - * - * After *_register() - users and driver can access devlink instance at - * the same time. - */ -#define ASSERT_DEVLINK_REGISTERED(d) \ - WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) -#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ - WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) - -struct net *devlink_net(const struct devlink *devlink) -{ - return read_pnet(&devlink->_net); -} -EXPORT_SYMBOL_GPL(devlink_net); - -static void __devlink_put_rcu(struct rcu_head *head) -{ - struct devlink *devlink = container_of(head, struct devlink, rcu); - - complete(&devlink->comp); -} - -void devlink_put(struct devlink *devlink) -{ - if (refcount_dec_and_test(&devlink->refcount)) - /* Make sure unregister operation that may await the completion - * is unblocked only after all users are after the end of - * RCU grace period. - */ - call_rcu(&devlink->rcu, __devlink_put_rcu); -} - -struct devlink *__must_check devlink_try_get(struct devlink *devlink) -{ - if (refcount_inc_not_zero(&devlink->refcount)) - return devlink; - return NULL; -} - -void devl_assert_locked(struct devlink *devlink) -{ - lockdep_assert_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_assert_locked); - -#ifdef CONFIG_LOCKDEP -/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ -bool devl_lock_is_held(struct devlink *devlink) -{ - return lockdep_is_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock_is_held); -#endif - -void devl_lock(struct devlink *devlink) -{ - mutex_lock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock); - -int devl_trylock(struct devlink *devlink) -{ - return mutex_trylock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_trylock); - -void devl_unlock(struct devlink *devlink) -{ - mutex_unlock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_unlock); - -static struct devlink * -devlinks_xa_find_get(struct net *net, unsigned long *indexp, xa_mark_t filter, - void * (*xa_find_fn)(struct xarray *, unsigned long *, - unsigned long, xa_mark_t)) -{ - struct devlink *devlink; - - rcu_read_lock(); -retry: - devlink = xa_find_fn(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); - if (!devlink) - goto unlock; - - /* In case devlink_unregister() was already called and "unregistering" - * mark was set, do not allow to get a devlink reference here. - * This prevents live-lock of devlink_unregister() wait for completion. - */ - if (xa_get_mark(&devlinks, *indexp, DEVLINK_UNREGISTERING)) - goto retry; - - /* For a possible retry, the xa_find_after() should be always used */ - xa_find_fn = xa_find_after; - if (!devlink_try_get(devlink)) - goto retry; - if (!net_eq(devlink_net(devlink), net)) { - devlink_put(devlink); - goto retry; - } -unlock: - rcu_read_unlock(); - return devlink; -} - -static struct devlink *devlinks_xa_find_get_first(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find); -} - -static struct devlink *devlinks_xa_find_get_next(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find_after); -} - -/* Iterate over devlink pointers which were possible to get reference to. - * devlink_put() needs to be called for each iterated devlink pointer - * in loop body in order to release the reference. - */ -#define devlinks_xa_for_each_get(net, index, devlink, filter) \ - for (index = 0, \ - devlink = devlinks_xa_find_get_first(net, &index, filter); \ - devlink; devlink = devlinks_xa_find_get_next(net, &index, filter)) - -#define devlinks_xa_for_each_registered_get(net, index, devlink) \ - devlinks_xa_for_each_get(net, index, devlink, DEVLINK_REGISTERED) - -static struct devlink *devlink_get_from_attrs(struct net *net, - struct nlattr **attrs) -{ - struct devlink *devlink; - unsigned long index; - char *busname; - char *devname; - - if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) - return ERR_PTR(-EINVAL); - - busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); - devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); - - devlinks_xa_for_each_registered_get(net, index, devlink) { - if (strcmp(devlink->dev->bus->name, busname) == 0 && - strcmp(dev_name(devlink->dev), devname) == 0) - return devlink; - devlink_put(devlink); - } - - return ERR_PTR(-ENODEV); -} - #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ @@ -390,8 +156,8 @@ static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, return xa_load(&devlink->ports, port_index); } -static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, - struct nlattr **attrs) +struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_PORT_INDEX]) { u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); @@ -405,8 +171,8 @@ static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, return ERR_PTR(-EINVAL); } -static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, - struct genl_info *info) +struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } @@ -466,13 +232,13 @@ devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return devlink_rate_node_get_by_name(devlink, rate_node_name); } -static struct devlink_rate * +struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } -static struct devlink_rate * +struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; @@ -511,11 +277,7 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); struct devlink_linecard *linecard; - mutex_lock(&devlink->linecards_lock); linecard = devlink_linecard_get_by_index(devlink, linecard_index); - if (linecard) - refcount_inc(&linecard->refcount); - mutex_unlock(&devlink->linecards_lock); if (!linecard) return ERR_PTR(-ENODEV); return linecard; @@ -523,20 +285,12 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return ERR_PTR(-EINVAL); } -static struct devlink_linecard * +struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); } -static void devlink_linecard_put(struct devlink_linecard *linecard) -{ - if (refcount_dec_and_test(&linecard->refcount)) { - mutex_destroy(&linecard->state_lock); - kfree(linecard); - } -} - struct devlink_sb { struct list_head list; unsigned int index; @@ -838,104 +592,6 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) return NULL; } -#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) -#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) -#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) -#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) - -static int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink_port *devlink_port; - struct devlink *devlink; - int err; - - devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); - if (IS_ERR(devlink)) - return PTR_ERR(devlink); - devl_lock(devlink); - info->user_ptr[0] = devlink; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (IS_ERR(devlink_port)) { - err = PTR_ERR(devlink_port); - goto unlock; - } - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (!IS_ERR(devlink_port)) - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { - struct devlink_rate *devlink_rate; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) { - err = PTR_ERR(devlink_rate); - goto unlock; - } - info->user_ptr[1] = devlink_rate; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_from_info(devlink, info); - if (IS_ERR(rate_node)) { - err = PTR_ERR(rate_node); - goto unlock; - } - info->user_ptr[1] = rate_node; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) { - err = PTR_ERR(linecard); - goto unlock; - } - info->user_ptr[1] = linecard; - } - return 0; - -unlock: - devl_unlock(devlink); - devlink_put(devlink); - return err; -} - -static void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink *devlink; - - devlink = info->user_ptr[0]; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = info->user_ptr[1]; - devlink_linecard_put(linecard); - } - devl_unlock(devlink); - devlink_put(devlink); -} - -static struct genl_family devlink_nl_family; - -enum devlink_multicast_groups { - DEVLINK_MCGRP_CONFIG, -}; - -static const struct genl_multicast_group devlink_nl_mcgrps[] = { - [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, -}; - -static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) -{ - if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) - return -EMSGSIZE; - if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) - return -EMSGSIZE; - return 0; -} - static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink) { struct nlattr *nested_attr; @@ -972,185 +628,6 @@ size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } -struct devlink_reload_combination { - enum devlink_reload_action action; - enum devlink_reload_limit limit; -}; - -static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { - { - /* can't reinitialize driver with no down time */ - .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, - }, -}; - -static bool -devlink_reload_combination_is_invalid(enum devlink_reload_action action, - enum devlink_reload_limit limit) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) - if (devlink_reload_invalid_combinations[i].action == action && - devlink_reload_invalid_combinations[i].limit == limit) - return true; - return false; -} - -static bool -devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) -{ - return test_bit(action, &devlink->ops->reload_actions); -} - -static bool -devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) -{ - return test_bit(limit, &devlink->ops->reload_limits); -} - -static int devlink_reload_stat_put(struct sk_buff *msg, - enum devlink_reload_limit limit, u32 value) -{ - struct nlattr *reload_stats_entry; - - reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); - if (!reload_stats_entry) - return -EMSGSIZE; - - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || - nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) - goto nla_put_failure; - nla_nest_end(msg, reload_stats_entry); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, reload_stats_entry); - return -EMSGSIZE; -} - -static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) -{ - struct nlattr *reload_stats_attr, *act_info, *act_stats; - int i, j, stat_idx; - u32 value; - - if (!is_remote) - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); - else - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); - - if (!reload_stats_attr) - return -EMSGSIZE; - - for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if ((!is_remote && - !devlink_reload_action_is_supported(devlink, i)) || - i == DEVLINK_RELOAD_ACTION_UNSPEC) - continue; - act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); - if (!act_info) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) - goto action_info_nest_cancel; - act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); - if (!act_stats) - goto action_info_nest_cancel; - - for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - /* Remote stats are shown even if not locally supported. - * Stats of actions with unspecified limit are shown - * though drivers don't need to register unspecified - * limit. - */ - if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && - !devlink_reload_limit_is_supported(devlink, j)) || - devlink_reload_combination_is_invalid(i, j)) - continue; - - stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; - if (!is_remote) - value = devlink->stats.reload_stats[stat_idx]; - else - value = devlink->stats.remote_reload_stats[stat_idx]; - if (devlink_reload_stat_put(msg, j, value)) - goto action_stats_nest_cancel; - } - nla_nest_end(msg, act_stats); - nla_nest_end(msg, act_info); - } - nla_nest_end(msg, reload_stats_attr); - return 0; - -action_stats_nest_cancel: - nla_nest_cancel(msg, act_stats); -action_info_nest_cancel: - nla_nest_cancel(msg, act_info); -nla_put_failure: - nla_nest_cancel(msg, reload_stats_attr); - return -EMSGSIZE; -} - -static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct nlattr *dev_stats; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) - goto nla_put_failure; - - dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); - if (!dev_stats) - goto nla_put_failure; - - if (devlink_reload_stats_put(msg, devlink, false)) - goto dev_stats_nest_cancel; - if (devlink_reload_stats_put(msg, devlink, true)) - goto dev_stats_nest_cancel; - - nla_nest_end(msg, dev_stats); - genlmsg_end(msg, hdr); - return 0; - -dev_stats_nest_cancel: - nla_nest_cancel(msg, dev_stats); -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { @@ -1333,13 +810,12 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops, } if (!devlink_port_fn_state_valid(state)) { WARN_ON_ONCE(1); - NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver"); + NL_SET_ERR_MSG(extack, "Invalid state read from driver"); return -EINVAL; } if (!devlink_port_fn_opstate_valid(opstate)) { WARN_ON_ONCE(1); - NL_SET_ERR_MSG_MOD(extack, - "Invalid operational state read from driver"); + NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); return -EINVAL; } if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || @@ -1537,47 +1013,40 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; - u32 id = NETLINK_CB(cb->skb).portid; + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; + u32 id = NETLINK_CB(cb->skb).portid; - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, NULL); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, NULL); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_rate_get = { + .dump_one = devlink_nl_cmd_rate_get_dump_one, +}; + static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1612,55 +1081,6 @@ devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, return false; } -static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) { - idx++; - devlink_put(devlink); - continue; - } - - err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - devlink_put(devlink); - if (err) - goto out; - idx++; - } -out: - cb->args[0] = idx; - return msg->len; -} - static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1683,43 +1103,34 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; - unsigned long index, port_index; - int start = cb->args[0]; - int idx = 0; - int err; + unsigned long port_index; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, devlink_port) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_port_fill(msg, devlink_port, - DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; + xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { + err = devlink_nl_port_fill(msg, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->extack); + if (err) { + state->idx = port_index; + break; } - devl_unlock(devlink); - devlink_put(devlink); } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_port_get = { + .dump_one = devlink_nl_cmd_port_get_dump_one, +}; + static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) @@ -1753,16 +1164,16 @@ static int devlink_port_function_hw_addr_set(struct devlink_port *port, hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); if (hw_addr_len > MAX_ADDR_LEN) { - NL_SET_ERR_MSG_MOD(extack, "Port function hardware address too long"); + NL_SET_ERR_MSG(extack, "Port function hardware address too long"); return -EINVAL; } if (port->type == DEVLINK_PORT_TYPE_ETH) { if (hw_addr_len != ETH_ALEN) { - NL_SET_ERR_MSG_MOD(extack, "Address must be 6 bytes for Ethernet device"); + NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); return -EINVAL; } if (!is_unicast_ether_addr(hw_addr)) { - NL_SET_ERR_MSG_MOD(extack, "Non-unicast hardware address unsupported"); + NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); return -EINVAL; } } @@ -1838,7 +1249,7 @@ static int devlink_port_function_set(struct devlink_port *port, err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, devlink_function_nl_policy, extack); if (err < 0) { - NL_SET_ERR_MSG_MOD(extack, "Fail to parse port function attributes"); + NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); return err; } @@ -1917,14 +1328,14 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) - NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split further"); + NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); else - NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split"); + NL_SET_ERR_MSG(info->extack, "Port cannot be split"); return -EINVAL; } if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid split count"); + NL_SET_ERR_MSG(info->extack, "Invalid split count"); return -EINVAL; } @@ -1988,7 +1399,7 @@ static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { - NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified"); + NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); return -EINVAL; } new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); @@ -2036,7 +1447,7 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) { - NL_SET_ERR_MSG_MOD(extack, "Port index is not specified"); + NL_SET_ERR_MSG(extack, "Port index is not specified"); return -EINVAL; } port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); @@ -2078,13 +1489,13 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, return -ENODEV; if (parent == devlink_rate) { - NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed"); + NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); return -EINVAL; } if (devlink_rate_is_node(devlink_rate) && devlink_rate_is_parent_node(devlink_rate, parent->parent)) { - NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node."); + NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); return -EEXIST; } @@ -2193,16 +1604,16 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, if (type == DEVLINK_RATE_TYPE_LEAF) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs"); + NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs"); + NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_leaf_parent_set) { - NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs"); + NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { @@ -2219,16 +1630,16 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); + NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes"); + NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_node_parent_set) { - NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes"); + NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { @@ -2279,7 +1690,7 @@ static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, ops = devlink->ops; if (!ops || !ops->rate_node_new || !ops->rate_node_del) { - NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported"); + NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); return -EOPNOTSUPP; } @@ -2335,7 +1746,7 @@ static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, int err; if (refcount_read(&rate_node->refcnt) > 1) { - NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node."); + NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } @@ -2462,46 +1873,42 @@ static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->linecards_lock); - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < start) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); - goto out; - } + list_for_each_entry(linecard, &devlink->linecard_list, list) { + if (idx < state->idx) { idx++; + continue; } - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + cb->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + state->idx = idx; + break; + } + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_linecard_get = { + .dump_one = devlink_nl_cmd_linecard_get_dump_one, +}; + static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, const char *type) @@ -2527,26 +1934,26 @@ static int devlink_linecard_type_set(struct devlink_linecard *linecard, mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } linecard_type = devlink_linecard_type_lookup(linecard, type); if (!linecard_type) { - NL_SET_ERR_MSG_MOD(extack, "Unsupported line card type provided"); + NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); err = -EINVAL; goto out; } if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - NL_SET_ERR_MSG_MOD(extack, "Line card already provisioned"); + NL_SET_ERR_MSG(extack, "Line card already provisioned"); err = -EBUSY; /* Check if the line card is provisioned in the same * way the user asks. In case it is, make the operation @@ -2590,12 +1997,12 @@ static int devlink_linecard_type_unset(struct devlink_linecard *linecard, mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } @@ -2608,7 +2015,7 @@ static int devlink_linecard_type_unset(struct devlink_linecard *linecard, } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { - NL_SET_ERR_MSG_MOD(extack, "Line card is not provisioned"); + NL_SET_ERR_MSG(extack, "Line card is not provisioned"); err = 0; goto out; } @@ -2724,43 +2131,39 @@ static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_sb_fill(msg, devlink, devlink_sb, - DEVLINK_CMD_SB_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_sb_get = { + .dump_one = devlink_nl_cmd_sb_get_dump_one, +}; + static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, u16 pool_index, enum devlink_command cmd, @@ -2866,46 +2269,39 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, return 0; } -static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; - int idx = 0; int err = 0; + int idx = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_pool_get) - goto retry; + if (!devlink->ops->sb_pool_get) + return 0; - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_pool_get_dumpit(msg, start, &idx, devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_sb_pool_get = { + .dump_one = devlink_nl_cmd_sb_pool_get_dump_one, +}; + static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, @@ -3081,46 +2477,39 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, return 0; } -static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_port_pool_get) - goto retry; - - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_port_pool_get_dumpit(msg, start, &idx, - devlink, devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + if (!devlink->ops->sb_port_pool_get) + return 0; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_sb_port_pool_get = { + .dump_one = devlink_nl_cmd_sb_port_pool_get_dump_one, +}; + static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, u32 threshold, @@ -3324,47 +2713,38 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, } static int -devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_tc_pool_bind_get) - goto retry; + if (!devlink->ops->sb_tc_pool_bind_get) + return 0; - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, - devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get = { + .dump_one = devlink_nl_cmd_sb_tc_pool_bind_get_dump_one, +}; + static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, @@ -3452,142 +2832,19 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, return -EOPNOTSUPP; } -static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - const struct devlink_ops *ops = devlink->ops; - enum devlink_eswitch_encap_mode encap_mode; - u8 inline_mode; - void *hdr; - int err = 0; - u16 mode; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = devlink_nl_put_handle(msg, devlink); - if (err) - goto nla_put_failure; - - if (ops->eswitch_mode_get) { - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - goto nla_put_failure; - err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); - if (err) - goto nla_put_failure; - } - - if (ops->eswitch_inline_mode_get) { - err = ops->eswitch_inline_mode_get(devlink, &inline_mode); - if (err) - goto nla_put_failure; - err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, - inline_mode); - if (err) - goto nla_put_failure; - } - - if (ops->eswitch_encap_mode_get) { - err = ops->eswitch_encap_mode_get(devlink, &encap_mode); - if (err) - goto nla_put_failure; - err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); - if (err) - goto nla_put_failure; - } - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, - info->snd_portid, info->snd_seq, 0); - - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack) +int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { - NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); + NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; } -static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const struct devlink_ops *ops = devlink->ops; - enum devlink_eswitch_encap_mode encap_mode; - u8 inline_mode; - int err = 0; - u16 mode; - - if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { - if (!ops->eswitch_mode_set) - return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); - err = devlink_rate_nodes_check(devlink, mode, info->extack); - if (err) - return err; - err = ops->eswitch_mode_set(devlink, mode, info->extack); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { - if (!ops->eswitch_inline_mode_set) - return -EOPNOTSUPP; - inline_mode = nla_get_u8( - info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); - err = ops->eswitch_inline_mode_set(devlink, inline_mode, - info->extack); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { - if (!ops->eswitch_encap_mode_set) - return -EOPNOTSUPP; - encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); - err = ops->eswitch_encap_mode_set(devlink, encap_mode, - info->extack); - if (err) - return err; - } - - return 0; -} - int devlink_dpipe_match_put(struct sk_buff *skb, struct devlink_dpipe_match *match) { @@ -4348,18 +3605,18 @@ devlink_resource_validate_size(struct devlink_resource *resource, u64 size, int err = 0; if (size > resource->size_params.size_max) { - NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum"); + NL_SET_ERR_MSG(extack, "Size larger than maximum"); err = -EINVAL; } if (size < resource->size_params.size_min) { - NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum"); + NL_SET_ERR_MSG(extack, "Size smaller than minimum"); err = -EINVAL; } div64_u64_rem(size, resource->size_params.size_granularity, &reminder); if (reminder) { - NL_SET_ERR_MSG_MOD(extack, "Wrong granularity"); + NL_SET_ERR_MSG(extack, "Wrong granularity"); err = -EINVAL; } @@ -4441,9 +3698,10 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, DEVLINK_ATTR_PAD)) goto nla_put_failure; - if (resource->size != resource->size_new) - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, - resource->size_new, DEVLINK_ATTR_PAD); + if (resource->size != resource->size_new && + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD)) + goto nla_put_failure; if (devlink_resource_occ_put(resource, skb)) goto nla_put_failure; if (devlink_resource_size_params_put(resource, skb)) @@ -4557,10 +3815,9 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); } -static int -devlink_resources_validate(struct devlink *devlink, - struct devlink_resource *resource, - struct genl_info *info) +int devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) { struct list_head *resource_list; int err = 0; @@ -4580,743 +3837,6 @@ devlink_resources_validate(struct devlink *devlink, return err; } -static struct net *devlink_netns_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; - struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; - struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; - struct net *net; - - if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { - NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified"); - return ERR_PTR(-EINVAL); - } - - if (netns_pid_attr) { - net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); - } else if (netns_fd_attr) { - net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); - } else if (netns_id_attr) { - net = get_net_ns_by_id(sock_net(skb->sk), - nla_get_u32(netns_id_attr)); - if (!net) - net = ERR_PTR(-EINVAL); - } else { - WARN_ON(1); - net = ERR_PTR(-EINVAL); - } - if (IS_ERR(net)) { - NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace"); - return ERR_PTR(-EINVAL); - } - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { - put_net(net); - return ERR_PTR(-EPERM); - } - return net; -} - -static void devlink_param_notify(struct devlink *devlink, - unsigned int port_index, - struct devlink_param_item *param_item, - enum devlink_command cmd); - -static void devlink_ns_change_notify(struct devlink *devlink, - struct net *dest_net, struct net *curr_net, - bool new) -{ - struct devlink_param_item *param_item; - enum devlink_command cmd; - - /* Userspace needs to be notified about devlink objects - * removed from original and entering new network namespace. - * The rest of the devlink objects are re-created during - * reload process so the notifications are generated separatelly. - */ - - if (!dest_net || net_eq(dest_net, curr_net)) - return; - - if (new) - devlink_notify(devlink, DEVLINK_CMD_NEW); - - cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL; - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, cmd); - - if (!new) - devlink_notify(devlink, DEVLINK_CMD_DEL); -} - -static bool devlink_reload_supported(const struct devlink_ops *ops) -{ - return ops->reload_down && ops->reload_up; -} - -static void devlink_reload_failed_set(struct devlink *devlink, - bool reload_failed) -{ - if (devlink->reload_failed == reload_failed) - return; - devlink->reload_failed = reload_failed; - devlink_notify(devlink, DEVLINK_CMD_NEW); -} - -bool devlink_is_reload_failed(const struct devlink *devlink) -{ - return devlink->reload_failed; -} -EXPORT_SYMBOL_GPL(devlink_is_reload_failed); - -static void -__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, - enum devlink_reload_limit limit, u32 actions_performed) -{ - unsigned long actions = actions_performed; - int stat_idx; - int action; - - for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { - stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; - reload_stats[stat_idx]++; - } - devlink_notify(devlink, DEVLINK_CMD_NEW); -} - -static void -devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, - u32 actions_performed) -{ - __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, - actions_performed); -} - -/** - * devlink_remote_reload_actions_performed - Update devlink on reload actions - * performed which are not a direct result of devlink reload call. - * - * This should be called by a driver after performing reload actions in case it was not - * a result of devlink reload call. For example fw_activate was performed as a result - * of devlink reload triggered fw_activate on another host. - * The motivation for this function is to keep data on reload actions performed on this - * function whether it was done due to direct devlink reload call or not. - * - * @devlink: devlink - * @limit: reload limit - * @actions_performed: bitmask of actions performed - */ -void devlink_remote_reload_actions_performed(struct devlink *devlink, - enum devlink_reload_limit limit, - u32 actions_performed) -{ - if (WARN_ON(!actions_performed || - actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || - actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || - limit > DEVLINK_RELOAD_LIMIT_MAX)) - return; - - __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, - actions_performed); -} -EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); - -static int devlink_reload(struct devlink *devlink, struct net *dest_net, - enum devlink_reload_action action, enum devlink_reload_limit limit, - u32 *actions_performed, struct netlink_ext_ack *extack) -{ - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - struct net *curr_net; - int err; - - memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, - sizeof(remote_reload_stats)); - - curr_net = devlink_net(devlink); - devlink_ns_change_notify(devlink, dest_net, curr_net, false); - err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); - if (err) - return err; - - if (dest_net && !net_eq(dest_net, curr_net)) { - move_netdevice_notifier_net(curr_net, dest_net, - &devlink->netdevice_nb); - write_pnet(&devlink->_net, dest_net); - } - - err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); - devlink_reload_failed_set(devlink, !!err); - if (err) - return err; - - devlink_ns_change_notify(devlink, dest_net, curr_net, true); - WARN_ON(!(*actions_performed & BIT(action))); - /* Catch driver on updating the remote action within devlink reload */ - WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, - sizeof(remote_reload_stats))); - devlink_reload_stats_update(devlink, limit, *actions_performed); - return 0; -} - -static int -devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, - enum devlink_command cmd, struct genl_info *info) -{ - struct sk_buff *msg; - void *hdr; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); - if (!hdr) - goto free_msg; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, - actions_performed)) - goto nla_put_failure; - genlmsg_end(msg, hdr); - - return genlmsg_reply(msg, info); - -nla_put_failure: - genlmsg_cancel(msg, hdr); -free_msg: - nlmsg_free(msg); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - enum devlink_reload_action action; - enum devlink_reload_limit limit; - struct net *dest_net = NULL; - u32 actions_performed; - int err; - - if (!(devlink->features & DEVLINK_F_RELOAD)) - return -EOPNOTSUPP; - - err = devlink_resources_validate(devlink, NULL, info); - if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); - return err; - } - - if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) - action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); - else - action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; - - if (!devlink_reload_action_is_supported(devlink, action)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested reload action is not supported by the driver"); - return -EOPNOTSUPP; - } - - limit = DEVLINK_RELOAD_LIMIT_UNSPEC; - if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { - struct nla_bitfield32 limits; - u32 limits_selected; - - limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); - limits_selected = limits.value & limits.selector; - if (!limits_selected) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected"); - return -EINVAL; - } - for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) - if (limits_selected & BIT(limit)) - break; - /* UAPI enables multiselection, but currently it is not used */ - if (limits_selected != BIT(limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Multiselection of limit is not supported"); - return -EOPNOTSUPP; - } - if (!devlink_reload_limit_is_supported(devlink, limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested limit is not supported by the driver"); - return -EOPNOTSUPP; - } - if (devlink_reload_combination_is_invalid(action, limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested limit is invalid for this action"); - return -EINVAL; - } - } - if (info->attrs[DEVLINK_ATTR_NETNS_PID] || - info->attrs[DEVLINK_ATTR_NETNS_FD] || - info->attrs[DEVLINK_ATTR_NETNS_ID]) { - dest_net = devlink_netns_get(skb, info); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - } - - err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); - - if (dest_net) - put_net(dest_net); - - if (err) - return err; - /* For backward compatibility generate reply only if attributes used by user */ - if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) - return 0; - - return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, - DEVLINK_CMD_RELOAD, info); -} - -static int devlink_nl_flash_update_fill(struct sk_buff *msg, - struct devlink *devlink, - enum devlink_command cmd, - struct devlink_flash_notify *params) -{ - void *hdr; - - hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) - goto out; - - if (params->status_msg && - nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, - params->status_msg)) - goto nla_put_failure; - if (params->component && - nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, - params->component)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, - params->done, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, - params->total, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, - params->timeout, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - -out: - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void __devlink_flash_update_notify(struct devlink *devlink, - enum devlink_command cmd, - struct devlink_flash_notify *params) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && - cmd != DEVLINK_CMD_FLASH_UPDATE_END && - cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); - if (err) - goto out_free_msg; - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); - return; - -out_free_msg: - nlmsg_free(msg); -} - -static void devlink_flash_update_begin_notify(struct devlink *devlink) -{ - struct devlink_flash_notify params = {}; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE, - ¶ms); -} - -static void devlink_flash_update_end_notify(struct devlink *devlink) -{ - struct devlink_flash_notify params = {}; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_END, - ¶ms); -} - -void devlink_flash_update_status_notify(struct devlink *devlink, - const char *status_msg, - const char *component, - unsigned long done, - unsigned long total) -{ - struct devlink_flash_notify params = { - .status_msg = status_msg, - .component = component, - .done = done, - .total = total, - }; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_STATUS, - ¶ms); -} -EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); - -void devlink_flash_update_timeout_notify(struct devlink *devlink, - const char *status_msg, - const char *component, - unsigned long timeout) -{ - struct devlink_flash_notify params = { - .status_msg = status_msg, - .component = component, - .timeout = timeout, - }; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_STATUS, - ¶ms); -} -EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); - -struct devlink_info_req { - struct sk_buff *msg; - void (*version_cb)(const char *version_name, - enum devlink_info_version_type version_type, - void *version_cb_priv); - void *version_cb_priv; -}; - -struct devlink_flash_component_lookup_ctx { - const char *lookup_name; - bool lookup_name_found; -}; - -static void -devlink_flash_component_lookup_cb(const char *version_name, - enum devlink_info_version_type version_type, - void *version_cb_priv) -{ - struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; - - if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || - lookup_ctx->lookup_name_found) - return; - - lookup_ctx->lookup_name_found = - !strcmp(lookup_ctx->lookup_name, version_name); -} - -static int devlink_flash_component_get(struct devlink *devlink, - struct nlattr *nla_component, - const char **p_component, - struct netlink_ext_ack *extack) -{ - struct devlink_flash_component_lookup_ctx lookup_ctx = {}; - struct devlink_info_req req = {}; - const char *component; - int ret; - - if (!nla_component) - return 0; - - component = nla_data(nla_component); - - if (!devlink->ops->info_get) { - NL_SET_ERR_MSG_ATTR(extack, nla_component, - "component update is not supported by this device"); - return -EOPNOTSUPP; - } - - lookup_ctx.lookup_name = component; - req.version_cb = devlink_flash_component_lookup_cb; - req.version_cb_priv = &lookup_ctx; - - ret = devlink->ops->info_get(devlink, &req, NULL); - if (ret) - return ret; - - if (!lookup_ctx.lookup_name_found) { - NL_SET_ERR_MSG_ATTR(extack, nla_component, - "selected component is not supported by this device"); - return -EINVAL; - } - *p_component = component; - return 0; -} - -static int devlink_nl_cmd_flash_update(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *nla_overwrite_mask, *nla_file_name; - struct devlink_flash_update_params params = {}; - struct devlink *devlink = info->user_ptr[0]; - const char *file_name; - u32 supported_params; - int ret; - - if (!devlink->ops->flash_update) - return -EOPNOTSUPP; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) - return -EINVAL; - - ret = devlink_flash_component_get(devlink, - info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], - ¶ms.component, info->extack); - if (ret) - return ret; - - supported_params = devlink->ops->supported_flash_update_params; - - nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; - if (nla_overwrite_mask) { - struct nla_bitfield32 sections; - - if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, - "overwrite settings are not supported by this device"); - return -EOPNOTSUPP; - } - sections = nla_get_bitfield32(nla_overwrite_mask); - params.overwrite_mask = sections.value & sections.selector; - } - - nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; - file_name = nla_data(nla_file_name); - ret = request_firmware(¶ms.fw, file_name, devlink->dev); - if (ret) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); - return ret; - } - - devlink_flash_update_begin_notify(devlink); - ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); - devlink_flash_update_end_notify(devlink); - - release_firmware(params.fw); - - return ret; -} - -static int -devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink, - u32 portid, u32 seq, int flags, - struct netlink_ext_ack *extack) -{ - struct nlattr *selftests; - void *hdr; - int err; - int i; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, - DEVLINK_CMD_SELFTESTS_GET); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto err_cancel_msg; - - selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); - if (!selftests) - goto err_cancel_msg; - - for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; - i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { - if (devlink->ops->selftest_check(devlink, i, extack)) { - err = nla_put_flag(msg, i); - if (err) - goto err_cancel_msg; - } - } - - nla_nest_end(msg, selftests); - genlmsg_end(msg, hdr); - return 0; - -err_cancel_msg: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - if (!devlink->ops->selftest_check) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid, - info->snd_seq, 0, info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start || !devlink->ops->selftest_check) - goto inc; - - devl_lock(devlink); - err = devlink_nl_selftests_fill(msg, devlink, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, - enum devlink_selftest_status test_status) -{ - struct nlattr *result_attr; - - result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT); - if (!result_attr) - return -EMSGSIZE; - - if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) || - nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS, - test_status)) - goto nla_put_failure; - - nla_nest_end(skb, result_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, result_attr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_selftests_run(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1]; - struct devlink *devlink = info->user_ptr[0]; - struct nlattr *attrs, *selftests; - struct sk_buff *msg; - void *hdr; - int err; - int i; - - if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) - return -EOPNOTSUPP; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) - return -EINVAL; - - attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; - - err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs, - devlink_selftest_nl_policy, info->extack); - if (err < 0) - return err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = -EMSGSIZE; - hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, - &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN); - if (!hdr) - goto free_msg; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); - if (!selftests) - goto genlmsg_cancel; - - for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; - i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { - enum devlink_selftest_status test_status; - - if (nla_get_flag(tb[i])) { - if (!devlink->ops->selftest_check(devlink, i, - info->extack)) { - if (devlink_selftest_result_put(msg, i, - DEVLINK_SELFTEST_STATUS_SKIP)) - goto selftests_nest_cancel; - continue; - } - - test_status = devlink->ops->selftest_run(devlink, i, - info->extack); - if (devlink_selftest_result_put(msg, i, test_status)) - goto selftests_nest_cancel; - } - } - - nla_nest_end(msg, selftests); - genlmsg_end(msg, hdr); - return genlmsg_reply(msg, info); - -selftests_nest_cancel: - nla_nest_cancel(msg, selftests); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); -free_msg: - nlmsg_free(msg); - return err; -} - static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, @@ -5433,26 +3953,22 @@ static int devlink_param_driver_verify(const struct devlink_param *param) } static struct devlink_param_item * -devlink_param_find_by_name(struct list_head *param_list, - const char *param_name) +devlink_param_find_by_name(struct xarray *params, const char *param_name) { struct devlink_param_item *param_item; + unsigned long param_id; - list_for_each_entry(param_item, param_list, list) + xa_for_each(params, param_id, param_item) { if (!strcmp(param_item->param->name, param_name)) return param_item; + } return NULL; } static struct devlink_param_item * -devlink_param_find_by_id(struct list_head *param_list, u32 param_id) +devlink_param_find_by_id(struct xarray *params, u32 param_id) { - struct devlink_param_item *param_item; - - list_for_each_entry(param_item, param_list, list) - if (param_item->param->id == param_id) - return param_item; - return NULL; + return xa_load(params, param_id); } static bool @@ -5571,9 +4087,12 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (!devlink_param_cmode_is_supported(param, i)) continue; if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { - if (!param_item->driverinit_value_valid) + if (param_item->driverinit_value_new_valid) + param_value[i] = param_item->driverinit_value_new; + else if (param_item->driverinit_value_valid) + param_value[i] = param_item->driverinit_value; + else return -EOPNOTSUPP; - param_value[i] = param_item->driverinit_value; } else { ctx.cmode = i; err = devlink_param_get(devlink, param, &ctx); @@ -5650,7 +4169,13 @@ static void devlink_param_notify(struct devlink *devlink, WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && cmd != DEVLINK_CMD_PORT_PARAM_NEW && cmd != DEVLINK_CMD_PORT_PARAM_DEL); - ASSERT_DEVLINK_REGISTERED(devlink); + + /* devlink_notify_register() / devlink_notify_unregister() + * will replay the notifications if the params are added/removed + * outside of the lifetime of the instance. + */ + if (!devl_is_registered(devlink)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -5666,48 +4191,36 @@ static void devlink_param_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; + unsigned long param_id; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(param_item, &devlink->param_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; + xa_for_each_start(&devlink->params, param_id, param_item, state->idx) { + err = devlink_nl_param_fill(msg, devlink, 0, param_item, + DEVLINK_CMD_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = param_id; + break; } - devl_unlock(devlink); - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_cmd devl_cmd_param_get = { + .dump_one = devlink_nl_cmd_param_get_dump_one, +}; + static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) @@ -5784,8 +4297,7 @@ devlink_param_value_get_from_info(const struct devlink_param *param, } static struct devlink_param_item * -devlink_param_get_from_info(struct list_head *param_list, - struct genl_info *info) +devlink_param_get_from_info(struct xarray *params, struct genl_info *info) { char *param_name; @@ -5793,7 +4305,7 @@ devlink_param_get_from_info(struct list_head *param_list, return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); - return devlink_param_find_by_name(param_list, param_name); + return devlink_param_find_by_name(params, param_name); } static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, @@ -5804,7 +4316,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, struct sk_buff *msg; int err; - param_item = devlink_param_get_from_info(&devlink->param_list, info); + param_item = devlink_param_get_from_info(&devlink->params, info); if (!param_item) return -EINVAL; @@ -5825,7 +4337,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, unsigned int port_index, - struct list_head *param_list, + struct xarray *params, struct genl_info *info, enum devlink_command cmd) { @@ -5837,7 +4349,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, union devlink_param_value value; int err = 0; - param_item = devlink_param_get_from_info(param_list, info); + param_item = devlink_param_get_from_info(params, info); if (!param_item) return -EINVAL; param = param_item->param; @@ -5862,11 +4374,8 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { - if (param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(param_item->driverinit_value.vstr, value.vstr); - else - param_item->driverinit_value = value; - param_item->driverinit_value_valid = true; + param_item->driverinit_value_new = value; + param_item->driverinit_value_new_valid = true; } else { if (!param->set) return -EOPNOTSUPP; @@ -5886,28 +4395,28 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; - return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->param_list, + return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params, info, DEVLINK_CMD_PARAM_NEW); } static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { - NL_SET_ERR_MSG_MOD(cb->extack, "Port params are not supported"); + NL_SET_ERR_MSG(cb->extack, "Port params are not supported"); return msg->len; } static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); + NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); + NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } @@ -6371,21 +4880,20 @@ out: return err; } -static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, - struct netlink_callback *cb, - struct devlink *devlink, - int *idx, - int start) +static int +devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; struct devlink_port *port; unsigned long port_index; - int err = 0; + int idx = 0; + int err; - devl_lock(devlink); list_for_each_entry(region, &devlink->region_list, list) { - if (*idx < start) { - (*idx)++; + if (idx < state->idx) { + idx++; continue; } err = devlink_nl_region_fill(msg, devlink, @@ -6393,43 +4901,28 @@ static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, region); - if (err) - goto out; - (*idx)++; + if (err) { + state->idx = idx; + return err; + } + idx++; } xa_for_each(&devlink->ports, port_index, port) { - err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx, - start); - if (err) - goto out; + err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, + state->idx); + if (err) { + state->idx = idx; + return err; + } } -out: - devl_unlock(devlink); - return err; + return 0; } -static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, - &idx, start); - devlink_put(devlink); - if (err) - goto out; - } -out: - cb->args[0] = idx; - return msg->len; -} +const struct devlink_cmd devl_cmd_region_get = { + .dump_one = devlink_nl_cmd_region_get_dump_one, +}; static int devlink_nl_cmd_region_del(struct sk_buff *skb, struct genl_info *info) @@ -6492,7 +4985,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { - NL_SET_ERR_MSG_MOD(info->extack, "No region name provided"); + NL_SET_ERR_MSG(info->extack, "No region name provided"); return -EINVAL; } @@ -6512,19 +5005,19 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) region = devlink_region_get_by_name(devlink, region_name); if (!region) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist"); + NL_SET_ERR_MSG(info->extack, "The requested region does not exist"); return -EINVAL; } if (!region->ops->snapshot) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot"); + NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot"); return -EOPNOTSUPP; } mutex_lock(®ion->snapshot_lock); if (region->cur_snapshots == region->max_snapshots) { - NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots"); + NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots"); err = -ENOSPC; goto unlock; } @@ -6534,7 +5027,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) snapshot_id = nla_get_u32(snapshot_id_attr); if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use"); err = -EEXIST; goto unlock; } @@ -6545,7 +5038,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) } else { err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); + NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id"); goto unlock; } } @@ -6712,6 +5205,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; @@ -6725,14 +5219,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, void *hdr; int err; - start_offset = *((u64 *)&cb->args[0]); + start_offset = state->start_offset; - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) return PTR_ERR(devlink); - devl_lock(devlink); - if (!attrs[DEVLINK_ATTR_REGION_NAME]) { NL_SET_ERR_MSG(cb->extack, "No region name provided"); err = -EINVAL; @@ -6864,7 +5356,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - *((u64 *)&cb->args[0]) = ret_offset; + state->start_offset = ret_offset; nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); @@ -6880,1605 +5372,6 @@ out_unlock: return err; } -int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) -{ - if (!req->msg) - return 0; - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); -} -EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); - -int devlink_info_board_serial_number_put(struct devlink_info_req *req, - const char *bsn) -{ - if (!req->msg) - return 0; - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, - bsn); -} -EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); - -static int devlink_info_version_put(struct devlink_info_req *req, int attr, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - struct nlattr *nest; - int err; - - if (req->version_cb) - req->version_cb(version_name, version_type, - req->version_cb_priv); - - if (!req->msg) - return 0; - - nest = nla_nest_start_noflag(req->msg, attr); - if (!nest) - return -EMSGSIZE; - - err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, - version_name); - if (err) - goto nla_put_failure; - - err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, - version_value); - if (err) - goto nla_put_failure; - - nla_nest_end(req->msg, nest); - - return 0; - -nla_put_failure: - nla_nest_cancel(req->msg, nest); - return err; -} - -int devlink_info_version_fixed_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); - -int devlink_info_version_stored_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); - -int devlink_info_version_stored_put_ext(struct devlink_info_req *req, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value, - version_type); -} -EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); - -int devlink_info_version_running_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_running_put); - -int devlink_info_version_running_put_ext(struct devlink_info_req *req, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value, - version_type); -} -EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); - -static int devlink_nl_driver_info_get(struct device_driver *drv, - struct devlink_info_req *req) -{ - if (!drv) - return 0; - - if (drv->name[0]) - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, - drv->name); - - return 0; -} - -static int -devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, struct netlink_ext_ack *extack) -{ - struct device *dev = devlink_to_dev(devlink); - struct devlink_info_req req = {}; - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto err_cancel_msg; - - req.msg = msg; - if (devlink->ops->info_get) { - err = devlink->ops->info_get(devlink, &req, extack); - if (err) - goto err_cancel_msg; - } - - err = devlink_nl_driver_info_get(dev->driver, &req); - if (err) - goto err_cancel_msg; - - genlmsg_end(msg, hdr); - return 0; - -err_cancel_msg: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, - info->snd_portid, info->snd_seq, 0, - info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) - goto inc; - - devl_lock(devlink); - err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err == -EOPNOTSUPP) - err = 0; - else if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -struct devlink_fmsg_item { - struct list_head list; - int attrtype; - u8 nla_type; - u16 len; - int value[]; -}; - -struct devlink_fmsg { - struct list_head item_list; - bool putting_binary; /* This flag forces enclosing of binary data - * in an array brackets. It forces using - * of designated API: - * devlink_fmsg_binary_pair_nest_start() - * devlink_fmsg_binary_pair_nest_end() - */ -}; - -static struct devlink_fmsg *devlink_fmsg_alloc(void) -{ - struct devlink_fmsg *fmsg; - - fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); - if (!fmsg) - return NULL; - - INIT_LIST_HEAD(&fmsg->item_list); - - return fmsg; -} - -static void devlink_fmsg_free(struct devlink_fmsg *fmsg) -{ - struct devlink_fmsg_item *item, *tmp; - - list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { - list_del(&item->list); - kfree(item); - } - kfree(fmsg); -} - -static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, - int attrtype) -{ - struct devlink_fmsg_item *item; - - item = kzalloc(sizeof(*item), GFP_KERNEL); - if (!item) - return -ENOMEM; - - item->attrtype = attrtype; - list_add_tail(&item->list, &fmsg->item_list); - - return 0; -} - -int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); - -static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); -} - -int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_end(fmsg); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); - -#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) - -static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) -{ - struct devlink_fmsg_item *item; - - if (fmsg->putting_binary) - return -EINVAL; - - if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) - return -EMSGSIZE; - - item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); - if (!item) - return -ENOMEM; - - item->nla_type = NLA_NUL_STRING; - item->len = strlen(name) + 1; - item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; - memcpy(&item->value, name, item->len); - list_add_tail(&item->list, &fmsg->item_list); - - return 0; -} - -int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) -{ - int err; - - if (fmsg->putting_binary) - return -EINVAL; - - err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); - if (err) - return err; - - err = devlink_fmsg_put_name(fmsg, name); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); - -int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_end(fmsg); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); - -int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name) -{ - int err; - - if (fmsg->putting_binary) - return -EINVAL; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); - -int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) -{ - int err; - - if (fmsg->putting_binary) - return -EINVAL; - - err = devlink_fmsg_nest_end(fmsg); - if (err) - return err; - - err = devlink_fmsg_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); - -int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name) -{ - int err; - - err = devlink_fmsg_arr_pair_nest_start(fmsg, name); - if (err) - return err; - - fmsg->putting_binary = true; - return err; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start); - -int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg) -{ - if (!fmsg->putting_binary) - return -EINVAL; - - fmsg->putting_binary = false; - return devlink_fmsg_arr_pair_nest_end(fmsg); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end); - -static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, - const void *value, u16 value_len, - u8 value_nla_type) -{ - struct devlink_fmsg_item *item; - - if (value_len > DEVLINK_FMSG_MAX_SIZE) - return -EMSGSIZE; - - item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); - if (!item) - return -ENOMEM; - - item->nla_type = value_nla_type; - item->len = value_len; - item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; - memcpy(&item->value, value, item->len); - list_add_tail(&item->list, &fmsg->item_list); - - return 0; -} - -static int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); -} - -static int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); -} - -int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); - -static int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); -} - -int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, - NLA_NUL_STRING); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); - -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) -{ - if (!fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); - -int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, - bool value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_bool_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); - -int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, - u8 value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_u8_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); - -int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, - u32 value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_u32_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); - -int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, - u64 value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_u64_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); - -int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, - const char *value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_string_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); - -int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u32 value_len) -{ - u32 data_size; - int end_err; - u32 offset; - int err; - - err = devlink_fmsg_binary_pair_nest_start(fmsg, name); - if (err) - return err; - - for (offset = 0; offset < value_len; offset += data_size) { - data_size = value_len - offset; - if (data_size > DEVLINK_FMSG_MAX_SIZE) - data_size = DEVLINK_FMSG_MAX_SIZE; - err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); - if (err) - break; - /* Exit from loop with a break (instead of - * return) to make sure putting_binary is turned off in - * devlink_fmsg_binary_pair_nest_end - */ - } - - end_err = devlink_fmsg_binary_pair_nest_end(fmsg); - if (end_err) - err = end_err; - - return err; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); - -static int -devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) -{ - switch (msg->nla_type) { - case NLA_FLAG: - case NLA_U8: - case NLA_U32: - case NLA_U64: - case NLA_NUL_STRING: - case NLA_BINARY: - return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, - msg->nla_type); - default: - return -EINVAL; - } -} - -static int -devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) -{ - int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; - u8 tmp; - - switch (msg->nla_type) { - case NLA_FLAG: - /* Always provide flag data, regardless of its value */ - tmp = *(bool *) msg->value; - - return nla_put_u8(skb, attrtype, tmp); - case NLA_U8: - return nla_put_u8(skb, attrtype, *(u8 *) msg->value); - case NLA_U32: - return nla_put_u32(skb, attrtype, *(u32 *) msg->value); - case NLA_U64: - return nla_put_u64_64bit(skb, attrtype, *(u64 *) msg->value, - DEVLINK_ATTR_PAD); - case NLA_NUL_STRING: - return nla_put_string(skb, attrtype, (char *) &msg->value); - case NLA_BINARY: - return nla_put(skb, attrtype, msg->len, (void *) &msg->value); - default: - return -EINVAL; - } -} - -static int -devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, - int *start) -{ - struct devlink_fmsg_item *item; - struct nlattr *fmsg_nlattr; - int i = 0; - int err; - - fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); - if (!fmsg_nlattr) - return -EMSGSIZE; - - list_for_each_entry(item, &fmsg->item_list, list) { - if (i < *start) { - i++; - continue; - } - - switch (item->attrtype) { - case DEVLINK_ATTR_FMSG_OBJ_NEST_START: - case DEVLINK_ATTR_FMSG_PAIR_NEST_START: - case DEVLINK_ATTR_FMSG_ARR_NEST_START: - case DEVLINK_ATTR_FMSG_NEST_END: - err = nla_put_flag(skb, item->attrtype); - break; - case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: - err = devlink_fmsg_item_fill_type(item, skb); - if (err) - break; - err = devlink_fmsg_item_fill_data(item, skb); - break; - case DEVLINK_ATTR_FMSG_OBJ_NAME: - err = nla_put_string(skb, item->attrtype, - (char *) &item->value); - break; - default: - err = -EINVAL; - break; - } - if (!err) - *start = ++i; - else - break; - } - - nla_nest_end(skb, fmsg_nlattr); - return err; -} - -static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, - struct genl_info *info, - enum devlink_command cmd, int flags) -{ - struct nlmsghdr *nlh; - struct sk_buff *skb; - bool last = false; - int index = 0; - void *hdr; - int err; - - while (!last) { - int tmp_index = index; - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, flags | NLM_F_MULTI, cmd); - if (!hdr) { - err = -EMSGSIZE; - goto nla_put_failure; - } - - err = devlink_fmsg_prepare_skb(fmsg, skb, &index); - if (!err) - last = true; - else if (err != -EMSGSIZE || tmp_index == index) - goto nla_put_failure; - - genlmsg_end(skb, hdr); - err = genlmsg_reply(skb, info); - if (err) - return err; - } - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = -EMSGSIZE; - goto nla_put_failure; - } - - return genlmsg_reply(skb, info); - -nla_put_failure: - nlmsg_free(skb); - return err; -} - -static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, - struct netlink_callback *cb, - enum devlink_command cmd) -{ - int index = cb->args[0]; - int tmp_index = index; - void *hdr; - int err; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); - if (!hdr) { - err = -EMSGSIZE; - goto nla_put_failure; - } - - err = devlink_fmsg_prepare_skb(fmsg, skb, &index); - if ((err && err != -EMSGSIZE) || tmp_index == index) - goto nla_put_failure; - - cb->args[0] = index; - genlmsg_end(skb, hdr); - return skb->len; - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return err; -} - -struct devlink_health_reporter { - struct list_head list; - void *priv; - const struct devlink_health_reporter_ops *ops; - struct devlink *devlink; - struct devlink_port *devlink_port; - struct devlink_fmsg *dump_fmsg; - struct mutex dump_lock; /* lock parallel read/write from dump buffers */ - u64 graceful_period; - bool auto_recover; - bool auto_dump; - u8 health_state; - u64 dump_ts; - u64 dump_real_ts; - u64 error_count; - u64 recovery_count; - u64 last_recovery_ts; - refcount_t refcount; -}; - -void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return reporter->priv; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); - -static struct devlink_health_reporter * -__devlink_health_reporter_find_by_name(struct list_head *reporter_list, - struct mutex *list_lock, - const char *reporter_name) -{ - struct devlink_health_reporter *reporter; - - lockdep_assert_held(list_lock); - list_for_each_entry(reporter, reporter_list, list) - if (!strcmp(reporter->ops->name, reporter_name)) - return reporter; - return NULL; -} - -static struct devlink_health_reporter * -devlink_health_reporter_find_by_name(struct devlink *devlink, - const char *reporter_name) -{ - return __devlink_health_reporter_find_by_name(&devlink->reporter_list, - &devlink->reporters_lock, - reporter_name); -} - -static struct devlink_health_reporter * -devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, - const char *reporter_name) -{ - return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, - &devlink_port->reporters_lock, - reporter_name); -} - -static struct devlink_health_reporter * -__devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) -{ - struct devlink_health_reporter *reporter; - - if (WARN_ON(graceful_period && !ops->recover)) - return ERR_PTR(-EINVAL); - - reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); - if (!reporter) - return ERR_PTR(-ENOMEM); - - reporter->priv = priv; - reporter->ops = ops; - reporter->devlink = devlink; - reporter->graceful_period = graceful_period; - reporter->auto_recover = !!ops->recover; - reporter->auto_dump = !!ops->dump; - mutex_init(&reporter->dump_lock); - refcount_set(&reporter->refcount, 1); - return reporter; -} - -/** - * devlink_port_health_reporter_create - create devlink health reporter for - * specified port instance - * - * @port: devlink_port which should contain the new reporter - * @ops: ops - * @graceful_period: to avoid recovery loops, in msecs - * @priv: priv - */ -struct devlink_health_reporter * -devlink_port_health_reporter_create(struct devlink_port *port, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) -{ - struct devlink_health_reporter *reporter; - - mutex_lock(&port->reporters_lock); - if (__devlink_health_reporter_find_by_name(&port->reporter_list, - &port->reporters_lock, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } - - reporter = __devlink_health_reporter_create(port->devlink, ops, - graceful_period, priv); - if (IS_ERR(reporter)) - goto unlock; - - reporter->devlink_port = port; - list_add_tail(&reporter->list, &port->reporter_list); -unlock: - mutex_unlock(&port->reporters_lock); - return reporter; -} -EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); - -/** - * devlink_health_reporter_create - create devlink health reporter - * - * @devlink: devlink - * @ops: ops - * @graceful_period: to avoid recovery loops, in msecs - * @priv: priv - */ -struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) -{ - struct devlink_health_reporter *reporter; - - mutex_lock(&devlink->reporters_lock); - if (devlink_health_reporter_find_by_name(devlink, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } - - reporter = __devlink_health_reporter_create(devlink, ops, - graceful_period, priv); - if (IS_ERR(reporter)) - goto unlock; - - list_add_tail(&reporter->list, &devlink->reporter_list); -unlock: - mutex_unlock(&devlink->reporters_lock); - return reporter; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_create); - -static void -devlink_health_reporter_free(struct devlink_health_reporter *reporter) -{ - mutex_destroy(&reporter->dump_lock); - if (reporter->dump_fmsg) - devlink_fmsg_free(reporter->dump_fmsg); - kfree(reporter); -} - -static void -devlink_health_reporter_put(struct devlink_health_reporter *reporter) -{ - if (refcount_dec_and_test(&reporter->refcount)) - devlink_health_reporter_free(reporter); -} - -static void -__devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - list_del(&reporter->list); - devlink_health_reporter_put(reporter); -} - -/** - * devlink_health_reporter_destroy - destroy devlink health reporter - * - * @reporter: devlink health reporter to destroy - */ -void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - struct mutex *lock = &reporter->devlink->reporters_lock; - - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); - -/** - * devlink_port_health_reporter_destroy - destroy devlink port health reporter - * - * @reporter: devlink health reporter to destroy - */ -void -devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - struct mutex *lock = &reporter->devlink_port->reporters_lock; - - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); -} -EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); - -static int -devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink_health_reporter *reporter, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct devlink *devlink = reporter->devlink; - struct nlattr *reporter_attr; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - if (reporter->devlink_port) { - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) - goto genlmsg_cancel; - } - reporter_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_HEALTH_REPORTER); - if (!reporter_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, - reporter->ops->name)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, - reporter->health_state)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, - reporter->error_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, - reporter->recovery_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, - reporter->auto_recover)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, - reporter->dump_real_ts, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->dump && - nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, - reporter->auto_dump)) - goto reporter_nest_cancel; - - nla_nest_end(msg, reporter_attr); - genlmsg_end(msg, hdr); - return 0; - -reporter_nest_cancel: - nla_nest_end(msg, reporter_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_recover_notify(struct devlink_health_reporter *reporter, - enum devlink_command cmd) -{ - struct devlink *devlink = reporter->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -void -devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) -{ - reporter->recovery_count++; - reporter->last_recovery_ts = jiffies; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); - -static int -devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx, struct netlink_ext_ack *extack) -{ - int err; - - if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) - return 0; - - if (!reporter->ops->recover) - return -EOPNOTSUPP; - - err = reporter->ops->recover(reporter, priv_ctx, extack); - if (err) - return err; - - devlink_health_reporter_recovery_done(reporter); - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - - return 0; -} - -static void -devlink_health_dump_clear(struct devlink_health_reporter *reporter) -{ - if (!reporter->dump_fmsg) - return; - devlink_fmsg_free(reporter->dump_fmsg); - reporter->dump_fmsg = NULL; -} - -static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx, - struct netlink_ext_ack *extack) -{ - int err; - - if (!reporter->ops->dump) - return 0; - - if (reporter->dump_fmsg) - return 0; - - reporter->dump_fmsg = devlink_fmsg_alloc(); - if (!reporter->dump_fmsg) { - err = -ENOMEM; - return err; - } - - err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg); - if (err) - goto dump_err; - - err = reporter->ops->dump(reporter, reporter->dump_fmsg, - priv_ctx, extack); - if (err) - goto dump_err; - - err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg); - if (err) - goto dump_err; - - reporter->dump_ts = jiffies; - reporter->dump_real_ts = ktime_get_real_ns(); - - return 0; - -dump_err: - devlink_health_dump_clear(reporter); - return err; -} - -int devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - enum devlink_health_reporter_state prev_health_state; - struct devlink *devlink = reporter->devlink; - unsigned long recover_ts_threshold; - int ret; - - /* write a log message of the current error */ - WARN_ON(!msg); - trace_devlink_health_report(devlink, reporter->ops->name, msg); - reporter->error_count++; - prev_health_state = reporter->health_state; - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - - /* abort if the previous error wasn't recovered */ - recover_ts_threshold = reporter->last_recovery_ts + - msecs_to_jiffies(reporter->graceful_period); - if (reporter->auto_recover && - (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - (reporter->last_recovery_ts && reporter->recovery_count && - time_is_after_jiffies(recover_ts_threshold)))) { - trace_devlink_health_recover_aborted(devlink, - reporter->ops->name, - reporter->health_state, - jiffies - - reporter->last_recovery_ts); - return -ECANCELED; - } - - if (reporter->auto_dump) { - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx, NULL); - mutex_unlock(&reporter->dump_lock); - } - - if (!reporter->auto_recover) - return 0; - - devl_lock(devlink); - ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL); - devl_unlock(devlink); - - return ret; -} -EXPORT_SYMBOL_GPL(devlink_health_report); - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_attrs(struct devlink *devlink, - struct nlattr **attrs) -{ - struct devlink_health_reporter *reporter; - struct devlink_port *devlink_port; - char *reporter_name; - - if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) - return NULL; - - reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); - devlink_port = devlink_port_get_from_attrs(devlink, attrs); - if (IS_ERR(devlink_port)) { - mutex_lock(&devlink->reporters_lock); - reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink->reporters_lock); - } else { - mutex_lock(&devlink_port->reporters_lock); - reporter = devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink_port->reporters_lock); - } - - return reporter; -} - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - return devlink_health_reporter_get_from_attrs(devlink, info->attrs); -} - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_cb(struct netlink_callback *cb) -{ - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - struct devlink_health_reporter *reporter; - struct nlattr **attrs = info->attrs; - struct devlink *devlink; - - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); - if (IS_ERR(devlink)) - return NULL; - - reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); - devlink_put(devlink); - return reporter; -} - -void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) -{ - if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && - state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) - return; - - if (reporter->health_state == state) - return; - - reporter->health_state = state; - trace_devlink_health_reporter_state_update(reporter->devlink, - reporter->ops->name, state); - devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); - -static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - struct sk_buff *msg; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } - - err = devlink_nl_health_reporter_fill(msg, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - info->snd_portid, info->snd_seq, - 0); - if (err) { - nlmsg_free(msg); - goto out; - } - - err = genlmsg_reply(msg, info); -out: - devlink_health_reporter_put(reporter); - return err; -} - -static int -devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_health_reporter *reporter; - unsigned long index, port_index; - struct devlink_port *port; - struct devlink *devlink; - int start = cb->args[0]; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->reporters_lock); - list_for_each_entry(reporter, &devlink->reporter_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill( - msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - } - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, port) { - mutex_lock(&port->reporters_lock); - list_for_each_entry(reporter, &port->reporter_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill( - msg, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { - mutex_unlock(&port->reporters_lock); - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&port->reporters_lock); - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int -devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->recover && - (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) { - err = -EOPNOTSUPP; - goto out; - } - if (!reporter->ops->dump && - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { - err = -EOPNOTSUPP; - goto out; - } - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) - reporter->graceful_period = - nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) - reporter->auto_recover = - nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) - reporter->auto_dump = - nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); - - devlink_health_reporter_put(reporter); - return 0; -out: - devlink_health_reporter_put(reporter); - return err; -} - -static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - err = devlink_health_reporter_recover(reporter, NULL, info->extack); - - devlink_health_reporter_put(reporter); - return err; -} - -static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - struct devlink_fmsg *fmsg; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->diagnose) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; - } - - fmsg = devlink_fmsg_alloc(); - if (!fmsg) { - devlink_health_reporter_put(reporter); - return -ENOMEM; - } - - err = devlink_fmsg_obj_nest_start(fmsg); - if (err) - goto out; - - err = reporter->ops->diagnose(reporter, fmsg, info->extack); - if (err) - goto out; - - err = devlink_fmsg_obj_nest_end(fmsg); - if (err) - goto out; - - err = devlink_fmsg_snd(fmsg, info, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); - -out: - devlink_fmsg_free(fmsg); - devlink_health_reporter_put(reporter); - return err; -} - -static int -devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct devlink_health_reporter *reporter; - u64 start = cb->args[0]; - int err; - - reporter = devlink_health_reporter_get_from_cb(cb); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->dump) { - err = -EOPNOTSUPP; - goto out; - } - mutex_lock(&reporter->dump_lock); - if (!start) { - err = devlink_health_do_dump(reporter, NULL, cb->extack); - if (err) - goto unlock; - cb->args[1] = reporter->dump_ts; - } - if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { - NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); - err = -EAGAIN; - goto unlock; - } - - err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); -unlock: - mutex_unlock(&reporter->dump_lock); -out: - devlink_health_reporter_put(reporter); - return err; -} - -static int -devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; - } - - mutex_lock(&reporter->dump_lock); - devlink_health_dump_clear(reporter); - mutex_unlock(&reporter->dump_lock); - devlink_health_reporter_put(reporter); - return 0; -} - -static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->test) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; - } - - err = reporter->ops->test(reporter, info->extack); - - devlink_health_reporter_put(reporter); - return err; -} - struct devlink_stats { u64_stats_t rx_bytes; u64_stats_t rx_packets; @@ -8789,7 +5682,7 @@ static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb, trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap"); + NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } @@ -8810,43 +5703,39 @@ err_trap_fill: return err; } -static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_fill(msg, devlink, trap_item, - DEVLINK_CMD_TRAP_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_trap_fill(msg, devlink, trap_item, + DEVLINK_CMD_TRAP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_trap_get = { + .dump_one = devlink_nl_cmd_trap_get_dump_one, +}; + static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, enum devlink_trap_action trap_action, @@ -8856,7 +5745,7 @@ static int __devlink_trap_action_set(struct devlink *devlink, if (trap_item->action != trap_action && trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { - NL_SET_ERR_MSG_MOD(extack, "Cannot change action of non-drop traps. Skipping"); + NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping"); return 0; } @@ -8882,7 +5771,7 @@ static int devlink_trap_action_set(struct devlink *devlink, err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action"); + NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } @@ -8902,7 +5791,7 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap"); + NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } @@ -9004,7 +5893,7 @@ static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb, group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group"); + NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } @@ -9025,46 +5914,41 @@ err_trap_group_fill: return err; } -static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - enum devlink_command cmd = DEVLINK_CMD_TRAP_GROUP_NEW; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(group_item, &devlink->trap_group_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_group_fill(msg, devlink, - group_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_trap_group_fill(msg, devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_trap_group_get = { + .dump_one = devlink_nl_cmd_trap_group_get_dump_one, +}; + static int __devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, @@ -9118,7 +6002,7 @@ devlink_trap_group_action_set(struct devlink *devlink, err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action"); + NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } @@ -9140,6 +6024,7 @@ static int devlink_trap_group_set(struct devlink *devlink, struct netlink_ext_ack *extack = info->extack; const struct devlink_trap_policer *policer; struct nlattr **attrs = info->attrs; + u32 policer_id; int err; if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) @@ -9148,17 +6033,11 @@ static int devlink_trap_group_set(struct devlink *devlink, if (!devlink->ops->trap_group_set) return -EOPNOTSUPP; - policer_item = group_item->policer_item; - if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) { - u32 policer_id; - - policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); - policer_item = devlink_trap_policer_item_lookup(devlink, - policer_id); - if (policer_id && !policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); - return -ENOENT; - } + policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); + if (policer_id && !policer_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); + return -ENOENT; } policer = policer_item ? policer_item->policer : NULL; @@ -9186,7 +6065,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group"); + NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } @@ -9203,7 +6082,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, err_trap_group_set: if (modified) - NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already"); + NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already"); return err; } @@ -9308,7 +6187,7 @@ static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb, policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } @@ -9329,46 +6208,40 @@ err_trap_policer_fill: return err; } -static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(policer_item, &devlink->trap_policer_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_policer_fill(msg, devlink, - policer_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_cmd devl_cmd_trap_policer_get = { + .dump_one = devlink_nl_cmd_trap_policer_get_dump_one, +}; + static int devlink_trap_policer_set(struct devlink *devlink, struct devlink_trap_policer_item *policer_item, @@ -9389,22 +6262,22 @@ devlink_trap_policer_set(struct devlink *devlink, burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); if (rate < policer_item->policer->min_rate) { - NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit"); + NL_SET_ERR_MSG(extack, "Policer rate lower than limit"); return -EINVAL; } if (rate > policer_item->policer->max_rate) { - NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit"); + NL_SET_ERR_MSG(extack, "Policer rate higher than limit"); return -EINVAL; } if (burst < policer_item->policer->min_burst) { - NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit"); + NL_SET_ERR_MSG(extack, "Policer burst size lower than limit"); return -EINVAL; } if (burst > policer_item->policer->max_burst) { - NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit"); + NL_SET_ERR_MSG(extack, "Policer burst size higher than limit"); return -EINVAL; } @@ -9434,95 +6307,26 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } return devlink_trap_policer_set(devlink, policer_item, info); } -static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { - [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = - DEVLINK_ATTR_TRAP_POLICER_ID }, - [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, - DEVLINK_PORT_TYPE_IB), - [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, - DEVLINK_ESWITCH_MODE_SWITCHDEV), - [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, - [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, - [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, - [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, - [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, - [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = - NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), - [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, - [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_ACTION_MAX), - [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), - [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, - [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, -}; - -static const struct genl_small_ops devlink_nl_ops[] = { +const struct genl_small_ops devlink_nl_ops[56] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_get_doit, - .dumpit = devlink_nl_cmd_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_PORT_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_get_doit, - .dumpit = devlink_nl_cmd_port_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9536,7 +6340,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_RATE_GET, .doit = devlink_nl_cmd_rate_get_doit, - .dumpit = devlink_nl_cmd_rate_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, /* can be retrieved by unprivileged users */ }, @@ -9584,7 +6388,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_LINECARD_GET, .doit = devlink_nl_cmd_linecard_get_doit, - .dumpit = devlink_nl_cmd_linecard_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, /* can be retrieved by unprivileged users */ }, @@ -9598,14 +6402,14 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, - .dumpit = devlink_nl_cmd_sb_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9618,7 +6422,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9633,7 +6437,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, - .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9714,7 +6518,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_PARAM_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_get_doit, - .dumpit = devlink_nl_cmd_param_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9742,7 +6546,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_REGION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_get_doit, - .dumpit = devlink_nl_cmd_region_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .flags = GENL_ADMIN_PERM, }, { @@ -9768,14 +6572,14 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_info_get_doit, - .dumpit = devlink_nl_cmd_info_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, - .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, /* can be retrieved by unprivileged users */ }, @@ -9830,7 +6634,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_GET, .doit = devlink_nl_cmd_trap_get_doit, - .dumpit = devlink_nl_cmd_trap_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9841,7 +6645,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, .doit = devlink_nl_cmd_trap_group_get_doit, - .dumpit = devlink_nl_cmd_trap_group_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9852,7 +6656,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, .doit = devlink_nl_cmd_trap_policer_get_doit, - .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9863,7 +6667,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_SELFTESTS_GET, .doit = devlink_nl_cmd_selftests_get_doit, - .dumpit = devlink_nl_cmd_selftests_get_dumpit + .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -9871,148 +6675,9 @@ static const struct genl_small_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_selftests_run, .flags = GENL_ADMIN_PERM, }, + /* -- No new ops here! Use split ops going forward! -- */ }; -static struct genl_family devlink_nl_family __ro_after_init = { - .name = DEVLINK_GENL_NAME, - .version = DEVLINK_GENL_VERSION, - .maxattr = DEVLINK_ATTR_MAX, - .policy = devlink_nl_policy, - .netnsok = true, - .parallel_ops = true, - .pre_doit = devlink_nl_pre_doit, - .post_doit = devlink_nl_post_doit, - .module = THIS_MODULE, - .small_ops = devlink_nl_ops, - .n_small_ops = ARRAY_SIZE(devlink_nl_ops), - .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, - .mcgrps = devlink_nl_mcgrps, - .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), -}; - -static bool devlink_reload_actions_valid(const struct devlink_ops *ops) -{ - const struct devlink_reload_combination *comb; - int i; - - if (!devlink_reload_supported(ops)) { - if (WARN_ON(ops->reload_actions)) - return false; - return true; - } - - if (WARN_ON(!ops->reload_actions || - ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || - ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) - return false; - - if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || - ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) - return false; - - for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { - comb = &devlink_reload_invalid_combinations[i]; - if (ops->reload_actions == BIT(comb->action) && - ops->reload_limits == BIT(comb->limit)) - return false; - } - return true; -} - -/** - * devlink_set_features - Set devlink supported features - * - * @devlink: devlink - * @features: devlink support features - * - * This interface allows us to set reload ops separatelly from - * the devlink_alloc. - */ -void devlink_set_features(struct devlink *devlink, u64 features) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - WARN_ON(features & DEVLINK_F_RELOAD && - !devlink_reload_supported(devlink->ops)); - devlink->features = features; -} -EXPORT_SYMBOL_GPL(devlink_set_features); - -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr); - -/** - * devlink_alloc_ns - Allocate new devlink instance resources - * in specific namespace - * - * @ops: ops - * @priv_size: size of user private data - * @net: net namespace - * @dev: parent device - * - * Allocate new devlink instance resources, including devlink index - * and name. - */ -struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, - size_t priv_size, struct net *net, - struct device *dev) -{ - struct devlink *devlink; - static u32 last_id; - int ret; - - WARN_ON(!ops || !dev); - if (!devlink_reload_actions_valid(ops)) - return NULL; - - devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); - if (!devlink) - return NULL; - - ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, - &last_id, GFP_KERNEL); - if (ret < 0) - goto err_xa_alloc; - - devlink->netdevice_nb.notifier_call = devlink_netdevice_event; - ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); - if (ret) - goto err_register_netdevice_notifier; - - devlink->dev = dev; - devlink->ops = ops; - xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); - xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - write_pnet(&devlink->_net, net); - INIT_LIST_HEAD(&devlink->rate_list); - INIT_LIST_HEAD(&devlink->linecard_list); - INIT_LIST_HEAD(&devlink->sb_list); - INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); - INIT_LIST_HEAD(&devlink->resource_list); - INIT_LIST_HEAD(&devlink->param_list); - INIT_LIST_HEAD(&devlink->region_list); - INIT_LIST_HEAD(&devlink->reporter_list); - INIT_LIST_HEAD(&devlink->trap_list); - INIT_LIST_HEAD(&devlink->trap_group_list); - INIT_LIST_HEAD(&devlink->trap_policer_list); - lockdep_register_key(&devlink->lock_key); - mutex_init(&devlink->lock); - lockdep_set_class(&devlink->lock, &devlink->lock_key); - mutex_init(&devlink->reporters_lock); - mutex_init(&devlink->linecards_lock); - refcount_set(&devlink->refcount, 1); - init_completion(&devlink->comp); - - return devlink; - -err_register_netdevice_notifier: - xa_erase(&devlinks, devlink->index); -err_xa_alloc: - kfree(devlink); - return NULL; -} -EXPORT_SYMBOL_GPL(devlink_alloc_ns); - static void devlink_trap_policer_notify(struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, @@ -10025,7 +6690,7 @@ static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd); -static void devlink_notify_register(struct devlink *devlink) +void devlink_notify_register(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; struct devlink_trap_group_item *group_item; @@ -10036,6 +6701,7 @@ static void devlink_notify_register(struct devlink *devlink) struct devlink_rate *rate_node; struct devlink_region *region; unsigned long port_index; + unsigned long param_id; devlink_notify(devlink, DEVLINK_CMD_NEW); list_for_each_entry(linecard, &devlink->linecard_list, list) @@ -10061,12 +6727,12 @@ static void devlink_notify_register(struct devlink *devlink) list_for_each_entry(region, &devlink->region_list, list) devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - list_for_each_entry(param_item, &devlink->param_list, list) + xa_for_each(&devlink->params, param_id, param_item) devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } -static void devlink_notify_unregister(struct devlink *devlink) +void devlink_notify_unregister(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; struct devlink_trap_group_item *group_item; @@ -10076,8 +6742,9 @@ static void devlink_notify_unregister(struct devlink *devlink) struct devlink_rate *rate_node; struct devlink_region *region; unsigned long port_index; + unsigned long param_id; - list_for_each_entry_reverse(param_item, &devlink->param_list, list) + xa_for_each(&devlink->params, param_id, param_item) devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); @@ -10103,79 +6770,6 @@ static void devlink_notify_unregister(struct devlink *devlink) devlink_notify(devlink, DEVLINK_CMD_DEL); } -/** - * devlink_register - Register devlink instance - * - * @devlink: devlink - */ -void devlink_register(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - /* Make sure that we are in .probe() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - devlink_notify_register(devlink); -} -EXPORT_SYMBOL_GPL(devlink_register); - -/** - * devlink_unregister - Unregister devlink instance - * - * @devlink: devlink - */ -void devlink_unregister(struct devlink *devlink) -{ - ASSERT_DEVLINK_REGISTERED(devlink); - /* Make sure that we are in .remove() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); - devlink_put(devlink); - wait_for_completion(&devlink->comp); - - devlink_notify_unregister(devlink); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); -} -EXPORT_SYMBOL_GPL(devlink_unregister); - -/** - * devlink_free - Free devlink instance resources - * - * @devlink: devlink - */ -void devlink_free(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - mutex_destroy(&devlink->linecards_lock); - mutex_destroy(&devlink->reporters_lock); - mutex_destroy(&devlink->lock); - lockdep_unregister_key(&devlink->lock_key); - WARN_ON(!list_empty(&devlink->trap_policer_list)); - WARN_ON(!list_empty(&devlink->trap_group_list)); - WARN_ON(!list_empty(&devlink->trap_list)); - WARN_ON(!list_empty(&devlink->reporter_list)); - WARN_ON(!list_empty(&devlink->region_list)); - WARN_ON(!list_empty(&devlink->param_list)); - WARN_ON(!list_empty(&devlink->resource_list)); - WARN_ON(!list_empty(&devlink->dpipe_table_list)); - WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); - WARN_ON(!list_empty(&devlink->linecard_list)); - WARN_ON(!xa_empty(&devlink->ports)); - - xa_destroy(&devlink->snapshot_ids); - xa_destroy(&devlink->ports); - - WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), - &devlink->netdevice_nb)); - - xa_erase(&devlinks, devlink->index); - - kfree(devlink); -} -EXPORT_SYMBOL_GPL(devlink_free); - static void devlink_port_type_warn(struct work_struct *work) { WARN(true, "Type was not set for devlink port."); @@ -10275,12 +6869,9 @@ int devl_port_register(struct devlink *devlink, devlink_port->index = port_index; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) { - mutex_destroy(&devlink_port->reporters_lock); + if (err) return err; - } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); @@ -10331,7 +6922,6 @@ void devl_port_unregister(struct devlink_port *devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); - mutex_destroy(&devlink_port->reporters_lock); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); @@ -10476,8 +7066,8 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) } EXPORT_SYMBOL_GPL(devlink_port_type_clear); -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr) +int devlink_port_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; @@ -10499,6 +7089,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this @@ -10508,6 +7100,8 @@ static int devlink_netdevice_event(struct notifier_block *nb, netdev); break; case NETDEV_UNREGISTER: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. @@ -10907,7 +7501,7 @@ static void devlink_linecard_types_fini(struct devlink_linecard *linecard) } /** - * devlink_linecard_create - Create devlink linecard + * devl_linecard_create - Create devlink linecard * * @devlink: devlink * @linecard_index: driver-specific numerical identifier of the linecard @@ -10920,8 +7514,8 @@ static void devlink_linecard_types_fini(struct devlink_linecard *linecard) * Return: Line card structure or an ERR_PTR() encoded error code. */ struct devlink_linecard * -devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, - const struct devlink_linecard_ops *ops, void *priv) +devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv) { struct devlink_linecard *linecard; int err; @@ -10930,17 +7524,12 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, !ops->types_count || !ops->types_get)) return ERR_PTR(-EINVAL); - mutex_lock(&devlink->linecards_lock); - if (devlink_linecard_index_exists(devlink, linecard_index)) { - mutex_unlock(&devlink->linecards_lock); + if (devlink_linecard_index_exists(devlink, linecard_index)) return ERR_PTR(-EEXIST); - } linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); - if (!linecard) { - mutex_unlock(&devlink->linecards_lock); + if (!linecard) return ERR_PTR(-ENOMEM); - } linecard->devlink = devlink; linecard->index = linecard_index; @@ -10953,35 +7542,29 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, if (err) { mutex_destroy(&linecard->state_lock); kfree(linecard); - mutex_unlock(&devlink->linecards_lock); return ERR_PTR(err); } list_add_tail(&linecard->list, &devlink->linecard_list); - refcount_set(&linecard->refcount, 1); - mutex_unlock(&devlink->linecards_lock); devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); return linecard; } -EXPORT_SYMBOL_GPL(devlink_linecard_create); +EXPORT_SYMBOL_GPL(devl_linecard_create); /** - * devlink_linecard_destroy - Destroy devlink linecard + * devl_linecard_destroy - Destroy devlink linecard * * @linecard: devlink linecard */ -void devlink_linecard_destroy(struct devlink_linecard *linecard) +void devl_linecard_destroy(struct devlink_linecard *linecard) { - struct devlink *devlink = linecard->devlink; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - mutex_lock(&devlink->linecards_lock); list_del(&linecard->list); devlink_linecard_types_fini(linecard); - mutex_unlock(&devlink->linecards_lock); - devlink_linecard_put(linecard); + mutex_destroy(&linecard->state_lock); + kfree(linecard); } -EXPORT_SYMBOL_GPL(devlink_linecard_destroy); +EXPORT_SYMBOL_GPL(devl_linecard_destroy); /** * devlink_linecard_provision_set - Set provisioning on linecard @@ -11584,8 +8167,53 @@ static int devlink_param_verify(const struct devlink_param *param) return devlink_param_driver_verify(param); } +static int devlink_param_register(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + int err; + + WARN_ON(devlink_param_verify(param)); + WARN_ON(devlink_param_find_by_name(&devlink->params, param->name)); + + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); + + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + if (!param_item) + return -ENOMEM; + + param_item->param = param; + + err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL); + if (err) + goto err_xa_insert; + + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; + +err_xa_insert: + kfree(param_item); + return err; +} + +static void devlink_param_unregister(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->params, param->id); + if (WARN_ON(!param_item)) + return; + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); + xa_erase(&devlink->params, param->id); + kfree(param_item); +} + /** - * devlink_params_register - register configuration parameters + * devl_params_register - register configuration parameters * * @devlink: devlink * @params: configuration parameters array @@ -11593,14 +8221,14 @@ static int devlink_param_verify(const struct devlink_param *param) * * Register the configuration parameters supported by the driver. */ -int devlink_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) +int devl_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) { const struct devlink_param *param = params; int i, err; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); + lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) { err = devlink_param_register(devlink, param); @@ -11617,124 +8245,103 @@ rollback: devlink_param_unregister(devlink, param); return err; } +EXPORT_SYMBOL_GPL(devl_params_register); + +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + int err; + + devl_lock(devlink); + err = devl_params_register(devlink, params, params_count); + devl_unlock(devlink); + return err; +} EXPORT_SYMBOL_GPL(devlink_params_register); /** - * devlink_params_unregister - unregister configuration parameters + * devl_params_unregister - unregister configuration parameters * @devlink: devlink * @params: configuration parameters to unregister * @params_count: number of parameters provided */ -void devlink_params_unregister(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) +void devl_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) { const struct devlink_param *param = params; int i; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); + lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) devlink_param_unregister(devlink, param); } -EXPORT_SYMBOL_GPL(devlink_params_unregister); +EXPORT_SYMBOL_GPL(devl_params_unregister); -/** - * devlink_param_register - register one configuration parameter - * - * @devlink: devlink - * @param: one configuration parameter - * - * Register the configuration parameter supported by the driver. - * Return: returns 0 on successful registration or error code otherwise. - */ -int devlink_param_register(struct devlink *devlink, - const struct devlink_param *param) -{ - struct devlink_param_item *param_item; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - WARN_ON(devlink_param_verify(param)); - WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name)); - - if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) - WARN_ON(param->get || param->set); - else - WARN_ON(!param->get || !param->set); - - param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); - if (!param_item) - return -ENOMEM; - - param_item->param = param; - - list_add_tail(¶m_item->list, &devlink->param_list); - return 0; -} -EXPORT_SYMBOL_GPL(devlink_param_register); - -/** - * devlink_param_unregister - unregister one configuration parameter - * @devlink: devlink - * @param: configuration parameter to unregister - */ -void devlink_param_unregister(struct devlink *devlink, - const struct devlink_param *param) +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) { - struct devlink_param_item *param_item; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - param_item = - devlink_param_find_by_name(&devlink->param_list, param->name); - WARN_ON(!param_item); - list_del(¶m_item->list); - kfree(param_item); + devl_lock(devlink); + devl_params_unregister(devlink, params, params_count); + devl_unlock(devlink); } -EXPORT_SYMBOL_GPL(devlink_param_unregister); +EXPORT_SYMBOL_GPL(devlink_params_unregister); /** - * devlink_param_driverinit_value_get - get configuration parameter - * value for driver initializing + * devl_param_driverinit_value_get - get configuration parameter + * value for driver initializing * * @devlink: devlink * @param_id: parameter ID - * @init_val: value of parameter in driverinit configuration mode + * @val: pointer to store the value of parameter in driverinit + * configuration mode * * This function should be used by the driver to get driverinit * configuration for initialization after reload command. + * + * Note that lockless call of this function relies on the + * driver to maintain following basic sane behavior: + * 1) Driver ensures a call to this function cannot race with + * registering/unregistering the parameter with the same parameter ID. + * 2) Driver ensures a call to this function cannot race with + * devl_param_driverinit_value_set() call with the same parameter ID. + * 3) Driver ensures a call to this function cannot race with + * reload operation. + * If the driver is not able to comply, it has to take the devlink->lock + * while calling this. */ -int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val) +int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *val) { struct devlink_param_item *param_item; - if (!devlink_reload_supported(devlink->ops)) + if (WARN_ON(!devlink_reload_supported(devlink->ops))) return -EOPNOTSUPP; - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + param_item = devlink_param_find_by_id(&devlink->params, param_id); if (!param_item) return -EINVAL; - if (!param_item->driverinit_value_valid || - !devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) + if (!param_item->driverinit_value_valid) return -EOPNOTSUPP; - if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(init_val->vstr, param_item->driverinit_value.vstr); - else - *init_val = param_item->driverinit_value; + if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT))) + return -EOPNOTSUPP; + + *val = param_item->driverinit_value; return 0; } -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); +EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get); /** - * devlink_param_driverinit_value_set - set value of configuration - * parameter for driverinit - * configuration mode + * devl_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode * * @devlink: devlink * @param_id: parameter ID @@ -11743,34 +8350,48 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); * This function should be used by the driver to set driverinit * configuration mode default value. */ -int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val) +void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) { struct devlink_param_item *param_item; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); + devl_assert_locked(devlink); - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); - if (!param_item) - return -EINVAL; + param_item = devlink_param_find_by_id(&devlink->params, param_id); + if (WARN_ON(!param_item)) + return; - if (!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) - return -EOPNOTSUPP; + if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT))) + return; - if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(param_item->driverinit_value.vstr, init_val.vstr); - else - param_item->driverinit_value = init_val; + param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; - return 0; + + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set); + +void devlink_params_driverinit_load_new(struct devlink *devlink) +{ + struct devlink_param_item *param_item; + unsigned long param_id; + + xa_for_each(&devlink->params, param_id, param_item) { + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT) || + !param_item->driverinit_value_new_valid) + continue; + param_item->driverinit_value = param_item->driverinit_value_new; + param_item->driverinit_value_valid = true; + param_item->driverinit_value_new_valid = false; + } } -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); /** - * devlink_param_value_changed - notify devlink on a parameter's value - * change. Should be called by the driver - * right after the change. + * devl_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. * * @devlink: devlink * @param_id: parameter ID @@ -11779,16 +8400,16 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); * change, excluding driverinit configuration mode. * For driverinit configuration mode driver should use the function */ -void devlink_param_value_changed(struct devlink *devlink, u32 param_id) +void devl_param_value_changed(struct devlink *devlink, u32 param_id) { struct devlink_param_item *param_item; - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + param_item = devlink_param_find_by_id(&devlink->params, param_id); WARN_ON(!param_item); devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } -EXPORT_SYMBOL_GPL(devlink_param_value_changed); +EXPORT_SYMBOL_GPL(devl_param_value_changed); /** * devl_region_create - create a new address region @@ -11924,8 +8545,10 @@ void devl_region_destroy(struct devlink_region *region) devl_assert_locked(devlink); /* Free all snapshots of region */ + mutex_lock(®ion->snapshot_lock); list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) devlink_region_snapshot_del(region, snapshot); + mutex_unlock(®ion->snapshot_lock); list_del(®ion->list); mutex_destroy(®ion->snapshot_lock); @@ -12872,76 +9495,6 @@ devl_trap_policers_unregister(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); -static void __devlink_compat_running_version(struct devlink *devlink, - char *buf, size_t len) -{ - struct devlink_info_req req = {}; - const struct nlattr *nlattr; - struct sk_buff *msg; - int rem, err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - req.msg = msg; - err = devlink->ops->info_get(devlink, &req, NULL); - if (err) - goto free_msg; - - nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { - const struct nlattr *kv; - int rem_kv; - - if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) - continue; - - nla_for_each_nested(kv, nlattr, rem_kv) { - if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) - continue; - - strlcat(buf, nla_data(kv), len); - strlcat(buf, " ", len); - } - } -free_msg: - nlmsg_free(msg); -} - -void devlink_compat_running_version(struct devlink *devlink, - char *buf, size_t len) -{ - if (!devlink->ops->info_get) - return; - - devl_lock(devlink); - __devlink_compat_running_version(devlink, buf, len); - devl_unlock(devlink); -} - -int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) -{ - struct devlink_flash_update_params params = {}; - int ret; - - if (!devlink->ops->flash_update) - return -EOPNOTSUPP; - - ret = request_firmware(¶ms.fw, file_name, devlink->dev); - if (ret) - return ret; - - devl_lock(devlink); - devlink_flash_update_begin_notify(devlink); - ret = devlink->ops->flash_update(devlink, ¶ms, NULL); - devlink_flash_update_end_notify(devlink); - devl_unlock(devlink); - - release_firmware(params.fw); - - return ret; -} - int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { @@ -12977,47 +9530,3 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } - -static void __net_exit devlink_pernet_pre_exit(struct net *net) -{ - struct devlink *devlink; - u32 actions_performed; - unsigned long index; - int err; - - /* In case network namespace is getting destroyed, reload - * all devlink instances from this namespace into init_net. - */ - devlinks_xa_for_each_registered_get(net, index, devlink) { - WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); - mutex_lock(&devlink->lock); - err = devlink_reload(devlink, &init_net, - DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_LIMIT_UNSPEC, - &actions_performed, NULL); - mutex_unlock(&devlink->lock); - if (err && err != -EOPNOTSUPP) - pr_warn("Failed to reload devlink instance into init_net\n"); - devlink_put(devlink); - } -} - -static struct pernet_operations devlink_pernet_ops __net_initdata = { - .pre_exit = devlink_pernet_pre_exit, -}; - -static int __init devlink_init(void) -{ - int err; - - err = genl_register_family(&devlink_nl_family); - if (err) - goto out; - err = register_pernet_subsys(&devlink_pernet_ops); - -out: - WARN_ON(err); - return err; -} - -subsys_initcall(devlink_init); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c new file mode 100644 index 000000000000..7a332eb70f70 --- /dev/null +++ b/net/devlink/netlink.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> +#include <net/sock.h> + +#include "devl_internal.h" + +static const struct genl_multicast_group devlink_nl_mcgrps[] = { + [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, +}; + +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = + DEVLINK_ATTR_TRAP_POLICER_ID }, + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, + DEVLINK_PORT_TYPE_IB), + [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, + DEVLINK_ESWITCH_MODE_SWITCHDEV), + [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, + [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, + [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = + NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), + [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, + [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_ACTION_MAX), + [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), + [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, +}; + +struct devlink * +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) +{ + struct devlink *devlink; + unsigned long index; + char *busname; + char *devname; + + if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) + return ERR_PTR(-EINVAL); + + busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); + devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + + devlinks_xa_for_each_registered_get(net, index, devlink) { + devl_lock(devlink); + if (devl_is_registered(devlink) && + strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0) + return devlink; + devl_unlock(devlink); + devlink_put(devlink); + } + + return ERR_PTR(-ENODEV); +} + +static int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink_linecard *linecard; + struct devlink_port *devlink_port; + struct devlink *devlink; + int err; + + devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs); + if (IS_ERR(devlink)) + return PTR_ERR(devlink); + + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (IS_ERR(devlink_port)) { + err = PTR_ERR(devlink_port); + goto unlock; + } + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (!IS_ERR(devlink_port)) + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { + struct devlink_rate *devlink_rate; + + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) { + err = PTR_ERR(devlink_rate); + goto unlock; + } + info->user_ptr[1] = devlink_rate; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) { + err = PTR_ERR(rate_node); + goto unlock; + } + info->user_ptr[1] = rate_node; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) { + err = PTR_ERR(linecard); + goto unlock; + } + info->user_ptr[1] = linecard; + } + return 0; + +unlock: + devl_unlock(devlink); + devlink_put(devlink); + return err; +} + +static void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink; + + devlink = info->user_ptr[0]; + devl_unlock(devlink); + devlink_put(devlink); +} + +static const struct devlink_cmd *devl_cmds[] = { + [DEVLINK_CMD_GET] = &devl_cmd_get, + [DEVLINK_CMD_PORT_GET] = &devl_cmd_port_get, + [DEVLINK_CMD_SB_GET] = &devl_cmd_sb_get, + [DEVLINK_CMD_SB_POOL_GET] = &devl_cmd_sb_pool_get, + [DEVLINK_CMD_SB_PORT_POOL_GET] = &devl_cmd_sb_port_pool_get, + [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = &devl_cmd_sb_tc_pool_bind_get, + [DEVLINK_CMD_PARAM_GET] = &devl_cmd_param_get, + [DEVLINK_CMD_REGION_GET] = &devl_cmd_region_get, + [DEVLINK_CMD_INFO_GET] = &devl_cmd_info_get, + [DEVLINK_CMD_HEALTH_REPORTER_GET] = &devl_cmd_health_reporter_get, + [DEVLINK_CMD_TRAP_GET] = &devl_cmd_trap_get, + [DEVLINK_CMD_TRAP_GROUP_GET] = &devl_cmd_trap_group_get, + [DEVLINK_CMD_TRAP_POLICER_GET] = &devl_cmd_trap_policer_get, + [DEVLINK_CMD_RATE_GET] = &devl_cmd_rate_get, + [DEVLINK_CMD_LINECARD_GET] = &devl_cmd_linecard_get, + [DEVLINK_CMD_SELFTESTS_GET] = &devl_cmd_selftests_get, +}; + +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + const struct devlink_cmd *cmd; + struct devlink *devlink; + int err = 0; + + cmd = devl_cmds[info->op.cmd]; + + while ((devlink = devlinks_xa_find_get(sock_net(msg->sk), + &state->instance))) { + devl_lock(devlink); + + if (devl_is_registered(devlink)) + err = cmd->dump_one(msg, devlink, cb); + else + err = 0; + + devl_unlock(devlink); + devlink_put(devlink); + + if (err) + break; + + state->instance++; + + /* restart sub-object walk for the next instance */ + state->idx = 0; + } + + if (err != -EMSGSIZE) + return err; + return msg->len; +} + +struct genl_family devlink_nl_family __ro_after_init = { + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .policy = devlink_nl_policy, + .netnsok = true, + .parallel_ops = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, + .module = THIS_MODULE, + .small_ops = devlink_nl_ops, + .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, + .mcgrps = devlink_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), +}; diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 3aced951d5ab..01e54b46ae0b 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -337,7 +337,7 @@ static int __init init_dns_resolver(void) * this is used to prevent malicious redirections from being installed * with add_key(). */ - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; diff --git a/net/dsa/master.c b/net/dsa/master.c index 26d90140d271..22d3f16b0e6d 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -299,7 +299,7 @@ static ssize_t tagging_show(struct device *d, struct device_attribute *attr, struct net_device *dev = to_net_dev(d); struct dsa_port *cpu_dp = dev->dsa_ptr; - return sprintf(buf, "%s\n", + return sysfs_emit(buf, "%s\n", dsa_tag_protocol_to_str(cpu_dp->tag_ops)); } @@ -464,9 +464,7 @@ int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp, err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack); if (err) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "CPU port failed to join LAG"); + NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG"); goto out_master_teardown; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index aab79c355224..6957971c2db2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1117,6 +1117,40 @@ static void dsa_slave_net_selftest(struct net_device *ndev, net_selftest(ndev, etest, buf); } +static int dsa_slave_get_mm(struct net_device *dev, + struct ethtool_mm_state *state) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->get_mm) + return -EOPNOTSUPP; + + return ds->ops->get_mm(ds, dp->index, state); +} + +static int dsa_slave_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->set_mm) + return -EOPNOTSUPP; + + return ds->ops->set_mm(ds, dp->index, cfg, extack); +} + +static void dsa_slave_get_mm_stats(struct net_device *dev, + struct ethtool_mm_stats *stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_mm_stats) + ds->ops->get_mm_stats(ds, dp->index, stats); +} + static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -2205,6 +2239,9 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, .self_test = dsa_slave_net_selftest, + .get_mm = dsa_slave_get_mm, + .set_mm = dsa_slave_set_mm, + .get_mm_stats = dsa_slave_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = { @@ -2655,9 +2692,8 @@ static int dsa_slave_changeupper(struct net_device *dev, if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { - if (extack && !extack->_msg) - NL_SET_ERR_MSG_MOD(extack, - "Offloading not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "Offloading not supported"); err = 0; } err = notifier_from_errno(err); @@ -2670,8 +2706,8 @@ static int dsa_slave_changeupper(struct net_device *dev, err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { - NL_SET_ERR_MSG_MOD(info->info.extack, - "Offloading not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "Offloading not supported"); err = 0; } err = notifier_from_errno(err); @@ -2683,8 +2719,8 @@ static int dsa_slave_changeupper(struct net_device *dev, if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev); if (err == -EOPNOTSUPP) { - NL_SET_ERR_MSG_MOD(info->info.extack, - "Offloading not supported"); + NL_SET_ERR_MSG_WEAK_MOD(extack, + "Offloading not supported"); err = 0; } err = notifier_from_errno(err); diff --git a/net/dsa/tag.c b/net/dsa/tag.c index 383721e167d6..b2fba1a003ce 100644 --- a/net/dsa/tag.c +++ b/net/dsa/tag.c @@ -33,6 +33,9 @@ static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, struct dsa_switch *ds = p->dp->ds; unsigned int type; + if (!ds->ops->port_rxtstamp) + return false; + if (skb_headroom(skb) < ETH_HLEN) return false; @@ -45,10 +48,7 @@ static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, if (type == PTP_CLASS_NONE) return false; - if (likely(ds->ops->port_rxtstamp)) - return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); - - return false; + return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); } static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index b1263917fcb2..5ee9ef00954e 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -415,6 +415,7 @@ static void dsa_tag_8021q_teardown(struct dsa_switch *ds) int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) { struct dsa_8021q_context *ctx; + int err; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -427,7 +428,15 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) ds->tag_8021q_ctx = ctx; - return dsa_tag_8021q_setup(ds); + err = dsa_tag_8021q_setup(ds); + if (err) + goto err_free; + + return 0; + +err_free: + kfree(ctx); + return err; } EXPORT_SYMBOL_GPL(dsa_tag_8021q_register); diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 71884296fc70..03a1fb9c87a9 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -51,7 +51,8 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, return NULL; } - pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN); + if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN)) + return NULL; dsa_default_offload_fwd_mark(skb); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0f6ae143afc9..0eb1c7784c3d 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -4,8 +4,10 @@ * Copyright (c) 2017 Microchip Technology */ +#include <linux/dsa/ksz_common.h> #include <linux/etherdevice.h> #include <linux/list.h> +#include <linux/ptp_classify.h> #include <net/dsa.h> #include "tag.h" @@ -16,9 +18,71 @@ #define LAN937X_NAME "lan937x" /* Typically only one byte is used for tail tag. */ +#define KSZ_PTP_TAG_LEN 4 #define KSZ_EGRESS_TAG_LEN 1 #define KSZ_INGRESS_TAG_LEN 1 +#define KSZ_HWTS_EN 0 + +struct ksz_tagger_private { + struct ksz_tagger_data data; /* Must be first */ + unsigned long state; + struct kthread_worker *xmit_worker; +}; + +static struct ksz_tagger_private * +ksz_tagger_private(struct dsa_switch *ds) +{ + return ds->tagger_data; +} + +static void ksz_hwtstamp_set_state(struct dsa_switch *ds, bool on) +{ + struct ksz_tagger_private *priv = ksz_tagger_private(ds); + + if (on) + set_bit(KSZ_HWTS_EN, &priv->state); + else + clear_bit(KSZ_HWTS_EN, &priv->state); +} + +static void ksz_disconnect(struct dsa_switch *ds) +{ + struct ksz_tagger_private *priv = ds->tagger_data; + + kthread_destroy_worker(priv->xmit_worker); + kfree(priv); + ds->tagger_data = NULL; +} + +static int ksz_connect(struct dsa_switch *ds) +{ + struct ksz_tagger_data *tagger_data; + struct kthread_worker *xmit_worker; + struct ksz_tagger_private *priv; + int ret; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + ds->dst->index, ds->index); + if (IS_ERR(xmit_worker)) { + ret = PTR_ERR(xmit_worker); + kfree(priv); + return ret; + } + + priv->xmit_worker = xmit_worker; + /* Export functions for switch driver use */ + tagger_data = &priv->data; + tagger_data->hwtstamp_set_state = ksz_hwtstamp_set_state; + ds->tagger_data = priv; + + return 0; +} + static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int port, unsigned int len) @@ -27,7 +91,8 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, if (!skb->dev) return NULL; - pskb_trim_rcsum(skb, skb->len - len); + if (pskb_trim_rcsum(skb, skb->len - len)) + return NULL; dsa_default_offload_fwd_mark(skb); @@ -91,17 +156,20 @@ DSA_TAG_DRIVER(ksz8795_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); /* - * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. + * For Ingress (Host -> KSZ9477), 2/6 bytes are added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)| + * FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if PTP is enabled in the Hardware) * tag0 : Prioritization (not used now) * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) * - * For Egress (KSZ9477 -> Host), 1 byte is added before FCS. + * For Egress (KSZ9477 -> Host), 1/5 bytes is added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if bit 7 of tag0 is set) * tag0 : zero-based value represents port * (eg, 0x00=port1, 0x02=port3, 0x06=port7) */ @@ -110,12 +178,100 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); #define KSZ9477_PTP_TAG_LEN 4 #define KSZ9477_PTP_TAG_INDICATION 0x80 +#define KSZ9477_TAIL_TAG_PRIO GENMASK(8, 7) #define KSZ9477_TAIL_TAG_OVERRIDE BIT(9) #define KSZ9477_TAIL_TAG_LOOKUP BIT(10) +static void ksz_rcv_timestamp(struct sk_buff *skb, u8 *tag) +{ + u8 *tstamp_raw = tag - KSZ_PTP_TAG_LEN; + ktime_t tstamp; + + tstamp = ksz_decode_tstamp(get_unaligned_be32(tstamp_raw)); + KSZ_SKB_CB(skb)->tstamp = tstamp; +} + +/* Time stamp tag *needs* to be inserted if PTP is enabled in hardware. + * Regardless of Whether it is a PTP frame or not. + */ +static void ksz_xmit_timestamp(struct dsa_port *dp, struct sk_buff *skb) +{ + struct ksz_tagger_private *priv; + struct ptp_header *ptp_hdr; + unsigned int ptp_type; + u32 tstamp_raw = 0; + s64 correction; + + priv = ksz_tagger_private(dp->ds); + + if (!test_bit(KSZ_HWTS_EN, &priv->state)) + return; + + if (!KSZ_SKB_CB(skb)->update_correction) + goto output_tag; + + ptp_type = KSZ_SKB_CB(skb)->ptp_type; + + ptp_hdr = ptp_parse_header(skb, ptp_type); + if (!ptp_hdr) + goto output_tag; + + correction = (s64)get_unaligned_be64(&ptp_hdr->correction); + + if (correction < 0) { + struct timespec64 ts; + + ts = ns_to_timespec64(-correction >> 16); + tstamp_raw = ((ts.tv_sec & 3) << 30) | ts.tv_nsec; + + /* Set correction field to 0 and update UDP checksum */ + ptp_header_update_correction(skb, ptp_type, ptp_hdr, 0); + } + +output_tag: + put_unaligned_be32(tstamp_raw, skb_put(skb, KSZ_PTP_TAG_LEN)); +} + +/* Defer transmit if waiting for egress time stamp is required. */ +static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb) +{ + struct ksz_tagger_data *tagger_data = ksz_tagger_data(dp->ds); + struct ksz_tagger_private *priv = ksz_tagger_private(dp->ds); + void (*xmit_work_fn)(struct kthread_work *work); + struct sk_buff *clone = KSZ_SKB_CB(skb)->clone; + struct ksz_deferred_xmit_work *xmit_work; + struct kthread_worker *xmit_worker; + + if (!clone) + return skb; /* no deferred xmit for this packet */ + + xmit_work_fn = tagger_data->xmit_work_fn; + xmit_worker = priv->xmit_worker; + + if (!xmit_work_fn || !xmit_worker) + return NULL; + + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + if (!xmit_work) + return NULL; + + kthread_init_work(&xmit_work->work, xmit_work_fn); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + xmit_work->dp = dp; + xmit_work->skb = skb_get(skb); + + kthread_queue_work(xmit_worker, &xmit_work->work); + + return NULL; +} + static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, struct net_device *dev) { + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 prio = netdev_txq_to_tc(dev, queue_mapping); struct dsa_port *dp = dsa_slave_to_port(dev); __be16 *tag; u8 *addr; @@ -125,17 +281,21 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, return NULL; /* Tag encoding */ + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); addr = skb_mac_header(skb); val = BIT(dp->index); + val |= FIELD_PREP(KSZ9477_TAIL_TAG_PRIO, prio); + if (is_link_local_ether_addr(addr)) val |= KSZ9477_TAIL_TAG_OVERRIDE; *tag = cpu_to_be16(val); - return skb; + return ksz_defer_xmit(dp, skb); } static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) @@ -146,8 +306,10 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) unsigned int len = KSZ_EGRESS_TAG_LEN; /* Extra 4-bytes PTP timestamp */ - if (tag[0] & KSZ9477_PTP_TAG_INDICATION) - len += KSZ9477_PTP_TAG_LEN; + if (tag[0] & KSZ9477_PTP_TAG_INDICATION) { + ksz_rcv_timestamp(skb, tag); + len += KSZ_PTP_TAG_LEN; + } return ksz_common_rcv(skb, dev, port, len); } @@ -157,18 +319,23 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = KSZ9477_INGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = KSZ9477_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(ksz9477_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477, KSZ9477_NAME); +#define KSZ9893_TAIL_TAG_PRIO GENMASK(4, 3) #define KSZ9893_TAIL_TAG_OVERRIDE BIT(5) #define KSZ9893_TAIL_TAG_LOOKUP BIT(6) static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, struct net_device *dev) { + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 prio = netdev_txq_to_tc(dev, queue_mapping); struct dsa_port *dp = dsa_slave_to_port(dev); u8 *addr; u8 *tag; @@ -177,15 +344,19 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, return NULL; /* Tag encoding */ + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); addr = skb_mac_header(skb); *tag = BIT(dp->index); + *tag |= FIELD_PREP(KSZ9893_TAIL_TAG_PRIO, prio); + if (is_link_local_ether_addr(addr)) *tag |= KSZ9893_TAIL_TAG_OVERRIDE; - return skb; + return ksz_defer_xmit(dp, skb); } static const struct dsa_device_ops ksz9893_netdev_ops = { @@ -193,23 +364,28 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = KSZ_INGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = KSZ_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(ksz9893_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME); -/* For xmit, 2 bytes are added before FCS. +/* For xmit, 2/6 bytes are added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)| + * FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if PTP is enabled in the Hardware) * tag0 : represents tag override, lookup and valid * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8) * - * For rcv, 1 byte is added before FCS. + * For rcv, 1/5 bytes is added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if bit 7 of tag0 is set) * tag0 : zero-based value represents port * (eg, 0x00=port1, 0x02=port3, 0x07=port8) */ @@ -218,11 +394,14 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME); #define LAN937X_TAIL_TAG_BLOCKING_OVERRIDE BIT(11) #define LAN937X_TAIL_TAG_LOOKUP BIT(12) #define LAN937X_TAIL_TAG_VALID BIT(13) +#define LAN937X_TAIL_TAG_PRIO GENMASK(10, 8) #define LAN937X_TAIL_TAG_PORT_MASK 7 static struct sk_buff *lan937x_xmit(struct sk_buff *skb, struct net_device *dev) { + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 prio = netdev_txq_to_tc(dev, queue_mapping); struct dsa_port *dp = dsa_slave_to_port(dev); const struct ethhdr *hdr = eth_hdr(skb); __be16 *tag; @@ -231,10 +410,14 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) return NULL; + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN); val = BIT(dp->index); + val |= FIELD_PREP(LAN937X_TAIL_TAG_PRIO, prio); + if (is_link_local_ether_addr(hdr->h_dest)) val |= LAN937X_TAIL_TAG_BLOCKING_OVERRIDE; @@ -243,7 +426,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, put_unaligned_be16(val, tag); - return skb; + return ksz_defer_xmit(dp, skb); } static const struct dsa_device_ops lan937x_netdev_ops = { @@ -251,7 +434,9 @@ static const struct dsa_device_ops lan937x_netdev_ops = { .proto = DSA_TAG_PROTO_LAN937X, .xmit = lan937x_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = LAN937X_EGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = LAN937X_EGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(lan937x_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index f14f51b41491..1c2ceba4771b 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -670,7 +670,8 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, * padding and trailer we need to account for the fact that * skb->data points to skb_mac_header(skb) + ETH_HLEN. */ - pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN); + if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN)) + return NULL; /* Trap-to-host frame, no timestamp trailer */ } else { *source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 228f13df2e18..504f954a1b28 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \ - pse-pd.o + tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ + module.o pse-pd.o plca.o mm.o diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index c7e37130647e..61c40e889a4d 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -86,18 +86,6 @@ static int channels_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_channels_request_ops = { - .request_cmd = ETHTOOL_MSG_CHANNELS_GET, - .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, - .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, - .req_info_size = sizeof(struct channels_req_info), - .reply_data_size = sizeof(struct channels_reply_data), - - .prepare_data = channels_prepare_data, - .reply_size = channels_reply_size, - .fill_reply = channels_fill_reply, -}; - /* CHANNELS_SET */ const struct nla_policy ethnl_channels_set_policy[] = { @@ -109,36 +97,28 @@ const struct nla_policy ethnl_channels_set_policy[] = { [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 }, }; -int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_channels_validate(struct ethnl_req_info *req_info, + struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_channels && ops->set_channels ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) { unsigned int from_channel, old_total, i; bool mod = false, mod_combined = false; + struct net_device *dev = req_info->dev; struct ethtool_channels channels = {}; - struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; u32 err_attr, max_rxfh_in_use; - const struct ethtool_ops *ops; - struct net_device *dev; u64 max_rxnfc_in_use; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_CHANNELS_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_channels || !ops->set_channels) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ops->get_channels(dev, &channels); + dev->ethtool_ops->get_channels(dev, &channels); old_total = channels.combined_count + max(channels.rx_count, channels.tx_count); @@ -151,9 +131,8 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) ethnl_update_u32(&channels.combined_count, tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined); mod |= mod_combined; - ret = 0; if (!mod) - goto out_ops; + return 0; /* ensure new channel counts are within limits */ if (channels.rx_count > channels.max_rx) @@ -167,10 +146,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) else err_attr = 0; if (err_attr) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel count exceeds maximum"); - goto out_ops; + return -EINVAL; } /* ensure there is at least one RX and one TX channel */ @@ -183,10 +161,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) if (err_attr) { if (mod_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel counts would result in no RX or TX channel being configured"); - goto out_ops; + return -EINVAL; } /* ensure the new Rx count fits within the configured Rx flow @@ -198,14 +175,12 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use)) max_rxfh_in_use = 0; if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) { - ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings"); - goto out_ops; + return -EINVAL; } if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) { - ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings"); - goto out_ops; + return -EINVAL; } /* Disabling channels, query zero-copy AF_XDP sockets */ @@ -213,21 +188,26 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) if (xsk_get_pool_from_qid(dev, i)) { - ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); - goto out_ops; + return -EINVAL; } ret = dev->ethtool_ops->set_channels(dev, &channels); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_channels_request_ops = { + .request_cmd = ETHTOOL_MSG_CHANNELS_GET, + .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, + .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, + .req_info_size = sizeof(struct channels_req_info), + .reply_data_size = sizeof(struct channels_reply_data), + + .prepare_data = channels_prepare_data, + .reply_size = channels_reply_size, + .fill_reply = channels_fill_reply, + + .set_validate = ethnl_set_channels_validate, + .set = ethnl_set_channels, + .set_ntf_cmd = ETHTOOL_MSG_CHANNELS_NTF, +}; diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 487bdf345541..443e7e642c96 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -105,7 +105,10 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ - nla_total_size(sizeof(u8)); /* _USE_CQE_MODE_RX */ + nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */ + nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ + nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ + nla_total_size(sizeof(u32)); /* _TX_AGGR_TIME_USECS */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, @@ -180,24 +183,18 @@ static int coalesce_fill_reply(struct sk_buff *skb, coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, kcoal->use_cqe_mode_tx, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, - kcoal->use_cqe_mode_rx, supported)) + kcoal->use_cqe_mode_rx, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, + kcoal->tx_aggr_max_bytes, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, + kcoal->tx_aggr_max_frames, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, + kcoal->tx_aggr_time_usecs, supported)) return -EMSGSIZE; return 0; } -const struct ethnl_request_ops ethnl_coalesce_request_ops = { - .request_cmd = ETHTOOL_MSG_COALESCE_GET, - .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, - .hdr_attr = ETHTOOL_A_COALESCE_HEADER, - .req_info_size = sizeof(struct coalesce_req_info), - .reply_data_size = sizeof(struct coalesce_reply_data), - - .prepare_data = coalesce_prepare_data, - .reply_size = coalesce_reply_size, - .fill_reply = coalesce_fill_reply, -}; - /* COALESCE_SET */ const struct nla_policy ethnl_coalesce_set_policy[] = { @@ -227,51 +224,49 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, }; -int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct kernel_ethtool_coalesce kernel_coalesce = {}; - struct ethtool_coalesce coalesce = {}; - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; - struct net_device *dev; u32 supported_params; - bool mod = false; - int ret; u16 a; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_COALESCE_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; if (!ops->get_coalesce || !ops->set_coalesce) - goto out_dev; + return -EOPNOTSUPP; /* make sure that only supported parameters are present */ supported_params = ops->supported_coalesce_params; for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) if (tb[a] && !(supported_params & attr_to_mask(a))) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[a], "cannot modify an unsupported parameter"); - goto out_dev; + return -EINVAL; } - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = ops->get_coalesce(dev, &coalesce, &kernel_coalesce, - info->extack); + return 1; +} + +static int +ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct kernel_ethtool_coalesce kernel_coalesce = {}; + struct net_device *dev = req_info->dev; + struct ethtool_coalesce coalesce = {}; + struct nlattr **tb = info->attrs; + bool mod = false; + int ret; + + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + info->extack); if (ret < 0) - goto out_ops; + return ret; ethnl_update_u32(&coalesce.rx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_RX_USECS], &mod); @@ -321,21 +316,32 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod); - ret = 0; + ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes, + tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod); + ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames, + tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); + ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, + tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_coalesce_request_ops = { + .request_cmd = ETHTOOL_MSG_COALESCE_GET, + .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, + .hdr_attr = ETHTOOL_A_COALESCE_HEADER, + .req_info_size = sizeof(struct coalesce_req_info), + .reply_data_size = sizeof(struct coalesce_reply_data), + + .prepare_data = coalesce_prepare_data, + .reply_size = coalesce_reply_size, + .fill_reply = coalesce_fill_reply, + + .set_validate = ethnl_set_coalesce_validate, + .set = ethnl_set_coalesce, + .set_ntf_cmd = ETHTOOL_MSG_COALESCE_NTF, +}; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 21cfe8557205..5fb19050991e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -208,6 +208,9 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full), __DEFINE_LINK_MODE_NAME(800000, SR8, Full), __DEFINE_LINK_MODE_NAME(800000, VR8, Full), + __DEFINE_LINK_MODE_NAME(10, T1S, Full), + __DEFINE_LINK_MODE_NAME(10, T1S, Half), + __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -244,6 +247,8 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_X 1 #define __LINK_MODE_LANES_FX 1 #define __LINK_MODE_LANES_T1L 1 +#define __LINK_MODE_LANES_T1S 1 +#define __LINK_MODE_LANES_T1S_P2MP 1 #define __LINK_MODE_LANES_VR8 8 #define __LINK_MODE_LANES_DR8_2 8 @@ -366,6 +371,9 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), + __DEFINE_LINK_MODE_PARAMS(10, T1S, Full), + __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), + __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -417,6 +425,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = { [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo", [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw", [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc", + [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)] = "option-id-tcp", }; static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index b1b9db810eca..28b8aaaf9bcb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -54,4 +54,6 @@ int ethtool_get_module_info_call(struct net_device *dev, int ethtool_get_module_eeprom_call(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data); +bool __ethtool_dev_mm_supported(struct net_device *dev); + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index d73888c7d19c..e4369769817e 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -63,18 +63,6 @@ static int debug_fill_reply(struct sk_buff *skb, netif_msg_class_names, compact); } -const struct ethnl_request_ops ethnl_debug_request_ops = { - .request_cmd = ETHTOOL_MSG_DEBUG_GET, - .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, - .hdr_attr = ETHTOOL_A_DEBUG_HEADER, - .req_info_size = sizeof(struct debug_req_info), - .reply_data_size = sizeof(struct debug_reply_data), - - .prepare_data = debug_prepare_data, - .reply_size = debug_reply_size, - .fill_reply = debug_fill_reply, -}; - /* DEBUG_SET */ const struct nla_policy ethnl_debug_set_policy[] = { @@ -83,46 +71,47 @@ const struct nla_policy ethnl_debug_set_policy[] = { [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, }; -int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_debug_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; bool mod = false; u32 msg_mask; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_DEBUG_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - msg_mask = dev->ethtool_ops->get_msglevel(dev); ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, tb[ETHTOOL_A_DEBUG_MSGMASK], netif_msg_class_names, info->extack, &mod); if (ret < 0 || !mod) - goto out_ops; + return ret; dev->ethtool_ops->set_msglevel(dev, msg_mask); - ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_debug_request_ops = { + .request_cmd = ETHTOOL_MSG_DEBUG_GET, + .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, + .hdr_attr = ETHTOOL_A_DEBUG_HEADER, + .req_info_size = sizeof(struct debug_req_info), + .reply_data_size = sizeof(struct debug_reply_data), + + .prepare_data = debug_prepare_data, + .reply_size = debug_reply_size, + .fill_reply = debug_fill_reply, + + .set_validate = ethnl_set_debug_validate, + .set = ethnl_set_debug, + .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF, +}; diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c index 45c42b2d5f17..42104bcb0e47 100644 --- a/net/ethtool/eee.c +++ b/net/ethtool/eee.c @@ -108,18 +108,6 @@ static int eee_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_eee_request_ops = { - .request_cmd = ETHTOOL_MSG_EEE_GET, - .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY, - .hdr_attr = ETHTOOL_A_EEE_HEADER, - .req_info_size = sizeof(struct eee_req_info), - .reply_data_size = sizeof(struct eee_reply_data), - - .prepare_data = eee_prepare_data, - .reply_size = eee_reply_size, - .fill_reply = eee_fill_reply, -}; - /* EEE_SET */ const struct nla_policy ethnl_eee_set_policy[] = { @@ -131,60 +119,56 @@ const struct nla_policy ethnl_eee_set_policy[] = { [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 }, }; -int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_eee_validate(struct ethnl_req_info *req_info, struct genl_info *info) { - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_eee && ops->set_eee ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; struct ethtool_eee eee = {}; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_EEE_HEADER], - genl_info_net(info), info->extack, - true); + ret = dev->ethtool_ops->get_eee(dev, &eee); if (ret < 0) return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_eee || !ops->set_eee) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = ops->get_eee(dev, &eee); - if (ret < 0) - goto out_ops; ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT, tb[ETHTOOL_A_EEE_MODES_OURS], link_mode_names, info->extack, &mod); if (ret < 0) - goto out_ops; + return ret; ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod); ethnl_update_bool32(&eee.tx_lpi_enabled, tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod); ethnl_update_u32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER], &mod); - ret = 0; if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_eee(dev, &eee); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_eee_request_ops = { + .request_cmd = ETHTOOL_MSG_EEE_GET, + .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY, + .hdr_attr = ETHTOOL_A_EEE_HEADER, + .req_info_size = sizeof(struct eee_req_info), + .reply_data_size = sizeof(struct eee_reply_data), + + .prepare_data = eee_prepare_data, + .reply_size = eee_reply_size, + .fill_reply = eee_fill_reply, + + .set_validate = ethnl_set_eee_validate, + .set = ethnl_set_eee, + .set_ntf_cmd = ETHTOOL_MSG_EEE_NTF, +}; diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index 9f5a134e2e01..0d9a3d153170 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -217,18 +217,6 @@ static int fec_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_fec_request_ops = { - .request_cmd = ETHTOOL_MSG_FEC_GET, - .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, - .hdr_attr = ETHTOOL_A_FEC_HEADER, - .req_info_size = sizeof(struct fec_req_info), - .reply_data_size = sizeof(struct fec_reply_data), - - .prepare_data = fec_prepare_data, - .reply_size = fec_reply_size, - .fill_reply = fec_fill_reply, -}; - /* FEC_SET */ const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { @@ -237,36 +225,28 @@ const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1), }; -int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_fec_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_fecparam && ops->set_fecparam ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_fec(struct ethnl_req_info *req_info, struct genl_info *info) { __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {}; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; struct ethtool_fecparam fec = {}; - const struct ethtool_ops *ops; - struct net_device *dev; bool mod = false; u8 fec_auto; int ret; - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEC_HEADER], - genl_info_net(info), info->extack, - true); + ret = dev->ethtool_ops->get_fecparam(dev, &fec); if (ret < 0) return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_fecparam || !ops->set_fecparam) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = ops->get_fecparam(dev, &fec); - if (ret < 0) - goto out_ops; ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto); @@ -275,36 +255,39 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_FEC_MODES], link_mode_names, info->extack, &mod); if (ret < 0) - goto out_ops; + return ret; ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod); - - ret = 0; if (!mod) - goto out_ops; + return 0; ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], "invalid FEC modes requested"); - goto out_ops; + return ret; } if (!fec.fec) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], "no FEC modes set"); - goto out_ops; + return -EINVAL; } ret = dev->ethtool_ops->set_fecparam(dev, &fec); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_FEC_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_fec_request_ops = { + .request_cmd = ETHTOOL_MSG_FEC_GET, + .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, + .hdr_attr = ETHTOOL_A_FEC_HEADER, + .req_info_size = sizeof(struct fec_req_info), + .reply_data_size = sizeof(struct fec_reply_data), + + .prepare_data = fec_prepare_data, + .reply_size = fec_reply_size, + .fill_reply = fec_fill_reply, + + .set_validate = ethnl_set_fec_validate, + .set = ethnl_set_fec, + .set_ntf_cmd = ETHTOOL_MSG_FEC_NTF, +}; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index c2f1a542e6fa..646b3e490c71 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2078,58 +2078,91 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } -static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +static int ethtool_vzalloc_stats_array(int n_stats, u64 **data) { + if (n_stats < 0) + return n_stats; + if (n_stats > S32_MAX / sizeof(u64)) + return -ENOMEM; + if (WARN_ON_ONCE(!n_stats)) + return -EOPNOTSUPP; + + *data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!*data) + return -ENOMEM; + + return 0; +} + +static int ethtool_get_phy_stats_phydev(struct phy_device *phydev, + struct ethtool_stats *stats, + u64 **data) + { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; + int n_stats, ret; + + if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats) + return -EOPNOTSUPP; + + n_stats = phy_ops->get_sset_count(phydev); + + ret = ethtool_vzalloc_stats_array(n_stats, data); + if (ret) + return ret; + + stats->n_stats = n_stats; + return phy_ops->get_stats(phydev, stats, *data); +} + +static int ethtool_get_phy_stats_ethtool(struct net_device *dev, + struct ethtool_stats *stats, + u64 **data) +{ const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - struct ethtool_stats stats; - u64 *data; - int ret, n_stats; + int n_stats, ret; - if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) + if (!ops || !ops->get_sset_count || ops->get_ethtool_phy_stats) return -EOPNOTSUPP; - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_sset_count) - n_stats = phy_ops->get_sset_count(phydev); - else - n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); - if (n_stats < 0) - return n_stats; - if (n_stats > S32_MAX / sizeof(u64)) - return -ENOMEM; - WARN_ON_ONCE(!n_stats); + n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); + + ret = ethtool_vzalloc_stats_array(n_stats, data); + if (ret) + return ret; + + stats->n_stats = n_stats; + ops->get_ethtool_phy_stats(dev, stats, *data); + + return 0; +} + +static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +{ + struct phy_device *phydev = dev->phydev; + struct ethtool_stats stats; + u64 *data = NULL; + int ret = -EOPNOTSUPP; if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; - stats.n_stats = n_stats; + if (phydev) + ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data); - if (n_stats) { - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (!data) - return -ENOMEM; + if (ret == -EOPNOTSUPP) + ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data); - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_stats) { - ret = phy_ops->get_stats(phydev, &stats, data); - if (ret < 0) - goto out; - } else { - ops->get_ethtool_phy_stats(dev, &stats, data); - } - } else { - data = NULL; - } + if (ret) + goto out; - ret = -EFAULT; - if (copy_to_user(useraddr, &stats, sizeof(stats))) + if (copy_to_user(useraddr, &stats, sizeof(stats))) { + ret = -EFAULT; goto out; + } + useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) - goto out; - ret = 0; + if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64)))) + ret = -EFAULT; out: vfree(data); diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index efa0f7f48836..310dfe63292a 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -73,18 +73,6 @@ static int linkinfo_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_linkinfo_request_ops = { - .request_cmd = ETHTOOL_MSG_LINKINFO_GET, - .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, - .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, - .req_info_size = sizeof(struct linkinfo_req_info), - .reply_data_size = sizeof(struct linkinfo_reply_data), - - .prepare_data = linkinfo_prepare_data, - .reply_size = linkinfo_reply_size, - .fill_reply = linkinfo_fill_reply, -}; - /* LINKINFO_SET */ const struct nla_policy ethnl_linkinfo_set_policy[] = { @@ -95,37 +83,31 @@ const struct nla_policy ethnl_linkinfo_set_policy[] = { [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, }; -int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info, + struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + if (!ops->get_link_ksettings || !ops->set_link_ksettings) + return -EOPNOTSUPP; + return 1; +} + +static int +ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct ethtool_link_settings *lsettings; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_LINKINFO_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_link_ksettings || - !dev->ethtool_ops->set_link_ksettings) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); - goto out_ops; + return ret; } lsettings = &ksettings.base; @@ -134,21 +116,30 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) &mod); ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); - ret = 0; if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); - if (ret < 0) + if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); - else - ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + return ret; + } -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_linkinfo_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKINFO_GET, + .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, + .req_info_size = sizeof(struct linkinfo_req_info), + .reply_data_size = sizeof(struct linkinfo_reply_data), + + .prepare_data = linkinfo_prepare_data, + .reply_size = linkinfo_reply_size, + .fill_reply = linkinfo_fill_reply, + + .set_validate = ethnl_set_linkinfo_validate, + .set = ethnl_set_linkinfo, + .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF, +}; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 126e06c713a3..fab66c169b9f 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -151,18 +151,6 @@ static int linkmodes_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_linkmodes_request_ops = { - .request_cmd = ETHTOOL_MSG_LINKMODES_GET, - .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, - .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, - .req_info_size = sizeof(struct linkmodes_req_info), - .reply_data_size = sizeof(struct linkmodes_reply_data), - - .prepare_data = linkmodes_prepare_data, - .reply_size = linkmodes_reply_size, - .fill_reply = linkmodes_fill_reply, -}; - /* LINKMODES_SET */ const struct nla_policy ethnl_linkmodes_set_policy[] = { @@ -310,59 +298,64 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, return 0; } -int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_linkmodes_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct ethtool_link_ksettings ksettings = {}; - struct ethnl_req_info req_info = {}; - struct nlattr **tb = info->attrs; - struct net_device *dev; - bool mod = false; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; int ret; - ret = ethnl_check_linkmodes(info, tb); + ret = ethnl_check_linkmodes(info, info->attrs); if (ret < 0) return ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_LINKMODES_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_link_ksettings || - !dev->ethtool_ops->set_link_ksettings) - goto out_dev; + if (!ops->get_link_ksettings || !ops->set_link_ksettings) + return -EOPNOTSUPP; + return 1; +} - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; +static int +ethnl_set_linkmodes(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct ethtool_link_ksettings ksettings = {}; + struct net_device *dev = req_info->dev; + struct nlattr **tb = info->attrs; + bool mod = false; + int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); - goto out_ops; + return ret; } ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev); if (ret < 0) - goto out_ops; + return ret; + if (!mod) + return 0; - if (mod) { - ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); - if (ret < 0) - GENL_SET_ERR_MSG(info, "link settings update failed"); - else - ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) { + GENL_SET_ERR_MSG(info, "link settings update failed"); + return ret; } -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_linkmodes_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKMODES_GET, + .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, + .req_info_size = sizeof(struct linkmodes_req_info), + .reply_data_size = sizeof(struct linkmodes_reply_data), + + .prepare_data = linkmodes_prepare_data, + .reply_size = linkmodes_reply_size, + .fill_reply = linkmodes_fill_reply, + + .set_validate = ethnl_set_linkmodes_validate, + .set = ethnl_set_linkmodes, + .set_ntf_cmd = ETHTOOL_MSG_LINKMODES_NTF, +}; diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c new file mode 100644 index 000000000000..e612856eed8c --- /dev/null +++ b/net/ethtool/mm.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022-2023 NXP + */ +#include "common.h" +#include "netlink.h" + +struct mm_req_info { + struct ethnl_req_info base; +}; + +struct mm_reply_data { + struct ethnl_reply_data base; + struct ethtool_mm_state state; + struct ethtool_mm_stats stats; +}; + +#define MM_REPDATA(__reply_base) \ + container_of(__reply_base, struct mm_reply_data, base) + +#define ETHTOOL_MM_STAT_CNT \ + (__ETHTOOL_A_MM_STAT_CNT - (ETHTOOL_A_MM_STAT_PAD + 1)) + +const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1] = { + [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), +}; + +static int mm_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct mm_reply_data *data = MM_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_ops *ops; + int ret; + + ops = dev->ethtool_ops; + + if (!ops->get_mm) + return -EOPNOTSUPP; + + ethtool_stats_init((u64 *)&data->stats, + sizeof(data->stats) / sizeof(u64)); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = ops->get_mm(dev, &data->state); + if (ret) + goto out_complete; + + if (ops->get_mm_stats && (req_base->flags & ETHTOOL_FLAG_STATS)) + ops->get_mm_stats(dev, &data->stats); + +out_complete: + ethnl_ops_complete(dev); + + return ret; +} + +static int mm_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + int len = 0; + + len += nla_total_size(sizeof(u8)); /* _MM_PMAC_ENABLED */ + len += nla_total_size(sizeof(u8)); /* _MM_TX_ENABLED */ + len += nla_total_size(sizeof(u8)); /* _MM_TX_ACTIVE */ + len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_ENABLED */ + len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_STATUS */ + len += nla_total_size(sizeof(u32)); /* _MM_VERIFY_TIME */ + len += nla_total_size(sizeof(u32)); /* _MM_MAX_VERIFY_TIME */ + len += nla_total_size(sizeof(u32)); /* _MM_TX_MIN_FRAG_SIZE */ + len += nla_total_size(sizeof(u32)); /* _MM_RX_MIN_FRAG_SIZE */ + + if (req_base->flags & ETHTOOL_FLAG_STATS) + len += nla_total_size(0) + /* _MM_STATS */ + nla_total_size_64bit(sizeof(u64)) * ETHTOOL_MM_STAT_CNT; + + return len; +} + +static int mm_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) +{ + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + if (nla_put_u64_64bit(skb, attrtype, val, ETHTOOL_A_MM_STAT_PAD)) + return -EMSGSIZE; + return 0; +} + +static int mm_put_stats(struct sk_buff *skb, + const struct ethtool_mm_stats *stats) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_MM_STATS); + if (!nest) + return -EMSGSIZE; + + if (mm_put_stat(skb, stats->MACMergeFrameAssErrorCount, + ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS) || + mm_put_stat(skb, stats->MACMergeFrameSmdErrorCount, + ETHTOOL_A_MM_STAT_SMD_ERRORS) || + mm_put_stat(skb, stats->MACMergeFrameAssOkCount, + ETHTOOL_A_MM_STAT_REASSEMBLY_OK) || + mm_put_stat(skb, stats->MACMergeFragCountRx, + ETHTOOL_A_MM_STAT_RX_FRAG_COUNT) || + mm_put_stat(skb, stats->MACMergeFragCountTx, + ETHTOOL_A_MM_STAT_TX_FRAG_COUNT) || + mm_put_stat(skb, stats->MACMergeHoldCount, + ETHTOOL_A_MM_STAT_HOLD_COUNT)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int mm_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct mm_reply_data *data = MM_REPDATA(reply_base); + const struct ethtool_mm_state *state = &data->state; + + if (nla_put_u8(skb, ETHTOOL_A_MM_TX_ENABLED, state->tx_enabled) || + nla_put_u8(skb, ETHTOOL_A_MM_TX_ACTIVE, state->tx_active) || + nla_put_u8(skb, ETHTOOL_A_MM_PMAC_ENABLED, state->pmac_enabled) || + nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_ENABLED, state->verify_enabled) || + nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_STATUS, state->verify_status) || + nla_put_u32(skb, ETHTOOL_A_MM_VERIFY_TIME, state->verify_time) || + nla_put_u32(skb, ETHTOOL_A_MM_MAX_VERIFY_TIME, state->max_verify_time) || + nla_put_u32(skb, ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, state->tx_min_frag_size) || + nla_put_u32(skb, ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, state->rx_min_frag_size)) + return -EMSGSIZE; + + if (req_base->flags & ETHTOOL_FLAG_STATS && + mm_put_stats(skb, &data->stats)) + return -EMSGSIZE; + + return 0; +} + +const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1] = { + [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_MM_VERIFY_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_MM_VERIFY_TIME] = NLA_POLICY_RANGE(NLA_U32, 1, 128), + [ETHTOOL_A_MM_TX_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_MM_PMAC_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_MM_TX_MIN_FRAG_SIZE] = NLA_POLICY_RANGE(NLA_U32, 60, 252), +}; + +static void mm_state_to_cfg(const struct ethtool_mm_state *state, + struct ethtool_mm_cfg *cfg) +{ + /* We could also compare state->verify_status against + * ETHTOOL_MM_VERIFY_STATUS_DISABLED, but state->verify_enabled + * is more like an administrative state which should be seen in + * ETHTOOL_MSG_MM_GET replies. For example, a port with verification + * disabled might be in the ETHTOOL_MM_VERIFY_STATUS_INITIAL + * if it's down. + */ + cfg->verify_enabled = state->verify_enabled; + cfg->verify_time = state->verify_time; + cfg->tx_enabled = state->tx_enabled; + cfg->pmac_enabled = state->pmac_enabled; + cfg->tx_min_frag_size = state->tx_min_frag_size; +} + +static int +ethnl_set_mm_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_mm && ops->set_mm ? 1 : -EOPNOTSUPP; +} + +static int ethnl_set_mm(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct net_device *dev = req_info->dev; + struct ethtool_mm_state state = {}; + struct nlattr **tb = info->attrs; + struct ethtool_mm_cfg cfg = {}; + bool mod = false; + int ret; + + ret = dev->ethtool_ops->get_mm(dev, &state); + if (ret) + return ret; + + mm_state_to_cfg(&state, &cfg); + + ethnl_update_bool(&cfg.verify_enabled, tb[ETHTOOL_A_MM_VERIFY_ENABLED], + &mod); + ethnl_update_u32(&cfg.verify_time, tb[ETHTOOL_A_MM_VERIFY_TIME], &mod); + ethnl_update_bool(&cfg.tx_enabled, tb[ETHTOOL_A_MM_TX_ENABLED], &mod); + ethnl_update_bool(&cfg.pmac_enabled, tb[ETHTOOL_A_MM_PMAC_ENABLED], + &mod); + ethnl_update_u32(&cfg.tx_min_frag_size, + tb[ETHTOOL_A_MM_TX_MIN_FRAG_SIZE], &mod); + + if (!mod) + return 0; + + if (cfg.verify_time > state.max_verify_time) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MM_VERIFY_TIME], + "verifyTime exceeds device maximum"); + return -ERANGE; + } + + ret = dev->ethtool_ops->set_mm(dev, &cfg, extack); + return ret < 0 ? ret : 1; +} + +const struct ethnl_request_ops ethnl_mm_request_ops = { + .request_cmd = ETHTOOL_MSG_MM_GET, + .reply_cmd = ETHTOOL_MSG_MM_GET_REPLY, + .hdr_attr = ETHTOOL_A_MM_HEADER, + .req_info_size = sizeof(struct mm_req_info), + .reply_data_size = sizeof(struct mm_reply_data), + + .prepare_data = mm_prepare_data, + .reply_size = mm_reply_size, + .fill_reply = mm_fill_reply, + + .set_validate = ethnl_set_mm_validate, + .set = ethnl_set_mm, + .set_ntf_cmd = ETHTOOL_MSG_MM_NTF, +}; + +/* Returns whether a given device supports the MAC merge layer + * (has an eMAC and a pMAC). Must be called under rtnl_lock() and + * ethnl_ops_begin(). + */ +bool __ethtool_dev_mm_supported(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_mm_state state = {}; + int ret = -EOPNOTSUPP; + + if (ops && ops->get_mm) + ret = ops->get_mm(dev, &state); + + return !!ret; +} diff --git a/net/ethtool/module.c b/net/ethtool/module.c index 898ed436b9e4..e0d539b21423 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -91,18 +91,6 @@ static int module_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_module_request_ops = { - .request_cmd = ETHTOOL_MSG_MODULE_GET, - .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, - .hdr_attr = ETHTOOL_A_MODULE_HEADER, - .req_info_size = sizeof(struct module_req_info), - .reply_data_size = sizeof(struct module_reply_data), - - .prepare_data = module_prepare_data, - .reply_size = module_reply_size, - .fill_reply = module_fill_reply, -}; - /* MODULE_SET */ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { @@ -112,69 +100,62 @@ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLI ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), }; -static int module_set_power_mode(struct net_device *dev, struct nlattr **tb, - bool *p_mod, struct netlink_ext_ack *extack) +static int +ethnl_set_module_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct ethtool_module_power_mode_params power = {}; - struct ethtool_module_power_mode_params power_new; - const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + struct nlattr **tb = info->attrs; if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; if (!ops->get_module_power_mode || !ops->set_module_power_mode) { - NL_SET_ERR_MSG_ATTR(extack, + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], "Setting power mode policy is not supported by this device"); return -EOPNOTSUPP; } - power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); - ret = ops->get_module_power_mode(dev, &power, extack); - if (ret < 0) - return ret; - - if (power_new.policy == power.policy) - return 0; - *p_mod = true; - - return ops->set_module_power_mode(dev, &power_new, extack); + return 1; } -int ethnl_set_module(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) { - struct ethnl_req_info req_info = {}; + struct ethtool_module_power_mode_params power = {}; + struct ethtool_module_power_mode_params power_new; + const struct ethtool_ops *ops; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; - bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_MODULE_HEADER], - genl_info_net(info), info->extack, - true); + ops = dev->ethtool_ops; + + power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); + ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) return ret; - dev = req_info.dev; - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; + if (power_new.policy == power.policy) + return 0; - ret = module_set_power_mode(dev, tb, &mod, info->extack); - if (ret < 0) - goto out_ops; + ret = ops->set_module_power_mode(dev, &power_new, info->extack); + return ret < 0 ? ret : 1; +} - if (!mod) - goto out_ops; +const struct ethnl_request_ops ethnl_module_request_ops = { + .request_cmd = ETHTOOL_MSG_MODULE_GET, + .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, + .hdr_attr = ETHTOOL_A_MODULE_HEADER, + .req_info_size = sizeof(struct module_req_info), + .reply_data_size = sizeof(struct module_reply_data), - ethtool_notify(dev, ETHTOOL_MSG_MODULE_NTF, NULL); + .prepare_data = module_prepare_data, + .reply_size = module_reply_size, + .fill_reply = module_fill_reply, -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info); - return ret; -} + .set_validate = ethnl_set_module_validate, + .set = ethnl_set_module, + .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index aee98be6237f..08120095cc68 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -269,25 +269,43 @@ static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKINFO_SET] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, + [ETHTOOL_MSG_LINKMODES_SET] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, + [ETHTOOL_MSG_DEBUG_SET] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, + [ETHTOOL_MSG_WOL_SET] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = ðnl_privflags_request_ops, + [ETHTOOL_MSG_PRIVFLAGS_SET] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = ðnl_rings_request_ops, + [ETHTOOL_MSG_RINGS_SET] = ðnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_GET] = ðnl_channels_request_ops, + [ETHTOOL_MSG_CHANNELS_SET] = ðnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, + [ETHTOOL_MSG_COALESCE_SET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, + [ETHTOOL_MSG_PAUSE_SET] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, + [ETHTOOL_MSG_EEE_SET] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, + [ETHTOOL_MSG_FEC_SET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, + [ETHTOOL_MSG_MODULE_SET] = ðnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, + [ETHTOOL_MSG_PSE_SET] = ðnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, + [ETHTOOL_MSG_PLCA_GET_CFG] = ðnl_plca_cfg_request_ops, + [ETHTOOL_MSG_PLCA_SET_CFG] = ðnl_plca_cfg_request_ops, + [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, + [ETHTOOL_MSG_MM_GET] = ðnl_mm_request_ops, + [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -588,6 +606,52 @@ static int ethnl_default_done(struct netlink_callback *cb) return 0; } +static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + const struct ethnl_request_ops *ops; + struct ethnl_req_info req_info = {}; + const u8 cmd = info->genlhdr->cmd; + int ret; + + ops = ethnl_default_requests[cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) + return -EOPNOTSUPP; + if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) + return -EINVAL; + + ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + if (ops->set_validate) { + ret = ops->set_validate(&req_info, info); + /* 0 means nothing to do */ + if (ret <= 0) + goto out_dev; + } + + rtnl_lock(); + ret = ethnl_ops_begin(req_info.dev); + if (ret < 0) + goto out_rtnl; + + ret = ops->set(&req_info, info); + if (ret <= 0) + goto out_ops; + ethtool_notify(req_info.dev, ops->set_ntf_cmd, NULL); + + ret = 0; +out_ops: + ethnl_ops_complete(req_info.dev); +out_rtnl: + rtnl_unlock(); +out_dev: + ethnl_parse_header_dev_put(&req_info); + return ret; +} + static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, @@ -603,6 +667,8 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, + [ETHTOOL_MSG_PLCA_NTF] = ðnl_plca_cfg_request_ops, + [ETHTOOL_MSG_MM_NTF] = ðnl_mm_request_ops, }; /* default notification handler */ @@ -696,6 +762,8 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -760,7 +828,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_LINKINFO_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_linkinfo, + .doit = ethnl_default_set_doit, .policy = ethnl_linkinfo_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1, }, @@ -776,7 +844,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_LINKMODES_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_linkmodes, + .doit = ethnl_default_set_doit, .policy = ethnl_linkmodes_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1, }, @@ -801,7 +869,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_DEBUG_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_debug, + .doit = ethnl_default_set_doit, .policy = ethnl_debug_set_policy, .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1, }, @@ -818,7 +886,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_WOL_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_wol, + .doit = ethnl_default_set_doit, .policy = ethnl_wol_set_policy, .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1, }, @@ -850,7 +918,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PRIVFLAGS_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_privflags, + .doit = ethnl_default_set_doit, .policy = ethnl_privflags_set_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1, }, @@ -866,7 +934,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_RINGS_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_rings, + .doit = ethnl_default_set_doit, .policy = ethnl_rings_set_policy, .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1, }, @@ -882,7 +950,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_CHANNELS_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_channels, + .doit = ethnl_default_set_doit, .policy = ethnl_channels_set_policy, .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1, }, @@ -898,7 +966,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_COALESCE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_coalesce, + .doit = ethnl_default_set_doit, .policy = ethnl_coalesce_set_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1, }, @@ -914,7 +982,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PAUSE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_pause, + .doit = ethnl_default_set_doit, .policy = ethnl_pause_set_policy, .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1, }, @@ -930,7 +998,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_EEE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_eee, + .doit = ethnl_default_set_doit, .policy = ethnl_eee_set_policy, .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1, }, @@ -977,7 +1045,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_FEC_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_fec, + .doit = ethnl_default_set_doit, .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, @@ -1021,7 +1089,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_MODULE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_module, + .doit = ethnl_default_set_doit, .policy = ethnl_module_set_policy, .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, }, @@ -1037,7 +1105,7 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PSE_SET, .flags = GENL_UNS_ADMIN_PERM, - .doit = ethnl_set_pse, + .doit = ethnl_default_set_doit, .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, @@ -1047,6 +1115,47 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_PLCA_GET_CFG, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_plca_get_cfg_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_PLCA_SET_CFG, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_plca_set_cfg_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_plca_get_status_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_MM_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_mm_get_policy, + .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_MM_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_mm_set_policy, + .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 3753787ba233..f7b189ed96b2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -138,6 +138,32 @@ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, } /** + * ethnl_update_bool() - updateb bool used as bool from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the bool value from NLA_U8 netlink attribute @attr to set bool variable + * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is + * null. Bool pointed to by @mod is set to true if this function changed the + * logical value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_bool(bool *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = !!nla_get_u8(attr); + if (!!*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length @@ -258,13 +284,14 @@ int ethnl_ops_begin(struct net_device *dev); void ethnl_ops_complete(struct net_device *dev); /** - * struct ethnl_request_ops - unified handling of GET requests + * struct ethnl_request_ops - unified handling of GET and SET requests * @request_cmd: command id for request (GET) * @reply_cmd: command id for reply (GET_REPLY) * @hdr_attr: attribute type for request header * @req_info_size: size of request info * @reply_data_size: size of reply data * @allow_nodev_do: allow non-dump request with no device identification + * @set_ntf_cmd: notification to generate on changes (SET) * @parse_request: * Parse request except common header (struct ethnl_req_info). Common * header is already filled on entry, the rest up to @repdata_offset @@ -293,6 +320,18 @@ void ethnl_ops_complete(struct net_device *dev); * used e.g. to free any additional data structures outside the main * structure which were allocated by ->prepare_data(). When processing * dump requests, ->cleanup() is called for each message. + * @set_validate: + * Check if set operation is supported for a given device, and perform + * extra input checks. Expected return values: + * - 0 if the operation is a noop for the device (rare) + * - 1 if operation should proceed to calling @set + * - negative errno on errors + * Called without any locks, just a reference on the netdev. + * @set: + * Execute the set operation. The implementation should return + * - 0 if no configuration has changed + * - 1 if configuration changed and notification should be generated + * - negative errno on errors * * Description of variable parts of GET request handling when using the * unified infrastructure. When used, a pointer to an instance of this @@ -309,6 +348,7 @@ struct ethnl_request_ops { unsigned int req_info_size; unsigned int reply_data_size; bool allow_nodev_do; + u8 set_ntf_cmd; int (*parse_request)(struct ethnl_req_info *req_info, struct nlattr **tb, @@ -322,6 +362,11 @@ struct ethnl_request_ops { const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); void (*cleanup_data)(struct ethnl_reply_data *reply_data); + + int (*set_validate)(struct ethnl_req_info *req_info, + struct genl_info *info); + int (*set)(struct ethnl_req_info *req_info, + struct genl_info *info); }; /* request handlers */ @@ -347,6 +392,9 @@ extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; extern const struct ethnl_request_ops ethnl_rss_request_ops; +extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; +extern const struct ethnl_request_ops ethnl_plca_status_request_ops; +extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -365,12 +413,12 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; -extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX_PUSH + 1]; +extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_RX_PUSH + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; -extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1]; +extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; @@ -381,33 +429,25 @@ extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INF extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; -extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; +extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1]; extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_CONTEXT + 1]; +extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; +extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; +extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; +extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; +extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; -int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_module(struct sk_buff *skb, struct genl_info *info); -int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index a8c113d244db..6657d0b888d8 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -5,8 +5,12 @@ struct pause_req_info { struct ethnl_req_info base; + enum ethtool_mac_stats_src src; }; +#define PAUSE_REQINFO(__req_base) \ + container_of(__req_base, struct pause_req_info, base) + struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; @@ -19,13 +23,40 @@ struct pause_reply_data { const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), + [ETHTOOL_A_PAUSE_STATS_SRC] = + NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; +static int pause_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; + struct pause_req_info *req_info = PAUSE_REQINFO(req_base); + + if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { + if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { + NL_SET_ERR_MSG_MOD(extack, + "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); + return -EINVAL; + } + + src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); + } + + req_info->src = src; + + return 0; +} + static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) { + const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; struct pause_reply_data *data = PAUSE_REPDATA(reply_base); + enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; @@ -34,14 +65,26 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); + data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || + src == ETHTOOL_MAC_STATS_SRC_PMAC) && + !__ethtool_dev_mm_supported(dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Device does not support MAC merge layer"); + ethnl_ops_complete(dev); + return -EOPNOTSUPP; + } + dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); + ethnl_ops_complete(dev); return 0; @@ -56,6 +99,7 @@ static int pause_reply_size(const struct ethnl_req_info *req_base, if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ + nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } @@ -77,6 +121,9 @@ static int pause_put_stats(struct sk_buff *skb, const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; + if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) + return -EMSGSIZE; + nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; @@ -114,18 +161,6 @@ static int pause_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_pause_request_ops = { - .request_cmd = ETHTOOL_MSG_PAUSE_GET, - .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, - .hdr_attr = ETHTOOL_A_PAUSE_HEADER, - .req_info_size = sizeof(struct pause_req_info), - .reply_data_size = sizeof(struct pause_reply_data), - - .prepare_data = pause_prepare_data, - .reply_size = pause_reply_size, - .fill_reply = pause_fill_reply, -}; - /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { @@ -136,51 +171,49 @@ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, }; -int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_pause_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; - struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_PAUSE_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_pauseparam || !ops->set_pauseparam) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ops->get_pauseparam(dev, ¶ms); + dev->ethtool_ops->get_pauseparam(dev, ¶ms); ethnl_update_bool32(¶ms.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(¶ms.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(¶ms.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); - ret = 0; if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_pauseparam(dev, ¶ms); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_pause_request_ops = { + .request_cmd = ETHTOOL_MSG_PAUSE_GET, + .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_PAUSE_HEADER, + .req_info_size = sizeof(struct pause_req_info), + .reply_data_size = sizeof(struct pause_reply_data), + + .parse_request = pause_parse_request, + .prepare_data = pause_prepare_data, + .reply_size = pause_reply_size, + .fill_reply = pause_fill_reply, + + .set_validate = ethnl_set_pause_validate, + .set = ethnl_set_pause, + .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, +}; diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c new file mode 100644 index 000000000000..5a8cab4df0c9 --- /dev/null +++ b/net/ethtool/plca.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/phy.h> +#include <linux/ethtool_netlink.h> + +#include "netlink.h" +#include "common.h" + +struct plca_req_info { + struct ethnl_req_info base; +}; + +struct plca_reply_data { + struct ethnl_reply_data base; + struct phy_plca_cfg plca_cfg; + struct phy_plca_status plca_st; +}; + +// Helpers ------------------------------------------------------------------ // + +#define PLCA_REPDATA(__reply_base) \ + container_of(__reply_base, struct plca_reply_data, base) + +static void plca_update_sint(int *dst, const struct nlattr *attr, + bool *mod) +{ + if (!attr) + return; + + *dst = nla_get_u32(attr); + *mod = true; +} + +// PLCA get configuration message ------------------------------------------- // + +const struct nla_policy ethnl_plca_get_cfg_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct plca_reply_data *data = PLCA_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_phy_ops *ops; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out; + } + + // note: rtnl_lock is held already by ethnl_default_doit + ops = ethtool_phy_ops; + if (!ops || !ops->get_plca_cfg) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out; + + memset(&data->plca_cfg, 0xff, + sizeof_field(struct plca_reply_data, plca_cfg)); + + ret = ops->get_plca_cfg(dev->phydev, &data->plca_cfg); + ethnl_ops_complete(dev); + +out: + return ret; +} + +static int plca_get_cfg_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u16)) + /* _VERSION */ + nla_total_size(sizeof(u8)) + /* _ENABLED */ + nla_total_size(sizeof(u32)) + /* _NODE_CNT */ + nla_total_size(sizeof(u32)) + /* _NODE_ID */ + nla_total_size(sizeof(u32)) + /* _TO_TIMER */ + nla_total_size(sizeof(u32)) + /* _BURST_COUNT */ + nla_total_size(sizeof(u32)); /* _BURST_TIMER */ +} + +static int plca_get_cfg_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct plca_reply_data *data = PLCA_REPDATA(reply_base); + const struct phy_plca_cfg *plca = &data->plca_cfg; + + if ((plca->version >= 0 && + nla_put_u16(skb, ETHTOOL_A_PLCA_VERSION, plca->version)) || + (plca->enabled >= 0 && + nla_put_u8(skb, ETHTOOL_A_PLCA_ENABLED, !!plca->enabled)) || + (plca->node_id >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_ID, plca->node_id)) || + (plca->node_cnt >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_CNT, plca->node_cnt)) || + (plca->to_tmr >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_TO_TMR, plca->to_tmr)) || + (plca->burst_cnt >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_CNT, plca->burst_cnt)) || + (plca->burst_tmr >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_TMR, plca->burst_tmr))) + return -EMSGSIZE; + + return 0; +}; + +// PLCA set configuration message ------------------------------------------- // + +const struct nla_policy ethnl_plca_set_cfg_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255), + [ETHTOOL_A_PLCA_TO_TMR] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_BURST_CNT] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_BURST_TMR] = NLA_POLICY_MAX(NLA_U32, 255), +}; + +static int +ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; + const struct ethtool_phy_ops *ops; + struct nlattr **tb = info->attrs; + struct phy_plca_cfg plca_cfg; + bool mod = false; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) + return -EOPNOTSUPP; + + ops = ethtool_phy_ops; + if (!ops || !ops->set_plca_cfg) + return -EOPNOTSUPP; + + memset(&plca_cfg, 0xff, sizeof(plca_cfg)); + plca_update_sint(&plca_cfg.enabled, tb[ETHTOOL_A_PLCA_ENABLED], &mod); + plca_update_sint(&plca_cfg.node_id, tb[ETHTOOL_A_PLCA_NODE_ID], &mod); + plca_update_sint(&plca_cfg.node_cnt, tb[ETHTOOL_A_PLCA_NODE_CNT], &mod); + plca_update_sint(&plca_cfg.to_tmr, tb[ETHTOOL_A_PLCA_TO_TMR], &mod); + plca_update_sint(&plca_cfg.burst_cnt, tb[ETHTOOL_A_PLCA_BURST_CNT], + &mod); + plca_update_sint(&plca_cfg.burst_tmr, tb[ETHTOOL_A_PLCA_BURST_TMR], + &mod); + if (!mod) + return 0; + + ret = ops->set_plca_cfg(dev->phydev, &plca_cfg, info->extack); + return ret < 0 ? ret : 1; +} + +const struct ethnl_request_ops ethnl_plca_cfg_request_ops = { + .request_cmd = ETHTOOL_MSG_PLCA_GET_CFG, + .reply_cmd = ETHTOOL_MSG_PLCA_GET_CFG_REPLY, + .hdr_attr = ETHTOOL_A_PLCA_HEADER, + .req_info_size = sizeof(struct plca_req_info), + .reply_data_size = sizeof(struct plca_reply_data), + + .prepare_data = plca_get_cfg_prepare_data, + .reply_size = plca_get_cfg_reply_size, + .fill_reply = plca_get_cfg_fill_reply, + + .set = ethnl_set_plca, + .set_ntf_cmd = ETHTOOL_MSG_PLCA_NTF, +}; + +// PLCA get status message -------------------------------------------------- // + +const struct nla_policy ethnl_plca_get_status_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct plca_reply_data *data = PLCA_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_phy_ops *ops; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out; + } + + // note: rtnl_lock is held already by ethnl_default_doit + ops = ethtool_phy_ops; + if (!ops || !ops->get_plca_status) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out; + + memset(&data->plca_st, 0xff, + sizeof_field(struct plca_reply_data, plca_st)); + + ret = ops->get_plca_status(dev->phydev, &data->plca_st); + ethnl_ops_complete(dev); +out: + return ret; +} + +static int plca_get_status_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)); /* _STATUS */ +} + +static int plca_get_status_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct plca_reply_data *data = PLCA_REPDATA(reply_base); + const u8 status = data->plca_st.pst; + + if (nla_put_u8(skb, ETHTOOL_A_PLCA_STATUS, !!status)) + return -EMSGSIZE; + + return 0; +}; + +const struct ethnl_request_ops ethnl_plca_status_request_ops = { + .request_cmd = ETHTOOL_MSG_PLCA_GET_STATUS, + .reply_cmd = ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, + .hdr_attr = ETHTOOL_A_PLCA_HEADER, + .req_info_size = sizeof(struct plca_req_info), + .reply_data_size = sizeof(struct plca_reply_data), + + .prepare_data = plca_get_status_prepare_data, + .reply_size = plca_get_status_reply_size, + .fill_reply = plca_get_status_fill_reply, +}; diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c index 4c7bfa81e4ab..23264a1ebf12 100644 --- a/net/ethtool/privflags.c +++ b/net/ethtool/privflags.c @@ -118,19 +118,6 @@ static void privflags_cleanup_data(struct ethnl_reply_data *reply_data) kfree(data->priv_flag_names); } -const struct ethnl_request_ops ethnl_privflags_request_ops = { - .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, - .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, - .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, - .req_info_size = sizeof(struct privflags_req_info), - .reply_data_size = sizeof(struct privflags_reply_data), - - .prepare_data = privflags_prepare_data, - .reply_size = privflags_reply_size, - .fill_reply = privflags_fill_reply, - .cleanup_data = privflags_cleanup_data, -}; - /* PRIVFLAGS_SET */ const struct nla_policy ethnl_privflags_set_policy[] = { @@ -139,63 +126,70 @@ const struct nla_policy ethnl_privflags_set_policy[] = { [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED }, }; -int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_privflags_validate(struct ethnl_req_info *req_info, + struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + if (!info->attrs[ETHTOOL_A_PRIVFLAGS_FLAGS]) + return -EINVAL; + + if (!ops->get_priv_flags || !ops->set_priv_flags || + !ops->get_sset_count || !ops->get_strings) + return -EOPNOTSUPP; + return 1; +} + +static int +ethnl_set_privflags(struct ethnl_req_info *req_info, struct genl_info *info) { const char (*names)[ETH_GSTRING_LEN] = NULL; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - const struct ethtool_ops *ops; - struct net_device *dev; unsigned int nflags; bool mod = false; bool compact; u32 flags; int ret; - if (!tb[ETHTOOL_A_PRIVFLAGS_FLAGS]) - return -EINVAL; ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact); if (ret < 0) return ret; - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_PRIVFLAGS_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_priv_flags || !ops->set_priv_flags || - !ops->get_sset_count || !ops->get_strings) - goto out_dev; - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names); if (ret < 0) - goto out_ops; - flags = ops->get_priv_flags(dev); + return ret; + flags = dev->ethtool_ops->get_priv_flags(dev); ret = ethnl_update_bitset32(&flags, nflags, tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names, info->extack, &mod); if (ret < 0 || !mod) goto out_free; - ret = ops->set_priv_flags(dev, flags); + ret = dev->ethtool_ops->set_priv_flags(dev, flags); if (ret < 0) goto out_free; - ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); + ret = 1; out_free: kfree(names); -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); return ret; } + +const struct ethnl_request_ops ethnl_privflags_request_ops = { + .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, + .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, + .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, + .req_info_size = sizeof(struct privflags_req_info), + .reply_data_size = sizeof(struct privflags_reply_data), + + .prepare_data = privflags_prepare_data, + .reply_size = privflags_reply_size, + .fill_reply = privflags_fill_reply, + .cleanup_data = privflags_cleanup_data, + + .set_validate = ethnl_set_privflags_validate, + .set = ethnl_set_privflags, + .set_ntf_cmd = ETHTOOL_MSG_PRIVFLAGS_NTF, +}; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index e8683e485dc9..a5b607b0a652 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -106,18 +106,6 @@ static int pse_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_pse_request_ops = { - .request_cmd = ETHTOOL_MSG_PSE_GET, - .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY, - .hdr_attr = ETHTOOL_A_PSE_HEADER, - .req_info_size = sizeof(struct pse_req_info), - .reply_data_size = sizeof(struct pse_reply_data), - - .prepare_data = pse_prepare_data, - .reply_size = pse_reply_size, - .fill_reply = pse_fill_reply, -}; - /* PSE_SET */ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { @@ -127,59 +115,50 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), }; -static int pse_set_pse_config(struct net_device *dev, - struct netlink_ext_ack *extack, - struct nlattr **tb) +static int +ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) { - struct phy_device *phydev = dev->phydev; - struct pse_control_config config = {}; + return !!info->attrs[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]; +} - /* Optional attribute. Do not return error if not set. */ - if (!tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]) - return 0; +static int +ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; + struct pse_control_config config = {}; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; /* this values are already validated by the ethnl_pse_set_policy */ config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); + phydev = dev->phydev; if (!phydev) { - NL_SET_ERR_MSG(extack, "No PHY is attached"); + NL_SET_ERR_MSG(info->extack, "No PHY is attached"); return -EOPNOTSUPP; } if (!phydev->psec) { - NL_SET_ERR_MSG(extack, "No PSE is attached"); + NL_SET_ERR_MSG(info->extack, "No PSE is attached"); return -EOPNOTSUPP; } - return pse_ethtool_set_config(phydev->psec, extack, &config); + /* Return errno directly - PSE has no notification */ + return pse_ethtool_set_config(phydev->psec, info->extack, &config); } -int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info) -{ - struct ethnl_req_info req_info = {}; - struct nlattr **tb = info->attrs; - struct net_device *dev; - int ret; - - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_PSE_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - - dev = req_info.dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - - ret = pse_set_pse_config(dev, info->extack, tb); - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); +const struct ethnl_request_ops ethnl_pse_request_ops = { + .request_cmd = ETHTOOL_MSG_PSE_GET, + .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_PSE_HEADER, + .req_info_size = sizeof(struct pse_req_info), + .reply_data_size = sizeof(struct pse_reply_data), - ethnl_parse_header_dev_put(&req_info); + .prepare_data = pse_prepare_data, + .reply_size = pse_reply_size, + .fill_reply = pse_fill_reply, - return ret; -} + .set_validate = ethnl_set_pse_validate, + .set = ethnl_set_pse, + /* PSE has no notification */ +}; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index fa3ec8d438f7..f358cd57d094 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -56,7 +56,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */ - nla_total_size(sizeof(u8))); /* _RINGS_TX_PUSH */ + nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ + nla_total_size(sizeof(u8))); /* _RINGS_RX_PUSH */ } static int rings_fill_reply(struct sk_buff *skb, @@ -96,24 +97,13 @@ static int rings_fill_reply(struct sk_buff *skb, kr->tcp_data_split))) || (kr->cqe_size && (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) || - nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push)) + nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) || + nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push)) return -EMSGSIZE; return 0; } -const struct ethnl_request_ops ethnl_rings_request_ops = { - .request_cmd = ETHTOOL_MSG_RINGS_GET, - .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, - .hdr_attr = ETHTOOL_A_RINGS_HEADER, - .req_info_size = sizeof(struct rings_req_info), - .reply_data_size = sizeof(struct rings_reply_data), - - .prepare_data = rings_prepare_data, - .reply_size = rings_reply_size, - .fill_reply = rings_fill_reply, -}; - /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { @@ -126,64 +116,64 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), }; -int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_rings_validate(struct ethnl_req_info *req_info, + struct genl_info *info) { - struct kernel_ethtool_ringparam kernel_ringparam = {}; - struct ethtool_ringparam ringparam = {}; - struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; - const struct nlattr *err_attr; - const struct ethtool_ops *ops; - struct net_device *dev; - bool mod = false; - int ret; - - ret = ethnl_parse_header_dev_get(&req_info, - tb[ETHTOOL_A_RINGS_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ops = dev->ethtool_ops; - ret = -EOPNOTSUPP; - if (!ops->get_ringparam || !ops->set_ringparam) - goto out_dev; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { - ret = -EOPNOTSUPP; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); - goto out_dev; + return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { - ret = -EOPNOTSUPP; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); - goto out_dev; + return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { - ret = -EOPNOTSUPP; NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); - goto out_dev; + return -EOPNOTSUPP; } - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - ops->get_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); + if (tb[ETHTOOL_A_RINGS_RX_PUSH] && + !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_RX_PUSH], + "setting rx push not supported"); + return -EOPNOTSUPP; + } + + return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct kernel_ethtool_ringparam kernel_ringparam = {}; + struct ethtool_ringparam ringparam = {}; + struct net_device *dev = req_info->dev; + struct nlattr **tb = info->attrs; + const struct nlattr *err_attr; + bool mod = false; + int ret; + + dev->ethtool_ops->get_ringparam(dev, &ringparam, + &kernel_ringparam, info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, @@ -197,9 +187,10 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); - ret = 0; + ethnl_update_u8(&kernel_ringparam.rx_push, + tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); if (!mod) - goto out_ops; + return 0; /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) @@ -213,23 +204,28 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) else err_attr = NULL; if (err_attr) { - ret = -EINVAL; NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); - goto out_ops; + return -EINVAL; } ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); - if (ret < 0) - goto out_ops; - ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return ret < 0 ? ret : 1; } + +const struct ethnl_request_ops ethnl_rings_request_ops = { + .request_cmd = ETHTOOL_MSG_RINGS_GET, + .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, + .hdr_attr = ETHTOOL_A_RINGS_HEADER, + .req_info_size = sizeof(struct rings_req_info), + .reply_data_size = sizeof(struct rings_reply_data), + + .prepare_data = rings_prepare_data, + .reply_size = rings_reply_size, + .fill_reply = rings_fill_reply, + + .set_validate = ethnl_set_rings_validate, + .set = ethnl_set_rings, + .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, +}; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index ebe6145aed3f..be260ab34e58 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -122,10 +122,13 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, { const struct rss_reply_data *data = RSS_REPDATA(reply_base); - if (nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc) || - nla_put(skb, ETHTOOL_A_RSS_INDIR, - sizeof(u32) * data->indir_size, data->indir_table) || - nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey)) + if ((data->hfunc && + nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || + (data->indir_size && + nla_put(skb, ETHTOOL_A_RSS_INDIR, + sizeof(u32) * data->indir_size, data->indir_table)) || + (data->hkey_size && + nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))) return -EMSGSIZE; return 0; diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index a20e0a24ff61..010ed19ccc99 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -7,6 +7,7 @@ struct stats_req_info { struct ethnl_req_info base; DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT); + enum ethtool_mac_stats_src src; }; #define STATS_REQINFO(__req_base) \ @@ -75,16 +76,19 @@ const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", }; -const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { +const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED }, + [ETHTOOL_A_STATS_SRC] = + NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int stats_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { + enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct stats_req_info *req_info = STATS_REQINFO(req_base); bool mod = false; int err; @@ -100,6 +104,11 @@ static int stats_parse_request(struct ethnl_req_info *req_base, return -EINVAL; } + if (tb[ETHTOOL_A_STATS_SRC]) + src = nla_get_u32(tb[ETHTOOL_A_STATS_SRC]); + + req_info->src = src; + return 0; } @@ -108,7 +117,9 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, struct genl_info *info) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; struct stats_reply_data *data = STATS_REPDATA(reply_base); + enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; @@ -116,11 +127,25 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; + if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || + src == ETHTOOL_MAC_STATS_SRC_PMAC) && + !__ethtool_dev_mm_supported(dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Device does not support MAC merge layer"); + ethnl_ops_complete(dev); + return -EOPNOTSUPP; + } + /* Mark all stats as unset (see ETHTOOL_STAT_NOT_SET) to prevent them * from being reported to user space in case driver did not set them. */ memset(&data->stats, 0xff, sizeof(data->stats)); + data->phy_stats.src = src; + data->mac_stats.src = src; + data->ctrl_stats.src = src; + data->rmon_stats.src = src; + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); @@ -146,6 +171,8 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, unsigned int n_grps = 0, n_stats = 0; int len = 0; + len += nla_total_size(sizeof(u32)); /* _STATS_SRC */ + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); n_grps++; @@ -379,6 +406,9 @@ static int stats_fill_reply(struct sk_buff *skb, const struct stats_reply_data *data = STATS_REPDATA(reply_base); int ret = 0; + if (nla_put_u32(skb, ETHTOOL_A_STATS_SRC, req_info->src)) + return -EMSGSIZE; + if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, ETH_SS_STATS_ETH_PHY, @@ -410,3 +440,130 @@ const struct ethnl_request_ops ethnl_stats_request_ops = { .reply_size = stats_reply_size, .fill_reply = stats_fill_reply, }; + +static u64 ethtool_stats_sum(u64 a, u64 b) +{ + if (a == ETHTOOL_STAT_NOT_SET) + return b; + if (b == ETHTOOL_STAT_NOT_SET) + return a; + return a + b; +} + +/* Avoid modifying the aggregation procedure every time a new counter is added + * by treating the structures as an array of u64 statistics. + */ +static void ethtool_aggregate_stats(void *aggr_stats, const void *emac_stats, + const void *pmac_stats, size_t stats_size, + size_t stats_offset) +{ + size_t num_stats = stats_size / sizeof(u64); + const u64 *s1 = emac_stats + stats_offset; + const u64 *s2 = pmac_stats + stats_offset; + u64 *s = aggr_stats + stats_offset; + int i; + + for (i = 0; i < num_stats; i++) + s[i] = ethtool_stats_sum(s1[i], s2[i]); +} + +void ethtool_aggregate_mac_stats(struct net_device *dev, + struct ethtool_eth_mac_stats *mac_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_eth_mac_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_eth_mac_stats(dev, &emac); + ops->get_eth_mac_stats(dev, &pmac); + + ethtool_aggregate_stats(mac_stats, &emac, &pmac, + sizeof(mac_stats->stats), + offsetof(struct ethtool_eth_mac_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_mac_stats); + +void ethtool_aggregate_phy_stats(struct net_device *dev, + struct ethtool_eth_phy_stats *phy_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_eth_phy_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_eth_phy_stats(dev, &emac); + ops->get_eth_phy_stats(dev, &pmac); + + ethtool_aggregate_stats(phy_stats, &emac, &pmac, + sizeof(phy_stats->stats), + offsetof(struct ethtool_eth_phy_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_phy_stats); + +void ethtool_aggregate_ctrl_stats(struct net_device *dev, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_eth_ctrl_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_eth_ctrl_stats(dev, &emac); + ops->get_eth_ctrl_stats(dev, &pmac); + + ethtool_aggregate_stats(ctrl_stats, &emac, &pmac, + sizeof(ctrl_stats->stats), + offsetof(struct ethtool_eth_ctrl_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_ctrl_stats); + +void ethtool_aggregate_pause_stats(struct net_device *dev, + struct ethtool_pause_stats *pause_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_pause_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_pause_stats(dev, &emac); + ops->get_pause_stats(dev, &pmac); + + ethtool_aggregate_stats(pause_stats, &emac, &pmac, + sizeof(pause_stats->stats), + offsetof(struct ethtool_pause_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_pause_stats); + +void ethtool_aggregate_rmon_stats(struct net_device *dev, + struct ethtool_rmon_stats *rmon_stats) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + const struct ethtool_rmon_hist_range *dummy; + struct ethtool_rmon_stats pmac, emac; + + memset(&emac, 0xff, sizeof(emac)); + memset(&pmac, 0xff, sizeof(pmac)); + emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; + pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; + + ops->get_rmon_stats(dev, &emac, &dummy); + ops->get_rmon_stats(dev, &pmac, &dummy); + + ethtool_aggregate_stats(rmon_stats, &emac, &pmac, + sizeof(rmon_stats->stats), + offsetof(struct ethtool_rmon_stats, stats)); +} +EXPORT_SYMBOL(ethtool_aggregate_rmon_stats); diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index 88f435e76481..a4a43d9e6e9d 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -82,18 +82,6 @@ static int wol_fill_reply(struct sk_buff *skb, return 0; } -const struct ethnl_request_ops ethnl_wol_request_ops = { - .request_cmd = ETHTOOL_MSG_WOL_GET, - .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, - .hdr_attr = ETHTOOL_A_WOL_HEADER, - .req_info_size = sizeof(struct wol_req_info), - .reply_data_size = sizeof(struct wol_reply_data), - - .prepare_data = wol_prepare_data, - .reply_size = wol_reply_size, - .fill_reply = wol_fill_reply, -}; - /* WOL_SET */ const struct nla_policy ethnl_wol_set_policy[] = { @@ -104,67 +92,66 @@ const struct nla_policy ethnl_wol_set_policy[] = { .len = SOPASS_MAX }, }; -int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) +static int +ethnl_set_wol_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + + return ops->get_wol && ops->set_wol ? 1 : -EOPNOTSUPP; +} + +static int +ethnl_set_wol(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; - struct ethnl_req_info req_info = {}; + struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct net_device *dev; bool mod = false; int ret; - ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_WOL_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - dev = req_info.dev; - ret = -EOPNOTSUPP; - if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) - goto out_dev; - - rtnl_lock(); - ret = ethnl_ops_begin(dev); - if (ret < 0) - goto out_rtnl; - dev->ethtool_ops->get_wol(dev, &wol); ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, tb[ETHTOOL_A_WOL_MODES], wol_mode_names, info->extack, &mod); if (ret < 0) - goto out_ops; + return ret; if (wol.wolopts & ~wol.supported) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], "cannot enable unsupported WoL mode"); - ret = -EINVAL; - goto out_ops; + return -EINVAL; } if (tb[ETHTOOL_A_WOL_SOPASS]) { if (!(wol.supported & WAKE_MAGICSECURE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_SOPASS], "magicsecure not supported, cannot set password"); - ret = -EINVAL; - goto out_ops; + return -EINVAL; } ethnl_update_binary(wol.sopass, sizeof(wol.sopass), tb[ETHTOOL_A_WOL_SOPASS], &mod); } if (!mod) - goto out_ops; + return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) - goto out_ops; + return ret; dev->wol_enabled = !!wol.wolopts; - ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); - -out_ops: - ethnl_ops_complete(dev); -out_rtnl: - rtnl_unlock(); -out_dev: - ethnl_parse_header_dev_put(&req_info); - return ret; + return 1; } + +const struct ethnl_request_ops ethnl_wol_request_ops = { + .request_cmd = ETHTOOL_MSG_WOL_GET, + .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, + .hdr_attr = ETHTOOL_A_WOL_HEADER, + .req_info_size = sizeof(struct wol_req_info), + .reply_data_size = sizeof(struct wol_reply_data), + + .prepare_data = wol_prepare_data, + .reply_size = wol_reply_size, + .fill_reply = wol_fill_reply, + + .set_validate = ethnl_set_wol_validate, + .set = ethnl_set_wol, + .set_ntf_cmd = ETHTOOL_MSG_WOL_NTF, +}; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index af7d2cf490fb..880277c9fd07 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o +fou-y := fou_core.o fou_nl.o obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ab4a06be489b..8db6747f892f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -156,7 +156,6 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); - sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -347,6 +346,7 @@ lookup_protocol: sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; inet->mc_loop = 1; @@ -356,8 +356,6 @@ lookup_protocol: inet->mc_list = NULL; inet->rcv_tos = 0; - sk_refcnt_debug_inc(sk); - if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket @@ -1485,6 +1483,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out; + NAPI_GRO_CB(skb)->proto = proto; id = ntohl(*(__be32 *)&iph->id); flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; @@ -1618,9 +1617,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - nhoff); struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; + __be16 totlen = iph->tot_len; int proto = iph->protocol; int err = -ENOSYS; @@ -1629,8 +1628,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) skb_set_inner_network_header(skb, nhoff); } - csum_replace2(&iph->check, iph->tot_len, newlen); - iph->tot_len = newlen; + iph_set_totlen(iph, skb->len - nhoff); + csum_replace2(&iph->check, totlen, iph->tot_len); ops = rcu_dereference(inet_offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) @@ -1665,6 +1664,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, if (rc == 0) { *sk = sock->sk; (*sk)->sk_allocation = GFP_ATOMIC; + (*sk)->sk_use_task_frag = false; /* * Unhash it so that IP input processing does not even see it, * we do not wish this socket to see incoming packets. diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 4517d2bd186a..13fc0c185cd9 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -248,7 +248,8 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, } static int bpf_tcp_ca_check_member(const struct btf_type *t, - const struct btf_member *member) + const struct btf_member *member, + const struct bpf_prog *prog) { if (is_unsupported(__btf_member_bit_offset(t, member) / 8)) return -ENOTSUPP; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 6cd3b6c559f0..79ae7204e8ed 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -2222,7 +2222,7 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); if (len_delta != 0) { iph->ihl = 5 + (opt_len >> 2); - iph->tot_len = htons(skb->len); + iph_set_totlen(iph, skb->len); } ip_send_check(iph); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e8b9a9202fec..b0acf6e19aed 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -234,13 +234,20 @@ static void inet_free_ifa(struct in_ifaddr *ifa) call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); } +static void in_dev_free_rcu(struct rcu_head *head) +{ + struct in_device *idev = container_of(head, struct in_device, rcu_head); + + kfree(rcu_dereference_protected(idev->mc_hash, 1)); + kfree(idev); +} + void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); - kfree(rcu_dereference_protected(idev->mc_hash, 1)); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif @@ -248,7 +255,7 @@ void in_dev_finish_destroy(struct in_device *idev) if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else - kfree(idev); + call_rcu(&idev->rcu_head, in_dev_free_rcu); } EXPORT_SYMBOL(in_dev_finish_destroy); @@ -298,12 +305,6 @@ out_kfree: goto out; } -static void in_dev_rcu_put(struct rcu_head *head) -{ - struct in_device *idev = container_of(head, struct in_device, rcu_head); - in_dev_put(idev); -} - static void inetdev_destroy(struct in_device *in_dev) { struct net_device *dev; @@ -328,7 +329,7 @@ static void inetdev_destroy(struct in_device *in_dev) neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); - call_rcu(&in_dev->rcu_head, in_dev_rcu_put); + in_dev_put(in_dev); } int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f361d3d56be2..b5736ef16ed2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -841,6 +841,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, return -EINVAL; } + if (!cfg->fc_table) + cfg->fc_table = RT_TABLE_MAIN; + return 0; errout: return err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 19a662003eef..3bb890a40ed7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/netlink.h> #include <linux/hash.h> +#include <linux/nospec.h> #include <net/arp.h> #include <net/inet_dscp.h> @@ -423,6 +424,7 @@ static struct fib_info *fib_find_info(struct fib_info *nfi) nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && nfi->fib_type == fi->fib_type && + nfi->fib_tb_id == fi->fib_tb_id && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && @@ -1021,6 +1023,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) if (type > RTAX_MAX) return false; + type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; diff --git a/net/ipv4/fou.c b/net/ipv4/fou_core.c index 0c3c6d0cee29..cafec9b4eee0 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou_core.c @@ -19,6 +19,8 @@ #include <uapi/linux/fou.h> #include <uapi/linux/genetlink.h> +#include "fou_nl.h" + struct fou { struct socket *sock; u8 protocol; @@ -640,20 +642,6 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) static struct genl_family fou_nl_family; -static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { - [FOU_ATTR_PORT] = { .type = NLA_U16, }, - [FOU_ATTR_AF] = { .type = NLA_U8, }, - [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, - [FOU_ATTR_TYPE] = { .type = NLA_U8, }, - [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, - [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, - [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .len = sizeof(struct in6_addr), }, - [FOU_ATTR_PEER_V6] = { .len = sizeof(struct in6_addr), }, - [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, - [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, -}; - static int parse_nl_config(struct genl_info *info, struct fou_cfg *cfg) { @@ -745,7 +733,7 @@ static int parse_nl_config(struct genl_info *info, return 0; } -static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) +int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; @@ -758,7 +746,7 @@ static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) return fou_create(net, &cfg, NULL); } -static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) +int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; @@ -827,7 +815,7 @@ nla_put_failure: return -EMSGSIZE; } -static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) +int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_net *fn = net_generic(net, fou_net_id); @@ -874,7 +862,7 @@ out_free: return ret; } -static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fou_net *fn = net_generic(net, fou_net_id); @@ -897,33 +885,12 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static const struct genl_small_ops fou_nl_ops[] = { - { - .cmd = FOU_CMD_ADD, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = fou_nl_cmd_add_port, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = FOU_CMD_DEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = fou_nl_cmd_rm_port, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = FOU_CMD_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = fou_nl_cmd_get_port, - .dumpit = fou_nl_dump, - }, -}; - static struct genl_family fou_nl_family __ro_after_init = { .hdrsize = 0, .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, .maxattr = FOU_ATTR_MAX, - .policy = fou_nl_policy, + .policy = fou_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = fou_nl_ops, diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c new file mode 100644 index 000000000000..6c3820f41dd5 --- /dev/null +++ b/net/ipv4/fou_nl.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/fou.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "fou_nl.h" + +#include <linux/fou.h> + +/* Global operation policy for fou */ +const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, + [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, + [FOU_ATTR_LOCAL_V6] = { .len = 16, }, + [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, + [FOU_ATTR_PEER_V6] = { .len = 16, }, + [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, +}; + +/* Ops table for fou */ +const struct genl_small_ops fou_nl_ops[3] = { + { + .cmd = FOU_CMD_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = fou_nl_add_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = fou_nl_del_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = fou_nl_get_doit, + .dumpit = fou_nl_get_dumpit, + }, +}; diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h new file mode 100644 index 000000000000..b7a68121ce6f --- /dev/null +++ b/net/ipv4/fou_nl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/fou.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_FOU_GEN_H +#define _LINUX_FOU_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <linux/fou.h> + +/* Global operation policy for fou */ +extern const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1]; + +/* Ops table for fou */ +extern const struct genl_small_ops fou_nl_ops[3]; + +int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info); +int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info); +int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); + +#endif /* _LINUX_FOU_GEN_H */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d5d745c3e345..8cebb476b3ab 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -263,7 +263,7 @@ bool icmp_global_allow(void) /* We want to use a credit of one in average, but need to randomize * it for security reasons. */ - credit = max_t(int, credit - prandom_u32_max(3), 0); + credit = max_t(int, credit - get_random_u32_below(3), 0); rc = true; } WRITE_ONCE(icmp_global.credit, credit); @@ -296,6 +296,7 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -325,6 +326,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (peer) inet_putpeer(peer); out: + if (!rc) + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); return rc; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 81be3e0f0e70..c920aa9a62a9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -213,7 +213,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = prandom_u32_max(max_delay); + int tv = get_random_u32_below(max_delay); im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -222,7 +222,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = prandom_u32_max(in_dev->mr_maxdelay); + int tv = get_random_u32_below(in_dev->mr_maxdelay); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && @@ -236,7 +236,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = prandom_u32_max(delay); + int tv = get_random_u32_below(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4a34bc7cb15e..65ad4251f6fd 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -117,7 +117,7 @@ bool inet_rcv_saddr_any(const struct sock *sk) return !sk->sk_rcv_saddr; } -void inet_get_local_port_range(struct net *net, int *low, int *high) +void inet_get_local_port_range(const struct net *net, int *low, int *high) { unsigned int seq; @@ -130,6 +130,27 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) } EXPORT_SYMBOL(inet_get_local_port_range); +void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); + int lo, hi, sk_lo, sk_hi; + + inet_get_local_port_range(net, &lo, &hi); + + sk_lo = inet->local_port_range.lo; + sk_hi = inet->local_port_range.hi; + + if (unlikely(lo <= sk_lo && sk_lo <= hi)) + lo = sk_lo; + if (unlikely(lo <= sk_hi && sk_hi <= hi)) + hi = sk_hi; + + *low = lo; + *high = hi; +} +EXPORT_SYMBOL(inet_sk_get_local_port_range); + static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) @@ -173,22 +194,40 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, return false; } +static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, + kuid_t sk_uid, bool relax, + bool reuseport_cb_ok, bool reuseport_ok) +{ + if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) + return false; + + return inet_bind_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok); +} + static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { + struct inet_timewait_sock *tw2; struct sock *sk2; sk_for_each_bound_bhash2(sk2, &tb2->owners) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - continue; + if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok)) + return true; + } + + twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { + sk2 = (struct sock *)tw2; - if (inet_bind_conflict(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) + if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok)) return true; } + return false; } @@ -298,7 +337,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; @@ -314,7 +353,7 @@ other_half_scan: if (likely(remaining > 1)) remaining &= ~1U; - offset = prandom_u32_max(remaining); + offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ @@ -1083,8 +1122,7 @@ static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, if (!icsk->icsk_ulp_ops) return; - if (icsk->icsk_ulp_ops->clone) - icsk->icsk_ulp_ops->clone(req, newsk, priority); + icsk->icsk_ulp_ops->clone(req, newsk, priority); } /** @@ -1160,8 +1198,6 @@ void inet_csk_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); - this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); @@ -1182,20 +1218,31 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); +static int inet_ulp_can_listen(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) + return -EINVAL; + + return 0; +} + int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; + err = inet_ulp_can_listen(sk); + if (unlikely(err)) + return err; + reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); - if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT) - sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3cec471a2cd2..e41fdc38ce19 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -116,6 +116,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, #endif tb->rcv_saddr = sk->sk_rcv_saddr; INIT_HLIST_HEAD(&tb->owners); + INIT_HLIST_HEAD(&tb->deathrow); hlist_add_head(&tb->node, &head->chain); } @@ -137,7 +138,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } @@ -649,8 +650,20 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); - ret = sk_nulls_del_node_init_rcu(osk); - } else if (found_dup_sk) { + ret = sk_hashed(osk); + if (ret) { + /* Before deleting the node, we insert a new one to make + * sure that the look-up-sk process would not miss either + * of them and that at least one node would exist in ehash + * table all the time. Otherwise there's a tiny chance + * that lookup process could find nothing in ehash table. + */ + __sk_nulls_add_node_tail_rcu(sk, list); + sk_nulls_del_node_init_rcu(osk); + } + goto unlock; + } + if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; @@ -659,6 +672,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ret) __sk_nulls_add_node_rcu(sk, list); +unlock: spin_unlock(lock); return ret; @@ -994,17 +1008,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 index; if (port) { - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL, NULL); - spin_unlock_bh(&head->lock); - return 0; - } - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ + local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; @@ -1012,7 +1016,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, l3mdev = inet_sk_bound_l3mdev(sk); - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; if (likely(remaining > 1)) @@ -1097,21 +1101,22 @@ ok: * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, prandom_u32_max(8) * 2); + i = max_t(int, i, get_random_u32_below(8) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); - spin_unlock(&head2->lock); - if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); + + spin_unlock(&head2->lock); spin_unlock(&head->lock); + if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 66fc940f9521..40052414c7c7 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -29,6 +29,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { + struct inet_bind2_bucket *tb2 = tw->tw_tb2; struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) @@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + + __hlist_del(&tw->tw_bind2_node); + tw->tw_tb2 = NULL; + inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); + __sock_put((struct sock *)tw); } @@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - struct inet_bind_hashbucket *bhead; + struct inet_bind_hashbucket *bhead, *bhead2; spin_lock(lock); sk_nulls_del_node_init_rcu((struct sock *)tw); @@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; + bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw, + twsk_net(tw), tw->tw_num); spin_lock(&bhead->lock); + spin_lock(&bhead2->lock); inet_twsk_bind_unhash(tw, hashinfo); + spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); refcount_dec(&tw->tw_dr->tw_refcount); @@ -67,9 +77,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); -#ifdef SOCK_REFCNT_DEBUG - pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); -#endif kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } @@ -81,10 +88,10 @@ void inet_twsk_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(inet_twsk_put); -static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, - struct hlist_nulls_head *list) +static void inet_twsk_add_node_tail_rcu(struct inet_timewait_sock *tw, + struct hlist_nulls_head *list) { - hlist_nulls_add_head_rcu(&tw->tw_node, list); + hlist_nulls_add_tail_rcu(&tw->tw_node, list); } static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, @@ -93,6 +100,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, hlist_add_head(&tw->tw_bind_node, list); } +static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_bind2_node, list); +} + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -105,22 +118,33 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - struct inet_bind_hashbucket *bhead; + struct inet_bind_hashbucket *bhead, *bhead2; + /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; + bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); + spin_lock(&bhead->lock); + spin_lock(&bhead2->lock); + tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + + tw->tw_tb2 = icsk->icsk_bind2_hash; + WARN_ON(!icsk->icsk_bind2_hash); + inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow); + + spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); spin_lock(lock); - inet_twsk_add_node_rcu(tw, &ehead->chain); + inet_twsk_add_node_tail_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a4ccef3e6935..ffff46cdcb58 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1492,24 +1492,6 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if (t->erspan_ver <= 2) { - if (t->erspan_ver != 0 && !t->collect_md) - o_flags |= TUNNEL_KEY; - - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) - goto nla_put_failure; - - if (t->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) - goto nla_put_failure; - } else if (t->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) - goto nla_put_failure; - } - } - if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, gre_tnl_flags_to_gre_flags(p->i_flags)) || @@ -1550,6 +1532,34 @@ nla_put_failure: return -EMSGSIZE; } +static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + if (t->erspan_ver <= 2) { + if (t->erspan_ver != 0 && !t->collect_md) + t->parms.o_flags |= TUNNEL_KEY; + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } + } + + return ipgre_fill_info(skb, dev); + +nla_put_failure: + return -EMSGSIZE; +} + static void erspan_setup(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1628,7 +1638,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { .changelink = erspan_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, - .fill_info = ipgre_fill_info, + .fill_info = erspan_fill_info, .get_link_net = ip_tunnel_get_link_net, }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e880ce77322a..fe9ead9ee863 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 922c87ef1ab5..4e4e308c3230 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); - iph->tot_len = htons(skb->len); + iph_set_totlen(iph, skb->len); ip_send_check(iph); /* if egress device is enslaved to an L3 master device pass the diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 9f92ae35bb01..b511ff0adc0a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -923,6 +923,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_CHECKSUM: case IP_RECVFRAGSIZE: case IP_RECVERR_RFC4884: + case IP_LOCAL_PORT_RANGE: if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; @@ -1365,6 +1366,20 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(inet->min_ttl, val); break; + case IP_LOCAL_PORT_RANGE: + { + const __u16 lo = val; + const __u16 hi = val >> 16; + + if (optlen != sizeof(__u32)) + goto e_inval; + if (lo != 0 && hi != 0 && lo > hi) + goto e_inval; + + inet->local_port_range.lo = lo; + inet->local_port_range.hi = hi; + break; + } default: err = -ENOPROTOOPT; break; @@ -1743,6 +1758,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MINTTL: val = inet->min_ttl; break; + case IP_LOCAL_PORT_RANGE: + val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; + break; default: sockopt_release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b58df3c1bf7d..eec1f6df80d8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -412,7 +412,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { - del_timer_sync(&mrt->ipmr_expire_timer); + timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 7fcfdfd8f9de..0e3ee1532848 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> +#include <linux/nospec.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/ip.h> @@ -25,6 +26,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, return -EINVAL; } + type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index aab384126f61..f71a7e9a7de6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -259,20 +259,6 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_CLUSTERIP - tristate "CLUSTERIP target support" - depends on IP_NF_MANGLE - depends on NF_CONNTRACK - depends on NETFILTER_ADVANCED - select NF_CONNTRACK_MARK - select NETFILTER_FAMILY_ARP - help - The CLUSTERIP target allows you to build load-balancing clusters of - network servers without having a dedicated load-balancing - router/server/switch. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 93bad1184251..5a26f9de1ab9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -39,7 +39,6 @@ obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o # targets -obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c deleted file mode 100644 index b3cc416ed292..000000000000 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ /dev/null @@ -1,929 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Cluster IP hashmark target - * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> - * based on ideas of Fabio Olive Leite <olive@unixforge.org> - * - * Development of this code funded by SuSE Linux AG, https://www.suse.com/ - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/jhash.h> -#include <linux/bitops.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <linux/if_arp.h> -#include <linux/seq_file.h> -#include <linux/refcount.h> -#include <linux/netfilter_arp.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/checksum.h> -#include <net/ip.h> - -#define CLUSTERIP_VERSION "0.8" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); - -struct clusterip_config { - struct list_head list; /* list of all configs */ - refcount_t refcount; /* reference count */ - refcount_t entries; /* number of entries/rules - * referencing us */ - - __be32 clusterip; /* the IP address */ - u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ - int ifindex; /* device ifindex */ - u_int16_t num_total_nodes; /* total number of nodes */ - unsigned long local_nodes; /* node number array */ - -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *pde; /* proc dir entry */ -#endif - enum clusterip_hashmode hash_mode; /* which hashing mode */ - u_int32_t hash_initval; /* hash initialization */ - struct rcu_head rcu; /* for call_rcu */ - struct net *net; /* netns for pernet list */ - char ifname[IFNAMSIZ]; /* device ifname */ -}; - -#ifdef CONFIG_PROC_FS -static const struct proc_ops clusterip_proc_ops; -#endif - -struct clusterip_net { - struct list_head configs; - /* lock protects the configs list */ - spinlock_t lock; - - bool clusterip_deprecated_warning; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *procdir; - /* mutex protects the config->pde*/ - struct mutex mutex; -#endif - unsigned int hook_users; -}; - -static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); - -static const struct nf_hook_ops cip_arp_ops = { - .hook = clusterip_arp_mangle, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_OUT, - .priority = -1 -}; - -static unsigned int clusterip_net_id __read_mostly; -static inline struct clusterip_net *clusterip_pernet(struct net *net) -{ - return net_generic(net, clusterip_net_id); -} - -static inline void -clusterip_config_get(struct clusterip_config *c) -{ - refcount_inc(&c->refcount); -} - -static void clusterip_config_rcu_free(struct rcu_head *head) -{ - struct clusterip_config *config; - struct net_device *dev; - - config = container_of(head, struct clusterip_config, rcu); - dev = dev_get_by_name(config->net, config->ifname); - if (dev) { - dev_mc_del(dev, config->clustermac); - dev_put(dev); - } - kfree(config); -} - -static inline void -clusterip_config_put(struct clusterip_config *c) -{ - if (refcount_dec_and_test(&c->refcount)) - call_rcu(&c->rcu, clusterip_config_rcu_free); -} - -/* decrease the count of entries using/referencing this config. If last - * entry(rule) is removed, remove the config from lists, but don't free it - * yet, since proc-files could still be holding references */ -static inline void -clusterip_config_entry_put(struct clusterip_config *c) -{ - struct clusterip_net *cn = clusterip_pernet(c->net); - - local_bh_disable(); - if (refcount_dec_and_lock(&c->entries, &cn->lock)) { - list_del_rcu(&c->list); - spin_unlock(&cn->lock); - local_bh_enable(); - /* In case anyone still accesses the file, the open/close - * functions are also incrementing the refcount on their own, - * so it's safe to remove the entry even if it's in use. */ -#ifdef CONFIG_PROC_FS - mutex_lock(&cn->mutex); - if (cn->procdir) - proc_remove(c->pde); - mutex_unlock(&cn->mutex); -#endif - return; - } - local_bh_enable(); -} - -static struct clusterip_config * -__clusterip_config_find(struct net *net, __be32 clusterip) -{ - struct clusterip_config *c; - struct clusterip_net *cn = clusterip_pernet(net); - - list_for_each_entry_rcu(c, &cn->configs, list) { - if (c->clusterip == clusterip) - return c; - } - - return NULL; -} - -static inline struct clusterip_config * -clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) -{ - struct clusterip_config *c; - - rcu_read_lock_bh(); - c = __clusterip_config_find(net, clusterip); - if (c) { -#ifdef CONFIG_PROC_FS - if (!c->pde) - c = NULL; - else -#endif - if (unlikely(!refcount_inc_not_zero(&c->refcount))) - c = NULL; - else if (entry) { - if (unlikely(!refcount_inc_not_zero(&c->entries))) { - clusterip_config_put(c); - c = NULL; - } - } - } - rcu_read_unlock_bh(); - - return c; -} - -static void -clusterip_config_init_nodelist(struct clusterip_config *c, - const struct ipt_clusterip_tgt_info *i) -{ - int n; - - for (n = 0; n < i->num_local_nodes; n++) - set_bit(i->local_nodes[n] - 1, &c->local_nodes); -} - -static int -clusterip_netdev_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - struct clusterip_net *cn = clusterip_pernet(net); - struct clusterip_config *c; - - spin_lock_bh(&cn->lock); - list_for_each_entry_rcu(c, &cn->configs, list) { - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } else if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - } - } - spin_unlock_bh(&cn->lock); - - return NOTIFY_DONE; -} - -static struct clusterip_config * -clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, - __be32 ip, const char *iniface) -{ - struct clusterip_net *cn = clusterip_pernet(net); - struct clusterip_config *c; - struct net_device *dev; - int err; - - if (iniface[0] == '\0') { - pr_info("Please specify an interface name\n"); - return ERR_PTR(-EINVAL); - } - - c = kzalloc(sizeof(*c), GFP_ATOMIC); - if (!c) - return ERR_PTR(-ENOMEM); - - dev = dev_get_by_name(net, iniface); - if (!dev) { - pr_info("no such interface %s\n", iniface); - kfree(c); - return ERR_PTR(-ENOENT); - } - c->ifindex = dev->ifindex; - strcpy(c->ifname, dev->name); - memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); - dev_mc_add(dev, c->clustermac); - dev_put(dev); - - c->clusterip = ip; - c->num_total_nodes = i->num_total_nodes; - clusterip_config_init_nodelist(c, i); - c->hash_mode = i->hash_mode; - c->hash_initval = i->hash_initval; - c->net = net; - refcount_set(&c->refcount, 1); - - spin_lock_bh(&cn->lock); - if (__clusterip_config_find(net, ip)) { - err = -EBUSY; - goto out_config_put; - } - - list_add_rcu(&c->list, &cn->configs); - spin_unlock_bh(&cn->lock); - -#ifdef CONFIG_PROC_FS - { - char buffer[16]; - - /* create proc dir entry */ - sprintf(buffer, "%pI4", &ip); - mutex_lock(&cn->mutex); - c->pde = proc_create_data(buffer, 0600, - cn->procdir, - &clusterip_proc_ops, c); - mutex_unlock(&cn->mutex); - if (!c->pde) { - err = -ENOMEM; - goto err; - } - } -#endif - - refcount_set(&c->entries, 1); - return c; - -#ifdef CONFIG_PROC_FS -err: -#endif - spin_lock_bh(&cn->lock); - list_del_rcu(&c->list); -out_config_put: - spin_unlock_bh(&cn->lock); - clusterip_config_put(c); - return ERR_PTR(err); -} - -#ifdef CONFIG_PROC_FS -static int -clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) -{ - - if (nodenum == 0 || - nodenum > c->num_total_nodes) - return 1; - - /* check if we already have this number in our bitfield */ - if (test_and_set_bit(nodenum - 1, &c->local_nodes)) - return 1; - - return 0; -} - -static bool -clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) -{ - if (nodenum == 0 || - nodenum > c->num_total_nodes) - return true; - - if (test_and_clear_bit(nodenum - 1, &c->local_nodes)) - return false; - - return true; -} -#endif - -static inline u_int32_t -clusterip_hashfn(const struct sk_buff *skb, - const struct clusterip_config *config) -{ - const struct iphdr *iph = ip_hdr(skb); - unsigned long hashval; - u_int16_t sport = 0, dport = 0; - int poff; - - poff = proto_ports_offset(iph->protocol); - if (poff >= 0) { - const u_int16_t *ports; - u16 _ports[2]; - - ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); - if (ports) { - sport = ports[0]; - dport = ports[1]; - } - } else { - net_info_ratelimited("unknown protocol %u\n", iph->protocol); - } - - switch (config->hash_mode) { - case CLUSTERIP_HASHMODE_SIP: - hashval = jhash_1word(ntohl(iph->saddr), - config->hash_initval); - break; - case CLUSTERIP_HASHMODE_SIP_SPT: - hashval = jhash_2words(ntohl(iph->saddr), sport, - config->hash_initval); - break; - case CLUSTERIP_HASHMODE_SIP_SPT_DPT: - hashval = jhash_3words(ntohl(iph->saddr), sport, dport, - config->hash_initval); - break; - default: - /* to make gcc happy */ - hashval = 0; - /* This cannot happen, unless the check function wasn't called - * at rule load time */ - pr_info("unknown mode %u\n", config->hash_mode); - BUG(); - break; - } - - /* node numbers are 1..n, not 0..n */ - return reciprocal_scale(hashval, config->num_total_nodes) + 1; -} - -static inline int -clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) -{ - return test_bit(hash - 1, &config->local_nodes); -} - -/*********************************************************************** - * IPTABLES TARGET - ***********************************************************************/ - -static unsigned int -clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - u_int32_t hash; - - /* don't need to clusterip_config_get() here, since refcount - * is only decremented by destroy() - and ip_tables guarantees - * that the ->target() function isn't called after ->destroy() */ - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_DROP; - - /* special case: ICMP error handling. conntrack distinguishes between - * error messages (RELATED) and information requests (see below) */ - if (ip_hdr(skb)->protocol == IPPROTO_ICMP && - (ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)) - return XT_CONTINUE; - - /* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO, - * TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here - * on, which all have an ID field [relevant for hashing]. */ - - hash = clusterip_hashfn(skb, cipinfo->config); - - switch (ctinfo) { - case IP_CT_NEW: - WRITE_ONCE(ct->mark, hash); - break; - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - /* FIXME: we don't handle expectations at the moment. - * They can arrive on a different node than - * the master connection (e.g. FTP passive mode) */ - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - default: /* Prevent gcc warnings */ - break; - } - -#ifdef DEBUG - nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); -#endif - pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); - if (!clusterip_responsible(cipinfo->config, hash)) { - pr_debug("not responsible\n"); - return NF_DROP; - } - pr_debug("responsible\n"); - - /* despite being received via linklayer multicast, this is - * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ - skb->pkt_type = PACKET_HOST; - - return XT_CONTINUE; -} - -static int clusterip_tg_check(const struct xt_tgchk_param *par) -{ - struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct clusterip_net *cn = clusterip_pernet(par->net); - const struct ipt_entry *e = par->entryinfo; - struct clusterip_config *config; - int ret, i; - - if (par->nft_compat) { - pr_err("cannot use CLUSTERIP target from nftables compat\n"); - return -EOPNOTSUPP; - } - - if (cn->hook_users == UINT_MAX) - return -EOVERFLOW; - - if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && - cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && - cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { - pr_info("unknown mode %u\n", cipinfo->hash_mode); - return -EINVAL; - - } - if (e->ip.dmsk.s_addr != htonl(0xffffffff) || - e->ip.dst.s_addr == 0) { - pr_info("Please specify destination IP\n"); - return -EINVAL; - } - if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) { - pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes); - return -EINVAL; - } - for (i = 0; i < cipinfo->num_local_nodes; i++) { - if (cipinfo->local_nodes[i] - 1 >= - sizeof(config->local_nodes) * 8) { - pr_info("bad local_nodes[%d] %u\n", - i, cipinfo->local_nodes[i]); - return -EINVAL; - } - } - - config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); - if (!config) { - if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { - pr_info("no config found for %pI4, need 'new'\n", - &e->ip.dst.s_addr); - return -EINVAL; - } else { - config = clusterip_config_init(par->net, cipinfo, - e->ip.dst.s_addr, - e->ip.iniface); - if (IS_ERR(config)) - return PTR_ERR(config); - } - } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) { - clusterip_config_entry_put(config); - clusterip_config_put(config); - return -EINVAL; - } - - ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) { - pr_info("cannot load conntrack support for proto=%u\n", - par->family); - clusterip_config_entry_put(config); - clusterip_config_put(config); - return ret; - } - - if (cn->hook_users == 0) { - ret = nf_register_net_hook(par->net, &cip_arp_ops); - - if (ret < 0) { - clusterip_config_entry_put(config); - clusterip_config_put(config); - nf_ct_netns_put(par->net, par->family); - return ret; - } - } - - cn->hook_users++; - - if (!cn->clusterip_deprecated_warning) { - pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " - "use xt_cluster instead\n"); - cn->clusterip_deprecated_warning = true; - } - - cipinfo->config = config; - return ret; -} - -/* drop reference count of cluster config when rule is deleted */ -static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) -{ - const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct clusterip_net *cn = clusterip_pernet(par->net); - - /* if no more entries are referencing the config, remove it - * from the list and destroy the proc entry */ - clusterip_config_entry_put(cipinfo->config); - - clusterip_config_put(cipinfo->config); - - nf_ct_netns_put(par->net, par->family); - cn->hook_users--; - - if (cn->hook_users == 0) - nf_unregister_net_hook(par->net, &cip_arp_ops); -} - -#ifdef CONFIG_NETFILTER_XTABLES_COMPAT -struct compat_ipt_clusterip_tgt_info -{ - u_int32_t flags; - u_int8_t clustermac[6]; - u_int16_t num_total_nodes; - u_int16_t num_local_nodes; - u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; - u_int32_t hash_mode; - u_int32_t hash_initval; - compat_uptr_t config; -}; -#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ - -static struct xt_target clusterip_tg_reg __read_mostly = { - .name = "CLUSTERIP", - .family = NFPROTO_IPV4, - .target = clusterip_tg, - .checkentry = clusterip_tg_check, - .destroy = clusterip_tg_destroy, - .targetsize = sizeof(struct ipt_clusterip_tgt_info), - .usersize = offsetof(struct ipt_clusterip_tgt_info, config), -#ifdef CONFIG_NETFILTER_XTABLES_COMPAT - .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), -#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ - .me = THIS_MODULE -}; - - -/*********************************************************************** - * ARP MANGLING CODE - ***********************************************************************/ - -/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ -struct arp_payload { - u_int8_t src_hw[ETH_ALEN]; - __be32 src_ip; - u_int8_t dst_hw[ETH_ALEN]; - __be32 dst_ip; -} __packed; - -#ifdef DEBUG -static void arp_print(struct arp_payload *payload) -{ -#define HBUFFERLEN 30 - char hbuffer[HBUFFERLEN]; - int j, k; - - for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { - hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); - hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); - hbuffer[k++] = ':'; - } - hbuffer[--k] = '\0'; - - pr_debug("src %pI4@%s, dst %pI4\n", - &payload->src_ip, hbuffer, &payload->dst_ip); -} -#endif - -static unsigned int -clusterip_arp_mangle(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct arphdr *arp = arp_hdr(skb); - struct arp_payload *payload; - struct clusterip_config *c; - struct net *net = state->net; - - /* we don't care about non-ethernet and non-ipv4 ARP */ - if (arp->ar_hrd != htons(ARPHRD_ETHER) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) - return NF_ACCEPT; - - /* we only want to mangle arp requests and replies */ - if (arp->ar_op != htons(ARPOP_REPLY) && - arp->ar_op != htons(ARPOP_REQUEST)) - return NF_ACCEPT; - - payload = (void *)(arp+1); - - /* if there is no clusterip configuration for the arp reply's - * source ip, we don't want to mangle it */ - c = clusterip_config_find_get(net, payload->src_ip, 0); - if (!c) - return NF_ACCEPT; - - /* normally the linux kernel always replies to arp queries of - * addresses on different interfacs. However, in the CLUSTERIP case - * this wouldn't work, since we didn't subscribe the mcast group on - * other interfaces */ - if (c->ifindex != state->out->ifindex) { - pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n", - c->ifindex, state->out->ifindex); - clusterip_config_put(c); - return NF_ACCEPT; - } - - /* mangle reply hardware address */ - memcpy(payload->src_hw, c->clustermac, arp->ar_hln); - -#ifdef DEBUG - pr_debug("mangled arp reply: "); - arp_print(payload); -#endif - - clusterip_config_put(c); - - return NF_ACCEPT; -} - -/*********************************************************************** - * PROC DIR HANDLING - ***********************************************************************/ - -#ifdef CONFIG_PROC_FS - -struct clusterip_seq_position { - unsigned int pos; /* position */ - unsigned int weight; /* number of bits set == size */ - unsigned int bit; /* current bit */ - unsigned long val; /* current value */ -}; - -static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) -{ - struct clusterip_config *c = s->private; - unsigned int weight; - u_int32_t local_nodes; - struct clusterip_seq_position *idx; - - /* FIXME: possible race */ - local_nodes = c->local_nodes; - weight = hweight32(local_nodes); - if (*pos >= weight) - return NULL; - - idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL); - if (!idx) - return ERR_PTR(-ENOMEM); - - idx->pos = *pos; - idx->weight = weight; - idx->bit = ffs(local_nodes); - idx->val = local_nodes; - clear_bit(idx->bit - 1, &idx->val); - - return idx; -} - -static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct clusterip_seq_position *idx = v; - - *pos = ++idx->pos; - if (*pos >= idx->weight) { - kfree(v); - return NULL; - } - idx->bit = ffs(idx->val); - clear_bit(idx->bit - 1, &idx->val); - return idx; -} - -static void clusterip_seq_stop(struct seq_file *s, void *v) -{ - if (!IS_ERR(v)) - kfree(v); -} - -static int clusterip_seq_show(struct seq_file *s, void *v) -{ - struct clusterip_seq_position *idx = v; - - if (idx->pos != 0) - seq_putc(s, ','); - - seq_printf(s, "%u", idx->bit); - - if (idx->pos == idx->weight - 1) - seq_putc(s, '\n'); - - return 0; -} - -static const struct seq_operations clusterip_seq_ops = { - .start = clusterip_seq_start, - .next = clusterip_seq_next, - .stop = clusterip_seq_stop, - .show = clusterip_seq_show, -}; - -static int clusterip_proc_open(struct inode *inode, struct file *file) -{ - int ret = seq_open(file, &clusterip_seq_ops); - - if (!ret) { - struct seq_file *sf = file->private_data; - struct clusterip_config *c = pde_data(inode); - - sf->private = c; - - clusterip_config_get(c); - } - - return ret; -} - -static int clusterip_proc_release(struct inode *inode, struct file *file) -{ - struct clusterip_config *c = pde_data(inode); - int ret; - - ret = seq_release(inode, file); - - if (!ret) - clusterip_config_put(c); - - return ret; -} - -static ssize_t clusterip_proc_write(struct file *file, const char __user *input, - size_t size, loff_t *ofs) -{ - struct clusterip_config *c = pde_data(file_inode(file)); -#define PROC_WRITELEN 10 - char buffer[PROC_WRITELEN+1]; - unsigned long nodenum; - int rc; - - if (size > PROC_WRITELEN) - return -EIO; - if (copy_from_user(buffer, input, size)) - return -EFAULT; - buffer[size] = 0; - - if (*buffer == '+') { - rc = kstrtoul(buffer+1, 10, &nodenum); - if (rc) - return rc; - if (clusterip_add_node(c, nodenum)) - return -ENOMEM; - } else if (*buffer == '-') { - rc = kstrtoul(buffer+1, 10, &nodenum); - if (rc) - return rc; - if (clusterip_del_node(c, nodenum)) - return -ENOENT; - } else - return -EIO; - - return size; -} - -static const struct proc_ops clusterip_proc_ops = { - .proc_open = clusterip_proc_open, - .proc_read = seq_read, - .proc_write = clusterip_proc_write, - .proc_lseek = seq_lseek, - .proc_release = clusterip_proc_release, -}; - -#endif /* CONFIG_PROC_FS */ - -static int clusterip_net_init(struct net *net) -{ - struct clusterip_net *cn = clusterip_pernet(net); - - INIT_LIST_HEAD(&cn->configs); - - spin_lock_init(&cn->lock); - -#ifdef CONFIG_PROC_FS - cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); - if (!cn->procdir) { - pr_err("Unable to proc dir entry\n"); - return -ENOMEM; - } - mutex_init(&cn->mutex); -#endif /* CONFIG_PROC_FS */ - - return 0; -} - -static void clusterip_net_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - struct clusterip_net *cn = clusterip_pernet(net); - - mutex_lock(&cn->mutex); - proc_remove(cn->procdir); - cn->procdir = NULL; - mutex_unlock(&cn->mutex); -#endif -} - -static struct pernet_operations clusterip_net_ops = { - .init = clusterip_net_init, - .exit = clusterip_net_exit, - .id = &clusterip_net_id, - .size = sizeof(struct clusterip_net), -}; - -static struct notifier_block cip_netdev_notifier = { - .notifier_call = clusterip_netdev_event -}; - -static int __init clusterip_tg_init(void) -{ - int ret; - - ret = register_pernet_subsys(&clusterip_net_ops); - if (ret < 0) - return ret; - - ret = xt_register_target(&clusterip_tg_reg); - if (ret < 0) - goto cleanup_subsys; - - ret = register_netdevice_notifier(&cip_netdev_notifier); - if (ret < 0) - goto unregister_target; - - pr_info("ClusterIP Version %s loaded successfully\n", - CLUSTERIP_VERSION); - - return 0; - -unregister_target: - xt_unregister_target(&clusterip_tg_reg); -cleanup_subsys: - unregister_pernet_subsys(&clusterip_net_ops); - return ret; -} - -static void __exit clusterip_tg_exit(void) -{ - pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); - - unregister_netdevice_notifier(&cip_netdev_notifier); - xt_unregister_target(&clusterip_tg_reg); - unregister_pernet_subsys(&clusterip_net_ops); - - /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ - rcu_barrier(); -} - -module_init(clusterip_tg_init); -module_exit(clusterip_tg_exit); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index d640adcaf1b1..f33aeab9424f 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -280,6 +280,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, goto free_nskb; nf_ct_attach(nskb, oldskb); + nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index bb9854c2b7a1..409ec2a1f95b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -49,6 +49,11 @@ #include <net/transp_v6.h> #endif +#define ping_portaddr_for_each_entry(__sk, node, list) \ + hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) +#define ping_portaddr_for_each_entry_rcu(__sk, node, list) \ + hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) + struct ping_table { struct hlist_nulls_head hash[PING_HTABLE_SIZE]; spinlock_t lock; @@ -192,7 +197,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) return NULL; } - ping_portaddr_for_each_entry(sk, hnode, hslot) { + ping_portaddr_for_each_entry_rcu(sk, hnode, hslot) { isk = inet_sk(sk); pr_debug("iterate\n"); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f88daace9de3..eaf1d3113b62 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -353,7 +353,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_puts(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", @@ -363,9 +363,11 @@ static void icmp_put(struct seq_file *seq) for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); - seq_printf(seq, " %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 006c1f0ed8b4..94df935ee0c5 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ int raw_hash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_nulls_head *hlist; - hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; + hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); __sk_nulls_add_node_rcu(sk, hlist); @@ -160,9 +160,9 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) +static int raw_v4_input(struct net *net, struct sk_buff *skb, + const struct iphdr *iph, int hash) { - struct net *net = dev_net(skb->dev); struct hlist_nulls_head *hlist; struct hlist_nulls_node *hnode; int sdif = inet_sdif(skb); @@ -193,9 +193,10 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) int raw_local_deliver(struct sk_buff *skb, int protocol) { - int hash = protocol & (RAW_HTABLE_SIZE - 1); + struct net *net = dev_net(skb->dev); - return raw_v4_input(skb, ip_hdr(skb), hash); + return raw_v4_input(net, skb, ip_hdr(skb), + raw_hashfunc(net, protocol)); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) @@ -271,7 +272,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) struct sock *sk; int hash; - hash = protocol & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, protocol); hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); @@ -287,11 +288,13 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason; + /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb); - if (sock_queue_rcv_skb(sk, skb) < 0) { - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -302,7 +305,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cd1fa9f70f1a..de6e3515ab4f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -471,7 +471,7 @@ static u32 ip_idents_reserve(u32 hash, int segs) old = READ_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) - delta = prandom_u32_max(now - old); + delta = get_random_u32_below(now - old); /* If UBSAN reports an error there, please make sure your compiler * supports -fno-strict-overflow before reporting it that was a bug @@ -689,7 +689,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, } else { /* Randomize max depth to avoid some side channels attacks. */ int max_depth = FNHE_RECLAIM_DEPTH + - prandom_u32_max(FNHE_RECLAIM_DEPTH); + get_random_u32_below(FNHE_RECLAIM_DEPTH); while (depth > max_depth) { fnhe_remove_oldest(hash); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 942d2dfa1115..26fb97d1d4d9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -288,12 +288,11 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct tcp_request_sock *treq; struct request_sock *req; -#ifdef CONFIG_MPTCP if (sk_is_mptcp(sk)) - ops = &mptcp_subflow_request_sock_ops; -#endif + req = mptcp_subflow_reqsk_alloc(ops, sk, false); + else + req = inet_reqsk_alloc(ops, sk, false); - req = inet_reqsk_alloc(ops, sk, false); if (!req) return NULL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 001947136b0a..33f559f491c8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -435,6 +435,7 @@ void tcp_init_sock(struct sock *sk) /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; + tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -2000,7 +2001,7 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(READ, (void __user *)copy_address, + err = import_single_range(ITER_DEST, (void __user *)copy_address, inq, &iov, &msg.msg_iter); if (err) return err; @@ -2034,7 +2035,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(READ, (void __user *)copy_address, + err = import_single_range(ITER_DEST, (void __user *)copy_address, copylen, &iov, &msg.msg_iter); if (err) return err; @@ -3178,6 +3179,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; + tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 54eec33c6e1c..146792cd26fe 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -295,7 +295,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) } /* override sysctl_tcp_min_tso_segs */ -static u32 bbr_min_tso_segs(struct sock *sk) +__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) { return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } @@ -328,7 +328,7 @@ static void bbr_save_cwnd(struct sock *sk) bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp)); } -static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -618,7 +618,7 @@ static void bbr_reset_probe_bw_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_PROBE_BW; - bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); + bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ } @@ -1023,7 +1023,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_update_gains(sk); } -static void bbr_main(struct sock *sk, const struct rate_sample *rs) +__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) { struct bbr *bbr = inet_csk_ca(sk); u32 bw; @@ -1035,7 +1035,7 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs) bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); } -static void bbr_init(struct sock *sk) +__bpf_kfunc static void bbr_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -1077,7 +1077,7 @@ static void bbr_init(struct sock *sk) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); } -static u32 bbr_sndbuf_expand(struct sock *sk) +__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) { /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ return 3; @@ -1086,7 +1086,7 @@ static u32 bbr_sndbuf_expand(struct sock *sk) /* In theory BBR does not need to undo the cwnd since it does not * always reduce cwnd on losses (see bbr_main()). Keep it for now. */ -static u32 bbr_undo_cwnd(struct sock *sk) +__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) { struct bbr *bbr = inet_csk_ca(sk); @@ -1097,7 +1097,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) } /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ -static u32 bbr_ssthresh(struct sock *sk) +__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) { bbr_save_cwnd(sk); return tcp_sk(sk)->snd_ssthresh; @@ -1125,7 +1125,7 @@ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, return 0; } -static void bbr_set_state(struct sock *sk, u8 new_state) +__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) { struct bbr *bbr = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index cf9c3e8f7ccb..cf26d65ca389 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -6,6 +6,7 @@ #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> +#include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> @@ -45,8 +46,11 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, tmp->sg.end = i; if (apply) { apply_bytes -= size; - if (!apply_bytes) + if (!apply_bytes) { + if (sge->length) + sk_msg_iter_var_prev(i); break; + } } } while (i != msg->sg.end); @@ -131,10 +135,9 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, return ret; } -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, - u32 bytes, int flags) +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags) { - bool ingress = sk_msg_to_ingress(msg); struct sk_psock *psock = sk_psock_get(sk); int ret; @@ -276,10 +279,10 @@ msg_bytes_ready: static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = sk_msg_full(msg); + bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; - u32 eval = __SK_NONE; + u32 eval; int ret; more_data: @@ -310,6 +313,7 @@ more_data: tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; + eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: @@ -321,6 +325,7 @@ more_data: sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: + redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); if (!psock->apply_bytes) { @@ -337,7 +342,8 @@ more_data: release_sock(sk); origsize = msg->sg.size; - ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, + msg, tosend, flags); sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) @@ -634,10 +640,9 @@ EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; struct proto *prot = newsk->sk_prot; - if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) + if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d3cae40749e8..db8b4b488c31 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -403,7 +403,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) +__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); @@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ -void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) +__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { @@ -443,7 +443,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) +__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -462,7 +462,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ -u32 tcp_reno_ssthresh(struct sock *sk) +__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -470,7 +470,7 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -u32 tcp_reno_undo_cwnd(struct sock *sk) +__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 768c10c1f649..0fd78ecb67e7 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -126,7 +126,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -static void cubictcp_init(struct sock *sk) +__bpf_kfunc static void cubictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -139,7 +139,7 @@ static void cubictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); @@ -321,7 +321,7 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) +__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -338,7 +338,7 @@ static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_cong_avoid_ai(tp, ca->cnt, acked); } -static u32 cubictcp_recalc_ssthresh(struct sock *sk) +__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -355,7 +355,7 @@ static u32 cubictcp_recalc_ssthresh(struct sock *sk) return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } -static void cubictcp_state(struct sock *sk, u8 new_state) +__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -445,7 +445,7 @@ static void hystart_update(struct sock *sk, u32 delay) } } -static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) +__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index e0a2ca7456ff..bb23bb5b387a 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -75,7 +75,7 @@ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) ca->old_delivered_ce = tp->delivered_ce; } -static void dctcp_init(struct sock *sk) +__bpf_kfunc static void dctcp_init(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -104,7 +104,7 @@ static void dctcp_init(struct sock *sk) INET_ECN_dontxmit(sk); } -static u32 dctcp_ssthresh(struct sock *sk) +__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -113,7 +113,7 @@ static u32 dctcp_ssthresh(struct sock *sk) return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U); } -static void dctcp_update_alpha(struct sock *sk, u32 flags) +__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); @@ -169,7 +169,7 @@ static void dctcp_react_to_loss(struct sock *sk) tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U); } -static void dctcp_state(struct sock *sk, u8 new_state) +__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Recovery && new_state != inet_csk(sk)->icsk_ca_state) @@ -179,7 +179,7 @@ static void dctcp_state(struct sock *sk, u8 new_state) */ } -static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { struct dctcp *ca = inet_csk_ca(sk); @@ -229,7 +229,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, return 0; } -static u32 dctcp_cwnd_undo(struct sock *sk) +__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk) { const struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1efacbe948da..cc072d2cfcd8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3646,7 +3646,8 @@ static void tcp_send_challenge_ack(struct sock *sk) u32 half = (ack_limit + 1) >> 1; WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); - WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit)); + WRITE_ONCE(net->ipv4.tcp_challenge_count, + get_random_u32_inclusive(half, ack_limit + half - 1)); } count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8320d0ecb13a..ea370afa70ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2102,6 +2102,7 @@ process: /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c index bb1a08fda113..4bcf7eff95e3 100644 --- a/net/ipv4/tcp_plb.c +++ b/net/ipv4/tcp_plb.c @@ -97,7 +97,7 @@ void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) return; pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; - pause += prandom_u32_max(pause); + pause += get_random_u32_below(pause); plb->pause_until = tcp_jiffies32 + pause; /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 9ae50b1bd844..2aa442128630 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -139,6 +139,10 @@ static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); + err = -ENOTCONN; + if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN) + goto out_err; + err = ulp_ops->init(sk); if (err) goto out_err; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9592fe3e444a..c605d171eb2d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -248,7 +248,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, int low, high, remaining; unsigned int rand; - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; rand = get_random_u32(); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index aedde65e2268..1f01e15ca24f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -387,7 +387,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) return __udp_gso_segment(skb, features, false); mss = skb_shinfo(skb)->gso_size; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 8242c8947340..5f8104cf082d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -176,6 +176,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); + synchronize_rcu(); kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9c3f5202a97b..faa47f9ea73a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -104,7 +104,7 @@ static inline u32 cstamp_delta(unsigned long cstamp) static inline s32 rfc3315_s14_backoff_init(s32 irt) { /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ - u64 tmp = (900000 + prandom_u32_max(200001)) * (u64)irt; + u64 tmp = get_random_u32_inclusive(900000, 1100000) * (u64)irt; do_div(tmp, 1000000); return (s32)tmp; } @@ -112,11 +112,11 @@ static inline s32 rfc3315_s14_backoff_init(s32 irt) static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) { /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ - u64 tmp = (1900000 + prandom_u32_max(200001)) * (u64)rt; + u64 tmp = get_random_u32_inclusive(1900000, 2100000) * (u64)rt; do_div(tmp, 1000000); if ((s32)tmp > mrt) { /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ - tmp = (900000 + prandom_u32_max(200001)) * (u64)mrt; + tmp = get_random_u32_inclusive(900000, 1100000) * (u64)mrt; do_div(tmp, 1000000); } return (s32)tmp; @@ -3127,17 +3127,17 @@ static void add_v4_addrs(struct inet6_dev *idev) offset = sizeof(struct in6_addr) - 4; memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); - if (idev->dev->flags&IFF_POINTOPOINT) { + if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { + scope = IPV6_ADDR_COMPATv4; + plen = 96; + pflags |= RTF_NONEXTHOP; + } else { if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) return; addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64; - } else { - scope = IPV6_ADDR_COMPATv4; - plen = 96; - pflags |= RTF_NONEXTHOP; } if (addr.s6_addr32[3]) { @@ -3320,7 +3320,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) return; /* no link local addresses on devices flagged as slaves */ - if (idev->dev->flags & IFF_SLAVE) + if (idev->dev->priv_flags & IFF_NO_ADDRCONF) return; ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); @@ -3447,6 +3447,30 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +static void addrconf_init_auto_addrs(struct net_device *dev) +{ + switch (dev->type) { +#if IS_ENABLED(CONFIG_IPV6_SIT) + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; +#endif +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) + case ARPHRD_IP6GRE: + case ARPHRD_IPGRE: + addrconf_gre_config(dev); + break; +#endif + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + } +} + static int fixup_permanent_addr(struct net *net, struct inet6_dev *idev, struct inet6_ifaddr *ifp) @@ -3560,7 +3584,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev && idev->cnf.disable_ipv6) break; - if (dev->flags & IFF_SLAVE) { + if (dev->priv_flags & IFF_NO_ADDRCONF) { if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP && dev->flags & IFF_MULTICAST) ipv6_mc_up(idev); @@ -3615,26 +3639,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } - switch (dev->type) { -#if IS_ENABLED(CONFIG_IPV6_SIT) - case ARPHRD_SIT: - addrconf_sit_config(dev); - break; -#endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) - case ARPHRD_IP6GRE: - case ARPHRD_IPGRE: - addrconf_gre_config(dev); - break; -#endif - case ARPHRD_LOOPBACK: - init_loopback(dev); - break; - - default: - addrconf_dev_config(dev); - break; - } + addrconf_init_auto_addrs(dev); if (!IS_ERR_OR_NULL(idev)) { if (run_pending) @@ -3967,7 +3972,7 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = prandom_u32_max(idev->cnf.rtr_solicit_delay ?: 1); + rand_num = get_random_u32_below(idev->cnf.rtr_solicit_delay ? : 1); nonce = 0; if (idev->cnf.enhanced_dad || @@ -6397,7 +6402,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; @@ -6408,7 +6413,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev && idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fee9163382c2..38689bedfce7 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -222,6 +222,7 @@ lookup_protocol: np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. @@ -238,16 +239,6 @@ lookup_protocol: inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; - /* - * Increment only the relevant sk_prot->socks debug field, this changes - * the previous behaviour of incrementing both the equivalent to - * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. - * - * This allows better debug granularity as we'll know exactly how many - * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 - * transport protocol socks. -acme - */ - sk_refcnt_debug_inc(sk); if (inet->inet_num) { /* It assumes that any protocol which allows diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 7c7155b48f17..9b6818453afe 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -42,24 +42,29 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); + int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; - fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; - fl6->flowlabel = np->flow_label; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; - if (!fl6->flowi6_oif) - fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + if (!oif) + oif = np->sticky_pktinfo.ipi6_ifindex; - if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) - fl6->flowi6_oif = np->mcast_oif; + if (!oif) { + if (ipv6_addr_is_multicast(&fl6->daddr)) + oif = np->mcast_oif; + else + oif = np->ucast_oif; + } + fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9d92d51c4757..1f53f2a74480 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -183,6 +183,7 @@ static bool icmpv6_global_allow(struct net *net, int type) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -224,6 +225,9 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (peer) inet_putpeer(peer); } + if (!res) + __ICMP6_INC_STATS(net, ip6_dst_idev(dst), + ICMP6_MIB_RATELIMITHOST); dst_release(dst); return res; } @@ -328,7 +332,6 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hao *hao; - struct in6_addr tmp; int off; if (opt->dsthao) { @@ -336,9 +339,7 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); - tmp = iph->saddr; - iph->saddr = hao->addr; - hao->addr = tmp; + swap(iph->saddr, hao->addr); } } } @@ -704,7 +705,7 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, } EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); -static void icmpv6_echo_reply(struct sk_buff *skb) +static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sock *sk; @@ -718,18 +719,19 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); + SKB_DR(reason); bool acast; u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) - return; + return reason; saddr = &ipv6_hdr(skb)->daddr; acast = ipv6_anycast_destination(skb_dst(skb), saddr); if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) - return; + return reason; if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) @@ -803,6 +805,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); + reason = SKB_CONSUMED; } out_dst_release: dst_release(dst); @@ -810,18 +813,22 @@ out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); + return reason; } -void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) +enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, + u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(skb->dev); const struct inet6_protocol *ipprot; + enum skb_drop_reason reason; int inner_offset; __be16 frag_off; u8 nexthdr; - struct net *net = dev_net(skb->dev); - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr)); + if (reason != SKB_NOT_DROPPED_YET) goto out; seg6_icmp_srh(skb, opt); @@ -831,14 +838,17 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (inner_offset < 0) + if (inner_offset < 0) { + SKB_DR_SET(reason, IPV6_BAD_EXTHDR); goto out; + } } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ - if (!pskb_may_pull(skb, inner_offset+8)) + reason = pskb_may_pull_reason(skb, inner_offset + 8); + if (reason != SKB_NOT_DROPPED_YET) goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. @@ -853,10 +863,11 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) ipprot->err_handler(skb, opt, type, code, inner_offset, info); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); - return; + return SKB_CONSUMED; out: __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return reason; } /* @@ -921,12 +932,12 @@ static int icmpv6_rcv(struct sk_buff *skb) switch (type) { case ICMPV6_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) - icmpv6_echo_reply(skb); + reason = icmpv6_echo_reply(skb); break; case ICMPV6_EXT_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe)) - icmpv6_echo_reply(skb); + reason = icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: @@ -952,7 +963,8 @@ static int icmpv6_rcv(struct sk_buff *skb) case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: - icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + reason = icmpv6_notify(skb, type, hdr->icmp6_code, + hdr->icmp6_mtu); break; case NDISC_ROUTER_SOLICITATION: @@ -960,7 +972,7 @@ static int icmpv6_rcv(struct sk_buff *skb) case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: - ndisc_rcv(skb); + reason = ndisc_rcv(skb); break; case ICMPV6_MGM_QUERY: @@ -994,7 +1006,8 @@ static int icmpv6_rcv(struct sk_buff *skb) * must pass to upper level */ - icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + reason = icmpv6_notify(skb, type, hdr->icmp6_code, + hdr->icmp6_mtu); } /* until the v6 path can be better sorted assume failure and diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 3ee345672849..00dc2e3b0184 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -77,7 +77,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; - int proto, nexthdr; + int proto, err; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; @@ -87,28 +87,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, bool gso_partial; skb_reset_network_header(skb); - nexthdr = ipv6_has_hopopt_jumbo(skb); - if (nexthdr) { - const int hophdr_len = sizeof(struct hop_jumbo_hdr); - int err; - - err = skb_cow_head(skb, 0); - if (err < 0) - return ERR_PTR(err); - - /* remove the HBH header. - * Layout: [Ethernet header][IPv6 header][HBH][TCP header] - */ - memmove(skb_mac_header(skb) + hophdr_len, - skb_mac_header(skb), - ETH_HLEN + sizeof(struct ipv6hdr)); - skb->data += hophdr_len; - skb->len -= hophdr_len; - skb->network_header += hophdr_len; - skb->mac_header += hophdr_len; - ipv6h = (struct ipv6hdr *)skb->data; - ipv6h->nexthdr = nexthdr; - } + err = ipv6_hopopt_jumbo_remove(skb); + if (err) + return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e19507614f64..c314fdde0097 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -547,7 +547,20 @@ int ip6_forward(struct sk_buff *skb) pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { - hdr->hop_limit--; + /* It's tempting to decrease the hop limit + * here by 1, as we do at the end of the + * function too. + * + * But that would be incorrect, as proxying is + * not forwarding. The ip6_input function + * will handle this packet locally, and it + * depends on the hop limit being unchanged. + * + * One example is the NDP hop limit, that + * always has to stay 255, but other would be + * similar checks around RA packets, where the + * user can even change the desired limit. + */ return ip6_input(skb); } else if (proxied < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); @@ -920,6 +933,9 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (err < 0) goto fail; + /* We prevent @rt from being freed. */ + rcu_read_lock(); + for (;;) { /* Prepare header of the next frame, * before previous one went down. */ @@ -943,6 +959,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); + rcu_read_unlock(); return 0; } @@ -950,6 +967,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); + rcu_read_unlock(); return err; slow_path_clean: diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 23e766597f36..51cf37abd142 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -392,7 +392,7 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr_table *mrt) { - del_timer_sync(&mrt->ipmr_expire_timer); + timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 9ce51680290b..2917dd8d198c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -464,13 +464,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); - /* - * Sock is moving from IPv6 to IPv4 (sk_prot), so - * remove it from the refcnt debug socks count in the - * original family... - */ - sk_refcnt_debug_dec(sk); - if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -507,11 +500,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, inet6_cleanup_sock(sk); - /* - * ... and add it to the refcnt debug socks count - * in the new family. -acme - */ - sk_refcnt_debug_inc(sk); module_put(THIS_MODULE); retv = 0; break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 7860383295d8..1c02160cf7a4 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1050,7 +1050,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, /* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { - unsigned long tv = prandom_u32_max(idev->mc_maxdelay); + unsigned long tv = get_random_u32_below(idev->mc_maxdelay); idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) @@ -1068,7 +1068,7 @@ static void mld_gq_stop_work(struct inet6_dev *idev) /* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = prandom_u32_max(delay); + unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); @@ -1085,7 +1085,7 @@ static void mld_ifc_stop_work(struct inet6_dev *idev) /* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = prandom_u32_max(delay); + unsigned long tv = get_random_u32_below(delay); if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); @@ -1130,7 +1130,7 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) } if (delay >= resptime) - delay = prandom_u32_max(resptime); + delay = get_random_u32_below(resptime); if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); @@ -2574,7 +2574,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = prandom_u32_max(unsolicited_report_interval(ma->idev)); + delay = get_random_u32_below(unsolicited_report_interval(ma->idev)); if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3a553494ff16..c4be62c99f73 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -783,7 +783,7 @@ void ndisc_update(const struct net_device *dev, struct neighbour *neigh, ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } -static void ndisc_recv_ns(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; @@ -797,18 +797,17 @@ static void ndisc_recv_ns(struct sk_buff *skb) struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); - bool inc; int is_router = -1; + SKB_DR(reason); u64 nonce = 0; + bool inc; - if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK(2, warn, "NS: packet too short\n"); - return; - } + if (skb->len < sizeof(struct nd_msg)) + return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); - return; + return reason; } /* @@ -817,20 +816,18 @@ static void ndisc_recv_ns(struct sk_buff *skb) */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); - return; + return reason; } - if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, warn, "NS: invalid ND options\n"); - return; - } + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); - return; + return reason; } /* RFC2461 7.1.1: @@ -841,7 +838,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); - return; + return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) @@ -869,7 +866,7 @@ have_ifp: * so fail our DAD process */ addrconf_dad_failure(skb, ifp); - return; + return reason; } else { /* * This is not a dad solicitation. @@ -901,7 +898,7 @@ have_ifp: idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ - return; + return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || @@ -924,8 +921,10 @@ have_ifp: pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } - } else + } else { + SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; + } } if (is_router < 0) @@ -958,6 +957,7 @@ have_ifp: true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); + reason = SKB_CONSUMED; } out: @@ -965,6 +965,7 @@ out: in6_ifa_put(ifp); else in6_dev_put(idev); + return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) @@ -986,7 +987,7 @@ static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) } } -static void ndisc_recv_na(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; @@ -999,22 +1000,21 @@ static void ndisc_recv_na(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; + SKB_DR(reason); u8 new_state; - if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK(2, warn, "NA: packet too short\n"); - return; - } + if (skb->len < sizeof(struct nd_msg)) + return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); - return; + return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); - return; + return reason; } /* For some 802.11 wireless deployments (and possibly other networks), @@ -1024,18 +1024,17 @@ static void ndisc_recv_na(struct sk_buff *skb) */ if (!msg->icmph.icmp6_solicited && idev && idev->cnf.drop_unsolicited_na) - return; + return reason; + + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; - if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, warn, "NS: invalid ND option\n"); - return; - } if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); - return; + return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); @@ -1043,7 +1042,7 @@ static void ndisc_recv_na(struct sk_buff *skb) if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); - return; + return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing @@ -1059,7 +1058,7 @@ static void ndisc_recv_na(struct sk_buff *skb) "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); - return; + return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); @@ -1120,13 +1119,14 @@ static void ndisc_recv_na(struct sk_buff *skb) */ rt6_clean_tohost(dev_net(dev), saddr); } - + reason = SKB_CONSUMED; out: neigh_release(neigh); } + return reason; } -static void ndisc_recv_rs(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); @@ -1135,14 +1135,15 @@ static void ndisc_recv_rs(struct sk_buff *skb) const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; + SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) - return; + return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); - return; + return reason; } /* Don't accept RS if we're not in router mode */ @@ -1157,10 +1158,8 @@ static void ndisc_recv_rs(struct sk_buff *skb) goto out; /* Parse ND options */ - if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); - goto out; - } + if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, @@ -1177,9 +1176,10 @@ static void ndisc_recv_rs(struct sk_buff *skb) NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); + reason = SKB_CONSUMED; } out: - return; + return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) @@ -1228,20 +1228,21 @@ errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } -static void ndisc_router_discovery(struct sk_buff *skb) +static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); + bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; - struct inet6_dev *in6_dev; + struct ndisc_options ndopts; struct fib6_info *rt = NULL; + struct inet6_dev *in6_dev; u32 defrtr_usr_metric; + unsigned int pref = 0; + __u32 old_if_flags; struct net *net; + SKB_DR(reason); int lifetime; - struct ndisc_options ndopts; int optlen; - unsigned int pref = 0; - __u32 old_if_flags; - bool send_ifinfo_notify = false; __u8 *opt = (__u8 *)(ra_msg + 1); @@ -1253,17 +1254,15 @@ static void ndisc_router_discovery(struct sk_buff *skb) __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); - return; - } - if (optlen < 0) { - ND_PRINTK(2, warn, "RA: packet too short\n"); - return; + return reason; } + if (optlen < 0) + return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); - return; + return reason; } #endif @@ -1275,13 +1274,11 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); - return; + return reason; } - if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { - ND_PRINTK(2, warn, "RA: invalid ND options\n"); - return; - } + if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, @@ -1362,7 +1359,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); - return; + return reason; } } /* Set default route metric as specified by user */ @@ -1387,7 +1384,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); - return; + return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, @@ -1398,7 +1395,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); - return; + return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { @@ -1485,6 +1482,7 @@ skip_linkparms: NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); + reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { @@ -1595,15 +1593,17 @@ out: fib6_info_release(rt); if (neigh) neigh_release(neigh); + return reason; } -static void ndisc_redirect_rcv(struct sk_buff *skb) +static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { - u8 *hdr; - struct ndisc_options ndopts; struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); + struct ndisc_options ndopts; + SKB_DR(reason); + u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { @@ -1611,31 +1611,31 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); - return; + return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); - return; + return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) - return; + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); - return; + return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) - return; + return SKB_DROP_REASON_PKT_TOO_SMALL; - icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); + return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, @@ -1781,8 +1781,9 @@ release: static void pndisc_redo(struct sk_buff *skb) { - ndisc_recv_ns(skb); - kfree_skb(skb); + enum skb_drop_reason reason = ndisc_recv_ns(skb); + + kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) @@ -1804,15 +1805,16 @@ static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) return false; } -int ndisc_rcv(struct sk_buff *skb) +enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; + SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) - return 0; + return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) - return 0; + return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); @@ -1821,39 +1823,39 @@ int ndisc_rcv(struct sk_buff *skb) if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); - return 0; + return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); - return 0; + return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - ndisc_recv_ns(skb); + reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: - ndisc_recv_na(skb); + reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: - ndisc_recv_rs(skb); + reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: - ndisc_router_discovery(skb); + reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: - ndisc_redirect_rcv(skb); + reason = ndisc_redirect_rcv(skb); break; } - return 0; + return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index f61d4f18e1cf..58ccdb08c0fd 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -345,6 +345,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); + nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 2685c3f15e9d..b5205311f372 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -15,13 +15,7 @@ static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { - u32 id; - - do { - id = get_random_u32(); - } while (!id); - - return id; + return get_random_u32_above(0); } /* This function exists only for tap drivers that must support broken diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index d6306aa46bb1..e20b3705c2d2 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -94,6 +94,7 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), + SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a06a9f847db5..bac9ba747bde 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -152,7 +152,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_nulls_for_each(sk, hnode, hlist) { @@ -338,7 +338,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, struct sock *sk; int hash; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_nulls_for_each(sk, hnode, hlist) { @@ -355,17 +355,19 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason; + if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); - if (sock_queue_rcv_skb(sk, skb) < 0) { - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -386,7 +388,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -410,7 +412,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet->hdrincl) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } @@ -505,6 +507,7 @@ csum_copy_err: static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { + struct ipv6_txoptions *opt; struct sk_buff *skb; int err = 0; int offset; @@ -522,6 +525,9 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; + opt = inet6_sk(sk)->cork.opt; + total_len -= opt ? opt->opt_flen : 0; + if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2f355f0ec32a..c180c2ef17c5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -91,7 +91,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); +static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -1713,7 +1713,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, net->ipv6.rt6_stats->fib_rt_cache++; /* Randomize max depth to avoid some side channels attacks. */ - max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH); + max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); @@ -2593,9 +2593,10 @@ INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags_noref(struct net *net, - const struct sock *sk, - struct flowi6 *fl6, int flags) +static struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) { bool any_src; @@ -2624,7 +2625,6 @@ struct dst_entry *ip6_route_output_flags_noref(struct net *net, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } -EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, @@ -3284,23 +3284,17 @@ out: return dst; } -static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; unsigned int val; int entries; - entries = dst_entries_get_fast(ops); - if (entries > rt_max_size) - entries = dst_entries_get_slow(ops); - - if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); @@ -3310,7 +3304,6 @@ static int ip6_dst_gc(struct dst_ops *ops) out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); - return entries > rt_max_size; } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, @@ -6512,7 +6505,7 @@ static int __net_init ip6_route_net_init(struct net *net) #endif net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index ff691d9f4a04..b1c028df686e 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -13,7 +13,7 @@ #include <net/rpl.h> struct rpl_iptunnel_encap { - struct ipv6_rpl_sr_hdr srh[0]; + DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 487f8e98deaa..dd433cc265c8 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -109,8 +109,15 @@ struct bpf_lwt_prog { #define next_csid_chk_lcnode_fn_bits(flen) \ next_csid_chk_lcblock_bits(flen) +#define SEG6_F_LOCAL_FLV_OP(flvname) BIT(SEG6_LOCAL_FLV_OP_##flvname) +#define SEG6_F_LOCAL_FLV_PSP SEG6_F_LOCAL_FLV_OP(PSP) + +/* Supported RFC8986 Flavor operations are reported in this bitmask */ +#define SEG6_LOCAL_FLV8986_SUPP_OPS SEG6_F_LOCAL_FLV_PSP + /* Supported Flavor operations are reported in this bitmask */ -#define SEG6_LOCAL_FLV_SUPP_OPS (BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID)) +#define SEG6_LOCAL_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_OP(NEXT_CSID) | \ + SEG6_LOCAL_FLV8986_SUPP_OPS) struct seg6_flavors_info { /* Flavor operations */ @@ -364,6 +371,14 @@ static void seg6_next_csid_advance_arg(struct in6_addr *addr, memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects); } +static int input_action_end_finish(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); +} + static int input_action_end_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -375,9 +390,7 @@ static int input_action_end_core(struct sk_buff *skb, advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - seg6_lookup_nexthop(skb, NULL, 0); - - return dst_input(skb); + return input_action_end_finish(skb, slwt); drop: kfree_skb(skb); @@ -395,9 +408,7 @@ static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) /* update DA */ seg6_next_csid_advance_arg(daddr, finfo); - seg6_lookup_nexthop(skb, NULL, 0); - - return dst_input(skb); + return input_action_end_finish(skb, slwt); } static bool seg6_next_csid_enabled(__u32 fops) @@ -405,15 +416,331 @@ static bool seg6_next_csid_enabled(__u32 fops) return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID); } +/* We describe the packet state in relation to the absence/presence of the SRH + * and the Segment Left (SL) field. + * For our purposes, it is not necessary to record the exact value of the SL + * when the SID List consists of two or more segments. + */ +enum seg6_local_pktinfo { + /* the order really matters! */ + SEG6_LOCAL_PKTINFO_NOHDR = 0, + SEG6_LOCAL_PKTINFO_SL_ZERO, + SEG6_LOCAL_PKTINFO_SL_ONE, + SEG6_LOCAL_PKTINFO_SL_MORE, + __SEG6_LOCAL_PKTINFO_MAX, +}; + +#define SEG6_LOCAL_PKTINFO_MAX (__SEG6_LOCAL_PKTINFO_MAX - 1) + +static enum seg6_local_pktinfo seg6_get_srh_pktinfo(struct ipv6_sr_hdr *srh) +{ + __u8 sgl; + + if (!srh) + return SEG6_LOCAL_PKTINFO_NOHDR; + + sgl = srh->segments_left; + if (sgl < 2) + return SEG6_LOCAL_PKTINFO_SL_ZERO + sgl; + + return SEG6_LOCAL_PKTINFO_SL_MORE; +} + +enum seg6_local_flv_action { + SEG6_LOCAL_FLV_ACT_UNSPEC = 0, + SEG6_LOCAL_FLV_ACT_END, + SEG6_LOCAL_FLV_ACT_PSP, + SEG6_LOCAL_FLV_ACT_USP, + SEG6_LOCAL_FLV_ACT_USD, + __SEG6_LOCAL_FLV_ACT_MAX +}; + +#define SEG6_LOCAL_FLV_ACT_MAX (__SEG6_LOCAL_FLV_ACT_MAX - 1) + +/* The action table for RFC8986 flavors (see the flv8986_act_tbl below) + * contains the actions (i.e. processing operations) to be applied on packets + * when flavors are configured for an End* behavior. + * By combining the pkinfo data and from the flavors mask, the macro + * computes the index used to access the elements (actions) stored in the + * action table. The index is structured as follows: + * + * index + * _______________/\________________ + * / \ + * +----------------+----------------+ + * | pf | afm | + * +----------------+----------------+ + * ph-1 ... p1 p0 fk-1 ... f1 f0 + * MSB LSB + * + * where: + * - 'afm' (adjusted flavor mask) is the mask containing a combination of the + * RFC8986 flavors currently supported. 'afm' corresponds to the @fm + * argument of the macro whose value is righ-shifted by 1 bit. By doing so, + * we discard the SEG6_LOCAL_FLV_OP_UNSPEC flag (bit 0 in @fm) which is + * never used here; + * - 'pf' encodes the packet info (pktinfo) regarding the presence/absence of + * the SRH, SL = 0, etc. 'pf' is set with the value of @pf provided as + * argument to the macro. + */ +#define flv8986_act_tbl_idx(pf, fm) \ + ((((pf) << bits_per(SEG6_LOCAL_FLV8986_SUPP_OPS)) | \ + ((fm) & SEG6_LOCAL_FLV8986_SUPP_OPS)) >> SEG6_LOCAL_FLV_OP_PSP) + +/* We compute the size of the action table by considering the RFC8986 flavors + * actually supported by the kernel. In this way, the size is automatically + * adjusted when new flavors are supported. + */ +#define FLV8986_ACT_TBL_SIZE \ + roundup_pow_of_two(flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_MAX, \ + SEG6_LOCAL_FLV8986_SUPP_OPS)) + +/* tbl_cfg(act, pf, fm) macro is used to easily configure the action + * table; it accepts 3 arguments: + * i) @act, the suffix from SEG6_LOCAL_FLV_ACT_{act} representing + * the action that should be applied on the packet; + * ii) @pf, the suffix from SEG6_LOCAL_PKTINFO_{pf} reporting the packet + * info about the lack/presence of SRH, SRH with SL = 0, etc; + * iii) @fm, the mask of flavors. + */ +#define tbl_cfg(act, pf, fm) \ + [flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_##pf, \ + (fm))] = SEG6_LOCAL_FLV_ACT_##act + +/* shorthand for improving readability */ +#define F_PSP SEG6_F_LOCAL_FLV_PSP + +/* The table contains, for each combination of the pktinfo data and + * flavors, the action that should be taken on a packet (e.g. + * "standard" Endpoint processing, Penultimate Segment Pop, etc). + * + * By default, table entries not explicitly configured are initialized with the + * SEG6_LOCAL_FLV_ACT_UNSPEC action, which generally has the effect of + * discarding the processed packet. + */ +static const u8 flv8986_act_tbl[FLV8986_ACT_TBL_SIZE] = { + /* PSP variant for packet where SRH with SL = 1 */ + tbl_cfg(PSP, SL_ONE, F_PSP), + /* End for packet where the SRH with SL > 1*/ + tbl_cfg(END, SL_MORE, F_PSP), +}; + +#undef F_PSP +#undef tbl_cfg + +/* For each flavor defined in RFC8986 (or a combination of them) an action is + * performed on the packet. The specific action depends on: + * - info extracted from the packet (i.e. pktinfo data) regarding the + * lack/presence of the SRH, and if the SRH is available, on the value of + * Segment Left field; + * - the mask of flavors configured for the specific SRv6 End* behavior. + * + * The function combines both the pkinfo and the flavors mask to evaluate the + * corresponding action to be taken on the packet. + */ +static enum seg6_local_flv_action +seg6_local_flv8986_act_lookup(enum seg6_local_pktinfo pinfo, __u32 flvmask) +{ + unsigned long index; + + /* check if the provided mask of flavors is supported */ + if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) + return SEG6_LOCAL_FLV_ACT_UNSPEC; + + index = flv8986_act_tbl_idx(pinfo, flvmask); + if (unlikely(index >= FLV8986_ACT_TBL_SIZE)) + return SEG6_LOCAL_FLV_ACT_UNSPEC; + + return flv8986_act_tbl[index]; +} + +/* skb->data must be aligned with skb->network_header */ +static bool seg6_pop_srh(struct sk_buff *skb, int srhoff) +{ + struct ipv6_sr_hdr *srh; + struct ipv6hdr *iph; + __u8 srh_nexthdr; + int thoff = -1; + int srhlen; + int nhlen; + + if (unlikely(srhoff < sizeof(*iph) || + !pskb_may_pull(skb, srhoff + sizeof(*srh)))) + return false; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srhlen = ipv6_optlen(srh); + + /* we are about to mangle the pkt, let's check if we can write on it */ + if (unlikely(skb_ensure_writable(skb, srhoff + srhlen))) + return false; + + /* skb_ensure_writable() may change skb pointers; evaluate srh again */ + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_nexthdr = srh->nexthdr; + + if (unlikely(!skb_transport_header_was_set(skb))) + goto pull; + + nhlen = skb_network_header_len(skb); + /* we have to deal with the transport header: it could be set before + * the SRH, after the SRH, or within it (which is considered wrong, + * however). + */ + if (likely(nhlen <= srhoff)) + thoff = nhlen; + else if (nhlen >= srhoff + srhlen) + /* transport_header is set after the SRH */ + thoff = nhlen - srhlen; + else + /* transport_header falls inside the SRH; hence, we can't + * restore the transport_header pointer properly after + * SRH removing operation. + */ + return false; +pull: + /* we need to pop the SRH: + * 1) first of all, we pull out everything from IPv6 header up to SRH + * (included) evaluating also the rcsum; + * 2) we overwrite (and then remove) the SRH by properly moving the + * IPv6 along with any extension header that precedes the SRH; + * 3) At the end, we push back the pulled headers (except for SRH, + * obviously). + */ + skb_pull_rcsum(skb, srhoff + srhlen); + memmove(skb_network_header(skb) + srhlen, skb_network_header(skb), + srhoff); + skb_push(skb, srhoff); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + if (likely(thoff >= 0)) + skb_set_transport_header(skb, thoff); + + iph = ipv6_hdr(skb); + if (iph->nexthdr == NEXTHDR_ROUTING) { + iph->nexthdr = srh_nexthdr; + } else { + /* we must look for the extension header (EXTH, for short) that + * immediately precedes the SRH we have just removed. + * Then, we update the value of the EXTH nexthdr with the one + * contained in the SRH nexthdr. + */ + unsigned int off = sizeof(*iph); + struct ipv6_opt_hdr *hp, _hdr; + __u8 nexthdr = iph->nexthdr; + + for (;;) { + if (unlikely(!ipv6_ext_hdr(nexthdr) || + nexthdr == NEXTHDR_NONE)) + return false; + + hp = skb_header_pointer(skb, off, sizeof(_hdr), &_hdr); + if (unlikely(!hp)) + return false; + + if (hp->nexthdr == NEXTHDR_ROUTING) { + hp->nexthdr = srh_nexthdr; + break; + } + + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + fallthrough; + case NEXTHDR_AUTH: + /* we expect SRH before FRAG and AUTH */ + return false; + default: + off += ipv6_optlen(hp); + break; + } + + nexthdr = hp->nexthdr; + } + } + + iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + skb_postpush_rcsum(skb, iph, srhoff); + + return true; +} + +/* process the packet on the basis of the RFC8986 flavors set for the given + * SRv6 End behavior instance. + */ +static int end_flv8986_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + enum seg6_local_flv_action action; + enum seg6_local_pktinfo pinfo; + struct ipv6_sr_hdr *srh; + __u32 flvmask; + int srhoff; + + srh = seg6_get_srh(skb, 0); + srhoff = srh ? ((unsigned char *)srh - skb->data) : 0; + pinfo = seg6_get_srh_pktinfo(srh); +#ifdef CONFIG_IPV6_SEG6_HMAC + if (srh && !seg6_hmac_validate_skb(skb)) + goto drop; +#endif + flvmask = finfo->flv_ops; + if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) { + pr_warn_once("seg6local: invalid RFC8986 flavors\n"); + goto drop; + } + + /* retrieve the action triggered by the combination of pktinfo data and + * the flavors mask. + */ + action = seg6_local_flv8986_act_lookup(pinfo, flvmask); + switch (action) { + case SEG6_LOCAL_FLV_ACT_END: + /* process the packet as the "standard" End behavior */ + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + break; + case SEG6_LOCAL_FLV_ACT_PSP: + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + if (unlikely(!seg6_pop_srh(skb, srhoff))) + goto drop; + break; + case SEG6_LOCAL_FLV_ACT_UNSPEC: + fallthrough; + default: + /* by default, we drop the packet since we could not find a + * suitable action. + */ + goto drop; + } + + return input_action_end_finish(skb, slwt); + +drop: + kfree_skb(skb); + return -EINVAL; +} + /* regular endpoint function */ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) { const struct seg6_flavors_info *finfo = &slwt->flv_info; + __u32 fops = finfo->flv_ops; - if (seg6_next_csid_enabled(finfo->flv_ops)) + if (!fops) + return input_action_end_core(skb, slwt); + + /* check for the presence of NEXT-C-SID since it applies first */ + if (seg6_next_csid_enabled(fops)) return end_next_csid_core(skb, slwt); - return input_action_end_core(skb, slwt); + /* the specific processing function to be performed on the packet + * depends on the combination of flavors defined in RFC8986 and some + * information extracted from the packet, e.g. presence/absence of SRH, + * Segment Left = 0, etc. + */ + return end_flv8986_core(skb, slwt); } /* regular endpoint, and forward to specified nexthop */ @@ -2300,6 +2627,13 @@ int __init seg6_local_init(void) BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS)); BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS)); + /* To be memory efficient, we use 'u8' to represent the different + * actions related to RFC8986 flavors. If the kernel build stops here, + * it means that it is not possible to correctly encode these actions + * with the data type chosen for the action table. + */ + BUILD_BUG_ON(SEG6_LOCAL_FLV_ACT_MAX > (typeof(flv8986_act_tbl[0]))~0U); + return lwtunnel_encap_add_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 11b736a76bd7..1bf93b61aa06 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -272,6 +272,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; @@ -1387,14 +1388,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_mask(sk, GFP_ATOMIC)); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) { + if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); - skb_set_owner_r(newnp->pktoptions, newsk); - } } } else { if (!req_unhash && found_dup_sk) { @@ -1466,7 +1464,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + opt_skb = skb_clone_and_charge_r(skb, sk); reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1552,7 +1550,6 @@ ipv6_pktoptions: if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { @@ -1708,8 +1705,9 @@ process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ - if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e0e10f6bcdc1..c39c1e32f980 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,7 +42,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) return __udp_gso_segment(skb, features, true); mss = skb_shinfo(skb)->gso_size; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 890a2423f559..cfe828bd7fc6 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -28,6 +28,7 @@ #include <net/netns/generic.h> #include <net/sock.h> #include <uapi/linux/kcm.h> +#include <trace/events/sock.h> unsigned int kcm_net_id; @@ -349,6 +350,8 @@ static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; diff --git a/net/key/af_key.c b/net/key/af_key.c index 2bdbcec781cd..a815f5ab4c49 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1261,7 +1261,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; - x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); + x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 9a1415fe3fa7..03608d3ded4b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -104,9 +104,9 @@ static struct workqueue_struct *l2tp_wq; /* per-net private data for this module */ static unsigned int l2tp_net_id; struct l2tp_net { - struct list_head l2tp_tunnel_list; - /* Lock for write access to l2tp_tunnel_list */ - spinlock_t l2tp_tunnel_list_lock; + /* Lock for write access to l2tp_tunnel_idr */ + spinlock_t l2tp_tunnel_idr_lock; + struct idr l2tp_tunnel_idr; struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2]; /* Lock for write access to l2tp_session_hlist */ spinlock_t l2tp_session_hlist_lock; @@ -208,13 +208,10 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) struct l2tp_tunnel *tunnel; rcu_read_lock_bh(); - list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (tunnel->tunnel_id == tunnel_id && - refcount_inc_not_zero(&tunnel->ref_count)) { - rcu_read_unlock_bh(); - - return tunnel; - } + tunnel = idr_find(&pn->l2tp_tunnel_idr, tunnel_id); + if (tunnel && refcount_inc_not_zero(&tunnel->ref_count)) { + rcu_read_unlock_bh(); + return tunnel; } rcu_read_unlock_bh(); @@ -224,13 +221,14 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_get); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth) { - const struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_net *pn = l2tp_pernet(net); + unsigned long tunnel_id, tmp; struct l2tp_tunnel *tunnel; int count = 0; rcu_read_lock_bh(); - list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (++count > nth && + idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { + if (tunnel && ++count > nth && refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; @@ -1043,7 +1041,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); nf_reset_ct(skb); - bh_lock_sock(sk); + bh_lock_sock_nested(sk); if (sock_owned_by_user(sk)) { kfree_skb(skb); ret = NET_XMIT_DROP; @@ -1227,6 +1225,15 @@ static void l2tp_udp_encap_destroy(struct sock *sk) l2tp_tunnel_delete(tunnel); } +static void l2tp_tunnel_remove(struct net *net, struct l2tp_tunnel *tunnel) +{ + struct l2tp_net *pn = l2tp_pernet(net); + + spin_lock_bh(&pn->l2tp_tunnel_idr_lock); + idr_remove(&pn->l2tp_tunnel_idr, tunnel->tunnel_id); + spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); +} + /* Workqueue tunnel deletion function */ static void l2tp_tunnel_del_work(struct work_struct *work) { @@ -1234,7 +1241,6 @@ static void l2tp_tunnel_del_work(struct work_struct *work) del_work); struct sock *sk = tunnel->sock; struct socket *sock = sk->sk_socket; - struct l2tp_net *pn; l2tp_tunnel_closeall(tunnel); @@ -1248,12 +1254,7 @@ static void l2tp_tunnel_del_work(struct work_struct *work) } } - /* Remove the tunnel struct from the tunnel list */ - pn = l2tp_pernet(tunnel->l2tp_net); - spin_lock_bh(&pn->l2tp_tunnel_list_lock); - list_del_rcu(&tunnel->list); - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - + l2tp_tunnel_remove(tunnel->l2tp_net, tunnel); /* drop initial ref */ l2tp_tunnel_dec_refcount(tunnel); @@ -1384,8 +1385,6 @@ out: return err; } -static struct lock_class_key l2tp_socket_class; - int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { @@ -1455,12 +1454,19 @@ static int l2tp_validate_socket(const struct sock *sk, const struct net *net, int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg) { - struct l2tp_tunnel *tunnel_walk; - struct l2tp_net *pn; + struct l2tp_net *pn = l2tp_pernet(net); + u32 tunnel_id = tunnel->tunnel_id; struct socket *sock; struct sock *sk; int ret; + spin_lock_bh(&pn->l2tp_tunnel_idr_lock); + ret = idr_alloc_u32(&pn->l2tp_tunnel_idr, NULL, &tunnel_id, tunnel_id, + GFP_ATOMIC); + spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); + if (ret) + return ret == -ENOSPC ? -EEXIST : ret; + if (tunnel->fd < 0) { ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id, tunnel->peer_tunnel_id, cfg, @@ -1474,6 +1480,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, } sk = sock->sk; + lock_sock(sk); write_lock_bh(&sk->sk_callback_lock); ret = l2tp_validate_socket(sk, net, tunnel->encap); if (ret < 0) @@ -1481,24 +1488,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, rcu_assign_sk_user_data(sk, tunnel); write_unlock_bh(&sk->sk_callback_lock); - tunnel->l2tp_net = net; - pn = l2tp_pernet(net); - - sock_hold(sk); - tunnel->sock = sk; - - spin_lock_bh(&pn->l2tp_tunnel_list_lock); - list_for_each_entry(tunnel_walk, &pn->l2tp_tunnel_list, list) { - if (tunnel_walk->tunnel_id == tunnel->tunnel_id) { - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - sock_put(sk); - ret = -EEXIST; - goto err_sock; - } - } - list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); - spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { .sk_user_data = tunnel, @@ -1512,9 +1501,16 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, tunnel->old_sk_destruct = sk->sk_destruct; sk->sk_destruct = &l2tp_tunnel_destruct; - lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, - "l2tp_sock"); sk->sk_allocation = GFP_ATOMIC; + release_sock(sk); + + sock_hold(sk); + tunnel->sock = sk; + tunnel->l2tp_net = net; + + spin_lock_bh(&pn->l2tp_tunnel_idr_lock); + idr_replace(&pn->l2tp_tunnel_idr, tunnel, tunnel->tunnel_id); + spin_unlock_bh(&pn->l2tp_tunnel_idr_lock); trace_register_tunnel(tunnel); @@ -1523,17 +1519,16 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, return 0; -err_sock: - write_lock_bh(&sk->sk_callback_lock); - rcu_assign_sk_user_data(sk, NULL); err_inval_sock: write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); if (tunnel->fd < 0) sock_release(sock); else sockfd_put(sock); err: + l2tp_tunnel_remove(net, tunnel); return ret; } EXPORT_SYMBOL_GPL(l2tp_tunnel_register); @@ -1647,8 +1642,8 @@ static __net_init int l2tp_init_net(struct net *net) struct l2tp_net *pn = net_generic(net, l2tp_net_id); int hash; - INIT_LIST_HEAD(&pn->l2tp_tunnel_list); - spin_lock_init(&pn->l2tp_tunnel_list_lock); + idr_init(&pn->l2tp_tunnel_idr); + spin_lock_init(&pn->l2tp_tunnel_idr_lock); for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) INIT_HLIST_HEAD(&pn->l2tp_session_hlist[hash]); @@ -1662,11 +1657,13 @@ static __net_exit void l2tp_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; + unsigned long tunnel_id, tmp; int hash; rcu_read_lock_bh(); - list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - l2tp_tunnel_delete(tunnel); + idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { + if (tunnel) + l2tp_tunnel_delete(tunnel); } rcu_read_unlock_bh(); @@ -1676,6 +1673,7 @@ static __net_exit void l2tp_exit_net(struct net *net) for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) WARN_ON_ONCE(!hlist_empty(&pn->l2tp_session_hlist[hash])); + idr_destroy(&pn->l2tp_tunnel_idr); } static struct pernet_operations l2tp_net_ops = { diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 9c40f8d3bce8..f9514bacbd4a 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -491,7 +491,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) { struct tid_ampdu_tx *tid_tx; struct ieee80211_local *local = sta->local; - struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_sub_if_data *sdata; struct ieee80211_ampdu_params params = { .sta = &sta->sta, .action = IEEE80211_AMPDU_TX_START, @@ -511,8 +511,6 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state); - ieee80211_agg_stop_txq(sta, tid); - /* * Make sure no packets are being processed. This ensures that * we have a valid starting sequence number and that in-flight @@ -521,6 +519,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ synchronize_net(); + sdata = sta->sdata; params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); tid_tx->ssn = params.ssn; @@ -534,6 +533,9 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state); } else if (ret) { + if (!sdata) + return; + ht_dbg(sdata, "BA request denied - HW unavailable for %pM tid %d\n", sta->sta.addr, tid); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 8f9a2ab502b3..8eb342300868 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -147,6 +147,7 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; + link_conf->bssid_indicator = 0; if (sdata->vif.type != NL80211_IFTYPE_AP || !params.tx_wdev) return -EINVAL; @@ -1219,7 +1220,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_local *local = sdata->local; struct beacon_data *old; struct ieee80211_sub_if_data *vlan; - u32 changed = BSS_CHANGED_BEACON_INT | + u64 changed = BSS_CHANGED_BEACON_INT | BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_BEACON | BSS_CHANGED_P2P_PS | @@ -1251,6 +1252,21 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, prev_beacon_int = link_conf->beacon_int; link_conf->beacon_int = params->beacon_interval; + if (params->vht_cap) { + link_conf->vht_su_beamformer = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); + link_conf->vht_su_beamformee = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); + link_conf->vht_mu_beamformer = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE); + link_conf->vht_mu_beamformee = + params->vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); + } + if (params->he_cap && params->he_oper) { link_conf->he_support = true; link_conf->htc_trig_based_pkt_ext = @@ -1265,6 +1281,26 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, changed |= BSS_CHANGED_HE_BSS_COLOR; } + if (params->he_cap) { + link_conf->he_su_beamformer = + params->he_cap->phy_cap_info[3] & + IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER; + link_conf->he_su_beamformee = + params->he_cap->phy_cap_info[4] & + IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE; + link_conf->he_mu_beamformer = + params->he_cap->phy_cap_info[4] & + IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER; + link_conf->he_full_ul_mumimo = + params->he_cap->phy_cap_info[2] & + IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO; + } + + if (params->eht_cap) { + link_conf->eht_puncturing = params->punct_bitmap; + changed |= BSS_CHANGED_EHT_PUNCTURING; + } + if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, @@ -1511,6 +1547,12 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; + sdata->vif.mbssid_tx_vif = NULL; + link_conf->bssid_index = 0; + link_conf->nontransmitted = false; + link_conf->ema_ap = false; + link_conf->bssid_indicator = 0; + __sta_info_flush(sdata, true); ieee80211_free_keys(sdata, true); @@ -2727,7 +2769,7 @@ static int ieee80211_scan(struct wiphy *wiphy, * If the scan has been forced (and the driver supports * forcing), don't care about being beaconing already. * This will create problems to the attached stations (e.g. all - * the frames sent while scanning on other channel will be + * the frames sent while scanning on other channel will be * lost) */ if (sdata->deflink.u.ap.beacon && @@ -3509,6 +3551,12 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); + if (sdata->vif.bss_conf.eht_puncturing != sdata->vif.bss_conf.csa_punct_bitmap) { + sdata->vif.bss_conf.eht_puncturing = + sdata->vif.bss_conf.csa_punct_bitmap; + changed |= BSS_CHANGED_EHT_PUNCTURING; + } + /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the @@ -3550,7 +3598,8 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) if (err) return err; - cfg80211_ch_switch_notify(sdata->dev, &sdata->deflink.csa_chandef, 0); + cfg80211_ch_switch_notify(sdata->dev, &sdata->deflink.csa_chandef, 0, + sdata->vif.bss_conf.eht_puncturing); return 0; } @@ -3812,9 +3861,13 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } + if (params->punct_bitmap && !sdata->vif.bss_conf.eht_support) + goto out; + sdata->deflink.csa_chandef = params->chandef; sdata->deflink.csa_block_tx = params->block_tx; sdata->vif.bss_conf.csa_active = true; + sdata->vif.bss_conf.csa_punct_bitmap = params->punct_bitmap; if (sdata->deflink.csa_block_tx) ieee80211_stop_vif_queues(local, sdata, @@ -3822,7 +3875,8 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, cfg80211_ch_switch_started_notify(sdata->dev, &sdata->deflink.csa_chandef, 0, - params->count, params->block_tx); + params->count, params->block_tx, + sdata->vif.bss_conf.csa_punct_bitmap); if (changed) { ieee80211_link_info_change_notify(sdata, &sdata->deflink, @@ -4134,7 +4188,7 @@ static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; int ret; - u32 changed = 0; + u64 changed = 0; link = sdata_dereference(sdata->link[link_id], sdata); @@ -4615,6 +4669,19 @@ unlock: sdata_unlock(sdata); } +void ieee80211_color_collision_detection_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct ieee80211_link_data *link = + container_of(delayed_work, struct ieee80211_link_data, + color_collision_detect_work); + struct ieee80211_sub_if_data *sdata = link->sdata; + + sdata_lock(sdata); + cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap); + sdata_unlock(sdata); +} + void ieee80211_color_change_finish(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); @@ -4625,17 +4692,27 @@ void ieee80211_color_change_finish(struct ieee80211_vif *vif) EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void -ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif, +ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link = &sdata->deflink; if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) return; - cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap, gfp); + if (delayed_work_pending(&link->color_collision_detect_work)) + return; + + link->color_bitmap = color_bitmap; + /* queue the color collision detection event every 500 ms in order to + * avoid sending too much netlink messages to userspace. + */ + ieee80211_queue_delayed_work(&sdata->local->hw, + &link->color_collision_detect_work, + msecs_to_jiffies(500)); } -EXPORT_SYMBOL_GPL(ieeee80211_obss_color_collision_notify); +EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); static int ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e72cf0749d49..dbc34fbe7c8f 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1916,7 +1916,7 @@ int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link) int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, - u32 *changed) + u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index c87e1137e5da..0bac9af3ca96 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -603,8 +603,6 @@ IEEE80211_IF_FILE(fwded_mcast, u.mesh.mshstats.fwded_mcast, DEC); IEEE80211_IF_FILE(fwded_unicast, u.mesh.mshstats.fwded_unicast, DEC); IEEE80211_IF_FILE(fwded_frames, u.mesh.mshstats.fwded_frames, DEC); IEEE80211_IF_FILE(dropped_frames_ttl, u.mesh.mshstats.dropped_frames_ttl, DEC); -IEEE80211_IF_FILE(dropped_frames_congestion, - u.mesh.mshstats.dropped_frames_congestion, DEC); IEEE80211_IF_FILE(dropped_frames_no_route, u.mesh.mshstats.dropped_frames_no_route, DEC); @@ -740,7 +738,6 @@ static void add_mesh_stats(struct ieee80211_sub_if_data *sdata) MESHSTATS_ADD(fwded_frames); MESHSTATS_ADD(dropped_frames_ttl); MESHSTATS_ADD(dropped_frames_no_route); - MESHSTATS_ADD(dropped_frames_congestion); #undef MESHSTATS_ADD } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 7a3d7893e19d..f1914bf39f0e 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -167,7 +167,7 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, continue; txqi = to_txq_info(sta->sta.txq[i]); p += scnprintf(p, bufsz + buf - p, - "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n", + "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s%s)\n", txqi->txq.tid, txqi->txq.ac, txqi->tin.backlog_bytes, @@ -182,7 +182,8 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, txqi->flags, test_bit(IEEE80211_TXQ_STOP, &txqi->flags) ? "STOP" : "RUN", test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags) ? " AMPDU" : "", - test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags) ? " NO-AMSDU" : ""); + test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags) ? " NO-AMSDU" : "", + test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : ""); } rcu_read_unlock(); diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index d737db4e07e2..cfb09e4aed4d 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -392,6 +392,9 @@ int drv_ampdu_action(struct ieee80211_local *local, might_sleep(); + if (!sdata) + return -EIO; + sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 809bad53e15b..5d13a3dfd366 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1199,7 +1199,7 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local, /* In reconfig don't transmit now, but mark for waking later */ if (local->in_reconfig) { - set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txq->flags); + set_bit(IEEE80211_TXQ_DIRTY, &txq->flags); return; } diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 83bc41346ae7..5315ab750280 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -391,6 +391,37 @@ void ieee80211_ba_session_work(struct work_struct *work) tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; if (!blocked && tid_tx) { + struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); + struct ieee80211_sub_if_data *sdata = + vif_to_sdata(txqi->txq.vif); + struct fq *fq = &sdata->local->fq; + + spin_lock_bh(&fq->lock); + + /* Allow only frags to be dequeued */ + set_bit(IEEE80211_TXQ_STOP, &txqi->flags); + + if (!skb_queue_empty(&txqi->frags)) { + /* Fragmented Tx is ongoing, wait for it to + * finish. Reschedule worker to retry later. + */ + + spin_unlock_bh(&fq->lock); + spin_unlock_bh(&sta->lock); + + /* Give the task working on the txq a chance + * to send out the queued frags + */ + synchronize_net(); + + mutex_unlock(&sta->ampdu_mlme.mtx); + + ieee80211_queue_work(&sdata->local->hw, work); + return; + } + + spin_unlock_bh(&fq->lock); + /* * Assign it over to the normal tid_tx array * where it "goes live". diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 63ff0d2524b6..ecc232eb1ee8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -327,7 +327,6 @@ struct mesh_stats { __u32 fwded_frames; /* Mesh total forwarded frames */ __u32 dropped_frames_ttl; /* Not transmitted since mesh_ttl == 0*/ __u32 dropped_frames_no_route; /* Not transmitted, no route found */ - __u32 dropped_frames_congestion;/* Not forwarded due to congestion */ }; #define PREQ_Q_F_START 0x1 @@ -838,7 +837,7 @@ enum txq_info_flags { IEEE80211_TXQ_STOP, IEEE80211_TXQ_AMPDU, IEEE80211_TXQ_NO_AMSDU, - IEEE80211_TXQ_STOP_NETIF_TX, + IEEE80211_TXQ_DIRTY, }; /** @@ -974,6 +973,8 @@ struct ieee80211_link_data { struct cfg80211_chan_def csa_chandef; struct work_struct color_change_finalize_work; + struct delayed_work color_collision_detect_work; + u64 color_bitmap; /* context reservation -- protected with chanctx_mtx */ struct ieee80211_chanctx *reserved_chanctx; @@ -1929,6 +1930,7 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, /* color change handling */ void ieee80211_color_change_finalize_work(struct work_struct *work); +void ieee80211_color_collision_detection_work(struct work_struct *work); /* interface handling */ #define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ @@ -2478,7 +2480,7 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link); int __must_check ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, const struct cfg80211_chan_def *chandef, - u32 *changed); + u64 *changed); void ieee80211_link_release_channel(struct ieee80211_link_data *link); void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link); void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d49a5906a943..23ed13f15067 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -364,7 +364,9 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, /* No support for VLAN with MLO yet */ if (iftype == NL80211_IFTYPE_AP_VLAN && - nsdata->wdev.use_4addr) + sdata->wdev.use_4addr && + nsdata->vif.type == NL80211_IFTYPE_AP && + nsdata->vif.valid_links) return -EOPNOTSUPP; /* @@ -2195,7 +2197,6 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ret = cfg80211_register_netdevice(ndev); if (ret) { - ieee80211_if_free(ndev); free_netdev(ndev); return ret; } diff --git a/net/mac80211/link.c b/net/mac80211/link.c index d1f5a9f7c647..8c8869cc1fb4 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -39,6 +39,8 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_csa_finalize_work); INIT_WORK(&link->color_change_finalize_work, ieee80211_color_change_finalize_work); + INIT_DELAYED_WORK(&link->color_collision_detect_work, + ieee80211_color_collision_detection_work); INIT_LIST_HEAD(&link->assigned_chanctx_list); INIT_LIST_HEAD(&link->reserved_chanctx_list); INIT_DELAYED_WORK(&link->dfs_cac_timer_work, @@ -66,6 +68,7 @@ void ieee80211_link_stop(struct ieee80211_link_data *link) if (link->sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_mgd_stop_link(link); + cancel_delayed_work_sync(&link->color_collision_detect_work); ieee80211_link_release_channel(link); } diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 69d5e1ec6ede..3b81e6df3f34 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -512,7 +512,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, mpath->flags |= MESH_PATH_RESOLVING | MESH_PATH_DELETED; mesh_gate_del(tbl, mpath); spin_unlock_bh(&mpath->state_lock); - del_timer_sync(&mpath->timer); + timer_shutdown_sync(&mpath->timer); atomic_dec(&sdata->u.mesh.mpaths); atomic_dec(&tbl->entries); mesh_path_flush_pending(mpath); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0aee2392dd29..60792dfabc9d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2022 Intel Corporation + * Copyright (C) 2018 - 2023 Intel Corporation */ #include <linux/delay.h> @@ -89,6 +89,80 @@ MODULE_PARM_DESC(probe_wait_ms, #define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 /* + * Extract from the given disabled subchannel bitmap (raw format + * from the EHT Operation Element) the bits for the subchannel + * we're using right now. + */ +static u16 +ieee80211_extract_dis_subch_bmap(const struct ieee80211_eht_operation *eht_oper, + struct cfg80211_chan_def *chandef, u16 bitmap) +{ + struct ieee80211_eht_operation_info *info = (void *)eht_oper->optional; + struct cfg80211_chan_def ap_chandef = *chandef; + u32 ap_center_freq, local_center_freq; + u32 ap_bw, local_bw; + int ap_start_freq, local_start_freq; + u16 shift, mask; + + if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) || + !(eht_oper->params & + IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) + return 0; + + /* set 160/320 supported to get the full AP definition */ + ieee80211_chandef_eht_oper(eht_oper, true, true, &ap_chandef); + ap_center_freq = ap_chandef.center_freq1; + ap_bw = 20 * BIT(u8_get_bits(info->control, + IEEE80211_EHT_OPER_CHAN_WIDTH)); + ap_start_freq = ap_center_freq - ap_bw / 2; + local_center_freq = chandef->center_freq1; + local_bw = 20 * BIT(ieee80211_chan_width_to_rx_bw(chandef->width)); + local_start_freq = local_center_freq - local_bw / 2; + shift = (local_start_freq - ap_start_freq) / 20; + mask = BIT(local_bw / 20) - 1; + + return (bitmap >> shift) & mask; +} + +/* + * Handle the puncturing bitmap, possibly downgrading bandwidth to get a + * valid bitmap. + */ +static void +ieee80211_handle_puncturing_bitmap(struct ieee80211_link_data *link, + const struct ieee80211_eht_operation *eht_oper, + u16 bitmap, u64 *changed) +{ + struct cfg80211_chan_def *chandef = &link->conf->chandef; + u16 extracted; + u64 _changed = 0; + + if (!changed) + changed = &_changed; + + while (chandef->width > NL80211_CHAN_WIDTH_40) { + extracted = + ieee80211_extract_dis_subch_bmap(eht_oper, chandef, + bitmap); + + if (cfg80211_valid_disable_subchannel_bitmap(&bitmap, + chandef)) + break; + link->u.mgd.conn_flags |= + ieee80211_chandef_downgrade(chandef); + *changed |= BSS_CHANGED_BANDWIDTH; + } + + if (chandef->width <= NL80211_CHAN_WIDTH_40) + extracted = 0; + + if (link->conf->eht_puncturing != extracted) { + link->conf->eht_puncturing = extracted; + *changed |= BSS_CHANGED_EHT_PUNCTURING; + } +} + +/* * We can have multiple work items (and connection probing) * scheduling this timer, but we need to take care to only * reschedule it when it should fire _earlier_ than it was @@ -413,7 +487,7 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, const struct ieee80211_s1g_oper_ie *s1g_oper, - const u8 *bssid, u32 *changed) + const u8 *bssid, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; @@ -1704,7 +1778,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) return; } - cfg80211_ch_switch_notify(sdata->dev, &link->reserved_chandef, 0); + cfg80211_ch_switch_notify(sdata->dev, &link->reserved_chandef, 0, 0); } void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success) @@ -1914,7 +1988,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, mutex_unlock(&local->mtx); cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef, 0, - csa_ie.count, csa_ie.mode); + csa_ie.count, csa_ie.mode, 0); if (local->ops->channel_switch) { /* use driver's channel switch callback */ @@ -4145,6 +4219,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, link_sta); bss_conf->eht_support = link_sta->pub->eht_cap.has_eht; + *changed |= BSS_CHANGED_EHT_PUNCTURING; } else { bss_conf->eht_support = false; } @@ -5477,6 +5552,45 @@ static bool ieee80211_rx_our_beacon(const u8 *tx_bssid, return ether_addr_equal(tx_bssid, bss->transmitted_bss->bssid); } +static bool ieee80211_config_puncturing(struct ieee80211_link_data *link, + const struct ieee80211_eht_operation *eht_oper, + u64 *changed) +{ + u16 bitmap = 0, extracted; + + if ((eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) && + (eht_oper->params & + IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) { + const struct ieee80211_eht_operation_info *info = + (void *)eht_oper->optional; + const u8 *disable_subchannel_bitmap = info->optional; + + bitmap = get_unaligned_le16(disable_subchannel_bitmap); + } + + extracted = ieee80211_extract_dis_subch_bmap(eht_oper, + &link->conf->chandef, + bitmap); + + /* accept if there are no changes */ + if (!(*changed & BSS_CHANGED_BANDWIDTH) && + extracted == link->conf->eht_puncturing) + return true; + + if (!cfg80211_valid_disable_subchannel_bitmap(&bitmap, + &link->conf->chandef)) { + link_info(link, + "Got an invalid disable subchannel bitmap from AP %pM: bitmap = 0x%x, bw = 0x%x. disconnect\n", + link->u.mgd.bssid, + bitmap, + link->conf->chandef.width); + return false; + } + + ieee80211_handle_puncturing_bitmap(link, eht_oper, bitmap, changed); + return true; +} + static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) @@ -5494,7 +5608,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_channel *chan; struct link_sta_info *link_sta; struct sta_info *sta; - u32 changed = 0; + u64 changed = 0; bool erp_valid; u8 erp_value = 0; u32 ncrc = 0; @@ -5791,6 +5905,21 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, elems->pwr_constr_elem, elems->cisco_dtpc_elem); + if (elems->eht_operation && + !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT)) { + if (!ieee80211_config_puncturing(link, elems->eht_operation, + &changed)) { + ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, + WLAN_REASON_DEAUTH_LEAVING, + true, deauth_buf); + ieee80211_report_disconnect(sdata, deauth_buf, + sizeof(deauth_buf), true, + WLAN_REASON_DEAUTH_LEAVING, + false); + goto free; + } + } + ieee80211_link_info_change_notify(sdata, link, changed); free: kfree(elems); @@ -6892,9 +7021,12 @@ ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); } + link->conf->eht_puncturing = 0; + rcu_read_lock(); beacon_ies = rcu_dereference(cbss->beacon_ies); if (beacon_ies) { + const struct ieee80211_eht_operation *eht_oper; const struct element *elem; u8 dtim_count = 0; @@ -6923,6 +7055,31 @@ ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, link->conf->ema_ap = true; else link->conf->ema_ap = false; + + elem = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_OPERATION, + beacon_ies->data, beacon_ies->len); + eht_oper = (const void *)(elem->data + 1); + + if (elem && + ieee80211_eht_oper_size_ok((const void *)(elem->data + 1), + elem->datalen - 1) && + (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) && + (eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) { + const struct ieee80211_eht_operation_info *info = + (void *)eht_oper->optional; + const u8 *disable_subchannel_bitmap = info->optional; + u16 bitmap; + + bitmap = get_unaligned_le16(disable_subchannel_bitmap); + if (cfg80211_valid_disable_subchannel_bitmap(&bitmap, + &link->conf->chandef)) + ieee80211_handle_puncturing_bitmap(link, + eht_oper, + bitmap, + NULL); + else + conn_flags |= IEEE80211_CONN_DISABLE_EHT; + } } rcu_read_unlock(); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7e3ab6e1b28f..f7fdfe710951 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2720,6 +2720,174 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) } } +static ieee80211_rx_result +ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, + struct sk_buff *skb) +{ +#ifdef CONFIG_MAC80211_MESH + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct ieee80211_local *local = sdata->local; + uint16_t fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_DATA; + struct ieee80211_hdr hdr = { + .frame_control = cpu_to_le16(fc) + }; + struct ieee80211_hdr *fwd_hdr; + struct ieee80211s_hdr *mesh_hdr; + struct ieee80211_tx_info *info; + struct sk_buff *fwd_skb; + struct ethhdr *eth; + bool multicast; + int tailroom = 0; + int hdrlen, mesh_hdrlen; + u8 *qos; + + if (!ieee80211_vif_is_mesh(&sdata->vif)) + return RX_CONTINUE; + + if (!pskb_may_pull(skb, sizeof(*eth) + 6)) + return RX_DROP_MONITOR; + + mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(*eth)); + mesh_hdrlen = ieee80211_get_mesh_hdrlen(mesh_hdr); + + if (!pskb_may_pull(skb, sizeof(*eth) + mesh_hdrlen)) + return RX_DROP_MONITOR; + + eth = (struct ethhdr *)skb->data; + multicast = is_multicast_ether_addr(eth->h_dest); + + mesh_hdr = (struct ieee80211s_hdr *)(eth + 1); + if (!mesh_hdr->ttl) + return RX_DROP_MONITOR; + + /* frame is in RMC, don't forward */ + if (is_multicast_ether_addr(eth->h_dest) && + mesh_rmc_check(sdata, eth->h_source, mesh_hdr)) + return RX_DROP_MONITOR; + + /* Frame has reached destination. Don't forward */ + if (ether_addr_equal(sdata->vif.addr, eth->h_dest)) + goto rx_accept; + + if (!ifmsh->mshcfg.dot11MeshForwarding) { + if (is_multicast_ether_addr(eth->h_dest)) + goto rx_accept; + + return RX_DROP_MONITOR; + } + + /* forward packet */ + if (sdata->crypto_tx_tailroom_needed_cnt) + tailroom = IEEE80211_ENCRYPT_TAILROOM; + + if (!--mesh_hdr->ttl) { + if (multicast) + goto rx_accept; + + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl); + return RX_DROP_MONITOR; + } + + if (mesh_hdr->flags & MESH_FLAGS_AE) { + struct mesh_path *mppath; + char *proxied_addr; + + if (multicast) + proxied_addr = mesh_hdr->eaddr1; + else if ((mesh_hdr->flags & MESH_FLAGS_AE) == MESH_FLAGS_AE_A5_A6) + /* has_a4 already checked in ieee80211_rx_mesh_check */ + proxied_addr = mesh_hdr->eaddr2; + else + return RX_DROP_MONITOR; + + rcu_read_lock(); + mppath = mpp_path_lookup(sdata, proxied_addr); + if (!mppath) { + mpp_path_add(sdata, proxied_addr, eth->h_source); + } else { + spin_lock_bh(&mppath->state_lock); + if (!ether_addr_equal(mppath->mpp, eth->h_source)) + memcpy(mppath->mpp, eth->h_source, ETH_ALEN); + mppath->exp_time = jiffies; + spin_unlock_bh(&mppath->state_lock); + } + rcu_read_unlock(); + } + + skb_set_queue_mapping(skb, ieee802_1d_to_ac[skb->priority]); + + ieee80211_fill_mesh_addresses(&hdr, &hdr.frame_control, + eth->h_dest, eth->h_source); + hdrlen = ieee80211_hdrlen(hdr.frame_control); + if (multicast) { + int extra_head = sizeof(struct ieee80211_hdr) - sizeof(*eth); + + fwd_skb = skb_copy_expand(skb, local->tx_headroom + extra_head + + IEEE80211_ENCRYPT_HEADROOM, + tailroom, GFP_ATOMIC); + if (!fwd_skb) + goto rx_accept; + } else { + fwd_skb = skb; + skb = NULL; + + if (skb_cow_head(fwd_skb, hdrlen - sizeof(struct ethhdr))) + return RX_DROP_UNUSABLE; + } + + fwd_hdr = skb_push(fwd_skb, hdrlen - sizeof(struct ethhdr)); + memcpy(fwd_hdr, &hdr, hdrlen - 2); + qos = ieee80211_get_qos_ctl(fwd_hdr); + qos[0] = qos[1] = 0; + + skb_reset_mac_header(fwd_skb); + hdrlen += mesh_hdrlen; + if (ieee80211_get_8023_tunnel_proto(fwd_skb->data + hdrlen, + &fwd_skb->protocol)) + hdrlen += ETH_ALEN; + else + fwd_skb->protocol = htons(fwd_skb->len - hdrlen); + skb_set_network_header(fwd_skb, hdrlen); + + info = IEEE80211_SKB_CB(fwd_skb); + memset(info, 0, sizeof(*info)); + info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; + info->control.vif = &sdata->vif; + info->control.jiffies = jiffies; + if (multicast) { + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_mcast); + memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN); + /* update power mode indication when forwarding */ + ieee80211_mps_set_frame_flags(sdata, NULL, fwd_hdr); + } else if (!mesh_nexthop_lookup(sdata, fwd_skb)) { + /* mesh power mode flags updated in mesh_nexthop_lookup */ + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_unicast); + } else { + /* unable to resolve next hop */ + if (sta) + mesh_path_error_tx(sdata, ifmsh->mshcfg.element_ttl, + hdr.addr3, 0, + WLAN_REASON_MESH_PATH_NOFORWARD, + sta->sta.addr); + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_no_route); + kfree_skb(fwd_skb); + goto rx_accept; + } + + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames); + fwd_skb->dev = sdata->dev; + ieee80211_add_pending_skb(local, fwd_skb); + +rx_accept: + if (!skb) + return RX_QUEUED; + + ieee80211_strip_8023_mesh_hdr(skb); +#endif + + return RX_CONTINUE; +} + static ieee80211_rx_result debug_noinline __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) { @@ -2728,6 +2896,7 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; struct sk_buff_head frame_list; + static ieee80211_rx_result res; struct ethhdr ethhdr; const u8 *check_da = ethhdr.h_dest, *check_sa = ethhdr.h_source; @@ -2746,6 +2915,7 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) break; case NL80211_IFTYPE_MESH_POINT: check_sa = NULL; + check_da = NULL; break; default: break; @@ -2760,20 +2930,43 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) data_offset, true)) return RX_DROP_UNUSABLE; + if (rx->sta && rx->sta->amsdu_mesh_control < 0) { + bool valid_std = ieee80211_is_valid_amsdu(skb, true); + bool valid_nonstd = ieee80211_is_valid_amsdu(skb, false); + + if (valid_std && !valid_nonstd) + rx->sta->amsdu_mesh_control = 1; + else if (valid_nonstd && !valid_std) + rx->sta->amsdu_mesh_control = 0; + } + ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, rx->sdata->vif.type, rx->local->hw.extra_tx_headroom, - check_da, check_sa); + check_da, check_sa, + rx->sta->amsdu_mesh_control); while (!skb_queue_empty(&frame_list)) { rx->skb = __skb_dequeue(&frame_list); - if (!ieee80211_frame_allowed(rx, fc)) { - dev_kfree_skb(rx->skb); + res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb); + switch (res) { + case RX_QUEUED: continue; + case RX_CONTINUE: + break; + default: + goto free; } + if (!ieee80211_frame_allowed(rx, fc)) + goto free; + ieee80211_deliver_skb(rx); + continue; + +free: + dev_kfree_skb(rx->skb); } return RX_QUEUED; @@ -2806,6 +2999,8 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) if (!rx->sdata->u.mgd.use_4addr) return RX_DROP_UNUSABLE; break; + case NL80211_IFTYPE_MESH_POINT: + break; default: return RX_DROP_UNUSABLE; } @@ -2834,160 +3029,6 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return __ieee80211_rx_h_amsdu(rx, 0); } -#ifdef CONFIG_MAC80211_MESH -static ieee80211_rx_result -ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) -{ - struct ieee80211_hdr *fwd_hdr, *hdr; - struct ieee80211_tx_info *info; - struct ieee80211s_hdr *mesh_hdr; - struct sk_buff *skb = rx->skb, *fwd_skb; - struct ieee80211_local *local = rx->local; - struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u16 ac, q, hdrlen; - int tailroom = 0; - - hdr = (struct ieee80211_hdr *) skb->data; - hdrlen = ieee80211_hdrlen(hdr->frame_control); - - /* make sure fixed part of mesh header is there, also checks skb len */ - if (!pskb_may_pull(rx->skb, hdrlen + 6)) - return RX_DROP_MONITOR; - - mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); - - /* make sure full mesh header is there, also checks skb len */ - if (!pskb_may_pull(rx->skb, - hdrlen + ieee80211_get_mesh_hdrlen(mesh_hdr))) - return RX_DROP_MONITOR; - - /* reload pointers */ - hdr = (struct ieee80211_hdr *) skb->data; - mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); - - if (ieee80211_drop_unencrypted(rx, hdr->frame_control)) { - int offset = hdrlen + ieee80211_get_mesh_hdrlen(mesh_hdr) + - sizeof(rfc1042_header); - __be16 ethertype; - - if (!ether_addr_equal(hdr->addr1, rx->sdata->vif.addr) || - skb_copy_bits(rx->skb, offset, ðertype, 2) != 0 || - ethertype != rx->sdata->control_port_protocol) - return RX_DROP_MONITOR; - } - - /* frame is in RMC, don't forward */ - if (ieee80211_is_data(hdr->frame_control) && - is_multicast_ether_addr(hdr->addr1) && - mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr)) - return RX_DROP_MONITOR; - - if (!ieee80211_is_data(hdr->frame_control)) - return RX_CONTINUE; - - if (!mesh_hdr->ttl) - return RX_DROP_MONITOR; - - if (mesh_hdr->flags & MESH_FLAGS_AE) { - struct mesh_path *mppath; - char *proxied_addr; - char *mpp_addr; - - if (is_multicast_ether_addr(hdr->addr1)) { - mpp_addr = hdr->addr3; - proxied_addr = mesh_hdr->eaddr1; - } else if ((mesh_hdr->flags & MESH_FLAGS_AE) == - MESH_FLAGS_AE_A5_A6) { - /* has_a4 already checked in ieee80211_rx_mesh_check */ - mpp_addr = hdr->addr4; - proxied_addr = mesh_hdr->eaddr2; - } else { - return RX_DROP_MONITOR; - } - - rcu_read_lock(); - mppath = mpp_path_lookup(sdata, proxied_addr); - if (!mppath) { - mpp_path_add(sdata, proxied_addr, mpp_addr); - } else { - spin_lock_bh(&mppath->state_lock); - if (!ether_addr_equal(mppath->mpp, mpp_addr)) - memcpy(mppath->mpp, mpp_addr, ETH_ALEN); - mppath->exp_time = jiffies; - spin_unlock_bh(&mppath->state_lock); - } - rcu_read_unlock(); - } - - /* Frame has reached destination. Don't forward */ - if (!is_multicast_ether_addr(hdr->addr1) && - ether_addr_equal(sdata->vif.addr, hdr->addr3)) - return RX_CONTINUE; - - ac = ieee802_1d_to_ac[skb->priority]; - q = sdata->vif.hw_queue[ac]; - if (ieee80211_queue_stopped(&local->hw, q)) { - IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion); - return RX_DROP_MONITOR; - } - skb_set_queue_mapping(skb, ac); - - if (!--mesh_hdr->ttl) { - if (!is_multicast_ether_addr(hdr->addr1)) - IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, - dropped_frames_ttl); - goto out; - } - - if (!ifmsh->mshcfg.dot11MeshForwarding) - goto out; - - if (sdata->crypto_tx_tailroom_needed_cnt) - tailroom = IEEE80211_ENCRYPT_TAILROOM; - - fwd_skb = skb_copy_expand(skb, local->tx_headroom + - IEEE80211_ENCRYPT_HEADROOM, - tailroom, GFP_ATOMIC); - if (!fwd_skb) - goto out; - - fwd_skb->dev = sdata->dev; - fwd_hdr = (struct ieee80211_hdr *) fwd_skb->data; - fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY); - info = IEEE80211_SKB_CB(fwd_skb); - memset(info, 0, sizeof(*info)); - info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; - info->control.vif = &rx->sdata->vif; - info->control.jiffies = jiffies; - if (is_multicast_ether_addr(fwd_hdr->addr1)) { - IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_mcast); - memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN); - /* update power mode indication when forwarding */ - ieee80211_mps_set_frame_flags(sdata, NULL, fwd_hdr); - } else if (!mesh_nexthop_lookup(sdata, fwd_skb)) { - /* mesh power mode flags updated in mesh_nexthop_lookup */ - IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_unicast); - } else { - /* unable to resolve next hop */ - mesh_path_error_tx(sdata, ifmsh->mshcfg.element_ttl, - fwd_hdr->addr3, 0, - WLAN_REASON_MESH_PATH_NOFORWARD, - fwd_hdr->addr2); - IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_no_route); - kfree_skb(fwd_skb); - return RX_DROP_MONITOR; - } - - IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames); - ieee80211_add_pending_skb(local, fwd_skb); - out: - if (is_multicast_ether_addr(hdr->addr1)) - return RX_CONTINUE; - return RX_DROP_MONITOR; -} -#endif - static ieee80211_rx_result debug_noinline ieee80211_rx_h_data(struct ieee80211_rx_data *rx) { @@ -2996,6 +3037,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) struct net_device *dev = sdata->dev; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; __le16 fc = hdr->frame_control; + static ieee80211_rx_result res; bool port_control; int err; @@ -3022,6 +3064,10 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) if (unlikely(err)) return RX_DROP_UNUSABLE; + res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb); + if (res != RX_CONTINUE) + return res; + if (!ieee80211_frame_allowed(rx, fc)) return RX_DROP_MONITOR; @@ -3219,9 +3265,9 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) color = le32_get_bits(he_oper->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); if (color == bss_conf->he_bss_color.color) - ieeee80211_obss_color_collision_notify(&rx->sdata->vif, - BIT_ULL(color), - GFP_ATOMIC); + ieee80211_obss_color_collision_notify(&rx->sdata->vif, + BIT_ULL(color), + GFP_ATOMIC); } } @@ -3992,10 +4038,6 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, CALL_RXH(ieee80211_rx_h_defragment); CALL_RXH(ieee80211_rx_h_michael_mic_verify); /* must be after MMIC verify so header is counted in MPDU mic */ -#ifdef CONFIG_MAC80211_MESH - if (ieee80211_vif_is_mesh(&rx->sdata->vif)) - CALL_RXH(ieee80211_rx_h_mesh_fwding); -#endif CALL_RXH(ieee80211_rx_h_amsdu); CALL_RXH(ieee80211_rx_h_data); @@ -4049,6 +4091,52 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) #undef CALL_RXH } +static bool +ieee80211_rx_is_valid_sta_link_id(struct ieee80211_sta *sta, u8 link_id) +{ + return !!(sta->valid_links & BIT(link_id)); +} + +static bool ieee80211_rx_data_set_link(struct ieee80211_rx_data *rx, + u8 link_id) +{ + rx->link_id = link_id; + rx->link = rcu_dereference(rx->sdata->link[link_id]); + + if (!rx->sta) + return rx->link; + + if (!ieee80211_rx_is_valid_sta_link_id(&rx->sta->sta, link_id)) + return false; + + rx->link_sta = rcu_dereference(rx->sta->link[link_id]); + + return rx->link && rx->link_sta; +} + +static bool ieee80211_rx_data_set_sta(struct ieee80211_rx_data *rx, + struct sta_info *sta, int link_id) +{ + rx->link_id = link_id; + rx->sta = sta; + + if (sta) { + rx->local = sta->sdata->local; + if (!rx->sdata) + rx->sdata = sta->sdata; + rx->link_sta = &sta->deflink; + } else { + rx->link_sta = NULL; + } + + if (link_id < 0) + rx->link = &rx->sdata->deflink; + else if (!ieee80211_rx_data_set_link(rx, link_id)) + return false; + + return true; +} + /* * This function makes calls into the RX path, therefore * it has to be invoked under RCU read lock. @@ -4057,16 +4145,19 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) { struct sk_buff_head frames; struct ieee80211_rx_data rx = { - .sta = sta, - .sdata = sta->sdata, - .local = sta->local, /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .link_id = -1, }; struct tid_ampdu_rx *tid_agg_rx; - u8 link_id; + int link_id = -1; + + /* FIXME: statistics won't be right with this */ + if (sta->sta.valid_links) + link_id = ffs(sta->sta.valid_links) - 1; + + if (!ieee80211_rx_data_set_sta(&rx, sta, link_id)) + return; tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) @@ -4086,10 +4177,6 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) }; drv_event_callback(rx.local, rx.sdata, &event); } - /* FIXME: statistics won't be right with this */ - link_id = sta->sta.valid_links ? ffs(sta->sta.valid_links) - 1 : 0; - rx.link = rcu_dereference(sta->sdata->link[link_id]); - rx.link_sta = rcu_dereference(sta->link[link_id]); ieee80211_rx_handlers(&rx, &frames); } @@ -4105,7 +4192,6 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .link_id = -1, }; int i, diff; @@ -4116,10 +4202,8 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, sta = container_of(pubsta, struct sta_info, sta); - rx.sta = sta; - rx.sdata = sta->sdata; - rx.link = &rx.sdata->deflink; - rx.local = sta->local; + if (!ieee80211_rx_data_set_sta(&rx, sta, -1)) + return; rcu_read_lock(); tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); @@ -4197,7 +4281,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) case NL80211_IFTYPE_STATION: if (!bssid && !sdata->u.mgd.use_4addr) return false; - if (ieee80211_is_robust_mgmt_frame(skb) && !rx->sta) + if (ieee80211_is_first_frag(hdr->seq_ctrl) && + ieee80211_is_robust_mgmt_frame(skb) && !rx->sta) return false; if (multicast) return true; @@ -4506,15 +4591,6 @@ void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->sta_mtx); } -static bool -ieee80211_rx_is_valid_sta_link_id(struct ieee80211_sta *sta, u8 link_id) -{ - if (!sta->mlo) - return false; - - return !!(sta->valid_links & BIT(link_id)); -} - static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, struct ieee80211_fast_rx *fast_rx, int orig_len) @@ -4625,7 +4701,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct sk_buff *skb = rx->skb; struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - struct sta_info *sta = rx->sta; int orig_len = skb->len; int hdrlen = ieee80211_hdrlen(hdr->frame_control); int snap_offs = hdrlen; @@ -4637,7 +4712,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, u8 da[ETH_ALEN]; u8 sa[ETH_ALEN]; } addrs __aligned(2); - struct link_sta_info *link_sta; struct ieee80211_sta_rx_stats *stats; /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write @@ -4740,18 +4814,10 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, drop: dev_kfree_skb(skb); - if (rx->link_id >= 0) { - link_sta = rcu_dereference(sta->link[rx->link_id]); - if (!link_sta) - return true; - } else { - link_sta = &sta->deflink; - } - if (fast_rx->uses_rss) - stats = this_cpu_ptr(link_sta->pcpu_rx_stats); + stats = this_cpu_ptr(rx->link_sta->pcpu_rx_stats); else - stats = &link_sta->rx_stats; + stats = &rx->link_sta->rx_stats; stats->dropped++; return true; @@ -4769,8 +4835,8 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_hdr *hdr = (void *)skb->data; - struct link_sta_info *link_sta = NULL; - struct ieee80211_link_data *link; + struct link_sta_info *link_sta = rx->link_sta; + struct ieee80211_link_data *link = rx->link; rx->skb = skb; @@ -4792,35 +4858,6 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, if (!ieee80211_accept_frame(rx)) return false; - if (rx->link_id >= 0) { - link = rcu_dereference(rx->sdata->link[rx->link_id]); - - /* we might race link removal */ - if (!link) - return true; - rx->link = link; - - if (rx->sta) { - rx->link_sta = - rcu_dereference(rx->sta->link[rx->link_id]); - if (!rx->link_sta) - return true; - } - } else { - if (rx->sta) - rx->link_sta = &rx->sta->deflink; - - rx->link = &sdata->deflink; - } - - if (unlikely(!is_multicast_ether_addr(hdr->addr1) && - rx->link_id >= 0 && rx->sta && rx->sta->sta.mlo)) { - link_sta = rcu_dereference(rx->sta->link[rx->link_id]); - - if (WARN_ON_ONCE(!link_sta)) - return true; - } - if (!consume) { struct skb_shared_hwtstamps *shwt; @@ -4838,9 +4875,13 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, */ shwt = skb_hwtstamps(rx->skb); shwt->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + + /* Update the hdr pointer to the new skb for translation below */ + hdr = (struct ieee80211_hdr *)rx->skb->data; } - if (unlikely(link_sta)) { + if (unlikely(rx->sta && rx->sta->sta.mlo) && + is_unicast_ether_addr(hdr->addr1)) { /* translate to MLD addresses */ if (ether_addr_equal(link->conf->addr, hdr->addr1)) ether_addr_copy(hdr->addr1, rx->sdata->vif.addr); @@ -4870,6 +4911,8 @@ static void __ieee80211_rx_handle_8023(struct ieee80211_hw *hw, struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_fast_rx *fast_rx; struct ieee80211_rx_data rx; + struct sta_info *sta; + int link_id = -1; memset(&rx, 0, sizeof(rx)); rx.skb = skb; @@ -4886,12 +4929,8 @@ static void __ieee80211_rx_handle_8023(struct ieee80211_hw *hw, if (!pubsta) goto drop; - rx.sta = container_of(pubsta, struct sta_info, sta); - rx.sdata = rx.sta->sdata; - - if (status->link_valid && - !ieee80211_rx_is_valid_sta_link_id(pubsta, status->link_id)) - goto drop; + if (status->link_valid) + link_id = status->link_id; /* * TODO: Should the frame be dropped if the right link_id is not @@ -4900,19 +4939,9 @@ static void __ieee80211_rx_handle_8023(struct ieee80211_hw *hw, * link_id is used only for stats purpose and updating the stats on * the deflink is fine? */ - if (status->link_valid) - rx.link_id = status->link_id; - - if (rx.link_id >= 0) { - struct ieee80211_link_data *link; - - link = rcu_dereference(rx.sdata->link[rx.link_id]); - if (!link) - goto drop; - rx.link = link; - } else { - rx.link = &rx.sdata->deflink; - } + sta = container_of(pubsta, struct sta_info, sta); + if (!ieee80211_rx_data_set_sta(&rx, sta, link_id)) + goto drop; fast_rx = rcu_dereference(rx.sta->fast_rx); if (!fast_rx) @@ -4930,6 +4959,8 @@ static bool ieee80211_rx_for_interface(struct ieee80211_rx_data *rx, { struct link_sta_info *link_sta; struct ieee80211_hdr *hdr = (void *)skb->data; + struct sta_info *sta; + int link_id = -1; /* * Look up link station first, in case there's a @@ -4939,24 +4970,19 @@ static bool ieee80211_rx_for_interface(struct ieee80211_rx_data *rx, */ link_sta = link_sta_info_get_bss(rx->sdata, hdr->addr2); if (link_sta) { - rx->sta = link_sta->sta; - rx->link_id = link_sta->link_id; + sta = link_sta->sta; + link_id = link_sta->link_id; } else { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - rx->sta = sta_info_get_bss(rx->sdata, hdr->addr2); - if (rx->sta) { - if (status->link_valid && - !ieee80211_rx_is_valid_sta_link_id(&rx->sta->sta, - status->link_id)) - return false; - - rx->link_id = status->link_valid ? status->link_id : -1; - } else { - rx->link_id = -1; - } + sta = sta_info_get_bss(rx->sdata, hdr->addr2); + if (status->link_valid) + link_id = status->link_id; } + if (!ieee80211_rx_data_set_sta(rx, sta, link_id)) + return false; + return ieee80211_prepare_and_rx_handle(rx, skb, consume); } @@ -5015,19 +5041,16 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (ieee80211_is_data(fc)) { struct sta_info *sta, *prev_sta; - u8 link_id = status->link_id; + int link_id = -1; - if (pubsta) { - rx.sta = container_of(pubsta, struct sta_info, sta); - rx.sdata = rx.sta->sdata; + if (status->link_valid) + link_id = status->link_id; - if (status->link_valid && - !ieee80211_rx_is_valid_sta_link_id(pubsta, link_id)) + if (pubsta) { + sta = container_of(pubsta, struct sta_info, sta); + if (!ieee80211_rx_data_set_sta(&rx, sta, link_id)) goto out; - if (status->link_valid) - rx.link_id = status->link_id; - /* * In MLO connection, fetch the link_id using addr2 * when the driver does not pass link_id in status. @@ -5045,7 +5068,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (!link_sta) goto out; - rx.link_id = link_sta->link_id; + ieee80211_rx_data_set_link(&rx, link_sta->link_id); } if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) @@ -5061,30 +5084,25 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, continue; } - if ((status->link_valid && - !ieee80211_rx_is_valid_sta_link_id(&prev_sta->sta, - link_id)) || - (!status->link_valid && prev_sta->sta.mlo)) + rx.sdata = prev_sta->sdata; + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) + goto out; + + if (!status->link_valid && prev_sta->sta.mlo) continue; - rx.link_id = status->link_valid ? link_id : -1; - rx.sta = prev_sta; - rx.sdata = prev_sta->sdata; ieee80211_prepare_and_rx_handle(&rx, skb, false); prev_sta = sta; } if (prev_sta) { - if ((status->link_valid && - !ieee80211_rx_is_valid_sta_link_id(&prev_sta->sta, - link_id)) || - (!status->link_valid && prev_sta->sta.mlo)) + rx.sdata = prev_sta->sdata; + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) goto out; - rx.link_id = status->link_valid ? link_id : -1; - rx.sta = prev_sta; - rx.sdata = prev_sta->sdata; + if (!status->link_valid && prev_sta->sta.mlo) + goto out; if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) return; @@ -5215,6 +5233,15 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss)) goto drop; break; + case RX_ENC_EHT: + if (WARN_ONCE(status->rate_idx > 15 || + !status->nss || + status->nss > 8 || + status->eht.gi > NL80211_RATE_INFO_EHT_GI_3_2, + "Rate marked as an EHT rate but data is invalid: MCS:%d, NSS:%d, GI:%d\n", + status->rate_idx, status->nss, status->eht.gi)) + goto drop; + break; default: WARN_ON_ONCE(1); fallthrough; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 04e0f132b1d9..7d68dbc872d7 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation */ #include <linux/module.h> @@ -594,6 +594,9 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sta_state = IEEE80211_STA_NONE; + if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + sta->amsdu_mesh_control = -1; + /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; @@ -2406,12 +2409,19 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; + case STA_STATS_RATE_TYPE_EHT: + rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; + rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); + rinfo->nss = STA_STATS_GET(EHT_NSS, rate); + rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); + rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); + break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { - u16 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); + u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 69820b551668..e8e482a82d77 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -622,6 +622,8 @@ struct link_sta_info { * taken from HT/VHT capabilities or VHT operating mode notification * @cparams: CoDel parameters for this station. * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) + * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer + * (-1: not yet known, 0: non-standard [without mesh header], 1: standard) * @fast_tx: TX fastpath information * @fast_rx: RX fastpath information * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to @@ -707,6 +709,7 @@ struct sta_info { struct codel_params cparams; u8 reserved_tid; + s8 amsdu_mesh_control; struct cfg80211_chan_def tdls_chandef; @@ -936,6 +939,7 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_VHT, STA_STATS_RATE_TYPE_HE, STA_STATS_RATE_TYPE_S1G, + STA_STATS_RATE_TYPE_EHT, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) @@ -945,12 +949,16 @@ enum sta_stats_type { #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_BW GENMASK(11, 8) -#define STA_STATS_FIELD_SGI GENMASK(12, 12) -#define STA_STATS_FIELD_TYPE GENMASK(15, 13) -#define STA_STATS_FIELD_HE_RU GENMASK(18, 16) -#define STA_STATS_FIELD_HE_GI GENMASK(20, 19) -#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21) +#define STA_STATS_FIELD_EHT_MCS GENMASK( 3, 0) +#define STA_STATS_FIELD_EHT_NSS GENMASK( 7, 4) +#define STA_STATS_FIELD_BW GENMASK(12, 8) +#define STA_STATS_FIELD_SGI GENMASK(13, 13) +#define STA_STATS_FIELD_TYPE GENMASK(16, 14) +#define STA_STATS_FIELD_HE_RU GENMASK(19, 17) +#define STA_STATS_FIELD_HE_GI GENMASK(21, 20) +#define STA_STATS_FIELD_HE_DCM GENMASK(22, 22) +#define STA_STATS_FIELD_EHT_RU GENMASK(20, 17) +#define STA_STATS_FIELD_EHT_GI GENMASK(22, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -989,6 +997,13 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(HE_RU, s->he_ru); r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); break; + case RX_ENC_EHT: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_EHT); + r |= STA_STATS_FIELD(EHT_NSS, s->nss); + r |= STA_STATS_FIELD(EHT_MCS, s->rate_idx); + r |= STA_STATS_FIELD(EHT_GI, s->eht.gi); + r |= STA_STATS_FIELD(EHT_RU, s->eht.ru); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2171cd1ca807..7699fb410670 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1129,7 +1129,6 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, struct sk_buff *purge_skb = NULL; if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { - info->flags |= IEEE80211_TX_CTL_AMPDU; reset_agg_timer = true; } else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) { /* @@ -1161,7 +1160,6 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, if (!tid_tx) { /* do nothing, let packet pass through */ } else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { - info->flags |= IEEE80211_TX_CTL_AMPDU; reset_agg_timer = true; } else { queued = true; @@ -3677,8 +3675,7 @@ static void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, info->band = fast_tx->band; info->control.vif = &sdata->vif; info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | - IEEE80211_TX_CTL_DONTFRAG | - (ampdu ? IEEE80211_TX_CTL_AMPDU : 0); + IEEE80211_TX_CTL_DONTFRAG; info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT | u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, IEEE80211_TX_CTRL_MLO_LINK); @@ -3783,6 +3780,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_tx_data tx; ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; + int q = vif->hw_queue[txq->ac]; + bool q_stopped; WARN_ON_ONCE(softirq_count() == 0); @@ -3790,17 +3789,18 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, return NULL; begin: - spin_lock_bh(&fq->lock); - - if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) || - test_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags)) - goto out; + spin_lock(&local->queue_stop_reason_lock); + q_stopped = local->queue_stop_reasons[q]; + spin_unlock(&local->queue_stop_reason_lock); - if (vif->txqs_stopped[txq->ac]) { - set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags); - goto out; + if (unlikely(q_stopped)) { + /* mark for waking later */ + set_bit(IEEE80211_TXQ_DIRTY, &txqi->flags); + return NULL; } + spin_lock_bh(&fq->lock); + /* Make sure fragments stay together. */ skb = __skb_dequeue(&txqi->frags); if (unlikely(skb)) { @@ -3810,6 +3810,9 @@ begin: IEEE80211_SKB_CB(skb)->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; } else { + if (unlikely(test_bit(IEEE80211_TXQ_STOP, &txqi->flags))) + goto out; + skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); } @@ -3860,9 +3863,8 @@ begin: } if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) - info->flags |= IEEE80211_TX_CTL_AMPDU; - else - info->flags &= ~IEEE80211_TX_CTL_AMPDU; + info->flags |= (IEEE80211_TX_CTL_AMPDU | + IEEE80211_TX_CTL_DONTFRAG); if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { @@ -4432,7 +4434,7 @@ static void ieee80211_mlo_multicast_tx(struct net_device *dev, u32 ctrl_flags = IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX; if (hweight16(links) == 1) { - ctrl_flags |= u32_encode_bits(ffs(links) - 1, + ctrl_flags |= u32_encode_bits(__ffs(links), IEEE80211_TX_CTRL_MLO_LINK); __ieee80211_subif_start_xmit(skb, sdata->dev, 0, ctrl_flags, @@ -4596,8 +4598,6 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); - if (tid_tx) - info->flags |= IEEE80211_TX_CTL_AMPDU; info->hw_queue = sdata->vif.hw_queue[queue]; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 6f5407038459..1a28fe5cb614 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -292,22 +292,12 @@ static void wake_tx_push_queue(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_txq *queue) { - int q = sdata->vif.hw_queue[queue->ac]; struct ieee80211_tx_control control = { .sta = queue->sta, }; struct sk_buff *skb; - unsigned long flags; - bool q_stopped; while (1) { - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - q_stopped = local->queue_stop_reasons[q]; - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - - if (q_stopped) - break; - skb = ieee80211_tx_dequeue(&local->hw, queue); if (!skb) break; @@ -347,8 +337,6 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) local_bh_disable(); spin_lock(&fq->lock); - sdata->vif.txqs_stopped[ac] = false; - if (!test_bit(SDATA_STATE_RUNNING, &sdata->state)) goto out; @@ -370,7 +358,7 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) if (ac != txq->ac) continue; - if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, + if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags)) continue; @@ -385,7 +373,7 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) txqi = to_txq_info(vif->txq); - if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags) || + if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) || (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; @@ -517,8 +505,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_sub_if_data *sdata; - int n_acs = IEEE80211_NUM_ACS; trace_stop_queue(local, queue, reason); @@ -530,29 +516,7 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, else local->q_stop_reasons[queue][reason]++; - if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) - return; - - if (local->hw.queues < IEEE80211_NUM_ACS) - n_acs = 1; - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - int ac; - - if (!sdata->dev) - continue; - - for (ac = 0; ac < n_acs; ac++) { - if (sdata->vif.hw_queue[ac] == queue || - sdata->vif.cab_queue == queue) { - spin_lock(&local->fq.lock); - sdata->vif.txqs_stopped[ac] = true; - spin_unlock(&local->fq.lock); - } - } - } - rcu_read_unlock(); + set_bit(reason, &local->queue_stop_reasons[queue]); } void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, @@ -868,19 +832,6 @@ static void __iterate_stations(struct ieee80211_local *local, } } -void ieee80211_iterate_stations(struct ieee80211_hw *hw, - void (*iterator)(void *data, - struct ieee80211_sta *sta), - void *data) -{ - struct ieee80211_local *local = hw_to_local(hw); - - mutex_lock(&local->sta_mtx); - __iterate_stations(local, iterator, data); - mutex_unlock(&local->sta_mtx); -} -EXPORT_SYMBOL_GPL(ieee80211_iterate_stations); - void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), @@ -4069,6 +4020,19 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, /* Fill cfg80211 rate info */ switch (status->encoding) { + case RX_ENC_EHT: + ri.flags |= RATE_INFO_FLAGS_EHT_MCS; + ri.mcs = status->rate_idx; + ri.nss = status->nss; + ri.eht_ru_alloc = status->eht.ru; + if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) + ri.flags |= RATE_INFO_FLAGS_SHORT_GI; + /* TODO/FIXME: is this right? handle other PPDUs */ + if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + mpdu_offset += 2; + ts += 36; + } + break; case RX_ENC_HE: ri.flags |= RATE_INFO_FLAGS_HE_MCS; ri.mcs = status->rate_idx; diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 803de5881485..c1250aa47808 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -625,7 +625,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, enum ieee80211_sta_rx_bandwidth new_bw; struct sta_opmode_info sta_opmode = {}; u32 changed = 0; - u8 nss; + u8 nss, cur_nss; /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) @@ -636,10 +636,25 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, nss += 1; if (link_sta->pub->rx_nss != nss) { - link_sta->pub->rx_nss = nss; - sta_opmode.rx_nss = nss; - changed |= IEEE80211_RC_NSS_CHANGED; - sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; + cur_nss = link_sta->pub->rx_nss; + /* Reset rx_nss and call ieee80211_sta_set_rx_nss() which + * will set the same to max nss value calculated based on capability. + */ + link_sta->pub->rx_nss = 0; + ieee80211_sta_set_rx_nss(link_sta); + /* Do not allow an nss change to rx_nss greater than max_nss + * negotiated and capped to APs capability during association. + */ + if (nss <= link_sta->pub->rx_nss) { + link_sta->pub->rx_nss = nss; + sta_opmode.rx_nss = nss; + changed |= IEEE80211_RC_NSS_CHANGED; + sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; + } else { + link_sta->pub->rx_nss = cur_nss; + pr_warn_ratelimited("Ignoring NSS change in VHT Operating Mode Notification from %pM with invalid nss %d", + link_sta->pub->addr, nss); + } } switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 9d59caeb74e0..c0e2da5072be 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -657,6 +657,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, sdata->dev = ndev; sdata->wpan_dev.wpan_phy = local->hw.phy; sdata->local = local; + INIT_LIST_HEAD(&sdata->wpan_dev.list); /* setup type-dependent data */ ret = ieee802154_setup_sdata(sdata, type); diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 2b0a80571097..da0628ee3c89 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -247,7 +247,6 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, ret = ieee802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); - kfree_skb(skb); return; } diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index fc9e728b6333..3150f3f0c872 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -544,9 +544,6 @@ static int mctp_sk_init(struct sock *sk) static void mctp_sk_close(struct sock *sk, long timeout) { - struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); - - del_timer_sync(&msk->key_expiry); sk_common_release(sk); } @@ -580,7 +577,19 @@ static void mctp_sk_unhash(struct sock *sk) spin_lock_irqsave(&key->lock, fl2); __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_CLOSED); } + sock_set_flag(sk, SOCK_DEAD); spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + /* Since there are no more tag allocations (we have removed all of the + * keys), stop any pending expiry events. the timer cannot be re-queued + * as the sk is no longer observable + */ + del_timer_sync(&msk->key_expiry); +} + +static void mctp_sk_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); } static struct proto mctp_proto = { @@ -619,6 +628,7 @@ static int mctp_pf_create(struct net *net, struct socket *sock, return -ENOMEM; sock_init_data(sock, sk); + sk->sk_destruct = mctp_sk_destruct; rc = 0; if (sk->sk_prot->init) diff --git a/net/mctp/device.c b/net/mctp/device.c index 99a3bda8852f..acb97b257428 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -429,12 +429,6 @@ static void mctp_unregister(struct net_device *dev) struct mctp_dev *mdev; mdev = mctp_dev_get_rtnl(dev); - if (mdev && !mctp_known(dev)) { - // Sanity check, should match what was set in mctp_register - netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", - __func__, dev->type); - return; - } if (!mdev) return; @@ -451,14 +445,8 @@ static int mctp_register(struct net_device *dev) struct mctp_dev *mdev; /* Already registered? */ - mdev = rtnl_dereference(dev->mctp_ptr); - - if (mdev) { - if (!mctp_known(dev)) - netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", - __func__, dev->type); + if (rtnl_dereference(dev->mctp_ptr)) return 0; - } /* only register specific types */ if (!mctp_known(dev)) diff --git a/net/mctp/route.c b/net/mctp/route.c index f9a80b82dc51..f51a05ec7162 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -147,6 +147,7 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, key->valid = true; spin_lock_init(&key->lock); refcount_set(&key->refs, 1); + sock_hold(key->sk); return key; } @@ -165,6 +166,7 @@ void mctp_key_unref(struct mctp_sk_key *key) mctp_dev_release_key(key->dev, key); spin_unlock_irqrestore(&key->lock, flags); + sock_put(key->sk); kfree(key); } @@ -177,6 +179,11 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) spin_lock_irqsave(&net->mctp.keys_lock, flags); + if (sock_flag(&msk->sk, SOCK_DEAD)) { + rc = -EINVAL; + goto out_unlock; + } + hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { if (mctp_key_match(tmp, key->local_addr, key->peer_addr, key->tag)) { @@ -198,6 +205,7 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) hlist_add_head(&key->sklist, &msk->keys); } +out_unlock: spin_unlock_irqrestore(&net->mctp.keys_lock, flags); return rc; @@ -315,8 +323,8 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) { + struct mctp_sk_key *key, *any_key = NULL; struct net *net = dev_net(skb->dev); - struct mctp_sk_key *key; struct mctp_sock *msk; struct mctp_hdr *mh; unsigned long f; @@ -361,13 +369,11 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * key for reassembly - we'll create a more specific * one for future packets if required (ie, !EOM). */ - key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); - if (key) { - msk = container_of(key->sk, + any_key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); + if (any_key) { + msk = container_of(any_key->sk, struct mctp_sock, sk); - spin_unlock_irqrestore(&key->lock, f); - mctp_key_unref(key); - key = NULL; + spin_unlock_irqrestore(&any_key->lock, f); } } @@ -419,14 +425,14 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * this function. */ rc = mctp_key_add(key, msk); - if (rc) { - kfree(key); - } else { + if (!rc) trace_mctp_key_acquire(key); - /* we don't need to release key->lock on exit */ - mctp_key_unref(key); - } + /* we don't need to release key->lock on exit, so + * clean up here and suppress the unlock via + * setting to NULL + */ + mctp_key_unref(key); key = NULL; } else { @@ -473,6 +479,8 @@ out_unlock: spin_unlock_irqrestore(&key->lock, f); mctp_key_unref(key); } + if (any_key) + mctp_key_unref(any_key); out: if (rc) kfree_skb(skb); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 35b5f806fdda..dc5165d3eec4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1428,6 +1428,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, free: kfree(table); out: + mdev->sysctl = NULL; return -ENOBUFS; } @@ -1437,6 +1438,9 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct net *net = dev_net(dev); struct ctl_table *table; + if (!mdev->sysctl) + return; + table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 5ded85e2c374..b30cea2fbf3f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1594,8 +1594,7 @@ mp_rst: TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); - MPTCP_INC_STATS(sock_net((const struct sock *)tp), - MPTCP_MIB_MPPRIOTX); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 45e2a48397b9..70f0ced3ca86 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -420,6 +420,31 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) } } +/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, + * otherwise allow any matching local/remote pair + */ +bool mptcp_pm_addr_families_match(const struct sock *sk, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *rem) +{ + bool mptcp_is_v4 = sk->sk_family == AF_INET; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); + bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); + + if (mptcp_is_v4) + return loc_is_v4 && rem_is_v4; + + if (ipv6_only_sock(sk)) + return !loc_is_v4 && !rem_is_v4; + + return loc_is_v4 == rem_is_v4; +#else + return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; +#endif +} + void mptcp_pm_data_reset(struct mptcp_sock *msk) { u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index eef69d0e44ec..56628b52d100 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -152,7 +152,6 @@ static struct mptcp_pm_addr_entry * select_local_address(const struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) { - const struct sock *sk = (const struct sock *)msk; struct mptcp_pm_addr_entry *entry, *ret = NULL; msk_owned_by_me(msk); @@ -165,16 +164,6 @@ select_local_address(const struct pm_nl_pernet *pernet, if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; - if (entry->addr.family != sk->sk_family) { -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if ((entry->addr.family == AF_INET && - !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) || - (sk->sk_family == AF_INET && - !ipv6_addr_v4mapped(&entry->addr.addr6))) -#endif - continue; - } - ret = entry; break; } @@ -423,7 +412,9 @@ static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned /* Fill all the remote addresses into the array addrs[], * and return the array size. */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh, +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + bool fullmesh, struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); @@ -443,6 +434,9 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm if (deny_id0) return 0; + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; + msk->pm.subflows++; addrs[i++] = remote; } else { @@ -453,6 +447,9 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm if (deny_id0 && !addrs[i].id) continue; + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; + if (!lookup_address_in_vec(addrs, i, &addrs[i]) && msk->pm.subflows < subflows_max) { msk->pm.subflows++; @@ -603,9 +600,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); msk->pm.local_addr_used++; - nr = fill_remote_addresses_vec(msk, fullmesh, addrs); - if (nr) - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs); + if (nr == 0) + continue; + spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); @@ -628,11 +627,11 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) * and return the array size. */ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, struct mptcp_addr_info *addrs) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info local; struct pm_nl_pernet *pernet; unsigned int subflows_max; int i = 0; @@ -645,15 +644,8 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; - if (entry->addr.family != sk->sk_family) { -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if ((entry->addr.family == AF_INET && - !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) || - (sk->sk_family == AF_INET && - !ipv6_addr_v4mapped(&entry->addr.addr6))) -#endif - continue; - } + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; if (msk->pm.subflows < subflows_max) { msk->pm.subflows++; @@ -666,8 +658,18 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, * 'IPADDRANY' local address */ if (!i) { + struct mptcp_addr_info local; + memset(&local, 0, sizeof(local)); - local.family = msk->pm.remote.family; + local.family = +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : +#endif + remote->family; + + if (!mptcp_pm_addr_families_match(sk, &local, remote)) + return 0; msk->pm.subflows++; addrs[i++] = local; @@ -706,7 +708,9 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - nr = fill_local_addresses_vec(msk, addrs); + nr = fill_local_addresses_vec(msk, &remote, addrs); + if (nr == 0) + return; msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || @@ -998,8 +1002,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, { int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; - struct mptcp_sock *msk; struct socket *ssock; + struct sock *newsk; int backlog = 1024; int err; @@ -1008,11 +1012,13 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; - msk = mptcp_sk(entry->lsk->sk); - if (!msk) + newsk = entry->lsk->sk; + if (!newsk) return -EINVAL; - ssock = __mptcp_nmpc_socket(msk); + lock_sock(newsk); + ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); + release_sock(newsk); if (!ssock) return -EINVAL; @@ -1143,7 +1149,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss if (!tcp_rtx_and_write_queues_empty(ssk)) { subflow->stale = 1; __mptcp_retransmit_pending_data(sk); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE); + MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); } unlock_sock_fast(ssk, slow); @@ -1190,7 +1196,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) - return err; + return 0; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); @@ -1224,7 +1230,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], if (tb[MPTCP_PM_ADDR_ATTR_PORT]) addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); - return err; + return 0; } int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, @@ -1903,8 +1909,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) } if (token) - return mptcp_userspace_pm_set_flags(sock_net(skb->sk), - token, &addr, &remote, bkup); + return mptcp_userspace_pm_set_flags(net, token, &addr, &remote, bkup); spin_lock_bh(&pernet->lock); entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); @@ -2094,7 +2099,7 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } void mptcp_event_addr_announced(const struct sock *ssk, @@ -2151,7 +2156,7 @@ void mptcp_event_addr_announced(const struct sock *ssk, return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } void mptcp_event_pm_listener(const struct sock *ssk, @@ -2203,7 +2208,7 @@ void mptcp_event_pm_listener(const struct sock *ssk, return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, @@ -2261,7 +2266,7 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } static const struct genl_small_ops mptcp_pm_ops[] = { diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 5cb65f0928f4..a02d3cbf2a1b 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -59,8 +59,8 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, */ e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); if (!e) { - spin_unlock_bh(&msk->pm.lock); - return -ENOMEM; + ret = -ENOMEM; + goto append_err; } *e = *entry; @@ -74,6 +74,7 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, ret = entry->addr.id; } +append_err: spin_unlock_bh(&msk->pm.lock); return ret; } @@ -156,6 +157,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + err = -EINVAL; goto announce_err; } @@ -282,6 +284,7 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) if (addr_l.id == 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); + err = -EINVAL; goto create_err; } @@ -292,6 +295,13 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) } sk = (struct sock *)msk; + + if (!mptcp_pm_addr_families_match(sk, &addr_l, &addr_r)) { + GENL_SET_ERR_MSG(info, "families mismatch"); + err = -EINVAL; + goto create_err; + } + lock_sock(sk); err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); @@ -395,11 +405,13 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) if (addr_l.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); + err = -EINVAL; goto destroy_err; } if (!addr_l.port || !addr_r.port) { GENL_SET_ERR_MSG(info, "missing local or remote port"); + err = -EINVAL; goto destroy_err; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f6f93957275b..3ad9c46202fc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -98,7 +98,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) struct socket *ssock; int err; - err = mptcp_subflow_create_socket(sk, &ssock); + err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock); if (err) return err; @@ -923,9 +923,8 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - sock_owned_by_me(sk); + msk_owned_by_me(msk); mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->data_avail)) @@ -1408,7 +1407,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - sock_owned_by_me(sk); + msk_owned_by_me(msk); if (__mptcp_check_fallback(msk)) { if (!msk->first) @@ -1662,6 +1661,8 @@ static void mptcp_set_nospace(struct sock *sk) set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } +static int mptcp_disconnect(struct sock *sk, int flags); + static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, size_t len, int *copied_syn) { @@ -1672,9 +1673,9 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->connect_flags = O_NONBLOCK; - msk->is_sendmsg = 1; + msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); - msk->is_sendmsg = 0; + msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); @@ -1688,6 +1689,8 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; + } else if (ret && ret != -EINPROGRESS) { + mptcp_disconnect(sk, 0); } return ret; @@ -1886,7 +1889,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) u32 time, advmss = 1; u64 rtt_us, mstamp; - sock_owned_by_me(sk); + msk_owned_by_me(msk); if (copied <= 0) return; @@ -2213,7 +2216,7 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - sock_owned_by_me((const struct sock *)msk); + msk_owned_by_me(msk); if (__mptcp_check_fallback(msk)) return NULL; @@ -2353,7 +2356,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* otherwise tcp will dispose of the ssk and subflow ctx */ if (ssk->sk_state == TCP_LISTEN) { tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(ssk); + mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); } @@ -2720,8 +2723,8 @@ static int mptcp_init_sock(struct sock *sk) mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); - sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } @@ -2872,7 +2875,6 @@ static void __mptcp_destroy_sock(struct sock *sk) sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); sock_put(sk); } @@ -2888,15 +2890,23 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) return EPOLLIN | EPOLLRDNORM; } +static void mptcp_listen_inuse_dec(struct sock *sk) +{ + if (inet_sk_state_load(sk) == TCP_LISTEN) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +} + bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; + int subflows_alive = 0; sk->sk_shutdown = SHUTDOWN_MASK; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { + mptcp_listen_inuse_dec(sk); inet_sk_state_store(sk, TCP_CLOSE); goto cleanup; } @@ -2918,6 +2928,8 @@ cleanup: struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + subflows_alive += ssk->sk_state != TCP_CLOSE; + /* since the close timeout takes precedence on the fail one, * cancel the latter */ @@ -2933,6 +2945,12 @@ cleanup: } sock_orphan(sk); + /* all the subflows are closed, only timeout can change the msk + * state, let's not keep resources busy for no reasons + */ + if (subflows_alive == 0) + inet_sk_state_store(sk, TCP_CLOSE); + sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (msk->token) @@ -2989,6 +3007,15 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); + /* We are on the fastopen error path. We can't call straight into the + * subflows cleanup code due to lock nesting (we are already under + * msk->firstsocket lock). Do nothing and leave the cleanup to the + * caller. + */ + if (msk->fastopening) + return 0; + + mptcp_listen_inuse_dec(sk); inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3532,7 +3559,7 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ - if (msk->is_sendmsg) + if (msk->fastopening) err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); else err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); @@ -3627,12 +3654,13 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct sock *sk = sock->sk; struct socket *ssock; int err; pr_debug("msk=%p", msk); - lock_sock(sock->sk); + lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); if (!ssock) { err = -EINVAL; @@ -3640,18 +3668,20 @@ static int mptcp_listen(struct socket *sock, int backlog) } mptcp_token_destroy(msk); - inet_sk_state_store(sock->sk, TCP_LISTEN); - sock_set_flag(sock->sk, SOCK_RCU_FREE); + inet_sk_state_store(sk, TCP_LISTEN); + sock_set_flag(sk, SOCK_RCU_FREE); err = ssock->ops->listen(ssock, backlog); - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - if (!err) - mptcp_copy_inaddrs(sock->sk, ssock->sk); + inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + if (!err) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + mptcp_copy_inaddrs(sk, ssock->sk); + } mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); unlock: - release_sock(sock->sk); + release_sock(sk); return err; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 955fb3d88eb3..61fd8eabfca2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -295,7 +295,7 @@ struct mptcp_sock { u8 recvmsg_inq:1, cork:1, nodelay:1, - is_sendmsg:1; + fastopening:1; int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; @@ -628,7 +628,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); -void mptcp_subflow_queue_clean(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); @@ -641,7 +641,8 @@ bool mptcp_addresses_equal(const struct mptcp_addr_info *a, /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); -int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); +int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, + struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); @@ -754,7 +755,7 @@ static inline void mptcp_token_init_request(struct request_sock *req) int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); -int mptcp_token_new_connect(struct sock *sk); +int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); @@ -776,6 +777,9 @@ int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry); +bool mptcp_pm_addr_families_match(const struct sock *sk, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a47423ebb33a..8a9656248b0f 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -18,7 +18,7 @@ static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - sock_owned_by_me((const struct sock *)msk); + msk_owned_by_me(msk); if (likely(!__mptcp_check_fallback(msk))) return NULL; @@ -740,7 +740,7 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, } release_sock(sk); - return err; + return 0; } static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, @@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { + struct sock *sk = (struct sock *)msk; struct socket *sock; + int ret = -EINVAL; /* Limit to first subflow, before the connection establishment */ + lock_sock(sk); sock = __mptcp_nmpc_socket(msk); if (!sock) - return -EINVAL; + goto unlock; + + ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); - return tcp_setsockopt(sock->sk, level, optname, optval, optlen); +unlock: + release_sock(sk); + return ret; } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, @@ -1255,6 +1262,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; ssk->sk_incoming_cpu = sk->sk_incoming_cpu; + ssk->sk_ipv6only = sk->sk_ipv6only; __ip_sock_set_tos(ssk, inet_sk(sk)->tos); if (sk->sk_userlocks & tx_rx_locks) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 29904303f5c2..4ae1a7304cf0 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -26,6 +26,7 @@ #include "mib.h" #include <trace/events/mptcp.h> +#include <trace/events/sock.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); @@ -45,7 +46,6 @@ static void subflow_req_destructor(struct request_sock *req) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); - tcp_request_sock_ops.destructor(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, @@ -590,7 +590,7 @@ static int subflow_v6_rebuild_header(struct sock *sk) } #endif -struct request_sock_ops mptcp_subflow_request_sock_ops; +static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -603,7 +603,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - return tcp_conn_request(&mptcp_subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: @@ -611,7 +611,14 @@ drop: return 0; } +static void subflow_v4_req_destructor(struct request_sock *req) +{ + subflow_req_destructor(req); + tcp_request_sock_ops.destructor(req); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; @@ -634,15 +641,36 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) return 0; } - return tcp_conn_request(&mptcp_subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } + +static void subflow_v6_req_destructor(struct request_sock *req) +{ + subflow_req_destructor(req); + tcp6_request_sock_ops.destructor(req); +} +#endif + +struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + if (ops->family == AF_INET) + ops = &mptcp_subflow_v4_request_sock_ops; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ops->family == AF_INET6) + ops = &mptcp_subflow_v6_request_sock_ops; #endif + return inet_reqsk_alloc(ops, sk_listener, attach_listener); +} +EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); + /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_options_received *mp_opt) @@ -1372,6 +1400,7 @@ void __mptcp_error_report(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err = sock_error(ssk); + int ssk_state; if (!err) continue; @@ -1382,7 +1411,14 @@ void __mptcp_error_report(struct sock *sk) if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) continue; - inet_sk_state_store(sk, inet_sk_state_load(ssk)); + /* We need to propagate only transition to CLOSE state. + * Orphaned socket will see such state change via + * subflow_sched_work_if_closed() and that path will properly + * destroy the msk as needed. + */ + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + inet_sk_state_store(sk, ssk_state); sk->sk_err = -err; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ @@ -1411,6 +1447,8 @@ static void subflow_data_ready(struct sock *sk) struct sock *parent = subflow->conn; struct mptcp_sock *msk; + trace_sk_data_ready(sk); + msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, @@ -1520,7 +1558,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (!mptcp_is_fully_established(sk)) goto err_out; - err = mptcp_subflow_create_socket(sk, &sf); + err = mptcp_subflow_create_socket(sk, loc->family, &sf); if (err) goto err_out; @@ -1633,7 +1671,9 @@ static void mptcp_subflow_ops_undo_override(struct sock *ssk) #endif ssk->sk_prot = &tcp_prot; } -int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) + +int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, + struct socket **new_sock) { struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); @@ -1646,12 +1686,11 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) if (unlikely(!sk->sk_socket)) return -EINVAL; - err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, - &sf); + err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) return err; - lock_sock(sf->sk); + lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); @@ -1764,7 +1803,7 @@ static void subflow_state_change(struct sock *sk) } } -void mptcp_subflow_queue_clean(struct sock *listener_ssk) +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct mptcp_sock *msk, *next, *head = NULL; @@ -1813,8 +1852,23 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk) do_cancel_work = __mptcp_close(sk, 0); release_sock(sk); - if (do_cancel_work) + if (do_cancel_work) { + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, + SINGLE_DEPTH_NESTING, 0, _RET_IP_); + } sock_put(sk); } @@ -1963,7 +2017,6 @@ static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); - subflow_ops->slab_name = "request_sock_subflow"; subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, @@ -1973,16 +2026,17 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) if (!subflow_ops->slab) return -ENOMEM; - subflow_ops->destructor = subflow_req_destructor; - return 0; } void __init mptcp_subflow_init(void) { - mptcp_subflow_request_sock_ops = tcp_request_sock_ops; - if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) - panic("MPTCP: failed to init subflow request sock ops\n"); + mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; + mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; + mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; + + if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; @@ -1998,6 +2052,20 @@ void __init mptcp_subflow_init(void) tcp_prot_override.release_cb = tcp_release_cb_override; #if IS_ENABLED(CONFIG_MPTCP_IPV6) + /* In struct mptcp_subflow_request_sock, we assume the TCP request sock + * structures for v4 and v6 have the same size. It should not changed in + * the future but better to make sure to be warned if it is no longer + * the case. + */ + BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); + + mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; + mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; + mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; + + if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow v6 request sock ops\n"); + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 65430f314a68..5bb924534387 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -134,7 +134,7 @@ int mptcp_token_new_request(struct request_sock *req) /** * mptcp_token_new_connect - create new key/idsn/token for subflow - * @sk: the socket that will initiate a connection + * @ssk: the socket that will initiate a connection * * This function is called when a new outgoing mptcp connection is * initiated. @@ -148,11 +148,12 @@ int mptcp_token_new_request(struct request_sock *req) * * returns 0 on success. */ -int mptcp_token_new_connect(struct sock *sk) +int mptcp_token_new_connect(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int retries = MPTCP_TOKEN_MAX_RETRIES; + struct sock *sk = subflow->conn; struct token_bucket *bucket; again: @@ -169,12 +170,13 @@ again: } pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); + ssk, subflow->local_key, subflow->token, subflow->idsn); WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; spin_unlock_bh(&bucket->lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } @@ -190,8 +192,10 @@ void mptcp_token_accept(struct mptcp_subflow_request_sock *req, struct mptcp_sock *msk) { struct mptcp_subflow_request_sock *pos; + struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); bucket = token_bucket(req->token); spin_lock_bh(&bucket->lock); @@ -370,12 +374,14 @@ void mptcp_token_destroy_request(struct request_sock *req) */ void mptcp_token_destroy(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; struct mptcp_sock *pos; if (sk_unhashed((struct sock *)msk)) return; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); bucket = token_bucket(msk->token); spin_lock_bh(&bucket->lock); pos = __token_lookup_msk(bucket, msk->token); diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index 5d984bec1cd8..0758865ab658 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -57,6 +57,9 @@ static struct mptcp_sock *build_msk(struct kunit *test) KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); sock_net_set((struct sock *)msk, &init_net); + + /* be sure the token helpers can dereference sk->sk_prot */ + ((struct sock *)msk)->sk_prot = &tcp_prot; return msk; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0846bd75b1da..4d6737160857 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -189,6 +189,9 @@ config NF_CONNTRACK_LABELS to connection tracking entries. It can be used with xtables connlabel match and the nftables ct expression. +config NF_CONNTRACK_OVS + bool + config NF_CT_PROTO_DCCP bool 'DCCP protocol connection tracking support' depends on NETFILTER_ADVANCED @@ -459,6 +462,9 @@ config NF_NAT_REDIRECT config NF_NAT_MASQUERADE bool +config NF_NAT_OVS + bool + config NETFILTER_SYNPROXY tristate diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1d4db1943936..5ffef1cd6143 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -11,6 +11,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o @@ -59,6 +60,7 @@ obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o obj-$(CONFIG_NF_NAT) += nf_nat.o nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o +nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o ifeq ($(CONFIG_NF_NAT),m) nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o @@ -97,6 +99,12 @@ nf_tables-objs += nft_set_pipapo_avx2.o endif endif +ifdef CONFIG_NFT_CT +ifdef CONFIG_RETPOLINE +nf_tables-objs += nft_ct_fast.o +endif +endif + obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5a6705a0e4ec..b2fdbbed2b4b 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -702,6 +702,22 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); +void nf_ct_set_closing(struct nf_conntrack *nfct) +{ + const struct nf_ct_hook *ct_hook; + + if (!nfct) + return; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ct_hook->set_closing(nfct); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_set_closing); + bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 3c273483df23..b1ea054bb82c 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -30,7 +30,7 @@ config IP_SET_BITMAP_IP depends on IP_SET help This option adds the bitmap:ip set type support, by which one - can store IPv4 addresses (or network addresse) from a range. + can store IPv4 addresses (or network addresses) from a range. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index a8ce04a4bb72..e4fa00abde6a 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -308,8 +308,8 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask); - hosts = 2 << (32 - netmask - 1); - elements = 2 << (netmask - mask_bits - 1); + hosts = 2U << (32 - netmask - 1); + elements = 2UL << (netmask - mask_bits - 1); } if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e7ba5b6dd2b7..46ebee9400da 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1698,9 +1698,10 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; - } while (ret == -EAGAIN && - set->variant->resize && - (ret = set->variant->resize(set, retried)) == 0); + } while (ret == -ERANGE || + (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7499192af586..7c2399541771 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -159,6 +159,17 @@ htable_size(u8 hbits) (SET_WITH_TIMEOUT(set) && \ ip_set_timeout_expired(ext_timeout(d, set))) +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) +static const union nf_inet_addr onesmask = { + .all[0] = 0xffffffff, + .all[1] = 0xffffffff, + .all[2] = 0xffffffff, + .all[3] = 0xffffffff +}; + +static const union nf_inet_addr zeromask = {}; +#endif + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -283,8 +294,9 @@ struct htype { u32 markmask; /* markmask value for mark mask to store */ #endif u8 bucketsize; /* max elements in an array block */ -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) u8 netmask; /* netmask value for subnets to store */ + union nf_inet_addr bitmask; /* stores bitmask */ #endif struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ @@ -459,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) /* Resizing changes htable_bits, so we ignore it */ return x->maxelem == y->maxelem && a->timeout == b->timeout && -#ifdef IP_SET_HASH_WITH_NETMASK - x->netmask == y->netmask && +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + nf_inet_addr_cmp(&x->bitmask, &y->bitmask) && #endif #ifdef IP_SET_HASH_WITH_MARKMASK x->markmask == y->markmask && @@ -1264,9 +1276,21 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_BITMASK + /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask + * and not bitmask. These two are mutually exclusive. */ + if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) { + if (set->family == NFPROTO_IPV4) { + if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip)) + goto nla_put_failure; + } else if (set->family == NFPROTO_IPV6) { + if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6)) + goto nla_put_failure; + } + } +#endif #ifdef IP_SET_HASH_WITH_NETMASK - if (h->netmask != HOST_MASK && - nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif #ifdef IP_SET_HASH_WITH_MARKMASK @@ -1429,8 +1453,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u32 markmask; #endif u8 hbits; -#ifdef IP_SET_HASH_WITH_NETMASK - u8 netmask; +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + int ret __attribute__((unused)) = 0; + u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + union nf_inet_addr bitmask = onesmask; #endif size_t hsize; struct htype *h; @@ -1468,7 +1494,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_NETMASK - netmask = set->family == NFPROTO_IPV4 ? 32 : 128; if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); @@ -1476,6 +1501,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, (set->family == NFPROTO_IPV6 && netmask > 128) || netmask == 0) return -IPSET_ERR_INVALID_NETMASK; + + /* we convert netmask to bitmask and store it */ + if (set->family == NFPROTO_IPV4) + bitmask.ip = ip_set_netmask(netmask); + else + ip6_netmask(&bitmask, netmask); + } +#endif + +#ifdef IP_SET_HASH_WITH_BITMASK + if (tb[IPSET_ATTR_BITMASK]) { + /* bitmask and netmask do the same thing, allow only one of these options */ + if (tb[IPSET_ATTR_NETMASK]) + return -IPSET_ERR_BITMASK_NETMASK_EXCL; + + if (set->family == NFPROTO_IPV4) { + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip); + if (ret || !bitmask.ip) + return -IPSET_ERR_INVALID_NETMASK; + } else if (set->family == NFPROTO_IPV6) { + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask); + if (ret || ipv6_addr_any(&bitmask.in6)) + return -IPSET_ERR_INVALID_NETMASK; + } + + if (nf_inet_addr_cmp(&bitmask, &zeromask)) + return -IPSET_ERR_INVALID_NETMASK; } #endif @@ -1518,7 +1570,8 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, for (i = 0; i < ahash_numof_locks(hbits); i++) spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + h->bitmask = bitmask; h->netmask = netmask; #endif #ifdef IP_SET_HASH_WITH_MARKMASK diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 75d556d71652..c9f4e3859663 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -24,7 +24,8 @@ /* 2 Comments support */ /* 3 Forceadd support */ /* 4 skbinfo support */ -#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */ +/* 5 bucketsize, initval support */ +#define IPSET_TYPE_REV_MAX 6 /* bitmask support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -34,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ #define HTYPE hash_ip #define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -86,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); - ip &= ip_set_netmask(h->netmask); + ip &= h->bitmask.ip; if (ip == 0) return -EINVAL; @@ -98,11 +100,11 @@ static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ip4 *h = set->data; + struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, hosts; + u32 ip = 0, ip_to = 0, hosts, i = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) @@ -119,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - ip &= ip_set_hostmask(h->netmask); + ip &= ntohl(h->bitmask.ip); e.ip = htonl(ip); if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; @@ -147,14 +149,14 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - /* 64bit division is not allowed on 32bit */ - if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to;) { + for (; ip <= ip_to; i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; @@ -185,12 +187,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static void -hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip6_netmask(ip, prefix); -} - static bool hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { @@ -227,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; @@ -266,7 +262,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; @@ -293,6 +289,7 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 153de3457423..a22ec1a6f6ec 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -97,11 +97,11 @@ static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipmark4 *h = set->data; + struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0; + u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -148,13 +148,14 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ipmark4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7303138e46be..e977b5a9c48d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -26,7 +26,8 @@ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ +/* 6 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -35,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -92,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -105,11 +112,11 @@ static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipport4 *h = set->data; + struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -129,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -173,17 +184,18 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -253,12 +265,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -298,6 +315,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -356,6 +377,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 334fb1ad0e86..39a01934b153 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -108,11 +108,11 @@ static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportip4 *h = set->data; + struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -180,17 +180,18 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipportip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7df94f437f60..5c6de605a9fb 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -160,12 +160,12 @@ static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportnet4 *h = set->data; + struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -253,9 +253,6 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); @@ -282,9 +279,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; + if (i > IPSET_MAX_RANGE) { + hash_ipportnet4_data_next(&h->next, + &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 1422739d9aa2..ce0a9ce5a91f 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -136,11 +136,11 @@ static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_net4 *h = set->data; + struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -188,19 +188,16 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_net4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 9810f5bf63f5..031073286236 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -256,19 +256,16 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_netiface4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3d09eefe998a..8fbe649c9dd3 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -23,7 +23,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ /* 2 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ +/* 3 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -33,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net"); /* Type specific function prefix */ #define HTYPE hash_netnet #define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK #define IPSET_NET_COUNT 2 /* IPv4 variants */ @@ -153,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); - e.ip[0] &= ip_set_netmask(e.cidr[0]); - e.ip[1] &= ip_set_netmask(e.cidr[1]); + e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip); + e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -163,13 +166,12 @@ static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netnet4 *h = set->data; + struct hash_netnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn; - u64 n = 0, m = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -213,8 +215,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { - e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); - e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1])); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; @@ -245,19 +247,6 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); - n++; - } while (ipn++ < ip_to); - ipn = ip2_from; - do { - ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); - m++; - } while (ipn++ < ip2_to); - - if (n*m > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); @@ -270,7 +259,12 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip[0] = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); do { + i++; e.ip[1] = htonl(ip2); + if (i > IPSET_MAX_RANGE) { + hash_netnet4_data_next(&h->next, &e); + return -ERANGE; + } ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) @@ -404,6 +398,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -414,6 +413,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + const struct hash_netnet6 *h = set->data; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -453,6 +453,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -IPSET_ERR_HASH_ELEM; + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -484,6 +489,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 09cf72eb37f8..d1a0628df4ef 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -154,12 +154,11 @@ static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netport4 *h = set->data; + struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn; - u64 n = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -236,14 +235,6 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr); - n++; - } while (ipn++ < ip_to); - - if (n*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip); @@ -255,8 +246,12 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_netport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 19bcdb3141f6..005a7ce87217 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -173,17 +173,26 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } +static u32 +hash_netportnet4_range_to_cidr(u32 from, u32 to, u8 *cidr) +{ + if (from == 0 && to == UINT_MAX) { + *cidr = 0; + return to; + } + return ip_set_range_to_cidr(from, to, cidr); +} + static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netportnet4 *h = set->data; + struct hash_netportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2, ipn; - u64 n = 0, m = 0; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; int ret; @@ -285,19 +294,6 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); - n++; - } while (ipn++ < ip_to); - ipn = ip2_from; - do { - ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); - m++; - } while (ipn++ < ip2_to); - - if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); @@ -310,13 +306,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], do { e.ip[0] = htonl(ip); - ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); + ip = hash_netportnet4_range_to_cidr(ip, ip_to, &e.cidr[0]); for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip[1] = htonl(ip2); - ip2 = ip_set_range_to_cidr(ip2, ip2_to, - &e.cidr[1]); + if (i > IPSET_MAX_RANGE) { + hash_netportnet4_data_next(&h->next, + &e); + return -ERANGE; + } + ip2 = hash_netportnet4_range_to_cidr(ip2, + ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 5a67f7966574..e162636525cf 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -427,7 +427,7 @@ list_set_destroy(struct ip_set *set) struct set_elem *e, *n; if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); + timer_shutdown_sync(&map->gc); list_for_each_entry_safe(e, n, &map->members, list) { list_del(&e->list); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 51ad557a525b..2fcc26507d69 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -132,21 +132,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -168,21 +168,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -200,17 +200,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -2448,6 +2448,10 @@ static void __exit ip_vs_cleanup(void) ip_vs_conn_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); + /* common rcu_barrier() used by: + * - ip_vs_control_cleanup() + */ + rcu_barrier(); pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4d62059a6021..2a5ed71c82c3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -49,8 +49,7 @@ MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); +DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ /* sysctl variables */ @@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work) } #endif +static void est_reload_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, est_reload_work.work); + int genid_done = atomic_read(&ipvs->est_genid_done); + unsigned long delay = HZ / 10; /* repeat startups after failure */ + bool repeat = false; + int genid; + int id; + + mutex_lock(&ipvs->est_mutex); + genid = atomic_read(&ipvs->est_genid); + for (id = 0; id < ipvs->est_kt_count; id++) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; + + /* netns clean up started, abort delayed work */ + if (!ipvs->enable) + goto unlock; + if (!kd) + continue; + /* New config ? Stop kthread tasks */ + if (genid != genid_done) + ip_vs_est_kthread_stop(kd); + if (!kd->task && !ip_vs_est_stopped(ipvs)) { + /* Do not start kthreads above 0 in calc phase */ + if ((!id || !ipvs->est_calc_phase) && + ip_vs_est_kthread_start(ipvs, kd) < 0) + repeat = true; + } + } + + atomic_set(&ipvs->est_genid_done, genid); + + if (repeat) + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, + delay); + +unlock: + mutex_unlock(&ipvs->est_mutex); +} + int ip_vs_use_count_inc(void) { @@ -471,7 +511,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) static void ip_vs_service_free(struct ip_vs_service *svc) { - free_percpu(svc->stats.cpustats); + ip_vs_stats_release(&svc->stats); kfree(svc); } @@ -483,17 +523,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head) ip_vs_service_free(svc); } -static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); - if (do_delay) - call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); - else - ip_vs_service_free(svc); + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } @@ -780,14 +817,22 @@ out: return dest; } +static void ip_vs_dest_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest *dest; + + dest = container_of(head, struct ip_vs_dest, rcu_head); + ip_vs_stats_release(&dest->stats); + ip_vs_dest_put_and_free(dest); +} + static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_dst_cache_reset(dest); - __ip_vs_svc_put(svc, false); - free_percpu(dest->stats.cpustats); - ip_vs_dest_put_and_free(dest); + __ip_vs_svc_put(svc); + call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* @@ -811,12 +856,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) } } +static void ip_vs_stats_rcu_free(struct rcu_head *head) +{ + struct ip_vs_stats_rcu *rs = container_of(head, + struct ip_vs_stats_rcu, + rcu_head); + + ip_vs_stats_release(&rs->s); + kfree(rs); +} + static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c - spin_lock_bh(&src->lock); + spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); @@ -826,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) ip_vs_read_estimator(dst, src); - spin_unlock_bh(&src->lock); + spin_unlock(&src->lock); } static void @@ -847,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) static void ip_vs_zero_stats(struct ip_vs_stats *stats) { - spin_lock_bh(&stats->lock); + spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ @@ -861,7 +916,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) ip_vs_zero_estimator(stats); - spin_unlock_bh(&stats->lock); + spin_unlock(&stats->lock); +} + +/* Allocate fields after kzalloc */ +int ip_vs_stats_init_alloc(struct ip_vs_stats *s) +{ + int i; + + spin_lock_init(&s->lock); + s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!s->cpustats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); + + u64_stats_init(&cs->syncp); + } + return 0; +} + +struct ip_vs_stats *ip_vs_stats_alloc(void) +{ + struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); + + if (s && ip_vs_stats_init_alloc(s) >= 0) + return s; + kfree(s); + return NULL; +} + +void ip_vs_stats_release(struct ip_vs_stats *stats) +{ + free_percpu(stats->cpustats); +} + +void ip_vs_stats_free(struct ip_vs_stats *stats) +{ + if (stats) { + ip_vs_stats_release(stats); + kfree(stats); + } } /* @@ -923,7 +1019,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); - __ip_vs_svc_put(old_svc, true); + __ip_vs_svc_put(old_svc); } } @@ -942,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock_bh(&dest->dst_lock); if (add) { - ip_vs_start_estimator(svc->ipvs, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); @@ -963,14 +1058,13 @@ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - unsigned int atype, i; + unsigned int atype; + int ret; EnterFunction(2); #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { - int ret; - atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -992,15 +1086,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (dest == NULL) return -ENOMEM; - dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!dest->stats.cpustats) + ret = ip_vs_stats_init_alloc(&dest->stats); + if (ret < 0) goto err_alloc; - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_dest_stats; - ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); - u64_stats_init(&ip_vs_dest_stats->syncp); - } + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + goto err_stats; dest->af = udest->af; dest->protocol = svc->protocol; @@ -1017,15 +1109,17 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); LeaveFunction(2); return 0; +err_stats: + ip_vs_stats_release(&dest->stats); + err_alloc: kfree(dest); - return -ENOMEM; + return ret; } @@ -1087,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + goto err; __ip_vs_update_dest(svc, dest, udest, 1); - ret = 0; } else { /* * Allocate and initialize the dest structure */ ret = ip_vs_new_dest(svc, udest); } + +err: LeaveFunction(2); return ret; @@ -1284,7 +1382,7 @@ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0, i; + int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; @@ -1344,18 +1442,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ret = -ENOMEM; goto out_err; } - svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!svc->stats.cpustats) { - ret = -ENOMEM; + ret = ip_vs_stats_init_alloc(&svc->stats); + if (ret < 0) goto out_err; - } - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_stats; - ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); - u64_stats_init(&ip_vs_stats->syncp); - } - /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); @@ -1372,7 +1461,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ if (sched) { @@ -1382,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, sched = NULL; } + ret = ip_vs_start_estimator(ipvs, &svc->stats); + if (ret < 0) + goto out_err; + /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; @@ -1394,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (svc->pe && svc->pe->conn_out) atomic_inc(&ipvs->conn_out_counter); - ip_vs_start_estimator(ipvs, &svc->stats); - /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; @@ -1406,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_svc_hash(svc); *svc_p = svc; - /* Now there is a service - full throttle */ - ipvs->enable = 1; + + if (!ipvs->enable) { + /* Now there is a service - full throttle */ + ipvs->enable = 1; + + /* Start estimation for first time */ + ip_vs_est_reload_start(ipvs); + } + return 0; @@ -1571,7 +1668,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* * Free the service if nobody refers to it */ - __ip_vs_svc_put(svc, true); + __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1761,7 +1858,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } } - ip_vs_zero_stats(&ipvs->tot_stats); + ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } @@ -1843,6 +1940,148 @@ proc_do_sync_ports(struct ctl_table *table, int write, return rc; } +static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + cpumask_var_t newmask; + int ret; + + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpulist_parse(buffer, newmask); + if (ret) + goto out; + + mutex_lock(&ipvs->est_mutex); + + if (!ipvs->est_cpulist_valid) { + if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { + ret = -ENOMEM; + goto unlock; + } + ipvs->est_cpulist_valid = 1; + } + cpumask_and(newmask, newmask, ¤t->cpus_mask); + cpumask_copy(*valp, newmask); + /* est_max_threads may depend on cpulist size */ + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + ipvs->est_calc_phase = 1; + ip_vs_est_reload_start(ipvs); + +unlock: + mutex_unlock(&ipvs->est_mutex); + +out: + free_cpumask_var(newmask); + return ret; +} + +static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, + size_t size) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + struct cpumask *mask; + int ret; + + mutex_lock(&ipvs->est_mutex); + + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + + mutex_unlock(&ipvs->est_mutex); + + return ret; +} + +static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + /* Ignore both read and write(append) if *ppos not 0 */ + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + if (write) { + /* proc_sys_call_handler() appends terminator */ + ret = ipvs_proc_est_cpumask_set(table, buffer); + if (ret >= 0) + *ppos += *lenp; + } else { + /* proc_sys_call_handler() allocates 1 byte for terminator */ + ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); + if (ret >= 0) { + *lenp = ret; + *ppos += *lenp; + ret = 0; + } + } + return ret; +} + +static int ipvs_proc_est_nice(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < MIN_NICE || val > MAX_NICE) { + ret = -EINVAL; + } else { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + } + return ret; +} + +static int ipvs_proc_run_estimation(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + return ret; +} + /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without @@ -2017,7 +2256,19 @@ static struct ctl_table vs_vars[] = { .procname = "run_estimation", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = ipvs_proc_run_estimation, + }, + { + .procname = "est_cpulist", + .maxlen = NR_CPUS, /* unused */ + .mode = 0644, + .proc_handler = ipvs_proc_est_cpulist, + }, + { + .procname = "est_nice", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_est_nice, }, #ifdef CONFIG_IP_VS_DEBUG { @@ -2255,7 +2506,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); - ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, @@ -2279,7 +2530,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; @@ -2297,11 +2548,11 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) do { start = u64_stats_fetch_begin(&u->syncp); - conns = u->cnt.conns; - inpkts = u->cnt.inpkts; - outpkts = u->cnt.outpkts; - inbytes = u->cnt.inbytes; - outbytes = u->cnt.outbytes; + conns = u64_stats_read(&u->cnt.conns); + inpkts = u64_stats_read(&u->cnt.inpkts); + outpkts = u64_stats_read(&u->cnt.outpkts); + inbytes = u64_stats_read(&u->cnt.inbytes); + outbytes = u64_stats_read(&u->cnt.outbytes); } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", @@ -2590,6 +2841,11 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + break; } out_unlock: @@ -4027,13 +4283,17 @@ static void ip_vs_genl_unregister(void) static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; - int idx; struct ctl_table *tbl; + int idx, ret; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); + ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); @@ -4094,31 +4354,44 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; + tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; + + ipvs->est_cpulist_valid = 0; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_cpulist; + + ipvs->sysctl_est_nice = IPVS_EST_NICE; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_nice; + #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) tbl[idx++].mode = 0444; #endif + ret = -ENOMEM; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); - if (ipvs->sysctl_hdr == NULL) { - if (!net_eq(net, &init_net)) - kfree(tbl); - return -ENOMEM; - } - ip_vs_start_estimator(ipvs, &ipvs->tot_stats); + if (!ipvs->sysctl_hdr) + goto err; ipvs->sysctl_tbl = tbl; + + ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); + if (ret < 0) + goto err; + /* Schedule defense work */ - INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); - /* Init delayed work for expiring no dest conn */ - INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, - expire_nodest_conn_handler); - return 0; + +err: + unregister_net_sysctl_table(ipvs->sysctl_hdr); + if (!net_eq(net, &init_net)) + kfree(tbl); + return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) @@ -4129,7 +4402,10 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + + if (ipvs->est_cpulist_valid) + free_cpumask_var(ipvs->sysctl_est_cpulist); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); @@ -4151,7 +4427,8 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { - int i, idx; + int ret = -ENOMEM; + int idx; /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -4164,18 +4441,14 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); - /* procfs stats */ - ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->tot_stats.cpustats) - return -ENOMEM; - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ipvs_tot_stats; - ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); - u64_stats_init(&ipvs_tot_stats->syncp); - } + INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); - spin_lock_init(&ipvs->tot_stats.lock); + /* procfs stats */ + ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); + if (!ipvs->tot_stats) + goto out; + if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) + goto err_tot_stats; #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, @@ -4190,7 +4463,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) goto err_percpu; #endif - if (ip_vs_control_net_init_sysctl(ipvs)) + ret = ip_vs_control_net_init_sysctl(ipvs); + if (ret < 0) goto err; return 0; @@ -4207,20 +4481,26 @@ err_stats: err_vs: #endif - free_percpu(ipvs->tot_stats.cpustats); - return -ENOMEM; + ip_vs_stats_release(&ipvs->tot_stats->s); + +err_tot_stats: + kfree(ipvs->tot_stats); + +out: + return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); + cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); #endif - free_percpu(ipvs->tot_stats.cpustats); + call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) @@ -4280,5 +4560,6 @@ void ip_vs_control_cleanup(void) { EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); + /* relying on common rcu_barrier() in ip_vs_cleanup() */ LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 9a1a7af6a186..c5970ba416ae 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -30,9 +30,6 @@ long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W @@ -47,68 +44,79 @@ to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c - */ - -/* - * Make a summary from each cpu + KEY POINTS: + - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled + - kthreads read the cpustats to update the estimators (svcs, dests, total) + - the states of estimators can be read (get stats) or modified (zero stats) + from processes + + KTHREADS: + - estimators are added initially to est_temp_list and later kthread 0 + distributes them to one or many kthreads for estimation + - kthread contexts are created and attached to array + - the kthread tasks are started when first service is added, before that + the total stats are not estimated + - when configuration (cpulist/nice) is changed, the tasks are restarted + by work (est_reload_work) + - kthread tasks are stopped while the cpulist is empty + - the kthread context holds lists with estimators (chains) which are + processed every 2 seconds + - as estimators can be added dynamically and in bursts, we try to spread + them to multiple chains which are estimated at different time + - on start, kthread 0 enters calculation phase to determine the chain limits + and the limit of estimators per kthread + - est_add_ktid: ktid where to add new ests, can point to empty slot where + we should add kt data */ -static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, - struct ip_vs_cpu_stats __percpu *stats) -{ - int i; - bool add = false; - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); - unsigned int start; - u64 conns, inpkts, outpkts, inbytes, outbytes; - if (add) { - do { - start = u64_stats_fetch_begin(&s->syncp); - conns = s->cnt.conns; - inpkts = s->cnt.inpkts; - outpkts = s->cnt.outpkts; - inbytes = s->cnt.inbytes; - outbytes = s->cnt.outbytes; - } while (u64_stats_fetch_retry(&s->syncp, start)); - sum->conns += conns; - sum->inpkts += inpkts; - sum->outpkts += outpkts; - sum->inbytes += inbytes; - sum->outbytes += outbytes; - } else { - add = true; - do { - start = u64_stats_fetch_begin(&s->syncp); - sum->conns = s->cnt.conns; - sum->inpkts = s->cnt.inpkts; - sum->outpkts = s->cnt.outpkts; - sum->inbytes = s->cnt.inbytes; - sum->outbytes = s->cnt.outbytes; - } while (u64_stats_fetch_retry(&s->syncp, start)); - } - } -} +static struct lock_class_key __ipvs_est_key; +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); -static void estimation_timer(struct timer_list *t) +static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; + struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; - struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); - if (!sysctl_run_estimation(ipvs)) - goto skip; + hlist_for_each_entry_rcu(e, chain, list) { + u64 conns, inpkts, outpkts, inbytes, outbytes; + u64 kconns = 0, kinpkts = 0, koutpkts = 0; + u64 kinbytes = 0, koutbytes = 0; + unsigned int start; + int i; + + if (kthread_should_stop()) + break; - spin_lock(&ipvs->est_lock); - list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); + for_each_possible_cpu(i) { + c = per_cpu_ptr(s->cpustats, i); + do { + start = u64_stats_fetch_begin(&c->syncp); + conns = u64_stats_read(&c->cnt.conns); + inpkts = u64_stats_read(&c->cnt.inpkts); + outpkts = u64_stats_read(&c->cnt.outpkts); + inbytes = u64_stats_read(&c->cnt.inbytes); + outbytes = u64_stats_read(&c->cnt.outbytes); + } while (u64_stats_fetch_retry(&c->syncp, start)); + kconns += conns; + kinpkts += inpkts; + koutpkts += outpkts; + kinbytes += inbytes; + koutbytes += outbytes; + } spin_lock(&s->lock); - ip_vs_read_cpu_stats(&s->kstats, s->cpustats); + + s->kstats.conns = kconns; + s->kstats.inpkts = kinpkts; + s->kstats.outpkts = koutpkts; + s->kstats.inbytes = kinbytes; + s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; @@ -133,30 +141,758 @@ static void estimation_timer(struct timer_list *t) e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } - spin_unlock(&ipvs->est_lock); +} -skip: - mod_timer(&ipvs->est_timer, jiffies + 2*HZ); +static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) +{ + struct ip_vs_est_tick_data *td; + int cid; + + rcu_read_lock(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + goto out; + for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { + if (kthread_should_stop()) + break; + ip_vs_chain_estimation(&td->chains[cid]); + cond_resched_rcu(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + break; + } + +out: + rcu_read_unlock(); } -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +static int ip_vs_estimation_kthread(void *data) { - struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_kt_data *kd = data; + struct netns_ipvs *ipvs = kd->ipvs; + int row = kd->est_row; + unsigned long now; + int id = kd->id; + long gap; + + if (id > 0) { + if (!ipvs->est_chain_max) + return 0; + } else { + if (!ipvs->est_chain_max) { + ipvs->est_calc_phase = 1; + /* commit est_calc_phase before reading est_genid */ + smp_mb(); + } + + /* kthread 0 will handle the calc phase */ + if (ipvs->est_calc_phase) + ip_vs_est_calc_phase(ipvs); + } + + while (1) { + if (!id && !hlist_empty(&ipvs->est_temp_list)) + ip_vs_est_drain_temp_list(ipvs); + set_current_state(TASK_IDLE); + if (kthread_should_stop()) + break; + + /* before estimation, check if we should sleep */ + now = jiffies; + gap = kd->est_timer - now; + if (gap > 0) { + if (gap > IPVS_EST_TICK) { + kd->est_timer = now - IPVS_EST_TICK; + gap = IPVS_EST_TICK; + } + schedule_timeout(gap); + } else { + __set_current_state(TASK_RUNNING); + if (gap < -8 * IPVS_EST_TICK) + kd->est_timer = now; + } + + if (kd->tick_len[row]) + ip_vs_tick_estimation(kd, row); + + row++; + if (row >= IPVS_EST_NTICKS) + row = 0; + WRITE_ONCE(kd->est_row, row); + kd->est_timer += IPVS_EST_TICK; + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* Schedule stop/start for kthread tasks */ +void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +{ + /* Ignore reloads before first service is added */ + if (!ipvs->enable) + return; + ip_vs_est_stopped_recalc(ipvs); + /* Bump the kthread configuration genid */ + atomic_inc(&ipvs->est_genid); + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); +} + +/* Start kthread task with current configuration */ +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + unsigned long now; + int ret = 0; + long gap; + + lockdep_assert_held(&ipvs->est_mutex); + + if (kd->task) + goto out; + now = jiffies; + gap = kd->est_timer - now; + /* Sync est_timer if task is starting later */ + if (abs(gap) > 4 * IPVS_EST_TICK) + kd->est_timer = now; + kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", + ipvs->gen, kd->id); + if (IS_ERR(kd->task)) { + ret = PTR_ERR(kd->task); + kd->task = NULL; + goto out; + } + + set_user_nice(kd->task, sysctl_est_nice(ipvs)); + set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); + + pr_info("starting estimator thread %d...\n", kd->id); + wake_up_process(kd->task); + +out: + return ret; +} + +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) +{ + if (kd->task) { + pr_info("stopping estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + kd->task = NULL; + } +} + +/* Apply parameters to kthread */ +static void ip_vs_est_set_params(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + kd->chain_max = ipvs->est_chain_max; + /* We are using single chain on RCU preemption */ + if (IPVS_EST_TICK_CHAINS == 1) + kd->chain_max *= IPVS_EST_CHAIN_FACTOR; + kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; + kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; +} + +/* Create and start estimation kthread in a free or new array slot */ +static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) +{ + struct ip_vs_est_kt_data *kd = NULL; + int id = ipvs->est_kt_count; + int ret = -ENOMEM; + void *arr = NULL; + int i; + + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + ipvs->enable && ipvs->est_max_threads) + return -EINVAL; + + mutex_lock(&ipvs->est_mutex); + + for (i = 0; i < id; i++) { + if (!ipvs->est_kt_arr[i]) + break; + } + if (i >= id) { + arr = krealloc_array(ipvs->est_kt_arr, id + 1, + sizeof(struct ip_vs_est_kt_data *), + GFP_KERNEL); + if (!arr) + goto out; + ipvs->est_kt_arr = arr; + } else { + id = i; + } - INIT_LIST_HEAD(&est->list); + kd = kzalloc(sizeof(*kd), GFP_KERNEL); + if (!kd) + goto out; + kd->ipvs = ipvs; + bitmap_fill(kd->avail, IPVS_EST_NTICKS); + kd->est_timer = jiffies; + kd->id = id; + ip_vs_est_set_params(ipvs, kd); + + /* Pre-allocate stats used in calc phase */ + if (!id && !kd->calc_stats) { + kd->calc_stats = ip_vs_stats_alloc(); + if (!kd->calc_stats) + goto out; + } + + /* Start kthread tasks only when services are present */ + if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { + ret = ip_vs_est_kthread_start(ipvs, kd); + if (ret < 0) + goto out; + } + + if (arr) + ipvs->est_kt_count++; + ipvs->est_kt_arr[id] = kd; + kd = NULL; + /* Use most recent kthread for new ests */ + ipvs->est_add_ktid = id; + ret = 0; + +out: + mutex_unlock(&ipvs->est_mutex); + if (kd) { + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } - spin_lock_bh(&ipvs->est_lock); - list_add(&est->list, &ipvs->est_list); - spin_unlock_bh(&ipvs->est_lock); + return ret; } +/* Select ktid where to add new ests: available, unused or new slot */ +static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) +{ + int ktid, best = ipvs->est_kt_count; + struct ip_vs_est_kt_data *kd; + + for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { + kd = ipvs->est_kt_arr[ktid]; + if (kd) { + if (kd->est_count < kd->est_max_count) { + best = ktid; + break; + } + } else if (ktid < best) { + best = ktid; + } + } + ipvs->est_add_ktid = best; +} + +/* Add estimator to current kthread (est_add_ktid) */ +static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, + struct ip_vs_estimator *est) +{ + struct ip_vs_est_kt_data *kd = NULL; + struct ip_vs_est_tick_data *td; + int ktid, row, crow, cid, ret; + int delay = est->ktrow; + + BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, + "Too many chains for ktcid"); + + if (ipvs->est_add_ktid < ipvs->est_kt_count) { + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + if (kd) + goto add_est; + } + + ret = ip_vs_est_add_kthread(ipvs); + if (ret < 0) + goto out; + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + +add_est: + ktid = kd->id; + /* For small number of estimators prefer to use few ticks, + * otherwise try to add into the last estimated row. + * est_row and add_row point after the row we should use + */ + if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) + crow = READ_ONCE(kd->est_row); + else + crow = kd->add_row; + crow += delay; + if (crow >= IPVS_EST_NTICKS) + crow -= IPVS_EST_NTICKS; + /* Assume initial delay ? */ + if (delay >= IPVS_EST_NTICKS - 1) { + /* Preserve initial delay or decrease it if no space in tick */ + row = crow; + if (crow < IPVS_EST_NTICKS - 1) { + crow++; + row = find_last_bit(kd->avail, crow); + } + if (row >= crow) + row = find_last_bit(kd->avail, IPVS_EST_NTICKS); + } else { + /* Preserve delay or increase it if no space in tick */ + row = IPVS_EST_NTICKS; + if (crow > 0) + row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); + if (row >= IPVS_EST_NTICKS) + row = find_first_bit(kd->avail, IPVS_EST_NTICKS); + } + + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) { + td = kzalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + ret = -ENOMEM; + goto out; + } + rcu_assign_pointer(kd->ticks[row], td); + } + + cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); + + kd->est_count++; + kd->tick_len[row]++; + if (!td->chain_len[cid]) + __set_bit(cid, td->present); + td->chain_len[cid]++; + est->ktid = ktid; + est->ktrow = row; + est->ktcid = cid; + hlist_add_head_rcu(&est->list, &td->chains[cid]); + + if (td->chain_len[cid] >= kd->chain_max) { + __set_bit(cid, td->full); + if (kd->tick_len[row] >= kd->tick_max) + __clear_bit(row, kd->avail); + } + + /* Update est_add_ktid to point to first available/empty kt slot */ + if (kd->est_count == kd->est_max_count) + ip_vs_est_update_ktid(ipvs); + + ret = 0; + +out: + return ret; +} + +/* Start estimation for stats */ +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + int ret; + + if (!ipvs->est_max_threads && ipvs->enable) + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + + est->ktid = -1; + est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ + + /* We prefer this code to be short, kthread 0 will requeue the + * estimator to available chain. If tasks are disabled, we + * will not allocate much memory, just for kt 0. + */ + ret = 0; + if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + ret = ip_vs_est_add_kthread(ipvs); + if (ret >= 0) + hlist_add_head(&est->list, &ipvs->est_temp_list); + else + INIT_HLIST_NODE(&est->list); + return ret; +} + +static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) +{ + if (kd) { + if (kd->task) { + pr_info("stop unused estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + } + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } +} + +/* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + int ktid = est->ktid; + int row = est->ktrow; + int cid = est->ktcid; + + /* Failed to add to chain ? */ + if (hlist_unhashed(&est->list)) + return; + + /* On return, estimator can be freed, dequeue it now */ + + /* In est_temp_list ? */ + if (ktid < 0) { + hlist_del(&est->list); + goto end_kt0; + } + + hlist_del_rcu(&est->list); + kd = ipvs->est_kt_arr[ktid]; + td = rcu_dereference_protected(kd->ticks[row], 1); + __clear_bit(cid, td->full); + td->chain_len[cid]--; + if (!td->chain_len[cid]) + __clear_bit(cid, td->present); + kd->tick_len[row]--; + __set_bit(row, kd->avail); + if (!kd->tick_len[row]) { + RCU_INIT_POINTER(kd->ticks[row], NULL); + kfree_rcu(td, rcu_head); + } + kd->est_count--; + if (kd->est_count) { + /* This kt slot can become available just now, prefer it */ + if (ktid < ipvs->est_add_ktid) + ipvs->est_add_ktid = ktid; + return; + } - spin_lock_bh(&ipvs->est_lock); - list_del(&est->list); - spin_unlock_bh(&ipvs->est_lock); + if (ktid > 0) { + mutex_lock(&ipvs->est_mutex); + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[ktid] = NULL; + if (ktid == ipvs->est_kt_count - 1) { + ipvs->est_kt_count--; + while (ipvs->est_kt_count > 1 && + !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) + ipvs->est_kt_count--; + } + mutex_unlock(&ipvs->est_mutex); + + /* This slot is now empty, prefer another available kt slot */ + if (ktid == ipvs->est_add_ktid) + ip_vs_est_update_ktid(ipvs); + } + +end_kt0: + /* kt 0 is freed after all other kthreads and chains are empty */ + if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { + kd = ipvs->est_kt_arr[0]; + if (!kd || !kd->est_count) { + mutex_lock(&ipvs->est_mutex); + if (kd) { + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[0] = NULL; + } + ipvs->est_kt_count--; + mutex_unlock(&ipvs->est_mutex); + ipvs->est_add_ktid = 0; + } + } +} + +/* Register all ests from est_temp_list to kthreads */ +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) +{ + struct ip_vs_estimator *est; + + while (1) { + int max = 16; + + mutex_lock(&__ip_vs_mutex); + + while (max-- > 0) { + est = hlist_entry_safe(ipvs->est_temp_list.first, + struct ip_vs_estimator, list); + if (est) { + if (kthread_should_stop()) + goto unlock; + hlist_del_init(&est->list); + if (ip_vs_enqueue_estimator(ipvs, est) >= 0) + continue; + est->ktid = -1; + hlist_add_head(&est->list, + &ipvs->est_temp_list); + /* Abort, some entries will not be estimated + * until next attempt + */ + } + goto unlock; + } + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + } + +unlock: + mutex_unlock(&__ip_vs_mutex); +} + +/* Calculate limits for all kthreads */ +static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) +{ + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + struct ip_vs_est_kt_data *kd; + struct hlist_head chain; + struct ip_vs_stats *s; + int cache_factor = 4; + int i, loops, ntest; + s32 min_est = 0; + ktime_t t1, t2; + int max = 8; + int ret = 1; + s64 diff; + u64 val; + + INIT_HLIST_HEAD(&chain); + mutex_lock(&__ip_vs_mutex); + kd = ipvs->est_kt_arr[0]; + mutex_unlock(&__ip_vs_mutex); + s = kd ? kd->calc_stats : NULL; + if (!s) + goto out; + hlist_add_head(&s->est.list, &chain); + + loops = 1; + /* Get best result from many tests */ + for (ntest = 0; ntest < 12; ntest++) { + if (!(ntest & 3)) { + /* Wait for cpufreq frequency transition */ + wait_event_idle_timeout(wq, kthread_should_stop(), + HZ / 50); + if (!ipvs->enable || kthread_should_stop()) + goto stop; + } + + local_bh_disable(); + rcu_read_lock(); + + /* Put stats in cache */ + ip_vs_chain_estimation(&chain); + + t1 = ktime_get(); + for (i = loops * cache_factor; i > 0; i--) + ip_vs_chain_estimation(&chain); + t2 = ktime_get(); + + rcu_read_unlock(); + local_bh_enable(); + + if (!ipvs->enable || kthread_should_stop()) + goto stop; + cond_resched(); + + diff = ktime_to_ns(ktime_sub(t2, t1)); + if (diff <= 1 * NSEC_PER_USEC) { + /* Do more loops on low time resolution */ + loops *= 2; + continue; + } + if (diff >= NSEC_PER_SEC) + continue; + val = diff; + do_div(val, loops); + if (!min_est || val < min_est) { + min_est = val; + /* goal: 95usec per chain */ + val = 95 * NSEC_PER_USEC; + if (val >= min_est) { + do_div(val, min_est); + max = (int)val; + } else { + max = 1; + } + } + } + +out: + if (s) + hlist_del_init(&s->est.list); + *chain_max = max; + return ret; + +stop: + ret = 0; + goto out; +} + +/* Calculate the parameters and apply them in context of kt #0 + * ECP: est_calc_phase + * ECM: est_chain_max + * ECP ECM Insert Chain enable Description + * --------------------------------------------------------------------------- + * 0 0 est_temp_list 0 create kt #0 context + * 0 0 est_temp_list 0->1 service added, start kthread #0 task + * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase + * 1 0 est_temp_list 1 kt #0: determine est_chain_max, + * stop tasks, move ests to est_temp_list + * and free kd for kthreads 1..last + * 1->0 0->N kt chains 1 ests can go to kthreads + * 0 N kt chains 1 drain est_temp_list, create new kthread + * contexts, start tasks, estimate + */ +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) +{ + int genid = atomic_read(&ipvs->est_genid); + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + struct ip_vs_estimator *est; + struct ip_vs_stats *stats; + int id, row, cid, delay; + bool last, last_td; + int chain_max; + int step; + + if (!ip_vs_est_calc_limits(ipvs, &chain_max)) + return; + + mutex_lock(&__ip_vs_mutex); + + /* Stop all other tasks, so that we can immediately move the + * estimators to est_temp_list without RCU grace period + */ + mutex_lock(&ipvs->est_mutex); + for (id = 1; id < ipvs->est_kt_count; id++) { + /* netns clean up started, abort */ + if (!ipvs->enable) + goto unlock2; + kd = ipvs->est_kt_arr[id]; + if (!kd) + continue; + ip_vs_est_kthread_stop(kd); + } + mutex_unlock(&ipvs->est_mutex); + + /* Move all estimators to est_temp_list but carefully, + * all estimators and kthread data can be released while + * we reschedule. Even for kthread 0. + */ + step = 0; + + /* Order entries in est_temp_list in ascending delay, so now + * walk delay(desc), id(desc), cid(asc) + */ + delay = IPVS_EST_NTICKS; + +next_delay: + delay--; + if (delay < 0) + goto end_dequeue; + +last_kt: + /* Destroy contexts backwards */ + id = ipvs->est_kt_count; + +next_kt: + if (!ipvs->enable || kthread_should_stop()) + goto unlock; + id--; + if (id < 0) + goto next_delay; + kd = ipvs->est_kt_arr[id]; + if (!kd) + goto next_kt; + /* kt 0 can exist with empty chains */ + if (!id && kd->est_count <= 1) + goto next_delay; + + row = kd->est_row + delay; + if (row >= IPVS_EST_NTICKS) + row -= IPVS_EST_NTICKS; + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) + goto next_kt; + + cid = 0; + +walk_chain: + if (kthread_should_stop()) + goto unlock; + step++; + if (!(step & 63)) { + /* Give chance estimators to be added (to est_temp_list) + * and deleted (releasing kthread contexts) + */ + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + mutex_lock(&__ip_vs_mutex); + + /* Current kt released ? */ + if (id >= ipvs->est_kt_count) + goto last_kt; + if (kd != ipvs->est_kt_arr[id]) + goto next_kt; + /* Current td released ? */ + if (td != rcu_dereference_protected(kd->ticks[row], 1)) + goto next_kt; + /* No fatal changes on the current kd and td */ + } + est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, + list); + if (!est) { + cid++; + if (cid >= IPVS_EST_TICK_CHAINS) + goto next_kt; + goto walk_chain; + } + /* We can cheat and increase est_count to protect kt 0 context + * from release but we prefer to keep the last estimator + */ + last = kd->est_count <= 1; + /* Do not free kt #0 data */ + if (!id && last) + goto next_delay; + last_td = kd->tick_len[row] <= 1; + stats = container_of(est, struct ip_vs_stats, est); + ip_vs_stop_estimator(ipvs, stats); + /* Tasks are stopped, move without RCU grace period */ + est->ktid = -1; + est->ktrow = row - kd->est_row; + if (est->ktrow < 0) + est->ktrow += IPVS_EST_NTICKS; + hlist_add_head(&est->list, &ipvs->est_temp_list); + /* kd freed ? */ + if (last) + goto next_kt; + /* td freed ? */ + if (last_td) + goto next_kt; + goto walk_chain; + +end_dequeue: + /* All estimators removed while calculating ? */ + if (!ipvs->est_kt_count) + goto unlock; + kd = ipvs->est_kt_arr[0]; + if (!kd) + goto unlock; + kd->add_row = kd->est_row; + ipvs->est_chain_max = chain_max; + ip_vs_est_set_params(ipvs, kd); + + pr_info("using max %d ests per chain, %d per kthread\n", + kd->chain_max, kd->est_max_count); + + /* Try to keep tot_stats in kt0, enqueue it early */ + if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && + ipvs->tot_stats->s.est.ktid == -1) { + hlist_del(&ipvs->tot_stats->s.est.list); + hlist_add_head(&ipvs->tot_stats->s.est.list, + &ipvs->est_temp_list); + } + + mutex_lock(&ipvs->est_mutex); + + /* We completed the calc phase, new calc phase not requested */ + if (genid == atomic_read(&ipvs->est_genid)) + ipvs->est_calc_phase = 0; + +unlock2: + mutex_unlock(&ipvs->est_mutex); + +unlock: + mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) @@ -191,14 +927,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { - INIT_LIST_HEAD(&ipvs->est_list); - spin_lock_init(&ipvs->est_lock); - timer_setup(&ipvs->est_timer, estimation_timer, 0); - mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + INIT_HLIST_HEAD(&ipvs->est_temp_list); + ipvs->est_kt_arr = NULL; + ipvs->est_max_threads = 0; + ipvs->est_calc_phase = 0; + ipvs->est_chain_max = 0; + ipvs->est_kt_count = 0; + ipvs->est_add_ktid = 0; + atomic_set(&ipvs->est_genid, 0); + atomic_set(&ipvs->est_genid_done, 0); + __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { - del_timer_sync(&ipvs->est_timer); + int i; + + for (i = 0; i < ipvs->est_kt_count; i++) + ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); + kfree(ipvs->est_kt_arr); + mutex_destroy(&ipvs->est_mutex); } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 7ac7473e3804..1b87214d385e 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -384,7 +384,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) struct ip_vs_lblc_table *tbl = svc->sched_data; /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); + timer_shutdown_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblc_flush(svc); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 77c323c36a88..ad8f5fea6d3a 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -547,7 +547,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) struct ip_vs_lblcr_table *tbl = svc->sched_data; /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); + timer_shutdown_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblcr_flush(svc); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index a56fd0b5a430..4963fec815da 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1617,7 +1617,7 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) EnterFunction(7); /* Receive a packet */ - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen); len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); if (len < 0) return len; diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index f2579fc9c75b..3308e4cc740a 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -71,8 +71,8 @@ static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc, * from 0 to total_weight */ total_weight += 1; - rweight1 = prandom_u32_max(total_weight); - rweight2 = prandom_u32_max(total_weight); + rweight1 = get_random_u32_below(total_weight); + rweight2 = get_random_u32_below(total_weight); /* Pick two weighted servers */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 029171379884..80448885c3d7 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) - *payload_len = ntohs(old_iph->tot_len); + *payload_len = skb_ip_totlen(skb); } /* Implement full-functionality option for ECN encapsulation */ diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 24002bc61e07..34913521c385 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -249,7 +249,7 @@ __diag_ignore_all("-Wmissing-prototypes", * @opts__sz - Length of the bpf_ct_opts structure * Must be NF_BPF_CT_OPTS_SZ (12) */ -struct nf_conn___init * +__bpf_kfunc struct nf_conn___init * bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -283,7 +283,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts__sz - Length of the bpf_ct_opts structure * Must be NF_BPF_CT_OPTS_SZ (12) */ -struct nf_conn * +__bpf_kfunc struct nf_conn * bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -316,7 +316,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts__sz - Length of the bpf_ct_opts structure * Must be NF_BPF_CT_OPTS_SZ (12) */ -struct nf_conn___init * +__bpf_kfunc struct nf_conn___init * bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -351,7 +351,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts__sz - Length of the bpf_ct_opts structure * Must be NF_BPF_CT_OPTS_SZ (12) */ -struct nf_conn * +__bpf_kfunc struct nf_conn * bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) { @@ -376,7 +376,7 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, * @nfct - Pointer to referenced nf_conn___init object, obtained * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc. */ -struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) +__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) { struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; @@ -400,7 +400,7 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) * @nf_conn - Pointer to referenced nf_conn object, obtained using * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. */ -void bpf_ct_release(struct nf_conn *nfct) +__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) { if (!nfct) return; @@ -417,7 +417,7 @@ void bpf_ct_release(struct nf_conn *nfct) * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. * @timeout - Timeout in msecs. */ -void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) +__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) { __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout)); } @@ -432,7 +432,7 @@ void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup. * @timeout - New timeout in msecs. */ -int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) +__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) { return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout)); } @@ -447,7 +447,7 @@ int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. * @status - New status value. */ -int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) +__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) { return nf_ct_change_status_common((struct nf_conn *)nfct, status); } @@ -462,7 +462,7 @@ int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup. * @status - New status value. */ -int bpf_ct_change_status(struct nf_conn *nfct, u32 status) +__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) { return nf_ct_change_status_common(nfct, status); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b96338b4bf36..70c4f892174e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -514,7 +514,6 @@ EXPORT_SYMBOL_GPL(nf_ct_get_id); static void clean_from_lists(struct nf_conn *ct) { - pr_debug("clean_from_lists(%p)\n", ct); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); @@ -582,7 +581,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { @@ -603,7 +601,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) if (ct->master) nf_ct_put(ct->master); - pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } EXPORT_SYMBOL(nf_ct_destroy); @@ -786,8 +783,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - rcu_read_lock(); - h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested @@ -799,7 +794,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, smp_acquire__after_ctrl_dep(); if (likely(nf_ct_key_equal(h, tuple, zone, net))) - goto found; + return h; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); @@ -807,8 +802,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, h = NULL; } -found: - rcu_read_unlock(); return h; } @@ -820,16 +813,21 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); struct nf_conntrack_tuple_hash *thash; + rcu_read_lock(); + thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, zone_id, net)); if (thash) - return thash; + goto out_unlock; rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (rid != zone_id) - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, rid, net)); + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + +out_unlock: + rcu_read_unlock(); return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -887,7 +885,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone = nf_ct_zone(ct); if (!nf_ct_ext_valid_pre(ct->ext)) { - NF_CT_STAT_INC(net, insert_failed); + NF_CT_STAT_INC_ATOMIC(net, insert_failed); return -ETIMEDOUT; } @@ -902,7 +900,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); - max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { @@ -934,7 +932,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) if (!nf_ct_ext_valid_post(ct->ext)) { nf_ct_kill(ct); - NF_CT_STAT_INC(net, drop); + NF_CT_STAT_INC_ATOMIC(net, drop); return -ETIMEDOUT; } @@ -1210,7 +1208,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } - pr_debug("Confirming conntrack %p\n", ct); /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking @@ -1223,7 +1220,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } - max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -1271,7 +1268,7 @@ chaintoolong: */ if (!nf_ct_ext_valid_post(ct->ext)) { nf_ct_kill(ct); - NF_CT_STAT_INC(net, drop); + NF_CT_STAT_INC_ATOMIC(net, drop); return NF_DROP; } @@ -1374,9 +1371,6 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) - continue; - if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; @@ -1446,11 +1440,14 @@ static bool gc_worker_skip_ct(const struct nf_conn *ct) static bool gc_worker_can_early_drop(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + u8 protonum = nf_ct_protonum(ct); + if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) + return false; if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; - l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(protonum); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; @@ -1507,7 +1504,8 @@ static void gc_worker(struct work_struct *work) if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); - continue; + if (!nf_conntrack_max95) + continue; } if (expired_count > GC_SCAN_EXPIRED_MAX) { @@ -1721,10 +1719,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_zone tmp; struct nf_conntrack_net *cnet; - if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { - pr_debug("Can't invert tuple.\n"); + if (!nf_ct_invert_tuple(&repl_tuple, tuple)) return NULL; - } zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, @@ -1764,8 +1760,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { - pr_debug("expectation arrives ct=%p exp=%p\n", - ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ @@ -1829,10 +1823,8 @@ resolve_normal_ct(struct nf_conn *tmpl, if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, - &tuple)) { - pr_debug("Can't get tuple\n"); + &tuple)) return 0; - } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); @@ -1864,17 +1856,15 @@ resolve_normal_ct(struct nf_conn *tmpl, if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { + unsigned long status = READ_ONCE(ct->status); + /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - pr_debug("normal packet for %p\n", ct); + if (likely(status & IPS_SEEN_REPLY)) ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - pr_debug("related packet for %p\n", ct); + else if (status & IPS_EXPECTED) ctinfo = IP_CT_RELATED; - } else { - pr_debug("new packet for %p\n", ct); + else ctinfo = IP_CT_NEW; - } } nf_ct_set(skb, ct, ctinfo); return 0; @@ -1988,7 +1978,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { - pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; @@ -2027,7 +2016,6 @@ repeat: if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ - pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_ct_put(ct); skb->_nfct = 0; /* Special case: TCP tracker reports an attempt to reopen a @@ -2066,7 +2054,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, /* Should be unconfirmed, so not in hash table yet */ WARN_ON(nf_ct_is_confirmed(ct)); - pr_debug("Altering reply tuple of %p to ", ct); nf_ct_dump_tuple(newreply); ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; @@ -2761,11 +2748,23 @@ err_cachep: return ret; } +static void nf_conntrack_set_closing(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = nf_ct_to_nf_conn(nfct); + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + nf_conntrack_tcp_set_closing(ct); + break; + } +} + static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, + .set_closing = nf_conntrack_set_closing, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 48ea6d0264b5..0c4db2f2ac43 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -242,104 +242,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); -/* 'skb' should already be pulled to nh_ofs. */ -int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, u16 proto) -{ - const struct nf_conntrack_helper *helper; - const struct nf_conn_help *help; - unsigned int protoff; - int err; - - if (ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - if (helper->tuple.src.l3num != NFPROTO_UNSPEC && - helper->tuple.src.l3num != proto) - return NF_ACCEPT; - - switch (proto) { - case NFPROTO_IPV4: - protoff = ip_hdrlen(skb); - proto = ip_hdr(skb)->protocol; - break; - case NFPROTO_IPV6: { - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - int ofs; - - ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - protoff = ofs; - proto = nexthdr; - break; - } - default: - WARN_ONCE(1, "helper invoked on non-IP family!"); - return NF_DROP; - } - - if (helper->tuple.dst.protonum != proto) - return NF_ACCEPT; - - err = helper->help(skb, protoff, ct, ctinfo); - if (err != NF_ACCEPT) - return err; - - /* Adjust seqs after helper. This is needed due to some helpers (e.g., - * FTP with NAT) adusting the TCP payload size when mangling IP - * addresses and/or port numbers in the text-based control connection. - */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) - return NF_DROP; - return NF_ACCEPT; -} -EXPORT_SYMBOL_GPL(nf_ct_helper); - -int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, - u8 proto, bool nat, struct nf_conntrack_helper **hp) -{ - struct nf_conntrack_helper *helper; - struct nf_conn_help *help; - int ret = 0; - - helper = nf_conntrack_helper_try_module_get(name, family, proto); - if (!helper) - return -EINVAL; - - help = nf_ct_helper_ext_add(ct, GFP_KERNEL); - if (!help) { - nf_conntrack_helper_put(helper); - return -ENOMEM; - } -#if IS_ENABLED(CONFIG_NF_NAT) - if (nat) { - ret = nf_nat_helper_try_module_get(name, family, proto); - if (ret) { - nf_conntrack_helper_put(helper); - return ret; - } - } -#endif - rcu_assign_pointer(help->helper, helper); - *hp = helper; - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_add_helper); - /* appropriate ct lock protecting must be taken by caller */ static int unhelp(struct nf_conn *ct, void *me) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d71150a40fb0..308fc0023c7e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -328,8 +328,13 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static int ctnetlink_dump_mark(struct sk_buff *skb, u32 mark) +static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) { + u32 mark = READ_ONCE(ct->mark); + + if (!mark) + return 0; + if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; @@ -543,7 +548,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb, static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_mark(skb, READ_ONCE(ct->mark)) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || @@ -722,7 +727,6 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; - u32 mark; int err; if (events & (1 << IPCT_DESTROY)) { @@ -827,9 +831,8 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) } #ifdef CONFIG_NF_CONNTRACK_MARK - mark = READ_ONCE(ct->mark); - if ((events & (1 << IPCT_MARK) || mark) && - ctnetlink_dump_mark(skb, mark) < 0) + if (events & (1 << IPCT_MARK) && + ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; #endif nlmsg_end(skb, nlh); @@ -2671,7 +2674,6 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; - u32 mark; zone = nf_ct_zone(ct); @@ -2733,8 +2735,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK - mark = READ_ONCE(ct->mark); - if (mark && ctnetlink_dump_mark(skb, mark) < 0) + if (ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) @@ -3865,7 +3866,7 @@ static int __init ctnetlink_init(void) { int ret; - BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx)); + NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c new file mode 100644 index 000000000000..52b776bdf526 --- /dev/null +++ b/net/netfilter/nf_conntrack_ovs.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Support ct functions for openvswitch and used by OVS and TC conntrack. */ + +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/ipv6_frag.h> +#include <net/ip.h> + +/* 'skb' should already be pulled to nh_ofs. */ +int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + unsigned int protoff; + int err; + + if (ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + if (helper->tuple.src.l3num != NFPROTO_UNSPEC && + helper->tuple.src.l3num != proto) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + proto = ip_hdr(skb)->protocol; + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + proto = nexthdr; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + if (helper->tuple.dst.protonum != proto) + return NF_ACCEPT; + + err = helper->help(skb, protoff, ct, ctinfo); + if (err != NF_ACCEPT) + return err; + + /* Adjust seqs after helper. This is needed due to some helpers (e.g., + * FTP with NAT) adusting the TCP payload size when mangling IP + * addresses and/or port numbers in the text-based control connection. + */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) + return NF_DROP; + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_ct_helper); + +int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, + u8 proto, bool nat, struct nf_conntrack_helper **hp) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + int ret = 0; + + helper = nf_conntrack_helper_try_module_get(name, family, proto); + if (!helper) + return -EINVAL; + + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (!help) { + nf_conntrack_helper_put(helper); + return -ENOMEM; + } +#if IS_ENABLED(CONFIG_NF_NAT) + if (nat) { + ret = nf_nat_helper_try_module_get(name, family, proto); + if (ret) { + nf_conntrack_helper_put(helper); + return ret; + } + } +#endif + rcu_assign_pointer(help->helper, helper); + *hp = helper; + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_add_helper); + +/* Trim the skb to the length specified by the IP/IPv6 header, + * removing any trailing lower-layer padding. This prepares the skb + * for higher-layer processing that assumes skb->len excludes padding + * (such as nf_ip_checksum). The caller needs to pull the skb to the + * network header, and ensure ip_hdr/ipv6_hdr points to valid data. + */ +int nf_ct_skb_network_trim(struct sk_buff *skb, int family) +{ + unsigned int len; + + switch (family) { + case NFPROTO_IPV4: + len = skb_ip_totlen(skb); + break; + case NFPROTO_IPV6: + len = sizeof(struct ipv6hdr) + + ntohs(ipv6_hdr(skb)->payload_len); + break; + default: + len = skb->len; + } + + return pskb_trim_rcsum(skb, len); +} +EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim); + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, + u16 zone, u8 family, u8 *proto, u16 *mru) +{ + int err; + + if (family == NFPROTO_IPV4) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(net, skb, user); + local_bh_enable(); + if (err) + return err; + + *mru = IPCB(skb)->frag_max_size; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + } else if (family == NFPROTO_IPV6) { + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + err = nf_ct_frag6_gather(net, skb, user); + if (err) { + if (err != -EINPROGRESS) + kfree_skb(skb); + return err; + } + + *proto = ipv6_hdr(skb)->nexthdr; + *mru = IP6CB(skb)->frag_max_size; +#endif + } else { + kfree_skb(skb); + return -EPFNOSUPPORT; + } + + skb_clear_hash(skb); + skb->ignore_df = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_handle_fragments); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 895b09cbd7cf..c928ff63b10e 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -121,17 +121,64 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) +static bool in_vrf_postrouting(const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (state->hook == NF_INET_POST_ROUTING && + netif_is_l3_master(state->out)) + return true; +#endif + return false; +} + +unsigned int nf_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + bool seqadj_needed; + __be16 frag_off; + int start; + u8 pnum; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || in_vrf_postrouting(state)) + return NF_ACCEPT; help = nfct_help(ct); + + seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb); + if (!help && !seqadj_needed) + return nf_conntrack_confirm(skb); + + /* helper->help() do not expect ICMP packets */ + if (ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case NFPROTO_IPV6: + pnum = ipv6_hdr(skb)->nexthdr; + start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); + if (start < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + + protoff = start; + break; + default: + return nf_conntrack_confirm(skb); + } + if (help) { const struct nf_conntrack_helper *helper; int ret; - /* rcu_read_lock()ed by nf_hook_thresh */ + /* rcu_read_lock()ed by nf_hook */ helper = rcu_dereference(help->helper); if (helper) { ret = helper->help(skb, @@ -142,12 +189,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } } - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } + if (seqadj_needed && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; } /* We've seen it coming out the other side: confirm it */ @@ -155,35 +200,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } EXPORT_SYMBOL_GPL(nf_confirm); -static bool in_vrf_postrouting(const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (state->hook == NF_INET_POST_ROUTING && - netif_is_l3_master(state->out)) - return true; -#endif - return false; -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - if (in_vrf_postrouting(state)) - return NF_ACCEPT; - - return nf_confirm(skb, - skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -230,13 +246,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = { .priority = NF_IP_PRI_CONNTRACK, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, @@ -268,16 +284,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) /* We only do TCP and SCTP at the moment: is there a better way? */ if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + tuple.dst.protonum != IPPROTO_SCTP) return -ENOPROTOOPT; - } - if ((unsigned int)*len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); + if ((unsigned int)*len < sizeof(struct sockaddr_in)) return -EINVAL; - } h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { @@ -291,17 +302,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u3.ip; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); nf_ct_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else return 0; } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } @@ -344,12 +350,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + if (!h) return -ENOENT; - } ct = nf_ct_tuplehash_to_ctrack(h); @@ -373,33 +375,6 @@ static struct nf_sockopt_ops so_getorigdst6 = { .owner = THIS_MODULE, }; -static unsigned int ipv6_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - int protoff; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - if (in_vrf_postrouting(state)) - return NF_ACCEPT; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return nf_conntrack_confirm(skb); - } - - return nf_confirm(skb, protoff, ct, ctinfo); -} - static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -428,13 +403,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = { .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST - 1, diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 61e3b05cf02c..1020d67600a9 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -129,6 +129,56 @@ static void icmpv6_error_log(const struct sk_buff *skb, nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } +static noinline_for_stack int +nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + u8 hl = ipv6_hdr(skb)->hop_limit; + union nf_inet_addr outer_daddr; + union { + struct nd_opt_hdr nd_opt; + struct rd_msg rd_msg; + } tmp; + const struct nd_opt_hdr *nd_opt; + const struct rd_msg *rd_msg; + + rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); + if (!rd_msg) { + icmpv6_error_log(skb, state, "short redirect"); + return -NF_ACCEPT; + } + + if (rd_msg->icmph.icmp6_code != 0) + return NF_ACCEPT; + + if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); + return -NF_ACCEPT; + } + + dataoff += sizeof(*rd_msg); + + /* warning: rd_msg no longer usable after this call */ + nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); + if (!nd_opt || nd_opt->nd_opt_len == 0) { + icmpv6_error_log(skb, state, "redirect without options"); + return -NF_ACCEPT; + } + + /* We could call ndisc_parse_options(), but it would need + * skb_linearize() and a bit more work. + */ + if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) + return NF_ACCEPT; + + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += 8; + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); +} + int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -159,6 +209,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, return NF_ACCEPT; } + if (icmp6h->icmp6_type == NDISC_REDIRECT) + return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); + /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 5a936334b517..91eacc9b0b98 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -27,22 +27,16 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR - - And so for me for SCTP :D -Kiran */ - static const char *const sctp_conntrack_names[] = { - "NONE", - "CLOSED", - "COOKIE_WAIT", - "COOKIE_ECHOED", - "ESTABLISHED", - "SHUTDOWN_SENT", - "SHUTDOWN_RECD", - "SHUTDOWN_ACK_SENT", - "HEARTBEAT_SENT", - "HEARTBEAT_ACKED", + [SCTP_CONNTRACK_NONE] = "NONE", + [SCTP_CONNTRACK_CLOSED] = "CLOSED", + [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", + [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", + [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", + [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", + [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", + [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; #define SECS * HZ @@ -54,12 +48,11 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, - [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 @@ -73,7 +66,6 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT -#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED #define sIV SCTP_CONNTRACK_MAX /* @@ -90,15 +82,12 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. -SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. -HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to - that of the HEARTBEAT chunk. Secondary connection is - established. */ /* TODO @@ -115,33 +104,33 @@ cookie echoed to closed. static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, -/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, -/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; @@ -153,6 +142,7 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) } #endif +/* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ @@ -163,7 +153,8 @@ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - unsigned long *map) + unsigned long *map, + const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; @@ -172,8 +163,6 @@ static int do_basic_checks(struct nf_conn *ct, flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); - if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) @@ -188,7 +177,9 @@ static int do_basic_checks(struct nf_conn *ct, sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { - pr_debug("Basic checks failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "%s failed. chunk num %d, type %d, len %d flag %d\n", + __func__, count, sch->type, sch->length, flag); return 1; } @@ -196,7 +187,6 @@ static int do_basic_checks(struct nf_conn *ct, set_bit(sch->type, map); } - pr_debug("Basic checks passed\n"); return count == 0; } @@ -206,64 +196,47 @@ static int sctp_new_state(enum ip_conntrack_dir dir, { int i; - pr_debug("Chunk type: %d\n", chunk_type); - switch (chunk_type) { case SCTP_CID_INIT: - pr_debug("SCTP_CID_INIT\n"); i = 0; break; case SCTP_CID_INIT_ACK: - pr_debug("SCTP_CID_INIT_ACK\n"); i = 1; break; case SCTP_CID_ABORT: - pr_debug("SCTP_CID_ABORT\n"); i = 2; break; case SCTP_CID_SHUTDOWN: - pr_debug("SCTP_CID_SHUTDOWN\n"); i = 3; break; case SCTP_CID_SHUTDOWN_ACK: - pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); i = 4; break; case SCTP_CID_ERROR: - pr_debug("SCTP_CID_ERROR\n"); i = 5; break; case SCTP_CID_COOKIE_ECHO: - pr_debug("SCTP_CID_COOKIE_ECHO\n"); i = 6; break; case SCTP_CID_COOKIE_ACK: - pr_debug("SCTP_CID_COOKIE_ACK\n"); i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: - pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; case SCTP_CID_HEARTBEAT: - pr_debug("SCTP_CID_HEARTBEAT"); i = 9; break; case SCTP_CID_HEARTBEAT_ACK: - pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ - pr_debug("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); + pr_debug("Unknown chunk type %d, Will stay in %s\n", + chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } - pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); - return sctp_conntracks[dir][i][cur_state]; } @@ -380,7 +353,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (sh == NULL) goto out; - if (do_basic_checks(ct, skb, dataoff, map) != 0) + if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { @@ -403,7 +376,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "verification tag check failed %x vs %x for dir %d", + sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } @@ -412,22 +387,29 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { - /* Sec 8.5.1 (A) */ + /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { - /* Sec 8.5.1 (B) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir]) + /* (B) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { - /* Sec 8.5.1 (C) */ - if (sh->vtag != ct->proto.sctp.vtag[dir] && - sh->vtag != ct->proto.sctp.vtag[!dir] && - sch->flags & SCTP_CHUNK_FLAG_T) + /* (C) vtag MUST match own vtag if T flag is unset OR + * MUST match peer's vtag if T flag is set + */ + if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[dir]) || + ((sch->flags & SCTP_CHUNK_FLAG_T) && + sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { - /* Sec 8.5.1 (D) */ + /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_HEARTBEAT) { @@ -471,9 +453,10 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { - pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " - "conntrack=%u\n", - dir, sch->type, old_state); + nf_ct_l4proto_log_invalid(skb, ct, state, + "Invalid, old_state %d, dir %d, type %d", + old_state, dir, sch->type); + goto out_unlock; } @@ -501,8 +484,12 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } ct->proto.sctp.state = new_state; - if (old_state != new_state) + if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + if (new_state == SCTP_CONNTRACK_ESTABLISHED && + !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } } spin_unlock_bh(&ct->lock); @@ -516,14 +503,6 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && - dir == IP_CT_DIR_REPLY && - new_state == SCTP_CONNTRACK_ESTABLISHED) { - pr_debug("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_ASSURED, ct); - } - return NF_ACCEPT; out_unlock: diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 656631083177..4018acb1d674 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -911,6 +911,41 @@ static bool tcp_can_early_drop(const struct nf_conn *ct) return false; } +void nf_conntrack_tcp_set_closing(struct nf_conn *ct) +{ + enum tcp_conntrack old_state; + const unsigned int *timeouts; + u32 timeout; + + if (!nf_ct_is_confirmed(ct)) + return; + + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; + ct->proto.tcp.state = TCP_CONNTRACK_CLOSE; + + if (old_state == TCP_CONNTRACK_CLOSE || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_unlock_bh(&ct->lock); + return; + } + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + const struct nf_tcp_net *tn; + + tn = nf_tcp_pernet(nf_ct_net(ct)); + timeouts = tn->timeouts; + } + + timeout = timeouts[TCP_CONNTRACK_CLOSE]; + WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); + + spin_unlock_bh(&ct->lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); +} + static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; @@ -930,7 +965,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); - struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum nf_ct_tcp_action res; @@ -954,7 +988,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; - tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: @@ -1068,6 +1101,13 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags |= IP_CT_EXP_CHALLENGE_ACK; } + + /* possible challenge ack reply to syn */ + if (old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && + dir == IP_CT_DIR_REPLY) + ct->proto.tcp.last_ack = ntohl(th->ack_seq); + spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d ignored, state %s", @@ -1193,6 +1233,14 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, * segments we ignored. */ goto in_window; } + + /* Reset in response to a challenge-ack we let through earlier */ + if (old_state == TCP_CONNTRACK_SYN_SENT && + ct->proto.tcp.last_index == TCP_ACK_SET && + ct->proto.tcp.last_dir == IP_CT_DIR_REPLY && + ntohl(th->seq) == ct->proto.tcp.last_ack) + goto in_window; + break; default: /* Keep compilers happy. */ @@ -1217,13 +1265,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; - pr_debug("tcp_conntracks: "); - nf_ct_dump_tuple(tuple); - pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); - ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3b516cffc779..0030fbe8885c 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -88,6 +88,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { unsigned int *timeouts; + unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -96,26 +97,27 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); - if (!nf_ct_is_confirmed(ct)) + status = READ_ONCE(ct->status); + if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + if (status & IPS_SEEN_REPLY) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; - stream = true; + stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ - if (unlikely((ct->status & IPS_NAT_CLASH))) + if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index bca839ab1ae8..57f6724c99a7 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -275,7 +275,7 @@ static const char* l4proto_name(u16 proto) return "unknown"; } -static unsigned int +static void seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { struct nf_conn_acct *acct; @@ -283,14 +283,12 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) acct = nf_conn_acct_find(ct); if (!acct) - return 0; + return; counter = acct->counter; seq_printf(s, "packets=%llu bytes=%llu ", (unsigned long long)atomic64_read(&counter[dir].packets), (unsigned long long)atomic64_read(&counter[dir].bytes)); - - return 0; } /* return 0 on success, 1 in case of error */ @@ -342,8 +340,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_has_overflowed(s)) goto release; - if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) - goto release; + seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL); if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); @@ -352,8 +349,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); - if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) - goto release; + seq_print_acct(s, ct, IP_CT_DIR_REPLY); if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[HW_OFFLOAD] "); @@ -601,7 +597,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, - NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, @@ -886,12 +881,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { - .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { @@ -1035,7 +1024,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); - XASSIGN(HEARTBEAT_ACKED, sn); #undef XASSIGN #endif } diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 81c26a96c30b..04bd0ed4d2ae 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -193,8 +193,11 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); + enum udp_conntrack state = + test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + UDP_CT_REPLIED : UDP_CT_UNREPLIED; - timeout = tn->timeouts[UDP_CT_REPLIED]; + timeout = tn->timeouts[state]; timeout -= tn->offload_timeout; } else { return; diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 0ccabf3fa6aa..9505f9d188ff 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -39,7 +39,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, } static int nf_flow_rule_route_inet(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index b350fe9d00b0..19efba1e51ef 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -421,6 +421,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (ret == NF_DROP) flow_offload_teardown(flow); break; + default: + WARN_ON_ONCE(1); + ret = NF_DROP; + break; } return ret; @@ -682,6 +686,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (ret == NF_DROP) flow_offload_teardown(flow); break; + default: + WARN_ON_ONCE(1); + ret = NF_DROP; + break; } return ret; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 00b522890d77..1c26f03fc661 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -383,12 +383,12 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, const __be32 *addr, const __be32 *mask) { struct flow_action_entry *entry; - int i, j; + int i; - for (i = 0, j = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32), j++) { + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) { entry = flow_action_entry_next(flow_rule); flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, - offset + i, &addr[j], mask); + offset + i * sizeof(u32), &addr[i], mask); } } @@ -679,7 +679,7 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, return 0; } -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -704,7 +704,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); -int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -735,7 +735,7 @@ nf_flow_offload_rule_alloc(struct net *net, { const struct nf_flowtable *flowtable = offload->flowtable; const struct flow_offload_tuple *tuple, *other_tuple; - const struct flow_offload *flow = offload->flow; + struct flow_offload *flow = offload->flow; struct dst_entry *other_dst = NULL; struct nf_flow_rule *flow_rule; int err = -ENOMEM; @@ -895,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, ok_count += flow_offload_tuple_add(offload, flow_rule[0], FLOW_OFFLOAD_DIR_ORIGINAL); - ok_count += flow_offload_tuple_add(offload, flow_rule[1], - FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); if (ok_count == 0) return -ENOENT; @@ -926,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload) { clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } @@ -946,7 +948,9 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) u64 lastused; flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); - flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, + &stats[1]); lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, @@ -997,13 +1001,13 @@ static void flow_offload_queue_work(struct flow_offload_work *offload) struct net *net = read_pnet(&offload->flowtable->net); if (offload->cmd == FLOW_CLS_REPLACE) { - NF_FLOW_TABLE_STAT_INC(net, count_wq_add); + NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_add); queue_work(nf_flow_offload_add_wq, &offload->work); } else if (offload->cmd == FLOW_CLS_DESTROY) { - NF_FLOW_TABLE_STAT_INC(net, count_wq_del); + NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_del); queue_work(nf_flow_offload_del_wq, &offload->work); } else { - NF_FLOW_TABLE_STAT_INC(net, count_wq_stats); + NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_stats); queue_work(nf_flow_offload_stats_wq, &offload->work); } } diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index cb894f0d63e9..c66689ad2b49 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 0fa5a0bbb0ff..141ee7783223 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -30,9 +30,9 @@ __diag_ignore_all("-Wmissing-prototypes", * interpreted as select a random port. * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST */ -int bpf_ct_set_nat_info(struct nf_conn___init *nfct, - union nf_inet_addr *addr, int port, - enum nf_nat_manip_type manip) +__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, + union nf_inet_addr *addr, int port, + enum nf_nat_manip_type manip) { struct nf_conn *ct = (struct nf_conn *)nfct; u16 proto = nf_ct_l3num(ct); diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index a95a25196943..bf591e6af005 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -223,7 +223,7 @@ u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port) if (res != -EBUSY || (--attempts_left < 0)) break; - port = min + prandom_u32_max(range); + port = min + get_random_u32_below(range); } return 0; diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c new file mode 100644 index 000000000000..551abd2da614 --- /dev/null +++ b/net/netfilter/nf_nat_ovs.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Support nat functions for openvswitch and used by OVS and TC conntrack. */ + +#include <net/netfilter/nf_nat.h> + +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, + enum nf_nat_manip_type maniptype) +{ + __be16 proto = skb_protocol(skb, true); + int hooknum, err = NF_ACCEPT; + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (proto == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto out; + } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto out; + } + } + /* Non-ICMP, fall thru to initialize if needed. */ + fallthrough; + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto out; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto out; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); + if (err == NF_ACCEPT) + *action |= BIT(maniptype); +out: + return err; +} + +int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, bool commit) +{ + enum nf_nat_manip_type maniptype; + int err, ct_action = *action; + + *action = 0; + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_DROP; /* Can't NAT. */ + + if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && + (ctinfo != IP_CT_RELATED || commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (ct_action & BIT(NF_NAT_MANIP_SRC)) { + maniptype = NF_NAT_MANIP_SRC; + } else if (ct_action & BIT(NF_NAT_MANIP_DST)) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; + } + + err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype); + if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { + if (ct->status & IPS_SRC_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; + + err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, + maniptype); + } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL, + NF_NAT_MANIP_SRC); + } + } + return err; +} +EXPORT_SYMBOL_GPL(nf_ct_nat); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6269b0d9977c..d73edbd4eec4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -465,8 +465,9 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, - struct nft_set *set) +static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set, + const struct nft_set_desc *desc) { struct nft_trans *trans; @@ -474,17 +475,28 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, if (trans == NULL) return -ENOMEM; - if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; + if (desc) { + nft_trans_set_update(trans) = true; + nft_trans_set_gc_int(trans) = desc->gc_int; + nft_trans_set_timeout(trans) = desc->timeout; + } nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } +static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + return __nft_trans_set_add(ctx, msg_type, set, NULL); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -1389,6 +1401,10 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(table)) { + if (PTR_ERR(table) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } @@ -2627,6 +2643,10 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } @@ -2873,8 +2893,8 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, return -EINVAL; type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); - if (IS_ERR(type)) - return PTR_ERR(type); + if (!type) + return -ENOENT; if (!type->inner_ops) return -EOPNOTSUPP; @@ -3704,6 +3724,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } @@ -3717,6 +3741,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_HANDLE]) { rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } @@ -3780,8 +3808,7 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) static const struct nft_set_ops * nft_select_set_ops(const struct nft_ctx *ctx, const struct nlattr * const nla[], - const struct nft_set_desc *desc, - enum nft_set_policies policy) + const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; @@ -3810,7 +3837,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (!ops->estimate(desc, flags, &est)) continue; - switch (policy) { + switch (desc->policy) { case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; @@ -4045,8 +4072,10 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { - struct nlmsghdr *nlh; + u64 timeout = READ_ONCE(set->timeout); + u32 gc_int = READ_ONCE(set->gc_int); u32 portid = ctx->portid; + struct nlmsghdr *nlh; struct nlattr *nest; u32 seq = ctx->seq; int i; @@ -4082,13 +4111,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) goto nla_put_failure; - if (set->timeout && + if (timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, - nf_jiffies64_to_msecs(set->timeout), + nf_jiffies64_to_msecs(timeout), NFTA_SET_PAD)) goto nla_put_failure; - if (set->gc_int && - nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) + if (gc_int && + nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int))) goto nla_put_failure; if (set->policy != NFT_SET_POL_PERFORMANCE) { @@ -4389,15 +4418,94 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, return err; } +static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr * const *nla, + struct nft_expr **exprs, int *num_exprs, + u32 flags) +{ + struct nft_expr *expr; + int err, i; + + if (nla[NFTA_SET_EXPR]) { + expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[0] = expr; + (*num_exprs)++; + } else if (nla[NFTA_SET_EXPRESSIONS]) { + struct nlattr *tmp; + int left; + + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_expr_alloc; + } + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_set_expr_alloc; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_expr_alloc; + } + expr = nft_set_elem_expr_alloc(ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[i++] = expr; + (*num_exprs)++; + } + } + + return 0; + +err_set_expr_alloc: + for (i = 0; i < *num_exprs; i++) + nft_expr_destroy(ctx, exprs[i]); + + return err; +} + +static bool nft_set_is_same(const struct nft_set *set, + const struct nft_set_desc *desc, + struct nft_expr *exprs[], u32 num_exprs, u32 flags) +{ + int i; + + if (set->ktype != desc->ktype || + set->dtype != desc->dtype || + set->flags != flags || + set->klen != desc->klen || + set->dlen != desc->dlen || + set->field_count != desc->field_count || + set->num_exprs != num_exprs) + return false; + + for (i = 0; i < desc->field_count; i++) { + if (set->field_len[i] != desc->field_len[i]) + return false; + } + + for (i = 0; i < num_exprs; i++) { + if (set->exprs[i]->ops != exprs[i]->ops) + return false; + } + + return true; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - u32 ktype, dtype, flags, policy, gc_int, objtype; struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; - struct nft_expr *expr = NULL; struct net *net = info->net; struct nft_set_desc desc; struct nft_table *table; @@ -4405,10 +4513,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, struct nft_set *set; struct nft_ctx ctx; size_t alloc_size; - u64 timeout; + int num_exprs = 0; char *name; int err, i; u16 udlen; + u32 flags; u64 size; if (nla[NFTA_SET_TABLE] == NULL || @@ -4419,10 +4528,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, memset(&desc, 0, sizeof(desc)); - ktype = NFT_DATA_VALUE; + desc.ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { - ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); - if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) + desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); + if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) return -EINVAL; } @@ -4447,17 +4556,17 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, return -EOPNOTSUPP; } - dtype = 0; + desc.dtype = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; - dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); - if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && - dtype != NFT_DATA_VERDICT) + desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); + if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && + desc.dtype != NFT_DATA_VERDICT) return -EINVAL; - if (dtype != NFT_DATA_VERDICT) { + if (desc.dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); @@ -4472,34 +4581,34 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_OBJECT)) return -EINVAL; - objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); - if (objtype == NFT_OBJECT_UNSPEC || - objtype > NFT_OBJECT_MAX) + desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); + if (desc.objtype == NFT_OBJECT_UNSPEC || + desc.objtype > NFT_OBJECT_MAX) return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else - objtype = NFT_OBJECT_UNSPEC; + desc.objtype = NFT_OBJECT_UNSPEC; - timeout = 0; + desc.timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; } - gc_int = 0; + desc.gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } - policy = NFT_SET_POL_PERFORMANCE; + desc.policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) - policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); @@ -4531,6 +4640,8 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, return PTR_ERR(set); } } else { + struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {}; + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; @@ -4538,13 +4649,29 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return 0; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); + if (err < 0) + return err; + + err = 0; + if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); + err = -EEXIST; + } + + for (i = 0; i < num_exprs; i++) + nft_expr_destroy(&ctx, exprs[i]); + + if (err < 0) + return err; + + return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc); } if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(&ctx, nla, &desc, policy); + ops = nft_select_set_ops(&ctx, nla, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -4584,18 +4711,18 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, set->table = table; write_pnet(&set->net, net); set->ops = ops; - set->ktype = ktype; + set->ktype = desc.ktype; set->klen = desc.klen; - set->dtype = dtype; - set->objtype = objtype; + set->dtype = desc.dtype; + set->objtype = desc.objtype; set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; - set->policy = policy; + set->policy = desc.policy; set->udlen = udlen; set->udata = udata; - set->timeout = timeout; - set->gc_int = gc_int; + set->timeout = desc.timeout; + set->gc_int = desc.gc_int; set->field_count = desc.field_count; for (i = 0; i < desc.field_count; i++) @@ -4605,43 +4732,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) goto err_set_init; - if (nla[NFTA_SET_EXPR]) { - expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]); - if (IS_ERR(expr)) { - err = PTR_ERR(expr); - goto err_set_expr_alloc; - } - set->exprs[0] = expr; - set->num_exprs++; - } else if (nla[NFTA_SET_EXPRESSIONS]) { - struct nft_expr *expr; - struct nlattr *tmp; - int left; - - if (!(flags & NFT_SET_EXPR)) { - err = -EINVAL; - goto err_set_expr_alloc; - } - i = 0; - nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { - if (i == NFT_SET_EXPR_MAX) { - err = -E2BIG; - goto err_set_expr_alloc; - } - if (nla_type(tmp) != NFTA_LIST_ELEM) { - err = -EINVAL; - goto err_set_expr_alloc; - } - expr = nft_set_elem_expr_alloc(&ctx, set, tmp); - if (IS_ERR(expr)) { - err = PTR_ERR(expr); - goto err_set_expr_alloc; - } - set->exprs[i++] = expr; - set->num_exprs++; - } - } + err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags); + if (err < 0) + goto err_set_destroy; + set->num_exprs = num_exprs; set->handle = nf_tables_alloc_handle(table); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); @@ -4655,7 +4750,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); - +err_set_destroy: ops->destroy(set); err_set_init: kfree(set->name); @@ -4729,6 +4824,10 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(set)) { + if (PTR_ERR(set) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } @@ -6008,7 +6107,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { - timeout = set->timeout; + timeout = READ_ONCE(set->timeout); } expiration = 0; @@ -6109,7 +6208,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err_parse_key_end; - if (timeout != set->timeout) { + if (timeout != READ_ONCE(set->timeout)) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; @@ -6611,6 +6710,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); + if (err == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) + continue; + if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -6920,6 +7023,9 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, return -EOPNOTSUPP; type = __nft_obj_type_get(objtype); + if (WARN_ON_ONCE(!type)) + return -ENOENT; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); @@ -7255,6 +7361,10 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(obj)) { + if (PTR_ERR(obj) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } @@ -7885,6 +7995,10 @@ static int nf_tables_delflowtable(struct sk_buff *skb, } if (IS_ERR(flowtable)) { + if (PTR_ERR(flowtable) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } @@ -8294,6 +8408,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, + [NFT_MSG_DESTROYTABLE] = { + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, [NFT_MSG_NEWCHAIN] = { .call = nf_tables_newchain, .type = NFNL_CB_BATCH, @@ -8312,6 +8432,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, + [NFT_MSG_DESTROYCHAIN] = { + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, [NFT_MSG_NEWRULE] = { .call = nf_tables_newrule, .type = NFNL_CB_BATCH, @@ -8336,6 +8462,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_DESTROYRULE] = { + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_NEWSET] = { .call = nf_tables_newset, .type = NFNL_CB_BATCH, @@ -8354,6 +8486,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, + [NFT_MSG_DESTROYSET] = { + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, [NFT_MSG_NEWSETELEM] = { .call = nf_tables_newsetelem, .type = NFNL_CB_BATCH, @@ -8372,6 +8510,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_DESTROYSETELEM] = { + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, .type = NFNL_CB_RCU, @@ -8394,6 +8538,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_DESTROYOBJ] = { + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, [NFT_MSG_GETOBJ_RESET] = { .call = nf_tables_getobj, .type = NFNL_CB_RCU, @@ -8418,6 +8568,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, + [NFT_MSG_DESTROYFLOWTABLE] = { + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, }; static int nf_tables_validate(struct net *net) @@ -8511,6 +8667,7 @@ static void nft_commit_release(struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: @@ -8518,23 +8675,29 @@ static void nft_commit_release(struct nft_trans *trans) kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: nf_tables_set_elem_destroy(&trans->ctx, nft_trans_elem_set(trans), nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else @@ -8986,8 +9149,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: list_del_rcu(&trans->ctx.table->list); - nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + nf_tables_table_notify(&trans->ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { @@ -9002,8 +9166,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nft_chain_del(trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + nf_tables_chain_notify(&trans->ctx, trans->msg_type); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, trans->ctx.chain); @@ -9019,10 +9184,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), - NFT_MSG_DELRULE); + trans->msg_type); nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); @@ -9031,22 +9197,29 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: - nft_clear(net, nft_trans_set(trans)); - /* This avoids hitting -EBUSY when deleting the table - * from the transaction. - */ - if (nft_set_is_anonymous(nft_trans_set(trans)) && - !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + if (nft_trans_set_update(trans)) { + struct nft_set *set = nft_trans_set(trans); + WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); + WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); + } else { + nft_clear(net, nft_trans_set(trans)); + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_set_is_anonymous(nft_trans_set(trans)) && + !list_empty(&nft_trans_set(trans)->bindings)) + trans->ctx.table->use--; + } nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), - NFT_MSG_DELSET, GFP_KERNEL); + trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; @@ -9058,11 +9231,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM); + trans->msg_type); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); @@ -9084,9 +9258,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_DELOBJ); + trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { @@ -9108,11 +9283,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable_hooks(trans)); } else { @@ -9120,7 +9296,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); } @@ -9216,6 +9392,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nft_clear(trans->ctx.net, trans->ctx.table); nft_trans_destroy(trans); break; @@ -9237,6 +9414,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: trans->ctx.table->use++; nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); @@ -9251,6 +9429,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); @@ -9260,6 +9439,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: + if (nft_trans_set_update(trans)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); @@ -9268,6 +9451,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); nft_trans_destroy(trans); @@ -9283,6 +9467,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nft_setelem_data_activate(net, te->set, &te->elem); @@ -9302,6 +9487,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); @@ -9318,6 +9504,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 709a736c301c..6ecd0ba2e546 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,6 +21,26 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_X86) + +static struct static_key_false nf_tables_skip_direct_calls; + +static bool nf_skip_indirect_calls(void) +{ + return static_branch_likely(&nf_tables_skip_direct_calls); +} + +static void __init nf_skip_indirect_calls_enable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&nf_tables_skip_direct_calls); +} +#else +static inline bool nf_skip_indirect_calls(void) { return false; } + +static inline void nf_skip_indirect_calls_enable(void) { } +#endif + static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) @@ -193,7 +213,12 @@ static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_pktinfo *pkt) { #ifdef CONFIG_RETPOLINE - unsigned long e = (unsigned long)expr->ops->eval; + unsigned long e; + + if (nf_skip_indirect_calls()) + goto indirect_call; + + e = (unsigned long)expr->ops->eval; #define X(e, fun) \ do { if ((e) == (unsigned long)(fun)) \ return fun(expr, regs, pkt); } while (0) @@ -203,13 +228,19 @@ static void expr_call_ops_eval(const struct nft_expr *expr, X(e, nft_counter_eval); X(e, nft_meta_get_eval); X(e, nft_lookup_eval); +#if IS_ENABLED(CONFIG_NFT_CT) + X(e, nft_ct_get_fast_eval); +#endif X(e, nft_range_eval); X(e, nft_immediate_eval); X(e, nft_byteorder_eval); X(e, nft_dynset_eval); X(e, nft_rt_get_eval); X(e, nft_bitwise_eval); + X(e, nft_objref_eval); + X(e, nft_objref_map_eval); #undef X +indirect_call: #endif /* CONFIG_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } @@ -369,6 +400,8 @@ int __init nf_tables_core_module_init(void) goto err; } + nf_skip_indirect_calls_enable(); + return 0; err: diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index c68e2151defe..b9c84499438b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -12,7 +12,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> @@ -23,16 +23,6 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> -struct nft_ct { - enum nft_ct_keys key:8; - enum ip_conntrack_dir dir:8; - u8 len; - union { - u8 dreg; - u8 sreg; - }; -}; - struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; struct nf_conntrack_helper *helper6; @@ -759,6 +749,18 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track, return false; } +#ifdef CONFIG_RETPOLINE +static const struct nft_expr_ops nft_ct_get_fast_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_fast_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_get_destroy, + .dump = nft_ct_get_dump, + .reduce = nft_ct_set_reduce, +}; +#endif + static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), @@ -791,8 +793,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); - if (tb[NFTA_CT_DREG]) + if (tb[NFTA_CT_DREG]) { +#ifdef CONFIG_RETPOLINE + u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + + switch (k) { + case NFT_CT_STATE: + case NFT_CT_DIRECTION: + case NFT_CT_STATUS: + case NFT_CT_MARK: + case NFT_CT_SECMARK: + return &nft_ct_get_fast_ops; + } +#endif return &nft_ct_get_ops; + } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c new file mode 100644 index 000000000000..89983b0613fa --- /dev/null +++ b/net/netfilter/nft_ct_fast.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +#if IS_ENABLED(CONFIG_NFT_CT) +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack.h> + +void nft_ct_get_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int state; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_CT_STATE: + if (ct) + state = NF_CT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + state = NF_CT_STATE_UNTRACKED_BIT; + else + state = NF_CT_STATE_INVALID_BIT; + *dest = state; + return; + case NFT_CT_DIRECTION: + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); + return; + case NFT_CT_STATUS: + *dest = ct->status; + return; +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + *dest = ct->mark; + return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + *dest = ct->secmark; + return; +#endif + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} +EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval); +#endif diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7b01aa2ef653..cb37169608ba 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -13,9 +13,9 @@ #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) -static void nft_objref_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); @@ -100,9 +100,9 @@ struct nft_objref_map { struct nft_set_binding binding; }; -static void nft_objref_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 17b418a5a593..3a3c7746e88f 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -63,7 +63,7 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) return false; if (offset + len > VLAN_ETH_HLEN + vlan_hlen) - ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen; + ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen; memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 4f9299b9dcdd..06d46d182634 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1162,6 +1162,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, struct nft_pipapo_match *m = priv->clone; u8 genmask = nft_genmask_next(net); struct nft_pipapo_field *f; + const u8 *start_p, *end_p; int i, bsize_max, err = 0; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) @@ -1202,9 +1203,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, } /* Validate */ + start_p = start; + end_p = end; nft_pipapo_for_each_field(f, i, m) { - const u8 *start_p = start, *end_p = end; - if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) return -ENOSPC; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 7325bee7d144..19ea4d3c3553 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, - const struct nft_rbtree_elem *interval) +static int nft_rbtree_cmp(const struct nft_set *set, + const struct nft_rbtree_elem *e1, + const struct nft_rbtree_elem *e2) { - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), + set->klen); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); @@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set rbe = rb_entry(parent, struct nft_rbtree_elem, node); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && - nft_rbtree_equal(set, this, interval) && + !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; @@ -215,154 +215,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static int nft_rbtree_gc_elem(const struct nft_set *__set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); + struct nft_rbtree_elem *rbe_prev; + struct nft_set_gc_batch *gcb; + + gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); + if (!gcb) + return -ENOMEM; + + /* search for expired end interval coming before this element. */ + do { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_end(rbe_prev)) + break; + + prev = rb_prev(prev); + } while (prev != NULL); + + rb_erase(&rbe_prev->node, &priv->root); + rb_erase(&rbe->node, &priv->root); + atomic_sub(2, &set->nelems); + + nft_set_gc_batch_add(gcb, rbe); + nft_set_gc_batch_complete(gcb); + + return 0; +} + +static bool nft_rbtree_update_first(const struct nft_set *set, + struct nft_rbtree_elem *rbe, + struct rb_node *first) +{ + struct nft_rbtree_elem *first_elem; + + first_elem = rb_entry(first, struct nft_rbtree_elem, node); + /* this element is closest to where the new element is to be inserted: + * update the first element for the node list path. + */ + if (nft_rbtree_cmp(set, rbe, first_elem) < 0) + return true; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { - bool overlap = false, dup_end_left = false, dup_end_right = false; + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_rbtree_elem *rbe; - struct rb_node *parent, **p; - int d; + int d, err; - /* Detect overlaps as we descend the tree. Set the flag in these cases: - * - * a1. _ _ __>| ?_ _ __| (insert end before existing end) - * a2. _ _ ___| ?_ _ _>| (insert end after existing end) - * a3. _ _ ___? >|_ _ __| (insert start before existing end) - * - * and clear it later on, as we eventually reach the points indicated by - * '?' above, in the cases described below. We'll always meet these - * later, locally, due to tree ordering, and overlaps for the intervals - * that are the closest together are always evaluated last. - * - * b1. _ _ __>| !_ _ __| (insert end before existing start) - * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) - * '--' no nodes falling in this range - * b4. >|_ _ ! (insert start before existing start) - * - * Case a3. resolves to b3.: - * - if the inserted start element is the leftmost, because the '0' - * element in the tree serves as end element - * - otherwise, if an existing end is found immediately to the left. If - * there are existing nodes in between, we need to further descend the - * tree before we can conclude the new start isn't causing an overlap - * - * or to b4., which, preceded by a3., means we already traversed one or - * more existing intervals entirely, from the right. - * - * For a new, rightmost pair of elements, we'll hit cases b3. and b2., - * in that order. - * - * The flag is also cleared in two special cases: - * - * b5. |__ _ _!|<_ _ _ (insert start right before existing end) - * b6. |__ _ >|!__ _ _ (insert end right after existing start) - * - * which always happen as last step and imply that no further - * overlapping is possible. - * - * Another special case comes from the fact that start elements matching - * an already existing start element are allowed: insertion is not - * performed but we return -EEXIST in that case, and the error will be - * cleared by the caller if NLM_F_EXCL is not present in the request. - * This way, request for insertion of an exact overlap isn't reported as - * error to userspace if not desired. - * - * However, if the existing start matches a pre-existing start, but the - * end element doesn't match the corresponding pre-existing end element, - * we need to report a partial overlap. This is a local condition that - * can be noticed without need for a tracking flag, by checking for a - * local duplicated end for a corresponding start, from left and right, - * separately. + /* Descend the tree to search for an existing element greater than the + * key value to insert that is greater than the new element. This is the + * first element to walk the ordered elements to find possible overlap. */ - parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), - nft_set_ext_key(&new->ext), - set->klen); + d = nft_rbtree_cmp(set, rbe, new); + if (d < 0) { p = &parent->rb_left; - - if (nft_rbtree_interval_start(new)) { - if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext) && !*p) - overlap = false; - } else { - if (dup_end_left && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_right = true; - continue; - } - } } else if (d > 0) { - p = &parent->rb_right; + if (!first || + nft_rbtree_update_first(set, rbe, first)) + first = &rbe->node; - if (nft_rbtree_interval_end(new)) { - if (dup_end_right && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_left = true; - continue; - } - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - overlap = nft_rbtree_interval_end(rbe); - } + p = &parent->rb_right; } else { - if (nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(new)) { + if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; - - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_rbtree_interval_start(rbe) && - nft_rbtree_interval_end(new)) { + else p = &parent->rb_right; + } + } + + if (!first) + first = rb_first(&priv->root); + + /* Detect overlap by going through the list of valid tree nodes. + * Values stored in the tree are in reversed order, starting from + * highest to lowest value. + */ + for (node = first; node != NULL; node = rb_next(node)) { + rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - *ext = &rbe->ext; - return -EEXIST; - } else { - overlap = false; - if (nft_rbtree_interval_end(rbe)) - p = &parent->rb_left; - else - p = &parent->rb_right; + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* perform garbage collection to avoid bogus overlap reports. */ + if (nft_set_elem_expired(&rbe->ext)) { + err = nft_rbtree_gc_elem(set, priv, rbe); + if (err < 0) + return err; + + continue; + } + + d = nft_rbtree_cmp(set, rbe, new); + if (d == 0) { + /* Matching end element: no need to look for an + * overlapping greater or equal element. + */ + if (nft_rbtree_interval_end(rbe)) { + rbe_le = rbe; + break; + } + + /* first element that is greater or equal to key value. */ + if (!rbe_ge) { + rbe_ge = rbe; + continue; + } + + /* this is a closer more or equal element, update it. */ + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { + rbe_ge = rbe; + continue; + } + + /* element is equal to key value, make sure flags are + * the same, an existing more or equal start element + * must not be replaced by more or equal end element. + */ + if ((nft_rbtree_interval_start(new) && + nft_rbtree_interval_start(rbe_ge)) || + (nft_rbtree_interval_end(new) && + nft_rbtree_interval_end(rbe_ge))) { + rbe_ge = rbe; + continue; } + } else if (d > 0) { + /* annotate element greater than the new element. */ + rbe_ge = rbe; + continue; + } else if (d < 0) { + /* annotate element less than the new element. */ + rbe_le = rbe; + break; } + } - dup_end_left = dup_end_right = false; + /* - new start element matching existing start element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { + *ext = &rbe_ge->ext; + return -EEXIST; } - if (overlap) + /* - new end element matching existing end element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + *ext = &rbe_le->ext; + return -EEXIST; + } + + /* - new start element with existing closest, less or equal key value + * being a start element: partial overlap, reported as -ENOTEMPTY. + * Anonymous sets allow for two consecutive start element since they + * are constant, skip them to avoid bogus overlap reports. + */ + if (!nft_set_is_anonymous(set) && rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, less or equal key value + * being a end element: partial overlap, reported as -ENOTEMPTY. + */ + if (rbe_le && + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* - new end element with existing closest, greater or equal key value + * being an end element: partial overlap, reported as -ENOTEMPTY + */ + if (rbe_ge && + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* Accepted element: pick insertion point depending on key value */ + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; @@ -501,23 +563,37 @@ static void nft_rbtree_gc(struct work_struct *work) struct nft_rbtree *priv; struct rb_node *node; struct nft_set *set; + struct net *net; + u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + genmask = nft_genmask_cur(net); write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is + * always visited before the start element. + */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) + + if (nft_set_elem_mark_busy(&rbe->ext)) { + rbe_end = NULL; continue; + } if (rbe_prev) { rb_erase(&rbe_prev->node, &priv->root); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 0f8bb0bf558f..8d36303f3935 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -413,7 +413,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); - del_timer_sync(&info->timer->timer); + timer_shutdown_sync(&info->timer->timer); cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); @@ -441,7 +441,7 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) if (info->timer->timer_type & XT_IDLETIMER_ALARM) { alarm_cancel(&info->timer->alarm); } else { - del_timer_sync(&info->timer->timer); + timer_shutdown_sync(&info->timer->timer); } cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 0371c387b0d1..66b0f941d8fb 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -166,7 +166,7 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par) list_del(&ledinternal->list); - del_timer_sync(&ledinternal->timer); + timer_shutdown_sync(&ledinternal->timer); led_trigger_unregister(&ledinternal->netfilter_led_trigger); diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 1873da3a945a..b3d623a52885 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,7 +21,7 @@ static bool length_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + u32 pktlen = skb_ip_totlen(skb); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d73091f6bb0f..c64277659753 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -580,7 +580,9 @@ static int netlink_insert(struct sock *sk, u32 portid) if (nlk_sk(sk)->bound) goto err; - nlk_sk(sk)->portid = portid; + /* portid can be read locklessly from netlink_getname(). */ + WRITE_ONCE(nlk_sk(sk)->portid, portid); + sock_hold(sk); err = __netlink_insert(table, sk); @@ -846,7 +848,7 @@ retry: /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ - rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN); + rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; @@ -1096,9 +1098,11 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; if (addr->sa_family == AF_UNSPEC) { - sk->sk_state = NETLINK_UNCONNECTED; - nlk->dst_portid = 0; - nlk->dst_group = 0; + /* paired with READ_ONCE() in netlink_getsockbyportid() */ + WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); + /* dst_portid and dst_group can be read locklessly */ + WRITE_ONCE(nlk->dst_portid, 0); + WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) @@ -1119,9 +1123,11 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, err = netlink_autobind(sock); if (err == 0) { - sk->sk_state = NETLINK_CONNECTED; - nlk->dst_portid = nladdr->nl_pid; - nlk->dst_group = ffs(nladdr->nl_groups); + /* paired with READ_ONCE() in netlink_getsockbyportid() */ + WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); + /* dst_portid and dst_group can be read locklessly */ + WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); + WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; @@ -1138,10 +1144,12 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_pad = 0; if (peer) { - nladdr->nl_pid = nlk->dst_portid; - nladdr->nl_groups = netlink_group_mask(nlk->dst_group); + /* Paired with WRITE_ONCE() in netlink_connect() */ + nladdr->nl_pid = READ_ONCE(nlk->dst_portid); + nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { - nladdr->nl_pid = nlk->portid; + /* Paired with WRITE_ONCE() in netlink_insert() */ + nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); @@ -1168,8 +1176,9 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); - if (sock->sk_state == NETLINK_CONNECTED && - nlk->dst_portid != nlk_sk(ssk)->portid) { + /* dst_portid and sk_state can be changed in netlink_connect() */ + if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && + READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } @@ -1886,8 +1895,9 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { - dst_portid = nlk->dst_portid; - dst_group = nlk->dst_group; + /* Paired with WRITE_ONCE() in netlink_connect() */ + dst_portid = READ_ONCE(nlk->dst_portid); + dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 600993c80050..04c4036bf406 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -13,7 +13,7 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> -#include <linux/string.h> +#include <linux/string_helpers.h> #include <linux/skbuff.h> #include <linux/mutex.h> #include <linux/bitmap.h> @@ -457,7 +457,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family) if (WARN_ON(grp->name[0] == '\0')) return -EINVAL; - if (WARN_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL)) + if (WARN_ON(!string_is_terminated(grp->name, GENL_NAMSIZ))) return -EINVAL; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 6f7f4392cffb..5a4cb796150f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -400,6 +400,11 @@ static int nr_listen(struct socket *sock, int backlog) struct sock *sk = sock->sk; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index a8da88db7893..4e7c968cde2d 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -121,6 +121,7 @@ static void nr_heartbeat_expiry(struct timer_list *t) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); goto out; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 3364caabef8b..a27e1842b2a0 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -157,6 +157,7 @@ static void local_cleanup(struct nfc_llcp_local *local) cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); + local->rx_pending = NULL; del_timer_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 282c51051dcc..994a0a1efb58 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -240,6 +240,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, target->sens_res = nfca_poll->sens_res; target->sel_res = nfca_poll->sel_res; target->nfcid1_len = nfca_poll->nfcid1_len; + if (target->nfcid1_len > ARRAY_SIZE(target->nfcid1)) + return -EPROTO; if (target->nfcid1_len > 0) { memcpy(target->nfcid1, nfca_poll->nfcid1, target->nfcid1_len); @@ -248,6 +250,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, nfcb_poll = (struct rf_tech_specific_params_nfcb_poll *)params; target->sensb_res_len = nfcb_poll->sensb_res_len; + if (target->sensb_res_len > ARRAY_SIZE(target->sensb_res)) + return -EPROTO; if (target->sensb_res_len > 0) { memcpy(target->sensb_res, nfcb_poll->sensb_res, target->sensb_res_len); @@ -256,6 +260,8 @@ static int nci_add_new_protocol(struct nci_dev *ndev, nfcf_poll = (struct rf_tech_specific_params_nfcf_poll *)params; target->sensf_res_len = nfcf_poll->sensf_res_len; + if (target->sensf_res_len > ARRAY_SIZE(target->sensf_res)) + return -EPROTO; if (target->sensf_res_len > 0) { memcpy(target->sensf_res, nfcf_poll->sensf_res, target->sensf_res_len); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 9d91087b9399..1fc339084d89 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1497,6 +1497,7 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; + int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || @@ -1510,25 +1511,37 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) if (!dev) return -ENODEV; - if (!dev->ops || !dev->ops->se_io) - return -ENOTSUPP; + if (!dev->ops || !dev->ops->se_io) { + rc = -EOPNOTSUPP; + goto put_dev; + } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); - if (apdu_len == 0) - return -EINVAL; + if (apdu_len == 0) { + rc = -EINVAL; + goto put_dev; + } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); - if (!apdu) - return -EINVAL; + if (!apdu) { + rc = -EINVAL; + goto put_dev; + } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + if (!ctx) { + rc = -ENOMEM; + goto put_dev; + } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; - return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + +put_dev: + nfc_put_device(dev); + return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, @@ -1551,14 +1564,21 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); - if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) + if (!dev) return -ENODEV; + if (!dev->vendor_cmds || !dev->n_vendor_cmds) { + err = -ENODEV; + goto put_dev; + } + if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); - if (data_len == 0) - return -EINVAL; + if (data_len == 0) { + err = -EINVAL; + goto put_dev; + } } else { data = NULL; data_len = 0; @@ -1573,10 +1593,14 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; - return err; + goto put_dev; } - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + +put_dev: + nfc_put_device(dev); + return err; } /* message building helper */ diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 15bd287f5cbd..29a7081858cd 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -15,6 +15,8 @@ config OPENVSWITCH select NET_MPLS_GSO select DST_CACHE select NET_NSH + select NF_CONNTRACK_OVS if NF_CONNTRACK + select NF_NAT_OVS if NF_NAT help Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d78f0fc4337d..331730fd3580 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -9,6 +9,7 @@ #include <linux/udp.h> #include <linux/sctp.h> #include <linux/static_key.h> +#include <linux/string_helpers.h> #include <net/ip.h> #include <net/genetlink.h> #include <net/netfilter/nf_conntrack_core.h> @@ -434,52 +435,21 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, return 0; } -/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero - * value if 'skb' is freed. - */ -static int handle_fragments(struct net *net, struct sw_flow_key *key, - u16 zone, struct sk_buff *skb) +static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, + u16 zone, int family, struct sk_buff *skb) { struct ovs_skb_cb ovs_cb = *OVS_CB(skb); int err; - if (key->eth.type == htons(ETH_P_IP)) { - enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - err = ip_defrag(net, skb, user); - if (err) - return err; - - ovs_cb.mru = IPCB(skb)->frag_max_size; -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - } else if (key->eth.type == htons(ETH_P_IPV6)) { - enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - - memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - err = nf_ct_frag6_gather(net, skb, user); - if (err) { - if (err != -EINPROGRESS) - kfree_skb(skb); - return err; - } - - key->ip.proto = ipv6_hdr(skb)->nexthdr; - ovs_cb.mru = IP6CB(skb)->frag_max_size; -#endif - } else { - kfree_skb(skb); - return -EPFNOSUPPORT; - } + err = nf_ct_handle_fragments(net, skb, zone, family, &key->ip.proto, &ovs_cb.mru); + if (err) + return err; /* The key extracted from the fragment that completed this datagram * likely didn't have an L4 header, so regenerate it. */ ovs_flow_key_update_l3l4(skb, key); - key->ip.frag = OVS_FRAG_TYPE_NONE; - skb_clear_hash(skb); - skb->ignore_df = 1; *OVS_CB(skb) = ovs_cb; return 0; @@ -726,147 +696,27 @@ static void ovs_nat_update_key(struct sw_flow_key *key, } } -/* Modelled after nf_nat_ipv[46]_fn(). - * range is only used for new, uninitialized NAT state. - * Returns either NF_ACCEPT or NF_DROP. - */ -static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, struct sw_flow_key *key) -{ - int hooknum, nh_off, err = NF_ACCEPT; - - nh_off = skb_network_offset(skb); - skb_pull_rcsum(skb, nh_off); - - /* See HOOK2MANIP(). */ - if (maniptype == NF_NAT_MANIP_SRC) - hooknum = NF_INET_LOCAL_IN; /* Source NAT */ - else - hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (IS_ENABLED(CONFIG_NF_NAT) && - skb->protocol == htons(ETH_P_IP) && - ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - hooknum)) - err = NF_DROP; - goto push; - } else if (IS_ENABLED(CONFIG_IPV6) && - skb->protocol == htons(ETH_P_IPV6)) { - __be16 frag_off; - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - int hdrlen = ipv6_skip_exthdr(skb, - sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, - ctinfo, - hooknum, - hdrlen)) - err = NF_DROP; - goto push; - } - } - /* Non-ICMP, fall thru to initialize if needed. */ - fallthrough; - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - /* Initialize according to the NAT action. */ - err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) - /* Action is set up to establish a new - * mapping. - */ - ? nf_nat_setup_info(ct, range, maniptype) - : nf_nat_alloc_null_binding(ct, hooknum); - if (err != NF_ACCEPT) - goto push; - } - break; - - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - - default: - err = NF_DROP; - goto push; - } - - err = nf_nat_packet(ct, ctinfo, hooknum, skb); -push: - skb_push_rcsum(skb, nh_off); - - /* Update the flow key if NAT successful. */ - if (err == NF_ACCEPT) - ovs_nat_update_key(key, skb, maniptype); - - return err; -} - /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - enum nf_nat_manip_type maniptype; - int err; + int err, action = 0; - /* Add NAT extension if not confirmed yet. */ - if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) - return NF_ACCEPT; /* Can't NAT. */ + if (!(info->nat & OVS_CT_NAT)) + return NF_ACCEPT; + if (info->nat & OVS_CT_SRC_NAT) + action |= BIT(NF_NAT_MANIP_SRC); + if (info->nat & OVS_CT_DST_NAT) + action |= BIT(NF_NAT_MANIP_DST); - /* Determine NAT type. - * Check if the NAT type can be deduced from the tracked connection. - * Make sure new expected connections (IP_CT_RELATED) are NATted only - * when committing. - */ - if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && - ct->status & IPS_NAT_MASK && - (ctinfo != IP_CT_RELATED || info->commit)) { - /* NAT an established or related connection like before. */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) - /* This is the REPLY direction for a connection - * for which NAT was applied in the forward - * direction. Do the reverse NAT. - */ - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; - else - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; - } else if (info->nat & OVS_CT_SRC_NAT) { - maniptype = NF_NAT_MANIP_SRC; - } else if (info->nat & OVS_CT_DST_NAT) { - maniptype = NF_NAT_MANIP_DST; - } else { - return NF_ACCEPT; /* Connection is not NATed. */ - } - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype, key); - - if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { - if (ct->status & IPS_SRC_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; - - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, - maniptype, key); - } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL, - NF_NAT_MANIP_SRC, key); - } - } + err = nf_ct_nat(skb, ct, ctinfo, &action, &info->range, info->commit); + + if (action & BIT(NF_NAT_MANIP_SRC)) + ovs_nat_update_key(key, skb, NF_NAT_MANIP_SRC); + if (action & BIT(NF_NAT_MANIP_DST)) + ovs_nat_update_key(key, skb, NF_NAT_MANIP_DST); return err; } @@ -1210,36 +1060,6 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, return 0; } -/* Trim the skb to the length specified by the IP/IPv6 header, - * removing any trailing lower-layer padding. This prepares the skb - * for higher-layer processing that assumes skb->len excludes padding - * (such as nf_ip_checksum). The caller needs to pull the skb to the - * network header, and ensure ip_hdr/ipv6_hdr points to valid data. - */ -static int ovs_skb_network_trim(struct sk_buff *skb) -{ - unsigned int len; - int err; - - switch (skb->protocol) { - case htons(ETH_P_IP): - len = ntohs(ip_hdr(skb)->tot_len); - break; - case htons(ETH_P_IPV6): - len = sizeof(struct ipv6hdr) - + ntohs(ipv6_hdr(skb)->payload_len); - break; - default: - len = skb->len; - } - - err = pskb_trim_rcsum(skb, len); - if (err) - kfree_skb(skb); - - return err; -} - /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */ @@ -1254,12 +1074,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, nh_ofs = skb_network_offset(skb); skb_pull_rcsum(skb, nh_ofs); - err = ovs_skb_network_trim(skb); - if (err) + err = nf_ct_skb_network_trim(skb, info->family); + if (err) { + kfree_skb(skb); return err; + } if (key->ip.frag != OVS_FRAG_TYPE_NONE) { - err = handle_fragments(net, key, info->zone.id, skb); + err = ovs_ct_handle_fragments(net, key, info->zone.id, + info->family, skb); if (err) return err; } @@ -1503,7 +1326,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, #endif case OVS_CT_ATTR_HELPER: *helper = nla_data(a); - if (!memchr(*helper, '\0', nla_len(a))) { + if (!string_is_terminated(*helper, nla_len(a))) { OVS_NLERR(log, "Invalid conntrack helper"); return -EINVAL; } @@ -1524,7 +1347,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT case OVS_CT_ATTR_TIMEOUT: memcpy(info->timeout, nla_data(a), nla_len(a)); - if (!memchr(info->timeout, '\0', nla_len(a))) { + if (!string_is_terminated(info->timeout, nla_len(a))) { OVS_NLERR(log, "Invalid conntrack timeout"); return -EINVAL; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 861dfb8daf4a..fcee6012293b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -209,6 +209,26 @@ static struct vport *new_vport(const struct vport_parms *parms) return vport; } +static void ovs_vport_update_upcall_stats(struct sk_buff *skb, + const struct dp_upcall_info *upcall_info, + bool upcall_result) +{ + struct vport *p = OVS_CB(skb)->input_vport; + struct vport_upcall_stats_percpu *stats; + + if (upcall_info->cmd != OVS_PACKET_CMD_MISS && + upcall_info->cmd != OVS_PACKET_CMD_ACTION) + return; + + stats = this_cpu_ptr(p->upcall_stats); + u64_stats_update_begin(&stats->syncp); + if (upcall_result) + u64_stats_inc(&stats->n_success); + else + u64_stats_inc(&stats->n_fail); + u64_stats_update_end(&stats->syncp); +} + void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); @@ -216,6 +236,9 @@ void ovs_dp_detach_port(struct vport *p) /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); + /* Free percpu memory */ + free_percpu(p->upcall_stats); + /* Then destroy it. */ ovs_vport_del(p); } @@ -305,6 +328,8 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); + + ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; @@ -948,6 +973,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; + struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -975,30 +1001,32 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &new_flow->key, false, &mask); + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) { + error = -ENOMEM; + goto err_kfree_flow; + } + + ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) - goto err_kfree_flow; + goto err_kfree_key; + + ovs_flow_mask_key(&new_flow->key, key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &new_flow->key, log); + key, log); if (error) - goto err_kfree_flow; - - /* unmasked key is needed to match when ufid is not used. */ - if (ovs_identifier_is_key(&new_flow->id)) - match.key = new_flow->id.unmasked_key; - - ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); + goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); - goto err_kfree_flow; + goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, @@ -1019,7 +1047,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); + flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -1089,6 +1117,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); + + kfree(key); return 0; err_unlock_ovs: @@ -1096,6 +1126,8 @@ err_unlock_ovs: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); +err_kfree_key: + kfree(key); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1826,6 +1858,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_destroy_portids; } + vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); + if (!vport->upcall_stats) { + err = -ENOMEM; + goto err_destroy_vport; + } + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); @@ -1838,6 +1876,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; +err_destroy_vport: + ovs_dp_detach_port(vport); err_destroy_portids: kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: @@ -2098,6 +2138,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, OVS_VPORT_ATTR_PAD)) goto nla_put_failure; + if (ovs_vport_get_upcall_stats(vport, skb)) + goto nla_put_failure; + if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; @@ -2279,6 +2322,12 @@ restart: goto exit_unlock_free; } + vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); + if (!vport->upcall_stats) { + err = -ENOMEM; + goto exit_unlock_free_vport; + } + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW, GFP_KERNEL); @@ -2296,6 +2345,8 @@ restart: ovs_notify(&dp_vport_genl_family, reply, info); return 0; +exit_unlock_free_vport: + ovs_dp_detach_port(vport); exit_unlock_free: ovs_unlock(); kfree_skb(reply); @@ -2508,6 +2559,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, + [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; static const struct genl_small_ops dp_vport_genl_ops[] = { diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index e20d1a973417..416976f70322 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -107,7 +107,8 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, rcu_assign_pointer(flow->stats[cpu], new_stats); - cpumask_set_cpu(cpu, &flow->cpu_used_mask); + cpumask_set_cpu(cpu, + flow->cpu_used_mask); goto unlock; } } @@ -135,7 +136,8 @@ void ovs_flow_stats_get(const struct sw_flow *flow, memset(ovs_stats, 0, sizeof(*ovs_stats)); /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { + for (cpu = 0; cpu < nr_cpu_ids; + cpu = cpumask_next(cpu, flow->cpu_used_mask)) { struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { @@ -159,7 +161,8 @@ void ovs_flow_stats_clear(struct sw_flow *flow) int cpu; /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { + for (cpu = 0; cpu < nr_cpu_ids; + cpu = cpumask_next(cpu, flow->cpu_used_mask)) { struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 073ab73ffeaa..b5711aff6e76 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -229,7 +229,7 @@ struct sw_flow { */ struct sw_flow_key key; struct sw_flow_id id; - struct cpumask cpu_used_mask; + struct cpumask *cpu_used_mask; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; struct sw_flow_stats __rcu *stats[]; /* One for each CPU. First one diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 0a0e4c283f02..791504b7f42b 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -79,6 +79,7 @@ struct sw_flow *ovs_flow_alloc(void) return ERR_PTR(-ENOMEM); flow->stats_last_writer = -1; + flow->cpu_used_mask = (struct cpumask *)&flow->stats[nr_cpu_ids]; /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, @@ -91,7 +92,7 @@ struct sw_flow *ovs_flow_alloc(void) RCU_INIT_POINTER(flow->stats[0], stats); - cpumask_set_cpu(0, &flow->cpu_used_mask); + cpumask_set_cpu(0, flow->cpu_used_mask); return flow; err: @@ -115,7 +116,7 @@ static void flow_free(struct sw_flow *flow) flow->sf_acts); /* We open code this to make sure cpu 0 is always considered */ for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { + cpu = cpumask_next(cpu, flow->cpu_used_mask)) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); @@ -1196,7 +1197,8 @@ int ovs_flow_init(void) flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + (nr_cpu_ids - * sizeof(struct sw_flow_stats *)), + * sizeof(struct sw_flow_stats *)) + + cpumask_size(), 0, 0, NULL); if (flow_cache == NULL) return -ENOMEM; diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 6e38f68f88c2..f2698d2316df 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -449,7 +449,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) err = attach_meter(meter_tbl, meter); if (err) - goto exit_unlock; + goto exit_free_old_meter; ovs_unlock(); @@ -472,6 +472,8 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_free_old_meter: + ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 82a74f998966..7e0f5c45b512 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -285,6 +285,56 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) } /** + * ovs_vport_get_upcall_stats - retrieve upcall stats + * + * @vport: vport from which to retrieve the stats. + * @skb: sk_buff where upcall stats should be appended. + * + * Retrieves upcall stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + int i; + + __u64 tx_success = 0; + __u64 tx_fail = 0; + + for_each_possible_cpu(i) { + const struct vport_upcall_stats_percpu *stats; + unsigned int start; + + stats = per_cpu_ptr(vport->upcall_stats, i); + do { + start = u64_stats_fetch_begin(&stats->syncp); + tx_success += u64_stats_read(&stats->n_success); + tx_fail += u64_stats_read(&stats->n_fail); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + } + + nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } + nla_nest_end(skb, nla); + + return 0; +} + +/** * ovs_vport_get_options - retrieve device options * * @vport: vport from which to retrieve the options. diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 6ff45e8a0868..3e71ca8ad8a7 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -32,6 +32,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name); void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb); + int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); @@ -65,6 +67,7 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. + * @upcall_stats: Upcall stats of every ports. * @detach_list: list used for detaching vport in net-exit call. * @rcu: RCU callback head for deferred destruction. */ @@ -78,6 +81,7 @@ struct vport { struct hlist_node hash_node; struct hlist_node dp_hash_node; const struct vport_ops *ops; + struct vport_upcall_stats_percpu __percpu *upcall_stats; struct list_head detach_list; struct rcu_head rcu; @@ -137,6 +141,18 @@ struct vport_ops { struct list_head list; }; +/** + * struct vport_upcall_stats_percpu - per-cpu packet upcall statistics for + * a given vport. + * @n_success: Number of packets that upcall to userspace succeed. + * @n_fail: Number of packets that upcall to userspace failed. + */ +struct vport_upcall_stats_percpu { + struct u64_stats_sync syncp; + u64_stats_t n_success; + u64_stats_t n_fail; +}; + struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 41c4ccc3a5d6..d4e76e2ae153 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1335,8 +1335,6 @@ static void packet_sock_destruct(struct sock *sk) pr_err("Attempt to release alive packet socket: %p\n", sk); return; } - - sk_refcnt_debug_dec(sk); } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) @@ -1350,7 +1348,7 @@ static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) if (READ_ONCE(history[i]) == rxhash) count++; - victim = prandom_u32_max(ROLLOVER_HLEN); + victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) @@ -1386,7 +1384,7 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { - return prandom_u32_max(num); + return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, @@ -2296,6 +2294,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; @@ -3172,7 +3172,6 @@ static int packet_release(struct socket *sock) skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); - sk_refcnt_debug_release(sk); sock_put(sk); return 0; @@ -3362,7 +3361,6 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; - sk_refcnt_debug_inc(sk); /* * Attach a protocol block @@ -3522,6 +3520,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index 1f5df0432d37..7f68d8662cfb 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -19,6 +19,8 @@ #include <net/tcp_states.h> #include <net/phonet/gprs.h> +#include <trace/events/sock.h> + #define GPRS_DEFAULT_MTU 1400 struct gprs_dev { @@ -138,6 +140,8 @@ static void gprs_data_ready(struct sock *sk) struct gprs_dev *gp = sk->sk_user_data; struct sk_buff *skb; + trace_sk_data_ready(sk); + while ((skb = pep_read(sk)) != NULL) { skb_orphan(skb); gprs_recv(gp, skb); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 1990d496fcfc..722936f7dd98 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -12,6 +12,7 @@ #include "qrtr.h" +#include <trace/events/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> @@ -83,7 +84,10 @@ static struct qrtr_node *node_get(unsigned int node_id) node->id = node_id; - radix_tree_insert(&nodes, node_id, node); + if (radix_tree_insert(&nodes, node_id, node)) { + kfree(node); + return NULL; + } return node; } @@ -752,6 +756,8 @@ static void qrtr_ns_worker(struct work_struct *work) static void qrtr_ns_data_ready(struct sock *sk) { + trace_sk_data_ready(sk); + queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index cfbf0e129cba..e53b7f266bd7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -31,6 +31,7 @@ * */ #include <linux/kernel.h> +#include <linux/sched/clock.h> #include <linux/slab.h> #include <linux/pci.h> #include <linux/dma-mapping.h> diff --git a/net/rds/message.c b/net/rds/message.c index b47e4f0a1639..7af59d2443e5 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_lock_irqsave(&q->lock, flags); head = &q->zcookie_head; if (!list_empty(head)) { - info = list_entry(head, struct rds_msg_zcopy_info, - rs_zcookie_next); - if (info && rds_zcookie_add(info, cookie)) { + info = list_first_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (rds_zcookie_add(info, cookie)) { spin_unlock_irqrestore(&q->lock, flags); kfree(rds_info_from_znotifier(znotif)); /* caller invokes rds_wake_sk_sleep() */ @@ -118,7 +118,7 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, ck = &info->zcookies; memset(ck, 0, sizeof(*ck)); WARN_ON(!rds_zcookie_add(info, cookie)); - list_add_tail(&q->zcookie_head, &info->rs_zcookie_next); + list_add_tail(&info->rs_zcookie_next, &q->zcookie_head); spin_unlock_irqrestore(&q->lock, flags); /* caller invokes rds_wake_sk_sleep() */ diff --git a/net/rds/recv.c b/net/rds/recv.c index 5b426dc3634d..c71b923764fd 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -35,6 +35,7 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> +#include <linux/sched/clock.h> #include <linux/time.h> #include <linux/rds.h> diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 7edf2e69d3fe..014fa24418c1 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -34,6 +34,7 @@ #include <linux/gfp.h> #include <linux/in.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include "rds.h" #include "tcp.h" @@ -234,6 +235,7 @@ void rds_tcp_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); + trace_sk_data_ready(sk); rdsdebug("listen data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index f4ee13da90c7..c00f04a1a534 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -33,6 +33,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include "rds.h" #include "tcp.h" @@ -309,6 +310,7 @@ void rds_tcp_data_ready(struct sock *sk) struct rds_conn_path *cp; struct rds_tcp_connection *tc; + trace_sk_data_ready(sk); rdsdebug("data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index dac4fdc7488a..01fca7a10b4b 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -685,7 +685,7 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%s\n", rfkill->name); + return sysfs_emit(buf, "%s\n", rfkill->name); } static DEVICE_ATTR_RO(name); @@ -694,7 +694,7 @@ static ssize_t type_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%s\n", rfkill_types[rfkill->type]); + return sysfs_emit(buf, "%s\n", rfkill_types[rfkill->type]); } static DEVICE_ATTR_RO(type); @@ -703,7 +703,7 @@ static ssize_t index_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%d\n", rfkill->idx); + return sysfs_emit(buf, "%d\n", rfkill->idx); } static DEVICE_ATTR_RO(index); @@ -712,7 +712,7 @@ static ssize_t persistent_show(struct device *dev, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%d\n", rfkill->persistent); + return sysfs_emit(buf, "%d\n", rfkill->persistent); } static DEVICE_ATTR_RO(persistent); @@ -721,7 +721,7 @@ static ssize_t hard_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_HW) ? 1 : 0 ); + return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_HW) ? 1 : 0); } static DEVICE_ATTR_RO(hard); @@ -730,7 +730,7 @@ static ssize_t soft_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0 ); + return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0); } static ssize_t soft_store(struct device *dev, struct device_attribute *attr, @@ -764,7 +764,7 @@ static ssize_t hard_block_reasons_show(struct device *dev, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "0x%lx\n", rfkill->hard_block_reasons); + return sysfs_emit(buf, "0x%lx\n", rfkill->hard_block_reasons); } static DEVICE_ATTR_RO(hard_block_reasons); @@ -783,7 +783,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); - return sprintf(buf, "%d\n", user_state_from_blocked(rfkill->state)); + return sysfs_emit(buf, "%d\n", user_state_from_blocked(rfkill->state)); } static ssize_t state_store(struct device *dev, struct device_attribute *attr, @@ -832,7 +832,7 @@ static void rfkill_release(struct device *dev) kfree(rfkill); } -static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env) +static int rfkill_dev_uevent(const struct device *dev, struct kobj_uevent_env *env) { struct rfkill *rfkill = to_rfkill(dev); unsigned long flags; diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index f5afc9bcdee6..786dbfdad772 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -75,6 +75,8 @@ static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; struct gpio_desc *gpio; + const char *name_property; + const char *type_property; const char *type_name; int ret; @@ -82,8 +84,15 @@ static int rfkill_gpio_probe(struct platform_device *pdev) if (!rfkill) return -ENOMEM; - device_property_read_string(&pdev->dev, "name", &rfkill->name); - device_property_read_string(&pdev->dev, "type", &type_name); + if (dev_of_node(&pdev->dev)) { + name_property = "label"; + type_property = "radio-type"; + } else { + name_property = "name"; + type_property = "type"; + } + device_property_read_string(&pdev->dev, name_property, &rfkill->name); + device_property_read_string(&pdev->dev, type_property, &type_name); if (!rfkill->name) rfkill->name = dev_name(&pdev->dev); @@ -157,12 +166,19 @@ static const struct acpi_device_id rfkill_acpi_match[] = { MODULE_DEVICE_TABLE(acpi, rfkill_acpi_match); #endif +static const struct of_device_id rfkill_of_match[] __maybe_unused = { + { .compatible = "rfkill-gpio", }, + { }, +}; +MODULE_DEVICE_TABLE(of, rfkill_of_match); + static struct platform_driver rfkill_gpio_driver = { .probe = rfkill_gpio_probe, .remove = rfkill_gpio_remove, .driver = { .name = "rfkill_gpio", .acpi_match_table = ACPI_PTR(rfkill_acpi_match), + .of_match_table = of_match_ptr(rfkill_of_match), }, }; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 36fefc3957d7..ca2b17f32670 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -488,6 +488,12 @@ static int rose_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { struct rose_sock *rose = rose_sk(sk); @@ -497,8 +503,10 @@ static int rose_listen(struct socket *sock, int backlog) memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; + release_sock(sk); return 0; } + release_sock(sk); return -EOPNOTSUPP; } diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 7ae023b37a83..a20986806fea 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -36,6 +36,15 @@ config AF_RXRPC_INJECT_LOSS Say Y here to inject packet loss by discarding some received and some transmitted packets. +config AF_RXRPC_INJECT_RX_DELAY + bool "Inject delay into packet reception" + depends on SYSCTL + help + Say Y here to inject a delay into packet reception, allowing an + extended RTT time to be modelled. The delay can be configured using + /proc/sys/net/rxrpc/rxrpc_inject_rx_delay, setting a number of + milliseconds up to 0.5s (note that the granularity is actually in + jiffies). config AF_RXRPC_DEBUG bool "RxRPC dynamic debugging" diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index e76d3459d78e..ac5caf5a48e1 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -10,6 +10,7 @@ rxrpc-y := \ call_accept.o \ call_event.o \ call_object.o \ + call_state.o \ conn_client.o \ conn_event.o \ conn_object.o \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7ea576f6ba4b..102f5cbff91a 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -155,10 +155,10 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) if (service_id) { write_lock(&local->services_lock); - if (rcu_access_pointer(local->service)) + if (local->service) goto service_in_use; rx->local = local; - rcu_assign_pointer(local->service, rx); + local->service = rx; write_unlock(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; @@ -328,7 +328,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, mutex_unlock(&call->user_mutex); } - rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp); _leave(" = %p", call); return call; } @@ -374,13 +373,17 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * @sock: The socket the call is on * @call: The call to check * - * Allow a kernel service to find out whether a call is still alive - - * ie. whether it has completed. + * Allow a kernel service to find out whether a call is still alive - whether + * it has completed successfully and all received data has been consumed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) { - return call->state != RXRPC_CALL_COMPLETE; + if (!rxrpc_call_is_complete(call)) + return true; + if (call->completion != RXRPC_CALL_SUCCEEDED) + return false; + return !skb_queue_empty(&call->recvmsg_queue); } EXPORT_SYMBOL(rxrpc_kernel_check_life); @@ -783,7 +786,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); - rwlock_init(&rx->recvmsg_lock); + spin_lock_init(&rx->recvmsg_lock); rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); @@ -872,9 +875,9 @@ static int rxrpc_release_sock(struct sock *sk) sk->sk_state = RXRPC_CLOSE; - if (rx->local && rcu_access_pointer(rx->local->service) == rx) { + if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); - rcu_assign_pointer(rx->local->service, NULL); + rx->local->service = NULL; write_unlock(&rx->local->services_lock); } @@ -957,16 +960,9 @@ static const struct net_proto_family rxrpc_family_ops = { static int __init af_rxrpc_init(void) { int ret = -1; - unsigned int tmp; BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb)); - get_random_bytes(&tmp, sizeof(tmp)); - tmp &= 0x3fffffff; - if (tmp == 0) - tmp = 1; - idr_set_cursor(&rxrpc_client_conn_ids, tmp); - ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, @@ -1062,7 +1058,6 @@ static void __exit af_rxrpc_exit(void) * are released. */ rcu_barrier(); - rxrpc_destroy_client_conn_ids(); destroy_workqueue(rxrpc_workqueue); rxrpc_exit_security(); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e7dccab7b741..9e19688b0e06 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -38,6 +38,7 @@ struct rxrpc_txbuf; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; @@ -75,13 +76,7 @@ struct rxrpc_net { bool live; - bool kill_all_client_conns; atomic_t nr_client_conns; - spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ - struct mutex client_conn_discard_lock; /* Prevent multiple discarders */ - struct list_head idle_client_conns; - struct work_struct client_conn_reaper; - struct timer_list client_conn_reap_timer; struct hlist_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ @@ -154,7 +149,7 @@ struct rxrpc_sock { struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */ - rwlock_t recvmsg_lock; /* Lock for recvmsg_q */ + spinlock_t recvmsg_lock; /* Lock for recvmsg_q */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* User ID -> call mapping */ @@ -202,6 +197,7 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { + struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ u16 offset; /* Offset of data */ u16 len; /* Length of data */ u8 flags; @@ -262,13 +258,11 @@ struct rxrpc_security { /* respond to a challenge */ int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *, - u32 *); + struct sk_buff *); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, - struct sk_buff *, - u32 *); + struct sk_buff *); /* clear connection security */ void (*clear)(struct rxrpc_connection *); @@ -283,21 +277,36 @@ struct rxrpc_local { struct rcu_head rcu; atomic_t active_users; /* Number of users of the local endpoint */ refcount_t ref; /* Number of references to the structure */ - struct rxrpc_net *rxnet; /* The network ns in which this resides */ + struct net *net; /* The network namespace */ + struct rxrpc_net *rxnet; /* Our bits in the network namespace */ struct hlist_node link; struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; - struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ - struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ + struct completion io_thread_ready; /* Indication that the I/O thread started */ + struct rxrpc_sock *service; /* Service(s) listening on this endpoint */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + struct sk_buff_head rx_delay_queue; /* Delay injection queue */ +#endif struct sk_buff_head rx_queue; /* Received packets */ + struct list_head conn_attend_q; /* Conns requiring immediate attention */ struct list_head call_attend_q; /* Calls requiring immediate attention */ + struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ + bool kill_all_client_conns; + struct list_head idle_client_conns; + struct timer_list client_conn_reap_timer; + unsigned long client_conn_flags; +#define RXRPC_CLIENT_CONN_REAP_TIMER 0 /* The client conn reap timer expired */ + spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ bool dead; bool service_closed; /* Service socket closed */ + struct idr conn_ids; /* List of connection IDs */ + struct list_head new_client_calls; /* Newly created client calls need connection */ + spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ }; @@ -355,7 +364,6 @@ struct rxrpc_conn_proto { struct rxrpc_conn_parameters { struct rxrpc_local *local; /* Representation of local endpoint */ - struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ @@ -364,10 +372,21 @@ struct rxrpc_conn_parameters { }; /* + * Call completion condition (state == RXRPC_CALL_COMPLETE). + */ +enum rxrpc_call_completion { + RXRPC_CALL_SUCCEEDED, /* - Normal termination */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + NR__RXRPC_CALL_COMPLETIONS +}; + +/* * Bits in the connection flags. */ enum rxrpc_conn_flag { - RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ @@ -387,6 +406,7 @@ enum rxrpc_conn_flag { */ enum rxrpc_conn_event { RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */ + RXRPC_CONN_EV_ABORT_CALLS, /* Abort attached calls */ }; /* @@ -394,13 +414,13 @@ enum rxrpc_conn_event { */ enum rxrpc_conn_proto_state { RXRPC_CONN_UNUSED, /* Connection not yet attempted */ + RXRPC_CONN_CLIENT_UNSECURED, /* Client connection needs security init */ RXRPC_CONN_CLIENT, /* Client connection */ RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ RXRPC_CONN_SERVICE, /* Service secured connection */ - RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ - RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ + RXRPC_CONN_ABORTED, /* Conn aborted */ RXRPC_CONN__NR_STATES }; @@ -411,17 +431,16 @@ struct rxrpc_bundle { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ + const struct rxrpc_security *security; /* applied security module */ refcount_t ref; atomic_t active; /* Number of active users */ unsigned int debug_id; u32 security_level; /* Security level selected */ u16 service_id; /* Service ID for this connection */ bool try_upgrade; /* True if the bundle is attempting upgrade */ - bool alloc_conn; /* True if someone's getting a conn */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ - short alloc_error; /* Error from last conn allocation */ - spinlock_t channel_lock; + unsigned short alloc_error; /* Error from last conn allocation */ struct rb_node local_node; /* Node in local->client_conns */ struct list_head waiting_calls; /* Calls waiting for channels */ unsigned long avail_chans; /* Mask of available channels */ @@ -439,6 +458,7 @@ struct rxrpc_connection { struct rxrpc_peer *peer; /* Remote endpoint */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct key *key; /* Security details */ + struct list_head attend_link; /* Link in local->conn_attend_q */ refcount_t ref; atomic_t active; /* Active count for service conns */ @@ -448,7 +468,7 @@ struct rxrpc_connection { unsigned char act_chans; /* Mask of active channels */ struct rxrpc_channel { unsigned long final_ack_at; /* Time at which to issue final ACK */ - struct rxrpc_call __rcu *call; /* Active call */ + struct rxrpc_call *call; /* Active call */ unsigned int call_debug_id; /* call->debug_id */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ @@ -469,6 +489,7 @@ struct rxrpc_connection { struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ + struct mutex security_lock; /* Lock for security management */ const struct rxrpc_security *security; /* applied security module */ union { struct { @@ -482,7 +503,8 @@ struct rxrpc_connection { unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ enum rxrpc_conn_proto_state state; /* current state of connection */ - u32 abort_code; /* Abort code of connection abort */ + enum rxrpc_call_completion completion; /* Completion condition */ + s32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ @@ -526,7 +548,8 @@ enum rxrpc_call_flag { RXRPC_CALL_KERNEL, /* The call was made by the kernel */ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ - RXRPC_CALL_RX_IS_IDLE, /* Reception is idle - send an ACK */ + RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ + RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ }; /* @@ -557,18 +580,6 @@ enum rxrpc_call_state { }; /* - * Call completion condition (state == RXRPC_CALL_COMPLETE). - */ -enum rxrpc_call_completion { - RXRPC_CALL_SUCCEEDED, /* - Normal termination */ - RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ - RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ - RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ - RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ - NR__RXRPC_CALL_COMPLETIONS -}; - -/* * Call Tx congestion management modes. */ enum rxrpc_congest_mode { @@ -586,6 +597,7 @@ enum rxrpc_congest_mode { struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ + struct rxrpc_bundle *bundle; /* Connection bundle to use */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_sock __rcu *socket; /* socket responsible */ @@ -608,7 +620,7 @@ struct rxrpc_call { struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ - struct list_head chan_wait_link; /* Link in conn->bundle->waiting_calls */ + struct list_head wait_link; /* Link in local->new_client_calls */ struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* Link in rx->acceptq */ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ @@ -622,10 +634,13 @@ struct rxrpc_call { unsigned long flags; unsigned long events; spinlock_t notify_lock; /* Kernel notification lock */ - rwlock_t state_lock; /* lock for state transition */ - u32 abort_code; /* Local/remote abort code */ + unsigned int send_abort_why; /* Why the abort [enum rxrpc_abort_reason] */ + s32 send_abort; /* Abort code to be sent */ + short send_abort_err; /* Error to be associated with the abort */ + rxrpc_seq_t send_abort_seq; /* DATA packet that incurred the abort (or 0) */ + s32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ - enum rxrpc_call_state state; /* current state of call */ + enum rxrpc_call_state _state; /* Current state of call (needs barrier) */ enum rxrpc_call_completion completion; /* Call completion condition */ refcount_t ref; u8 security_ix; /* Security type */ @@ -675,9 +690,11 @@ struct rxrpc_call { /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ + u16 ackr_sack_base; /* Starting slot in SACK table ring */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ - atomic64_t ackr_window; /* Base (in LSW) and top (in MSW) of SACK window */ - atomic_t ackr_nr_unacked; /* Number of unacked packets */ + rxrpc_seq_t ackr_window; /* Base of SACK window */ + rxrpc_seq_t ackr_wtop; /* Base of SACK window */ + unsigned int ackr_nr_unacked; /* Number of unacked packets */ atomic_t ackr_nr_consumed; /* Number of packets needing hard ACK */ struct { #define RXRPC_SACK_SIZE 256 @@ -811,9 +828,11 @@ extern struct workqueue_struct *rxrpc_workqueue; */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); -bool rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *, - struct rxrpc_connection *, struct sockaddr_rxrpc *, - struct sk_buff *); +bool rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_peer *peer, + struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb); void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); @@ -833,7 +852,7 @@ void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long now, enum rxrpc_timer_trace why); -void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); +bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); /* * call_object.c @@ -850,6 +869,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct sockaddr_rxrpc *, struct rxrpc_call_params *, gfp_t, unsigned int); +void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); @@ -872,32 +892,88 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) } /* + * call_state.c + */ +bool rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error); +bool rxrpc_call_completed(struct rxrpc_call *call); +bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why); +void rxrpc_prefail_call(struct rxrpc_call *call, enum rxrpc_call_completion compl, + int error); + +static inline void rxrpc_set_call_state(struct rxrpc_call *call, + enum rxrpc_call_state state) +{ + /* Order write of completion info before write of ->state. */ + smp_store_release(&call->_state, state); + wake_up(&call->waitq); +} + +static inline enum rxrpc_call_state __rxrpc_call_state(const struct rxrpc_call *call) +{ + return call->_state; /* Only inside I/O thread */ +} + +static inline bool __rxrpc_call_is_complete(const struct rxrpc_call *call) +{ + return __rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; +} + +static inline enum rxrpc_call_state rxrpc_call_state(const struct rxrpc_call *call) +{ + /* Order read ->state before read of completion info. */ + return smp_load_acquire(&call->_state); +} + +static inline bool rxrpc_call_is_complete(const struct rxrpc_call *call) +{ + return rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; +} + +static inline bool rxrpc_call_has_failed(const struct rxrpc_call *call) +{ + return rxrpc_call_is_complete(call) && call->completion != RXRPC_CALL_SUCCEEDED; +} + +/* * conn_client.c */ extern unsigned int rxrpc_reap_client_connections; extern unsigned long rxrpc_conn_idle_client_expiry; extern unsigned long rxrpc_conn_idle_client_fast_expiry; -extern struct idr rxrpc_client_conn_ids; -void rxrpc_destroy_client_conn_ids(void); +void rxrpc_purge_client_connections(struct rxrpc_local *local); struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); -int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, - struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, - gfp_t); +int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp); +void rxrpc_connect_client_calls(struct rxrpc_local *local); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); +void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); -void rxrpc_discard_expired_client_conns(struct work_struct *); -void rxrpc_destroy_all_client_connections(struct rxrpc_net *); +void rxrpc_discard_expired_client_conns(struct rxrpc_local *local); void rxrpc_clean_up_local_conns(struct rxrpc_local *); /* * conn_event.c */ +void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int channel); +int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, + s32 abort_code, int err, enum rxrpc_abort_reason why); void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); -int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); +bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); +void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb); + +static inline bool rxrpc_is_conn_aborted(const struct rxrpc_connection *conn) +{ + /* Order reading the abort info after the state check. */ + return smp_load_acquire(&conn->state) == RXRPC_CONN_ABORTED; +} /* * conn_object.c @@ -905,6 +981,7 @@ int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); extern unsigned int rxrpc_connection_expiry; extern unsigned int rxrpc_closed_conn_expiry; +void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why); struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t); struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *, struct sockaddr_rxrpc *, @@ -960,12 +1037,19 @@ void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); void rxrpc_error_report(struct sock *); +bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, + s32 abort_code, int err); int rxrpc_io_thread(void *data); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { wake_up_process(local->io_thread); } +static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why) +{ + return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EPROTO); +} + /* * insecure.c */ @@ -1029,6 +1113,9 @@ extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY +extern unsigned long rxrpc_inject_rx_delay; +#endif /* * net_ns.c @@ -1047,6 +1134,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); +void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); @@ -1062,17 +1150,15 @@ void rxrpc_peer_keepalive_worker(struct work_struct *); */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); -struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *, - struct sockaddr_rxrpc *, gfp_t); +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, gfp_t gfp); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t, enum rxrpc_peer_trace); -void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *, - struct rxrpc_peer *); +void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace); void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); -void rxrpc_put_peer_locked(struct rxrpc_peer *, enum rxrpc_peer_trace); /* * proc.c @@ -1086,33 +1172,22 @@ extern const struct seq_operations rxrpc_local_seq_ops; * recvmsg.c */ void rxrpc_notify_socket(struct rxrpc_call *); -bool __rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); -bool rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); -bool __rxrpc_call_completed(struct rxrpc_call *); -bool rxrpc_call_completed(struct rxrpc_call *); -bool __rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); -bool rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * Abort a call due to a protocol error. */ -static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, - struct sk_buff *skb, - const char *eproto_why, - const char *why, - u32 abort_code) +static inline int rxrpc_abort_eproto(struct rxrpc_call *call, + struct sk_buff *skb, + s32 abort_code, + enum rxrpc_abort_reason why) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); - return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); + rxrpc_abort_call(call, sp->hdr.seq, abort_code, -EPROTO, why); + return -EPROTO; } -#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ - __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ - (abort_why), (abort_code)) - /* * rtt.c */ @@ -1144,6 +1219,8 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, /* * sendmsg.c */ +bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, + enum rxrpc_abort_reason why); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index d1850863507f..0f5a1d77b890 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -99,7 +99,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); - call->state = RXRPC_CALL_SERVER_PREALLOC; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_PREALLOC); __set_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), @@ -195,7 +195,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->peer_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_peer *peer = b->peer_backlog[tail]; - rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_conn); + rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_peer); kfree(peer); tail = (tail + 1) & (size - 1); } @@ -280,7 +280,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - rxrpc_new_incoming_peer(rx, local, peer); + rxrpc_new_incoming_peer(local, peer); } /* Now allocate and set up the connection */ @@ -339,18 +339,17 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, _enter(""); - /* Don't set up a call for anything other than the first DATA packet. */ - if (sp->hdr.seq != 1 || - sp->hdr.type != RXRPC_PACKET_TYPE_DATA) - return true; /* Just discard */ + /* Don't set up a call for anything other than a DATA packet. */ + if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) + return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); - rcu_read_lock(); + read_lock(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just * discarded. */ - rx = rcu_dereference(local->service); + rx = local->service; if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && sp->hdr.serviceId != rx->second_service) ) { @@ -363,16 +362,14 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, if (!conn) { sec = rxrpc_get_incoming_security(rx, skb); if (!sec) - goto reject; + goto unsupported_security; } spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { - trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; + rxrpc_direct_abort(skb, rxrpc_abort_shut_down, + RX_INVALID_OPERATION, -ESHUTDOWN); goto no_call; } @@ -402,7 +399,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); - rcu_read_unlock(); + read_unlock(&local->services_lock); if (hlist_unhashed(&call->error_link)) { spin_lock(&call->peer->lock); @@ -416,18 +413,20 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, return true; unsupported_service: - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->priority = RX_INVALID_OPERATION; - goto reject; + read_unlock(&local->services_lock); + return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, + RX_INVALID_OPERATION, -EOPNOTSUPP); +unsupported_security: + read_unlock(&local->services_lock); + return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, + RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); -reject: - rcu_read_unlock(); + read_unlock(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: - rcu_read_unlock(); + read_unlock(&local->services_lock); return true; } diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index b2cf448fb02c..e363f21a2014 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -251,6 +251,41 @@ out: _leave(""); } +/* + * Start transmitting the reply to a service. This cancels the need to ACK the + * request if we haven't yet done so. + */ +static void rxrpc_begin_service_reply(struct rxrpc_call *call) +{ + unsigned long now = jiffies; + + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SEND_REPLY); + WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); + if (call->ackr_reason == RXRPC_ACK_DELAY) + call->ackr_reason = 0; + trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); +} + +/* + * Close the transmission phase. After this point there is no more data to be + * transmitted in the call. + */ +static void rxrpc_close_tx_phase(struct rxrpc_call *call) +{ + _debug("________awaiting reply/ACK__________"); + + switch (__rxrpc_call_state(call)) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_REPLY); + break; + case RXRPC_CALL_SERVER_SEND_REPLY: + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_AWAIT_ACK); + break; + default: + break; + } +} + static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) { unsigned int winsize = min_t(unsigned int, call->tx_winsize, @@ -270,9 +305,11 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) { struct rxrpc_txbuf *txb; - if (rxrpc_is_client_call(call) && - !test_bit(RXRPC_CALL_EXPOSED, &call->flags)) + if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + if (list_empty(&call->tx_sendmsg)) + return; rxrpc_expose_client_call(call); + } while ((txb = list_first_entry_or_null(&call->tx_sendmsg, struct rxrpc_txbuf, call_link))) { @@ -283,6 +320,9 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) call->tx_top = txb->seq; list_add_tail(&txb->call_link, &call->tx_buffer); + if (txb->wire.flags & RXRPC_LAST_PACKET) + rxrpc_close_tx_phase(call); + rxrpc_transmit_one(call, txb); if (!rxrpc_tx_window_has_space(call)) @@ -292,16 +332,15 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) static void rxrpc_transmit_some_data(struct rxrpc_call *call) { - switch (call->state) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: if (list_empty(&call->tx_sendmsg)) return; + rxrpc_begin_service_reply(call); fallthrough; case RXRPC_CALL_SERVER_SEND_REPLY: - case RXRPC_CALL_SERVER_AWAIT_ACK: case RXRPC_CALL_CLIENT_SEND_REQUEST: - case RXRPC_CALL_CLIENT_AWAIT_REPLY: if (!rxrpc_tx_window_has_space(call)) return; if (list_empty(&call->tx_sendmsg)) { @@ -331,20 +370,30 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) /* * Handle retransmission and deferred ACK/abort generation. */ -void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) { unsigned long now, next, t; rxrpc_serial_t ackr_serial; bool resend = false, expired = false; + s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); //printk("\n--------------------\n"); _enter("{%d,%s,%lx}", - call->debug_id, rxrpc_call_states[call->state], call->events); + call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)], + call->events); + + if (__rxrpc_call_is_complete(call)) + goto out; - if (call->state == RXRPC_CALL_COMPLETE) + /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ + abort_code = smp_load_acquire(&call->send_abort); + if (abort_code) { + rxrpc_abort_call(call, 0, call->send_abort, call->send_abort_err, + call->send_abort_why); goto out; + } if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) goto out; @@ -358,7 +407,7 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } t = READ_ONCE(call->expect_req_by); - if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST && + if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST && time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now); expired = true; @@ -429,11 +478,12 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && (int)call->conn->hi_serial - (int)call->rx_serial > 0) { trace_rxrpc_call_reset(call); - rxrpc_abort_call("EXP", call, 0, RX_CALL_DEAD, -ECONNRESET); + rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_reset); } else { - rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME); + rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME, + rxrpc_abort_call_timeout); } - rxrpc_send_abort_packet(call); goto out; } @@ -441,19 +491,28 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && call->state != RXRPC_CALL_CLIENT_RECV_REPLY) + if (resend && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY) rxrpc_resend(call, NULL); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, rxrpc_propose_ack_rx_idle); - if (atomic_read(&call->ackr_nr_unacked) > 2) - rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, - rxrpc_propose_ack_input_data); + if (call->ackr_nr_unacked > 2) { + if (call->peer->rtt_count < 3) + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_rtt); + else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + ktime_get_real())) + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_old_rtt); + else + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, + rxrpc_propose_ack_input_data); + } /* Make sure the timer is restarted */ - if (call->state != RXRPC_CALL_COMPLETE) { + if (!__rxrpc_call_is_complete(call)) { next = call->expect_rx_by; #define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } @@ -474,9 +533,15 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } out: - if (call->state == RXRPC_CALL_COMPLETE) + if (__rxrpc_call_is_complete(call)) { del_timer_sync(&call->timer); + if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) + rxrpc_disconnect_call(call); + if (call->security) + call->security->free_call_crypto(call); + } if (call->acks_hard_ack != call->tx_bottom) rxrpc_shrink_call_tx_buffer(call); _leave(""); + return true; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index be5eb8cdf549..e9f1f49d18c2 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -50,16 +50,18 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) struct rxrpc_local *local = call->local; bool busy; - if (call->state < RXRPC_CALL_COMPLETE) { + if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { spin_lock_bh(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); + if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) + busy = true; if (!busy) { - rxrpc_get_call(call, rxrpc_call_get_poke); list_add_tail(&call->attend_link, &local->call_attend_q); } spin_unlock_bh(&local->lock); - rxrpc_wake_up_io_thread(local); + if (!busy) + rxrpc_wake_up_io_thread(local); } } @@ -69,7 +71,7 @@ static void rxrpc_call_timer_expired(struct timer_list *t) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) { + if (!__rxrpc_call_is_complete(call)) { trace_rxrpc_timer_expired(call, jiffies); rxrpc_poke_call(call, rxrpc_call_poke_timer); } @@ -150,7 +152,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, timer_setup(&call->timer, rxrpc_call_timer_expired, 0); INIT_WORK(&call->destroyer, rxrpc_destroy_call); INIT_LIST_HEAD(&call->link); - INIT_LIST_HEAD(&call->chan_wait_link); + INIT_LIST_HEAD(&call->wait_link); INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); @@ -162,13 +164,13 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); - rwlock_init(&call->state_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; - atomic64_set(&call->ackr_window, 0x100000001ULL); + call->ackr_window = 1; + call->ackr_wtop = 1; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); @@ -211,12 +213,12 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, now = ktime_get_real(); call->acks_latest_ts = now; call->cong_tstamp = now; - call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; call->dest_srx = *srx; call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; call->key = key_get(cp->key); call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); + call->security_level = cp->security_level; if (p->kernel) __set_bit(RXRPC_CALL_KERNEL, &call->flags); if (cp->upgrade) @@ -226,11 +228,13 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, ret = rxrpc_init_client_call_security(call); if (ret < 0) { - __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); + rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret); rxrpc_put_call(call, rxrpc_call_put_discard_error); return ERR_PTR(ret); } + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_CONN); + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), p->user_call_ID, rxrpc_call_new_client); @@ -241,7 +245,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, /* * Initiate the call ack/resend/expiry timer. */ -static void rxrpc_start_call_timer(struct rxrpc_call *call) +void rxrpc_start_call_timer(struct rxrpc_call *call) { unsigned long now = jiffies; unsigned long j = now + MAX_JIFFY_OFFSET; @@ -286,6 +290,39 @@ static void rxrpc_put_call_slot(struct rxrpc_call *call) } /* + * Start the process of connecting a call. We obtain a peer and a connection + * bundle, but the actual association of a call with a connection is offloaded + * to the I/O thread to simplify locking. + */ +static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) +{ + struct rxrpc_local *local = call->local; + int ret = -ENOMEM; + + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + + call->peer = rxrpc_lookup_peer(local, &call->dest_srx, gfp); + if (!call->peer) + goto error; + + ret = rxrpc_look_up_bundle(call, gfp); + if (ret < 0) + goto error; + + trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); + rxrpc_get_call(call, rxrpc_call_get_io_thread); + spin_lock(&local->client_call_lock); + list_add_tail(&call->wait_link, &local->new_client_calls); + spin_unlock(&local->client_call_lock); + rxrpc_wake_up_io_thread(local); + return 0; + +error: + __set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); + return ret; +} + +/* * Set up a call for the given parameters. * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. @@ -364,14 +401,10 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ - ret = rxrpc_connect_call(rx, call, cp, srx, gfp); + ret = rxrpc_connect_call(call, gfp); if (ret < 0) goto error_attached_to_socket; - rxrpc_see_call(call, rxrpc_call_see_connected); - - rxrpc_start_call_timer(call); - _leave(" = %p [new]", call); return call; @@ -383,27 +416,23 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, error_dup_user_ID: write_unlock(&rx->call_lock); release_sock(&rx->sk); - __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - RX_CALL_DEAD, -EEXIST); + rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EEXIST); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, rxrpc_call_see_userid_exists); - rxrpc_release_call(rx, call); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_userid_exists); _leave(" = -EEXIST"); return ERR_PTR(-EEXIST); /* We got an error, but the call is attached to the socket and is in - * need of release. However, we might now race with recvmsg() when - * completing the call queues it. Return 0 from sys_sendmsg() and + * need of release. However, we might now race with recvmsg() when it + * completion notifies the socket. Return 0 from sys_sendmsg() and * leave the error to recvmsg() to deal with. */ error_attached_to_socket: trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret, rxrpc_call_see_connect_failed); - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); - __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - RX_CALL_DEAD, ret); + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); _leave(" = c=%08x [err]", call->debug_id); return call; } @@ -426,32 +455,32 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->call_id = sp->hdr.callNumber; call->dest_srx.srx_service = sp->hdr.serviceId; call->cid = sp->hdr.cid; - call->state = RXRPC_CALL_SERVER_SECURING; call->cong_tstamp = skb->tstamp; + __set_bit(RXRPC_CALL_EXPOSED, &call->flags); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - call->state = RXRPC_CALL_SERVER_SECURING; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); break; case RXRPC_CONN_SERVICE: - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; - case RXRPC_CONN_REMOTELY_ABORTED: - __rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - conn->abort_code, conn->error); - break; - case RXRPC_CONN_LOCALLY_ABORTED: - __rxrpc_abort_call("CON", call, 1, - conn->abort_code, conn->error); + case RXRPC_CONN_ABORTED: + rxrpc_set_call_completion(call, conn->completion, + conn->abort_code, conn->error); break; default: BUG(); } + rxrpc_get_call(call, rxrpc_call_get_io_thread); + /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called * from) and the RESPONSE packet parser (which is only really @@ -461,7 +490,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, chan = sp->hdr.cid & RXRPC_CHANNELMASK; conn->channels[chan].call_counter = call->call_id; conn->channels[chan].call_id = call->call_id; - rcu_assign_pointer(conn->channels[chan].call, call); + conn->channels[chan].call = call; spin_unlock(&conn->state_lock); spin_lock(&conn->peer->lock); @@ -521,23 +550,20 @@ static void rxrpc_cleanup_ring(struct rxrpc_call *call) void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; - bool put = false; + bool put = false, putu = false; _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref)); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), call->flags, rxrpc_call_see_release); - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); rxrpc_put_call_slot(call); - del_timer_sync(&call->timer); /* Make sure we don't get any more notifications */ - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -550,7 +576,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - write_unlock(&rx->recvmsg_lock); + spin_unlock(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); @@ -559,7 +585,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { rb_erase(&call->sock_node, &rx->calls); memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); - rxrpc_put_call(call, rxrpc_call_put_userid_exists); + putu = true; } list_del(&call->sock_link); @@ -567,10 +593,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); - if (conn && !test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) - rxrpc_disconnect_call(call); - if (call->security) - call->security->free_call_crypto(call); + if (putu) + rxrpc_put_call(call, rxrpc_call_put_userid); + _leave(""); } @@ -587,7 +612,8 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); list_del(&call->accept_link); - rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, -ECONNRESET); + rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_sock_release_tba); rxrpc_put_call(call, rxrpc_call_put_release_sock_tba); } @@ -595,8 +621,8 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_get_release_sock); - rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, -ECONNRESET); - rxrpc_send_abort_packet(call); + rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_sock_release); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_release_sock); } @@ -619,7 +645,7 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) dead = __refcount_dec_and_test(&call->ref, &r); trace_rxrpc_call(debug_id, r - 1, 0, why); if (dead) { - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); + ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { spin_lock(&rxnet->call_lock); @@ -668,6 +694,8 @@ static void rxrpc_destroy_call(struct work_struct *work) rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); + rxrpc_deactivate_bundle(call->bundle); + rxrpc_put_bundle(call->bundle, rxrpc_bundle_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); rxrpc_put_local(call->local, rxrpc_local_put_call); call_rcu(&call->rcu, rxrpc_rcu_free_call); @@ -680,7 +708,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) { memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); + ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); del_timer(&call->timer); @@ -718,7 +746,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", call, refcount_read(&call->ref), - rxrpc_call_states[call->state], + rxrpc_call_states[__rxrpc_call_state(call)], call->flags, call->events); spin_unlock(&rxnet->call_lock); diff --git a/net/rxrpc/call_state.c b/net/rxrpc/call_state.c new file mode 100644 index 000000000000..6afb54373ebb --- /dev/null +++ b/net/rxrpc/call_state.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Call state changing functions. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include "ar-internal.h" + +/* + * Transition a call to the complete state. + */ +bool rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + if (__rxrpc_call_state(call) == RXRPC_CALL_COMPLETE) + return false; + + call->abort_code = abort_code; + call->error = error; + call->completion = compl; + /* Allow reader of completion state to operate locklessly */ + rxrpc_set_call_state(call, RXRPC_CALL_COMPLETE); + trace_rxrpc_call_complete(call); + wake_up(&call->waitq); + rxrpc_notify_socket(call); + return true; +} + +/* + * Record that a call successfully completed. + */ +bool rxrpc_call_completed(struct rxrpc_call *call) +{ + return rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); +} + +/* + * Record that a call is locally aborted. + */ +bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, + u32 abort_code, int error, enum rxrpc_abort_reason why) +{ + trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, + abort_code, error); + if (!rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, + abort_code, error)) + return false; + if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) + rxrpc_send_abort_packet(call); + return true; +} + +/* + * Record that a call errored out before even getting off the ground, thereby + * setting the state to allow it to be destroyed. + */ +void rxrpc_prefail_call(struct rxrpc_call *call, enum rxrpc_call_completion compl, + int error) +{ + call->abort_code = RX_CALL_DEAD; + call->error = error; + call->completion = compl; + call->_state = RXRPC_CALL_COMPLETE; + trace_rxrpc_call_complete(call); + WARN_ON_ONCE(__test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)); +} diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index a08e33c9e54b..981ca5b98bcb 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -34,104 +34,59 @@ __read_mostly unsigned int rxrpc_reap_client_connections = 900; __read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; -/* - * We use machine-unique IDs for our client connections. - */ -DEFINE_IDR(rxrpc_client_conn_ids); -static DEFINE_SPINLOCK(rxrpc_conn_id_lock); - -static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); - -/* - * Get a connection ID and epoch for a client connection from the global pool. - * The connection struct pointer is then recorded in the idr radix tree. The - * epoch doesn't change until the client is rebooted (or, at least, unless the - * module is unloaded). - */ -static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, - gfp_t gfp) +static void rxrpc_activate_bundle(struct rxrpc_bundle *bundle) { - struct rxrpc_net *rxnet = conn->rxnet; - int id; - - _enter(""); - - idr_preload(gfp); - spin_lock(&rxrpc_conn_id_lock); - - id = idr_alloc_cyclic(&rxrpc_client_conn_ids, conn, - 1, 0x40000000, GFP_NOWAIT); - if (id < 0) - goto error; - - spin_unlock(&rxrpc_conn_id_lock); - idr_preload_end(); - - conn->proto.epoch = rxnet->epoch; - conn->proto.cid = id << RXRPC_CIDSHIFT; - set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); - _leave(" [CID %x]", conn->proto.cid); - return 0; - -error: - spin_unlock(&rxrpc_conn_id_lock); - idr_preload_end(); - _leave(" = %d", id); - return id; + atomic_inc(&bundle->active); } /* - * Release a connection ID for a client connection from the global pool. + * Release a connection ID for a client connection. */ -static void rxrpc_put_client_connection_id(struct rxrpc_connection *conn) +static void rxrpc_put_client_connection_id(struct rxrpc_local *local, + struct rxrpc_connection *conn) { - if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) { - spin_lock(&rxrpc_conn_id_lock); - idr_remove(&rxrpc_client_conn_ids, - conn->proto.cid >> RXRPC_CIDSHIFT); - spin_unlock(&rxrpc_conn_id_lock); - } + idr_remove(&local->conn_ids, conn->proto.cid >> RXRPC_CIDSHIFT); } /* * Destroy the client connection ID tree. */ -void rxrpc_destroy_client_conn_ids(void) +static void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) { struct rxrpc_connection *conn; int id; - if (!idr_is_empty(&rxrpc_client_conn_ids)) { - idr_for_each_entry(&rxrpc_client_conn_ids, conn, id) { + if (!idr_is_empty(&local->conn_ids)) { + idr_for_each_entry(&local->conn_ids, conn, id) { pr_err("AF_RXRPC: Leaked client conn %p {%d}\n", conn, refcount_read(&conn->ref)); } BUG(); } - idr_destroy(&rxrpc_client_conn_ids); + idr_destroy(&local->conn_ids); } /* * Allocate a connection bundle. */ -static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, +static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_bundle *bundle; bundle = kzalloc(sizeof(*bundle), gfp); if (bundle) { - bundle->local = cp->local; - bundle->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_bundle); - bundle->key = cp->key; - bundle->exclusive = cp->exclusive; - bundle->upgrade = cp->upgrade; - bundle->service_id = cp->service_id; - bundle->security_level = cp->security_level; + bundle->local = call->local; + bundle->peer = rxrpc_get_peer(call->peer, rxrpc_peer_get_bundle); + bundle->key = key_get(call->key); + bundle->security = call->security; + bundle->exclusive = test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); + bundle->upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); + bundle->service_id = call->dest_srx.srx_service; + bundle->security_level = call->security_level; refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); - spin_lock_init(&bundle->channel_lock); INIT_LIST_HEAD(&bundle->waiting_calls); trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new); } @@ -152,84 +107,87 @@ static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_free); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); + key_put(bundle->key); kfree(bundle); } void rxrpc_put_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) { - unsigned int id = bundle->debug_id; + unsigned int id; bool dead; int r; - dead = __refcount_dec_and_test(&bundle->ref, &r); - trace_rxrpc_bundle(id, r - 1, why); - if (dead) - rxrpc_free_bundle(bundle); + if (bundle) { + id = bundle->debug_id; + dead = __refcount_dec_and_test(&bundle->ref, &r); + trace_rxrpc_bundle(id, r - 1, why); + if (dead) + rxrpc_free_bundle(bundle); + } +} + +/* + * Get rid of outstanding client connection preallocations when a local + * endpoint is destroyed. + */ +void rxrpc_purge_client_connections(struct rxrpc_local *local) +{ + rxrpc_destroy_client_conn_ids(local); } /* * Allocate a client connection. */ static struct rxrpc_connection * -rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) +rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; - struct rxrpc_net *rxnet = bundle->local->rxnet; - int ret; + struct rxrpc_local *local = bundle->local; + struct rxrpc_net *rxnet = local->rxnet; + int id; _enter(""); - conn = rxrpc_alloc_connection(rxnet, gfp); - if (!conn) { - _leave(" = -ENOMEM"); + conn = rxrpc_alloc_connection(rxnet, GFP_ATOMIC | __GFP_NOWARN); + if (!conn) return ERR_PTR(-ENOMEM); + + id = idr_alloc_cyclic(&local->conn_ids, conn, 1, 0x40000000, + GFP_ATOMIC | __GFP_NOWARN); + if (id < 0) { + kfree(conn); + return ERR_PTR(id); } refcount_set(&conn->ref, 1); - conn->bundle = bundle; - conn->local = bundle->local; - conn->peer = bundle->peer; - conn->key = bundle->key; + conn->proto.cid = id << RXRPC_CIDSHIFT; + conn->proto.epoch = local->rxnet->epoch; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn); + conn->local = rxrpc_get_local(bundle->local, rxrpc_local_get_client_conn); + conn->peer = rxrpc_get_peer(bundle->peer, rxrpc_peer_get_client_conn); + conn->key = key_get(bundle->key); + conn->security = bundle->security; conn->exclusive = bundle->exclusive; conn->upgrade = bundle->upgrade; conn->orig_service_id = bundle->service_id; conn->security_level = bundle->security_level; - conn->out_clientflag = RXRPC_CLIENT_INITIATED; - conn->state = RXRPC_CONN_CLIENT; + conn->state = RXRPC_CONN_CLIENT_UNSECURED; conn->service_id = conn->orig_service_id; - ret = rxrpc_get_client_connection_id(conn, gfp); - if (ret < 0) - goto error_0; - - ret = rxrpc_init_client_conn_security(conn); - if (ret < 0) - goto error_1; + if (conn->security == &rxrpc_no_security) + conn->state = RXRPC_CONN_CLIENT; atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn); - rxrpc_get_peer(conn->peer, rxrpc_peer_get_client_conn); - rxrpc_get_local(conn->local, rxrpc_local_get_client_conn); - key_get(conn->key); - - trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref), - rxrpc_conn_new_client); + rxrpc_see_connection(conn, rxrpc_conn_new_client); atomic_inc(&rxnet->nr_client_conns); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); - _leave(" = %p", conn); return conn; - -error_1: - rxrpc_put_client_connection_id(conn); -error_0: - kfree(conn); - _leave(" = %d", ret); - return ERR_PTR(ret); } /* @@ -247,7 +205,8 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; - if (conn->state != RXRPC_CONN_CLIENT || + if ((conn->state != RXRPC_CONN_CLIENT_UNSECURED && + conn->state != RXRPC_CONN_CLIENT) || conn->proto.epoch != rxnet->epoch) goto mark_dont_reuse; @@ -257,7 +216,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) * times the maximum number of client conns away from the current * allocation point to try and keep the IDs concentrated. */ - id_cursor = idr_get_cursor(&rxrpc_client_conn_ids); + id_cursor = idr_get_cursor(&conn->local->conn_ids); id = conn->proto.cid >> RXRPC_CIDSHIFT; distance = id - id_cursor; if (distance < 0) @@ -278,20 +237,23 @@ dont_reuse: * Look up the conn bundle that matches the connection parameters, adding it if * it doesn't yet exist. */ -static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *cp, - gfp_t gfp) +int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) { static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle, *candidate; - struct rxrpc_local *local = cp->local; + struct rxrpc_local *local = call->local; struct rb_node *p, **pp, *parent; long diff; + bool upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); _enter("{%px,%x,%u,%u}", - cp->peer, key_serial(cp->key), cp->security_level, cp->upgrade); + call->peer, key_serial(call->key), call->security_level, + upgrade); - if (cp->exclusive) - return rxrpc_alloc_bundle(cp, gfp); + if (test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags)) { + call->bundle = rxrpc_alloc_bundle(call, gfp); + return call->bundle ? 0 : -ENOMEM; + } /* First, see if the bundle is already there. */ _debug("search 1"); @@ -300,11 +262,11 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c while (p) { bundle = rb_entry(p, struct rxrpc_bundle, local_node); -#define cmp(X) ((long)bundle->X - (long)cp->X) - diff = (cmp(peer) ?: - cmp(key) ?: - cmp(security_level) ?: - cmp(upgrade)); +#define cmp(X, Y) ((long)(X) - (long)(Y)) + diff = (cmp(bundle->peer, call->peer) ?: + cmp(bundle->key, call->key) ?: + cmp(bundle->security_level, call->security_level) ?: + cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) p = p->rb_left; @@ -317,9 +279,9 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c _debug("not found"); /* It wasn't. We need to add one. */ - candidate = rxrpc_alloc_bundle(cp, gfp); + candidate = rxrpc_alloc_bundle(call, gfp); if (!candidate) - return NULL; + return -ENOMEM; _debug("search 2"); spin_lock(&local->client_bundles_lock); @@ -329,11 +291,11 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c parent = *pp; bundle = rb_entry(parent, struct rxrpc_bundle, local_node); -#define cmp(X) ((long)bundle->X - (long)cp->X) - diff = (cmp(peer) ?: - cmp(key) ?: - cmp(security_level) ?: - cmp(upgrade)); +#define cmp(X, Y) ((long)(X) - (long)(Y)) + diff = (cmp(bundle->peer, call->peer) ?: + cmp(bundle->key, call->key) ?: + cmp(bundle->security_level, call->security_level) ?: + cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) pp = &(*pp)->rb_left; @@ -347,178 +309,89 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id); rb_link_node(&candidate->local_node, parent, pp); rb_insert_color(&candidate->local_node, &local->client_bundles); - rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); + call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); spin_unlock(&local->client_bundles_lock); - _leave(" = %u [new]", candidate->debug_id); - return candidate; + _leave(" = B=%u [new]", call->bundle->debug_id); + return 0; found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: - rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call); - atomic_inc(&bundle->active); + call->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call); + rxrpc_activate_bundle(bundle); spin_unlock(&local->client_bundles_lock); - _leave(" = %u [found]", bundle->debug_id); - return bundle; -} - -/* - * Create or find a client bundle to use for a call. - * - * If we return with a connection, the call will be on its waiting list. It's - * left to the caller to assign a channel and wake up the call. - */ -static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) -{ - struct rxrpc_bundle *bundle; - - _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - - cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp); - if (!cp->peer) - goto error; - - call->tx_last_sent = ktime_get_real(); - call->cong_ssthresh = cp->peer->cong_ssthresh; - if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - else - call->cong_mode = RXRPC_CALL_SLOW_START; - if (cp->upgrade) - __set_bit(RXRPC_CALL_UPGRADE, &call->flags); - - /* Find the client connection bundle. */ - bundle = rxrpc_look_up_bundle(cp, gfp); - if (!bundle) - goto error; - - /* Get this call queued. Someone else may activate it whilst we're - * lining up a new connection, but that's fine. - */ - spin_lock(&bundle->channel_lock); - list_add_tail(&call->chan_wait_link, &bundle->waiting_calls); - spin_unlock(&bundle->channel_lock); - - _leave(" = [B=%x]", bundle->debug_id); - return bundle; - -error: - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + _leave(" = B=%u [found]", call->bundle->debug_id); + return 0; } /* * Allocate a new connection and add it into a bundle. */ -static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) - __releases(bundle->channel_lock) +static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, + unsigned int slot) { - struct rxrpc_connection *candidate = NULL, *old = NULL; - bool conflict; - int i; - - _enter(""); - - conflict = bundle->alloc_conn; - if (!conflict) - bundle->alloc_conn = true; - spin_unlock(&bundle->channel_lock); - if (conflict) { - _leave(" [conf]"); - return; - } - - candidate = rxrpc_alloc_client_connection(bundle, gfp); - - spin_lock(&bundle->channel_lock); - bundle->alloc_conn = false; - - if (IS_ERR(candidate)) { - bundle->alloc_error = PTR_ERR(candidate); - spin_unlock(&bundle->channel_lock); - _leave(" [err %ld]", PTR_ERR(candidate)); - return; - } - - bundle->alloc_error = 0; - - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) { - unsigned int shift = i * RXRPC_MAXCALLS; - int j; - - old = bundle->conns[i]; - if (!rxrpc_may_reuse_conn(old)) { - if (old) - trace_rxrpc_client(old, -1, rxrpc_client_replace); - candidate->bundle_shift = shift; - atomic_inc(&bundle->active); - bundle->conns[i] = candidate; - for (j = 0; j < RXRPC_MAXCALLS; j++) - set_bit(shift + j, &bundle->avail_chans); - candidate = NULL; - break; - } + struct rxrpc_connection *conn, *old; + unsigned int shift = slot * RXRPC_MAXCALLS; + unsigned int i; - old = NULL; + old = bundle->conns[slot]; + if (old) { + bundle->conns[slot] = NULL; + trace_rxrpc_client(old, -1, rxrpc_client_replace); + rxrpc_put_connection(old, rxrpc_conn_put_noreuse); } - spin_unlock(&bundle->channel_lock); - - if (candidate) { - _debug("discard C=%x", candidate->debug_id); - trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); - rxrpc_put_connection(candidate, rxrpc_conn_put_discard); + conn = rxrpc_alloc_client_connection(bundle); + if (IS_ERR(conn)) { + bundle->alloc_error = PTR_ERR(conn); + return false; } - rxrpc_put_connection(old, rxrpc_conn_put_noreuse); - _leave(""); + rxrpc_activate_bundle(bundle); + conn->bundle_shift = shift; + bundle->conns[slot] = conn; + for (i = 0; i < RXRPC_MAXCALLS; i++) + set_bit(shift + i, &bundle->avail_chans); + return true; } /* * Add a connection to a bundle if there are no usable connections or we have * connections waiting for extra capacity. */ -static void rxrpc_maybe_add_conn(struct rxrpc_bundle *bundle, gfp_t gfp) +static bool rxrpc_bundle_has_space(struct rxrpc_bundle *bundle) { - struct rxrpc_call *call; - int i, usable; + int slot = -1, i, usable; _enter(""); - spin_lock(&bundle->channel_lock); + bundle->alloc_error = 0; /* See if there are any usable connections. */ usable = 0; - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) + for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) { if (rxrpc_may_reuse_conn(bundle->conns[i])) usable++; - - if (!usable && !list_empty(&bundle->waiting_calls)) { - call = list_first_entry(&bundle->waiting_calls, - struct rxrpc_call, chan_wait_link); - if (test_bit(RXRPC_CALL_UPGRADE, &call->flags)) - bundle->try_upgrade = true; + else if (slot == -1) + slot = i; } + if (!usable && bundle->upgrade) + bundle->try_upgrade = true; + if (!usable) goto alloc_conn; if (!bundle->avail_chans && !bundle->try_upgrade && - !list_empty(&bundle->waiting_calls) && usable < ARRAY_SIZE(bundle->conns)) goto alloc_conn; - spin_unlock(&bundle->channel_lock); _leave(""); - return; + return usable; alloc_conn: - return rxrpc_add_conn_to_bundle(bundle, gfp); + return slot >= 0 ? rxrpc_add_conn_to_bundle(bundle, slot) : false; } /* @@ -532,11 +405,13 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, struct rxrpc_channel *chan = &conn->channels[channel]; struct rxrpc_bundle *bundle = conn->bundle; struct rxrpc_call *call = list_entry(bundle->waiting_calls.next, - struct rxrpc_call, chan_wait_link); + struct rxrpc_call, wait_link); u32 call_id = chan->call_counter + 1; _enter("C=%x,%u", conn->debug_id, channel); + list_del_init(&call->wait_link); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); /* Cancel the final ACK on the previous call if it hasn't been sent yet @@ -546,70 +421,50 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, clear_bit(conn->bundle_shift + channel, &bundle->avail_chans); rxrpc_see_call(call, rxrpc_call_see_activate_client); - list_del_init(&call->chan_wait_link); - call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_activate_call); call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; - call->security = conn->security; - call->security_ix = conn->security_ix; call->dest_srx.srx_service = conn->service_id; - - trace_rxrpc_connect_call(call); - - write_lock(&call->state_lock); - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - write_unlock(&call->state_lock); - - /* Paired with the read barrier in rxrpc_connect_call(). This orders - * cid and epoch in the connection wrt to call_id without the need to - * take the channel_lock. - * - * We provisionally assign a callNumber at this point, but we don't - * confirm it until the call is about to be exposed. - * - * TODO: Pair with a barrier in the data_ready handler when that looks - * at the call ID through a connection channel. - */ - smp_wmb(); + call->cong_ssthresh = call->peer->cong_ssthresh; + if (call->cong_cwnd >= call->cong_ssthresh) + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + else + call->cong_mode = RXRPC_CALL_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; - rcu_assign_pointer(chan->call, call); + chan->call = call; + + rxrpc_see_call(call, rxrpc_call_see_connected); + trace_rxrpc_connect_call(call); + call->tx_last_sent = ktime_get_real(); + rxrpc_start_call_timer(call); + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_SEND_REQUEST); wake_up(&call->waitq); } /* * Remove a connection from the idle list if it's on it. */ -static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn) +static void rxrpc_unidle_conn(struct rxrpc_connection *conn) { - struct rxrpc_net *rxnet = bundle->local->rxnet; - bool drop_ref; - if (!list_empty(&conn->cache_link)) { - drop_ref = false; - spin_lock(&rxnet->client_conn_cache_lock); - if (!list_empty(&conn->cache_link)) { - list_del_init(&conn->cache_link); - drop_ref = true; - } - spin_unlock(&rxnet->client_conn_cache_lock); - if (drop_ref) - rxrpc_put_connection(conn, rxrpc_conn_put_unidle); + list_del_init(&conn->cache_link); + rxrpc_put_connection(conn, rxrpc_conn_put_unidle); } } /* - * Assign channels and callNumbers to waiting calls with channel_lock - * held by caller. + * Assign channels and callNumbers to waiting calls. */ -static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle) +static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; unsigned long avail, mask; unsigned int channel, slot; + trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans); + if (bundle->try_upgrade) mask = 1; else @@ -629,7 +484,7 @@ static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle) if (bundle->try_upgrade) set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); - rxrpc_unidle_conn(bundle, conn); + rxrpc_unidle_conn(conn); channel &= (RXRPC_MAXCALLS - 1); conn->act_chans |= 1 << channel; @@ -638,132 +493,24 @@ static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle) } /* - * Assign channels and callNumbers to waiting calls. - */ -static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) -{ - _enter("B=%x", bundle->debug_id); - - trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans); - - if (!bundle->avail_chans) - return; - - spin_lock(&bundle->channel_lock); - rxrpc_activate_channels_locked(bundle); - spin_unlock(&bundle->channel_lock); - _leave(""); -} - -/* - * Wait for a callNumber and a channel to be granted to a call. - */ -static int rxrpc_wait_for_channel(struct rxrpc_bundle *bundle, - struct rxrpc_call *call, gfp_t gfp) -{ - DECLARE_WAITQUEUE(myself, current); - int ret = 0; - - _enter("%d", call->debug_id); - - if (!gfpflags_allow_blocking(gfp)) { - rxrpc_maybe_add_conn(bundle, gfp); - rxrpc_activate_channels(bundle); - ret = bundle->alloc_error ?: -EAGAIN; - goto out; - } - - add_wait_queue_exclusive(&call->waitq, &myself); - for (;;) { - rxrpc_maybe_add_conn(bundle, gfp); - rxrpc_activate_channels(bundle); - ret = bundle->alloc_error; - if (ret < 0) - break; - - switch (call->interruptibility) { - case RXRPC_INTERRUPTIBLE: - case RXRPC_PREINTERRUPTIBLE: - set_current_state(TASK_INTERRUPTIBLE); - break; - case RXRPC_UNINTERRUPTIBLE: - default: - set_current_state(TASK_UNINTERRUPTIBLE); - break; - } - if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_AWAIT_CONN) - break; - if ((call->interruptibility == RXRPC_INTERRUPTIBLE || - call->interruptibility == RXRPC_PREINTERRUPTIBLE) && - signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - schedule(); - } - remove_wait_queue(&call->waitq, &myself); - __set_current_state(TASK_RUNNING); - -out: - _leave(" = %d", ret); - return ret; -} - -/* - * find a connection for a call - * - called in process context with IRQs enabled + * Connect waiting channels (called from the I/O thread). */ -int rxrpc_connect_call(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +void rxrpc_connect_client_calls(struct rxrpc_local *local) { - struct rxrpc_bundle *bundle; - struct rxrpc_net *rxnet = cp->local->rxnet; - int ret = 0; - - _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - - rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper); + struct rxrpc_call *call; - bundle = rxrpc_prep_call(rx, call, cp, srx, gfp); - if (IS_ERR(bundle)) { - ret = PTR_ERR(bundle); - goto out; - } + while ((call = list_first_entry_or_null(&local->new_client_calls, + struct rxrpc_call, wait_link)) + ) { + struct rxrpc_bundle *bundle = call->bundle; - if (call->state == RXRPC_CALL_CLIENT_AWAIT_CONN) { - ret = rxrpc_wait_for_channel(bundle, call, gfp); - if (ret < 0) - goto wait_failed; - } + spin_lock(&local->client_call_lock); + list_move_tail(&call->wait_link, &bundle->waiting_calls); + spin_unlock(&local->client_call_lock); -granted_channel: - /* Paired with the write barrier in rxrpc_activate_one_channel(). */ - smp_rmb(); - -out_put_bundle: - rxrpc_deactivate_bundle(bundle); - rxrpc_put_bundle(bundle, rxrpc_bundle_get_client_call); -out: - _leave(" = %d", ret); - return ret; - -wait_failed: - spin_lock(&bundle->channel_lock); - list_del_init(&call->chan_wait_link); - spin_unlock(&bundle->channel_lock); - - if (call->state != RXRPC_CALL_CLIENT_AWAIT_CONN) { - ret = 0; - goto granted_channel; + if (rxrpc_bundle_has_space(bundle)) + rxrpc_activate_channels(bundle); } - - trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed); - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); - rxrpc_disconnect_client_call(bundle, call); - goto out_put_bundle; } /* @@ -796,14 +543,14 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) /* * Set the reap timer. */ -static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet) +static void rxrpc_set_client_reap_timer(struct rxrpc_local *local) { - if (!rxnet->kill_all_client_conns) { + if (!local->kill_all_client_conns) { unsigned long now = jiffies; unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; - if (rxnet->live) - timer_reduce(&rxnet->client_conn_reap_timer, reap_at); + if (local->rxnet->live) + timer_reduce(&local->client_conn_reap_timer, reap_at); } } @@ -814,16 +561,13 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call { struct rxrpc_connection *conn; struct rxrpc_channel *chan = NULL; - struct rxrpc_net *rxnet = bundle->local->rxnet; + struct rxrpc_local *local = bundle->local; unsigned int channel; bool may_reuse; u32 cid; _enter("c=%x", call->debug_id); - spin_lock(&bundle->channel_lock); - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); - /* Calls that have never actually been assigned a channel can simply be * discarded. */ @@ -832,8 +576,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); - list_del_init(&call->chan_wait_link); - goto out; + list_del_init(&call->wait_link); + return; } cid = call->cid; @@ -841,10 +585,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call chan = &conn->channels[channel]; trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); - if (rcu_access_pointer(chan->call) != call) { - spin_unlock(&bundle->channel_lock); - BUG(); - } + if (WARN_ON(chan->call != call)) + return; may_reuse = rxrpc_may_reuse_conn(conn); @@ -865,16 +607,15 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call trace_rxrpc_client(conn, channel, rxrpc_client_to_active); bundle->try_upgrade = false; if (may_reuse) - rxrpc_activate_channels_locked(bundle); + rxrpc_activate_channels(bundle); } - } /* See if we can pass the channel directly to another call. */ if (may_reuse && !list_empty(&bundle->waiting_calls)) { trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); - goto out; + return; } /* Schedule the final ACK to be transmitted in a short while so that it @@ -892,7 +633,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call } /* Deactivate the channel. */ - rcu_assign_pointer(chan->call, NULL); + chan->call = NULL; set_bit(conn->bundle_shift + channel, &conn->bundle->avail_chans); conn->act_chans &= ~(1 << channel); @@ -905,17 +646,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call conn->idle_timestamp = jiffies; rxrpc_get_connection(conn, rxrpc_conn_get_idle); - spin_lock(&rxnet->client_conn_cache_lock); - list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); - spin_unlock(&rxnet->client_conn_cache_lock); + list_move_tail(&conn->cache_link, &local->idle_client_conns); - rxrpc_set_client_reap_timer(rxnet); + rxrpc_set_client_reap_timer(local); } - -out: - spin_unlock(&bundle->channel_lock); - _leave(""); - return; } /* @@ -925,7 +659,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; unsigned int bindex; - bool need_drop = false; int i; _enter("C=%x", conn->debug_id); @@ -933,18 +666,13 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, true); - spin_lock(&bundle->channel_lock); bindex = conn->bundle_shift / RXRPC_MAXCALLS; if (bundle->conns[bindex] == conn) { _debug("clear slot %u", bindex); bundle->conns[bindex] = NULL; for (i = 0; i < RXRPC_MAXCALLS; i++) clear_bit(conn->bundle_shift + i, &bundle->avail_chans); - need_drop = true; - } - spin_unlock(&bundle->channel_lock); - - if (need_drop) { + rxrpc_put_client_connection_id(bundle->local, conn); rxrpc_deactivate_bundle(bundle); rxrpc_put_connection(conn, rxrpc_conn_put_unbundle); } @@ -953,11 +681,15 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) /* * Drop the active count on a bundle. */ -static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) +void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) { - struct rxrpc_local *local = bundle->local; + struct rxrpc_local *local; bool need_put = false; + if (!bundle) + return; + + local = bundle->local; if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { if (!bundle->exclusive) { _debug("erase bundle"); @@ -984,7 +716,7 @@ void rxrpc_kill_client_conn(struct rxrpc_connection *conn) trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); atomic_dec(&rxnet->nr_client_conns); - rxrpc_put_client_connection_id(conn); + rxrpc_put_client_connection_id(local, conn); } /* @@ -994,42 +726,26 @@ void rxrpc_kill_client_conn(struct rxrpc_connection *conn) * This may be called from conn setup or from a work item so cannot be * considered non-reentrant. */ -void rxrpc_discard_expired_client_conns(struct work_struct *work) +void rxrpc_discard_expired_client_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn; - struct rxrpc_net *rxnet = - container_of(work, struct rxrpc_net, client_conn_reaper); unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; _enter(""); - if (list_empty(&rxnet->idle_client_conns)) { - _leave(" [empty]"); - return; - } - - /* Don't double up on the discarding */ - if (!mutex_trylock(&rxnet->client_conn_discard_lock)) { - _leave(" [already]"); - return; - } - /* We keep an estimate of what the number of conns ought to be after * we've discarded some so that we don't overdo the discarding. */ - nr_conns = atomic_read(&rxnet->nr_client_conns); + nr_conns = atomic_read(&local->rxnet->nr_client_conns); next: - spin_lock(&rxnet->client_conn_cache_lock); - - if (list_empty(&rxnet->idle_client_conns)) - goto out; - - conn = list_entry(rxnet->idle_client_conns.next, - struct rxrpc_connection, cache_link); + conn = list_first_entry_or_null(&local->idle_client_conns, + struct rxrpc_connection, cache_link); + if (!conn) + return; - if (!rxnet->kill_all_client_conns) { + if (!local->kill_all_client_conns) { /* If the number of connections is over the reap limit, we * expedite discard by reducing the expiry timeout. We must, * however, have at least a short grace period to be able to do @@ -1052,8 +768,6 @@ next: trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_del_init(&conn->cache_link); - spin_unlock(&rxnet->client_conn_cache_lock); - rxrpc_unbundle_conn(conn); /* Drop the ->cache_link ref */ rxrpc_put_connection(conn, rxrpc_conn_put_discard_idle); @@ -1070,31 +784,8 @@ not_yet_expired: * then things get messier. */ _debug("not yet"); - if (!rxnet->kill_all_client_conns) - timer_reduce(&rxnet->client_conn_reap_timer, conn_expires_at); - -out: - spin_unlock(&rxnet->client_conn_cache_lock); - mutex_unlock(&rxnet->client_conn_discard_lock); - _leave(""); -} - -/* - * Preemptively destroy all the client connection records rather than waiting - * for them to time out - */ -void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) -{ - _enter(""); - - spin_lock(&rxnet->client_conn_cache_lock); - rxnet->kill_all_client_conns = true; - spin_unlock(&rxnet->client_conn_cache_lock); - - del_timer_sync(&rxnet->client_conn_reap_timer); - - if (!rxrpc_queue_work(&rxnet->client_conn_reaper)) - _debug("destroy: queue failed"); + if (!local->kill_all_client_conns) + timer_reduce(&local->client_conn_reap_timer, conn_expires_at); _leave(""); } @@ -1104,29 +795,19 @@ void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet) */ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) { - struct rxrpc_connection *conn, *tmp; - struct rxrpc_net *rxnet = local->rxnet; - LIST_HEAD(graveyard); + struct rxrpc_connection *conn; _enter(""); - spin_lock(&rxnet->client_conn_cache_lock); - - list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns, - cache_link) { - if (conn->local == local) { - atomic_dec(&conn->active); - trace_rxrpc_client(conn, -1, rxrpc_client_discard); - list_move(&conn->cache_link, &graveyard); - } - } + local->kill_all_client_conns = true; - spin_unlock(&rxnet->client_conn_cache_lock); + del_timer_sync(&local->client_conn_reap_timer); - while (!list_empty(&graveyard)) { - conn = list_entry(graveyard.next, - struct rxrpc_connection, cache_link); + while ((conn = list_first_entry_or_null(&local->idle_client_conns, + struct rxrpc_connection, cache_link))) { list_del_init(&conn->cache_link); + atomic_dec(&conn->active); + trace_rxrpc_client(conn, -1, rxrpc_client_discard); rxrpc_unbundle_conn(conn); rxrpc_put_connection(conn, rxrpc_conn_put_local_dead); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 480364bcbf85..95f4bc206b3d 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -17,11 +17,65 @@ #include "ar-internal.h" /* + * Set the completion state on an aborted connection. + */ +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, + s32 abort_code, int err, + enum rxrpc_call_completion compl) +{ + bool aborted = false; + + if (conn->state != RXRPC_CONN_ABORTED) { + spin_lock(&conn->state_lock); + if (conn->state != RXRPC_CONN_ABORTED) { + conn->abort_code = abort_code; + conn->error = err; + conn->completion = compl; + /* Order the abort info before the state change. */ + smp_store_release(&conn->state, RXRPC_CONN_ABORTED); + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events); + aborted = true; + } + spin_unlock(&conn->state_lock); + } + + return aborted; +} + +/* + * Mark a socket buffer to indicate that the connection it's on should be aborted. + */ +int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, + s32 abort_code, int err, enum rxrpc_abort_reason why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + RXRPC_CALL_LOCALLY_ABORTED)) { + trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, + sp->hdr.seq, abort_code, err); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); + } + return -EPROTO; +} + +/* + * Mark a connection as being remotely aborted. + */ +static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); +} + +/* * Retransmit terminal ACK or ABORT of the previous call. */ -static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, - struct sk_buff *skb, - unsigned int channel) +void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, + struct sk_buff *skb, + unsigned int channel) { struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL; struct rxrpc_channel *chan; @@ -46,9 +100,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, /* If the last call got moved on whilst we were waiting to run, just * ignore this packet. */ - call_id = READ_ONCE(chan->last_call); - /* Sync with __rxrpc_disconnect_call() */ - smp_rmb(); + call_id = chan->last_call; if (skb && call_id != sp->hdr.callNumber) return; @@ -65,9 +117,12 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, iov[2].iov_base = &ack_info; iov[2].iov_len = sizeof(ack_info); + serial = atomic_inc_return(&conn->serial); + pkt.whdr.epoch = htonl(conn->proto.epoch); pkt.whdr.cid = htonl(conn->proto.cid | channel); pkt.whdr.callNumber = htonl(call_id); + pkt.whdr.serial = htonl(serial); pkt.whdr.seq = 0; pkt.whdr.type = chan->last_type; pkt.whdr.flags = conn->out_clientflag; @@ -104,31 +159,15 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, iov[0].iov_len += sizeof(pkt.ack); len += sizeof(pkt.ack) + 3 + sizeof(ack_info); ioc = 3; - break; - - default: - return; - } - - /* Resync with __rxrpc_disconnect_call() and check that the last call - * didn't get advanced whilst we were filling out the packets. - */ - smp_rmb(); - if (READ_ONCE(chan->last_call) != call_id) - return; - - serial = atomic_inc_return(&conn->serial); - pkt.whdr.serial = htonl(serial); - switch (chan->last_type) { - case RXRPC_PACKET_TYPE_ABORT: - break; - case RXRPC_PACKET_TYPE_ACK: trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), - pkt.ack.reason, 0); + pkt.ack.reason, 0, rxrpc_rx_window_size); break; + + default: + return; } ret = kernel_sendmsg(conn->local->socket, &msg, iov, ioc, len); @@ -146,131 +185,34 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, /* * pass a connection-level abort onto all calls on that connection */ -static void rxrpc_abort_calls(struct rxrpc_connection *conn, - enum rxrpc_call_completion compl, - rxrpc_serial_t serial) +static void rxrpc_abort_calls(struct rxrpc_connection *conn) { struct rxrpc_call *call; int i; _enter("{%d},%x", conn->debug_id, conn->abort_code); - spin_lock(&conn->bundle->channel_lock); - for (i = 0; i < RXRPC_MAXCALLS; i++) { - call = rcu_dereference_protected( - conn->channels[i].call, - lockdep_is_held(&conn->bundle->channel_lock)); - if (call) { - if (compl == RXRPC_CALL_LOCALLY_ABORTED) - trace_rxrpc_abort(call->debug_id, - "CON", call->cid, - call->call_id, 0, + call = conn->channels[i].call; + if (call) + rxrpc_set_call_completion(call, + conn->completion, conn->abort_code, conn->error); - else - trace_rxrpc_rx_abort(call, serial, - conn->abort_code); - rxrpc_set_call_completion(call, compl, - conn->abort_code, - conn->error); - } } - spin_unlock(&conn->bundle->channel_lock); _leave(""); } /* - * generate a connection-level abort - */ -static int rxrpc_abort_connection(struct rxrpc_connection *conn, - int error, u32 abort_code) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[2]; - __be32 word; - size_t len; - u32 serial; - int ret; - - _enter("%d,,%u,%u", conn->debug_id, error, abort_code); - - /* generate a connection-level abort */ - spin_lock(&conn->state_lock); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - spin_unlock(&conn->state_lock); - _leave(" = 0 [already dead]"); - return 0; - } - - conn->error = error; - conn->abort_code = abort_code; - conn->state = RXRPC_CONN_LOCALLY_ABORTED; - set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - spin_unlock(&conn->state_lock); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - whdr.epoch = htonl(conn->proto.epoch); - whdr.cid = htonl(conn->proto.cid); - whdr.callNumber = 0; - whdr.seq = 0; - whdr.type = RXRPC_PACKET_TYPE_ABORT; - whdr.flags = conn->out_clientflag; - whdr.userStatus = 0; - whdr.securityIndex = conn->security_ix; - whdr._rsvd = 0; - whdr.serviceId = htons(conn->service_id); - - word = htonl(conn->abort_code); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = &word; - iov[1].iov_len = sizeof(word); - - len = iov[0].iov_len + iov[1].iov_len; - - serial = atomic_inc_return(&conn->serial); - rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial); - whdr.serial = htonl(serial); - - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_conn_abort); - _debug("sendmsg failed: %d", ret); - return -EAGAIN; - } - - trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort); - - conn->peer->last_tx_at = ktime_get_seconds(); - - _leave(" = 0"); - return 0; -} - -/* * mark a call as being on a now-secured channel * - must be called with BH's disabled. */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - _enter("%p", call); - if (call) { - write_lock(&call->state_lock); - if (call->state == RXRPC_CALL_SERVER_SECURING) { - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; - rxrpc_notify_socket(call); - } - write_unlock(&call->state_lock); + if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + rxrpc_notify_socket(call); } } @@ -278,44 +220,22 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) * connection-level Rx packet processor */ static int rxrpc_process_event(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - int loop, ret; + int ret; - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - _leave(" = -ECONNABORTED [%u]", conn->state); + if (conn->state == RXRPC_CONN_ABORTED) return -ECONNABORTED; - } _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_DATA: - case RXRPC_PACKET_TYPE_ACK: - rxrpc_conn_retransmit_call(conn, skb, - sp->hdr.cid & RXRPC_CHANNELMASK); - return 0; - - case RXRPC_PACKET_TYPE_BUSY: - /* Just ignore BUSY packets for now. */ - return 0; - - case RXRPC_PACKET_TYPE_ABORT: - conn->error = -ECONNABORTED; - conn->abort_code = skb->priority; - conn->state = RXRPC_CONN_REMOTELY_ABORTED; - set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); - return -ECONNABORTED; - case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb, - _abort_code); + return conn->security->respond_to_challenge(conn, skb); case RXRPC_PACKET_TYPE_RESPONSE: - ret = conn->security->verify_response(conn, skb, _abort_code); + ret = conn->security->verify_response(conn, skb); if (ret < 0) return ret; @@ -324,27 +244,25 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->bundle->channel_lock); spin_lock(&conn->state_lock); - - if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { + if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure( - rcu_dereference_protected( - conn->channels[loop].call, - lockdep_is_held(&conn->bundle->channel_lock))); - } else { - spin_unlock(&conn->state_lock); - } + spin_unlock(&conn->state_lock); - spin_unlock(&conn->bundle->channel_lock); + if (conn->state == RXRPC_CONN_SERVICE) { + /* Offload call state flipping to the I/O thread. As + * we've already received the packet, put it on the + * front of the queue. + */ + skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; + rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); + skb_queue_head(&conn->local->rx_queue, skb); + rxrpc_wake_up_io_thread(conn->local); + } return 0; default: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_conn_pkt")); + WARN_ON_ONCE(1); return -EPROTO; } } @@ -354,26 +272,9 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, */ static void rxrpc_secure_connection(struct rxrpc_connection *conn) { - u32 abort_code; - int ret; - - _enter("{%d}", conn->debug_id); - - ASSERT(conn->security_ix != 0); - - if (conn->security->issue_challenge(conn) < 0) { - abort_code = RX_CALL_DEAD; - ret = -ENOMEM; - goto abort; - } - - _leave(""); - return; - -abort: - _debug("abort %d, %d", ret, abort_code); - rxrpc_abort_connection(conn, ret, abort_code); - _leave(" [aborted]"); + if (conn->security->issue_challenge(conn) < 0) + rxrpc_abort_conn(conn, NULL, RX_CALL_DEAD, -ENOMEM, + rxrpc_abort_nomem); } /* @@ -395,9 +296,7 @@ again: if (!test_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags)) continue; - smp_rmb(); /* vs rxrpc_disconnect_client_call */ - ack_at = READ_ONCE(chan->final_ack_at); - + ack_at = chan->final_ack_at; if (time_before(j, ack_at) && !force) { if (time_before(ack_at, next_j)) { next_j = ack_at; @@ -424,47 +323,27 @@ again: static void rxrpc_do_process_connection(struct rxrpc_connection *conn) { struct sk_buff *skb; - u32 abort_code = RX_PROTOCOL_ERROR; int ret; if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); - /* Process delayed ACKs whose time has come. */ - if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) - rxrpc_process_delayed_final_acks(conn, false); - /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { rxrpc_see_skb(skb, rxrpc_skb_see_conn_work); - ret = rxrpc_process_event(conn, skb, &abort_code); + ret = rxrpc_process_event(conn, skb); switch (ret) { - case -EPROTO: - case -EKEYEXPIRED: - case -EKEYREJECTED: - goto protocol_error; case -ENOMEM: case -EAGAIN: - goto requeue_and_leave; - case -ECONNABORTED: + skb_queue_head(&conn->rx_queue, skb); + rxrpc_queue_conn(conn, rxrpc_conn_queue_retry_work); + break; default: rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); break; } } - - return; - -requeue_and_leave: - skb_queue_head(&conn->rx_queue, skb); - return; - -protocol_error: - if (rxrpc_abort_connection(conn, ret, abort_code) < 0) - goto requeue_and_leave; - rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); - return; } void rxrpc_process_connection(struct work_struct *work) @@ -498,44 +377,59 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, /* * Input a connection-level packet. */ -int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) +bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - _leave(" = -ECONNABORTED [%u]", conn->state); - return -ECONNABORTED; - } - - _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_DATA: - case RXRPC_PACKET_TYPE_ACK: - rxrpc_conn_retransmit_call(conn, skb, - sp->hdr.cid & RXRPC_CHANNELMASK); - return 0; - case RXRPC_PACKET_TYPE_BUSY: /* Just ignore BUSY packets for now. */ - return 0; + return true; case RXRPC_PACKET_TYPE_ABORT: - conn->error = -ECONNABORTED; - conn->abort_code = skb->priority; - conn->state = RXRPC_CONN_REMOTELY_ABORTED; - set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); - return -ECONNABORTED; + if (rxrpc_is_conn_aborted(conn)) + return true; + rxrpc_input_conn_abort(conn, skb); + rxrpc_abort_calls(conn); + return true; case RXRPC_PACKET_TYPE_CHALLENGE: case RXRPC_PACKET_TYPE_RESPONSE: + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } rxrpc_post_packet_to_conn(conn, skb); - return 0; + return true; default: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_conn_pkt")); - return -EPROTO; + WARN_ON_ONCE(1); + return true; } } + +/* + * Input a connection event. + */ +void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + unsigned int loop; + + if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) + rxrpc_abort_calls(conn); + + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; + + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop].call); + break; + } + + /* Process delayed ACKs whose time has come. */ + if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) + rxrpc_process_delayed_final_acks(conn, false); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 3c8f83dacb2b..ac85d4644a3c 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -23,12 +23,30 @@ static void rxrpc_clean_up_connection(struct work_struct *work); static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, unsigned long reap_at); +void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) +{ + struct rxrpc_local *local = conn->local; + bool busy; + + if (WARN_ON_ONCE(!local)) + return; + + spin_lock_bh(&local->lock); + busy = !list_empty(&conn->attend_link); + if (!busy) { + rxrpc_get_connection(conn, why); + list_add_tail(&conn->attend_link, &local->conn_attend_q); + } + spin_unlock_bh(&local->lock); + rxrpc_wake_up_io_thread(local); +} + static void rxrpc_connection_timer(struct timer_list *timer) { struct rxrpc_connection *conn = container_of(timer, struct rxrpc_connection, timer); - rxrpc_queue_conn(conn, rxrpc_conn_queue_timer); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_timer); } /* @@ -49,6 +67,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + mutex_init(&conn->security_lock); skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; @@ -82,10 +101,10 @@ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *lo _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); - /* Look up client connections by connection ID alone as their IDs are - * unique for this machine. + /* Look up client connections by connection ID alone as their + * IDs are unique for this machine. */ - conn = idr_find(&rxrpc_client_conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); + conn = idr_find(&local->conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); if (!conn || refcount_read(&conn->ref) == 0) { _debug("no conn"); goto not_found; @@ -139,7 +158,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, _enter("%d,%x", conn->debug_id, call->cid); - if (rcu_access_pointer(chan->call) == call) { + if (chan->call == call) { /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. */ @@ -159,12 +178,9 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, break; } - /* Sync with rxrpc_conn_retransmit(). */ - smp_wmb(); chan->last_call = chan->call_id; chan->call_id = chan->call_counter; - - rcu_assign_pointer(chan->call, NULL); + chan->call = NULL; } _leave(""); @@ -178,6 +194,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); + rxrpc_see_call(call, rxrpc_call_see_disconnected); + call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { @@ -186,18 +205,17 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) spin_unlock(&call->peer->lock); } - if (rxrpc_is_client_call(call)) - return rxrpc_disconnect_client_call(conn->bundle, call); - - spin_lock(&conn->bundle->channel_lock); - __rxrpc_disconnect_call(conn, call); - spin_unlock(&conn->bundle->channel_lock); + if (rxrpc_is_client_call(call)) { + rxrpc_disconnect_client_call(call->bundle, call); + } else { + __rxrpc_disconnect_call(conn, call); + conn->idle_timestamp = jiffies; + if (atomic_dec_and_test(&conn->active)) + rxrpc_set_service_reap_timer(conn->rxnet, + jiffies + rxrpc_connection_expiry); + } - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); - conn->idle_timestamp = jiffies; - if (atomic_dec_and_test(&conn->active)) - rxrpc_set_service_reap_timer(conn->rxnet, - jiffies + rxrpc_connection_expiry); + rxrpc_put_call(call, rxrpc_call_put_io_thread); } /* @@ -293,10 +311,10 @@ static void rxrpc_clean_up_connection(struct work_struct *work) container_of(work, struct rxrpc_connection, destructor); struct rxrpc_net *rxnet = conn->rxnet; - ASSERT(!rcu_access_pointer(conn->channels[0].call) && - !rcu_access_pointer(conn->channels[1].call) && - !rcu_access_pointer(conn->channels[2].call) && - !rcu_access_pointer(conn->channels[3].call)); + ASSERT(!conn->channels[0].call && + !conn->channels[1].call && + !conn->channels[2].call && + !conn->channels[3].call); ASSERT(list_empty(&conn->cache_link)); del_timer_sync(&conn->timer); @@ -447,7 +465,6 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) _enter(""); atomic_dec(&rxnet->nr_conns); - rxrpc_destroy_all_client_connections(rxnet); del_timer_sync(&rxnet->service_conn_reap_timer); rxrpc_queue_work(&rxnet->service_conn_reaper); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 2a55a88b2a5b..89ac05a711a4 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -8,12 +8,6 @@ #include <linux/slab.h> #include "ar-internal.h" -static struct rxrpc_bundle rxrpc_service_dummy_bundle = { - .ref = REFCOUNT_INIT(1), - .debug_id = UINT_MAX, - .channel_lock = __SPIN_LOCK_UNLOCKED(&rxrpc_service_dummy_bundle.channel_lock), -}; - /* * Find a service connection under RCU conditions. * @@ -133,8 +127,6 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn */ conn->state = RXRPC_CONN_SERVICE_PREALLOC; refcount_set(&conn->ref, 2); - conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle, - rxrpc_bundle_get_service_conn); atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index d0e20e946e48..030d64f282f3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -9,11 +9,10 @@ #include "ar-internal.h" -static void rxrpc_proto_abort(const char *why, - struct rxrpc_call *call, rxrpc_seq_t seq) +static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, + enum rxrpc_abort_reason why) { - if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG)) - rxrpc_send_abort_packet(call); + rxrpc_abort_call(call, seq, RX_PROTOCOL_ERROR, -EBADMSG, why); } /* @@ -185,7 +184,7 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) if (call->cong_mode != RXRPC_CALL_SLOW_START && call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) return; - if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) + if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); @@ -250,47 +249,34 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, * This occurs when we get an ACKALL packet, the first DATA packet of a reply, * or a final ACK packet. */ -static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, - const char *abort_why) +static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, + enum rxrpc_abort_reason abort_why) { - unsigned int state; - ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - write_lock(&call->state_lock); - - state = call->state; - switch (state) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: - if (reply_begun) - call->state = state = RXRPC_CALL_CLIENT_RECV_REPLY; - else - call->state = state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + if (reply_begun) { + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_RECV_REPLY); + trace_rxrpc_txqueue(call, rxrpc_txqueue_end); + break; + } + + rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_REPLY); + trace_rxrpc_txqueue(call, rxrpc_txqueue_await_reply); break; case RXRPC_CALL_SERVER_AWAIT_ACK: - __rxrpc_call_completed(call); - state = call->state; + rxrpc_call_completed(call); + trace_rxrpc_txqueue(call, rxrpc_txqueue_end); break; default: - goto bad_state; + kdebug("end_tx %s", rxrpc_call_states[__rxrpc_call_state(call)]); + rxrpc_proto_abort(call, call->tx_top, abort_why); + break; } - - write_unlock(&call->state_lock); - if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - trace_rxrpc_txqueue(call, rxrpc_txqueue_await_reply); - else - trace_rxrpc_txqueue(call, rxrpc_txqueue_end); - _leave(" = ok"); - return true; - -bad_state: - write_unlock(&call->state_lock); - kdebug("end_tx %s", rxrpc_call_states[call->state]); - rxrpc_proto_abort(abort_why, call, call->tx_top); - return false; } /* @@ -305,24 +291,55 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) if (call->ackr_reason) { now = jiffies; timo = now + MAX_JIFFY_OFFSET; - WRITE_ONCE(call->resend_at, timo); + WRITE_ONCE(call->delay_ack_at, timo); trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { if (!rxrpc_rotate_tx_window(call, top, &summary)) { - rxrpc_proto_abort("TXL", call, top); + rxrpc_proto_abort(call, top, rxrpc_eproto_early_reply); return false; } } - return rxrpc_end_tx_phase(call, true, "ETD"); + + rxrpc_end_tx_phase(call, true, rxrpc_eproto_unexpected_reply); + return true; +} + +/* + * End the packet reception phase. + */ +static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) +{ + rxrpc_seq_t whigh = READ_ONCE(call->rx_highest_seq); + + _enter("%d,%s", call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)]); + + trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh); + + switch (__rxrpc_call_state(call)) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); + rxrpc_call_completed(call); + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_ACK_REQUEST); + call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; + rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_processing_op); + break; + + default: + break; + } } static void rxrpc_input_update_ack_window(struct rxrpc_call *call, rxrpc_seq_t window, rxrpc_seq_t wtop) { - atomic64_set_release(&call->ackr_window, ((u64)wtop) << 32 | window); + call->ackr_window = window; + call->ackr_wtop = wtop; } /* @@ -337,8 +354,9 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); - trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); + if (last) + rxrpc_end_rx_phase(call, sp->hdr.serial); } /* @@ -350,9 +368,9 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sk_buff *oos; rxrpc_serial_t serial = sp->hdr.serial; - u64 win = atomic64_read(&call->ackr_window); - rxrpc_seq_t window = lower_32_bits(win); - rxrpc_seq_t wtop = upper_32_bits(win); + unsigned int sack = call->ackr_sack_base; + rxrpc_seq_t window = call->ackr_window; + rxrpc_seq_t wtop = call->ackr_wtop; rxrpc_seq_t wlimit = window + call->rx_winsize - 1; rxrpc_seq_t seq = sp->hdr.seq; bool last = sp->hdr.flags & RXRPC_LAST_PACKET; @@ -366,17 +384,14 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, if (last) { if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) && - seq + 1 != wtop) { - rxrpc_proto_abort("LSN", call, seq); - return; - } + seq + 1 != wtop) + return rxrpc_proto_abort(call, seq, rxrpc_eproto_different_last); } else { if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && after_eq(seq, wtop)) { pr_warn("Packet beyond last: c=%x q=%x window=%x-%x wlimit=%x\n", call->debug_id, seq, window, wtop, wlimit); - rxrpc_proto_abort("LSA", call, seq); - return; + return rxrpc_proto_abort(call, seq, rxrpc_eproto_data_after_last); } } @@ -396,20 +411,23 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, /* Queue the packet. */ if (seq == window) { - rxrpc_seq_t reset_from; - bool reset_sack = false; - if (sp->hdr.flags & RXRPC_REQUEST_ACK) ack_reason = RXRPC_ACK_REQUESTED; /* Send an immediate ACK if we fill in a hole */ else if (!skb_queue_empty(&call->rx_oos_queue)) ack_reason = RXRPC_ACK_DELAY; else - atomic_inc_return(&call->ackr_nr_unacked); + call->ackr_nr_unacked++; window++; - if (after(window, wtop)) + if (after(window, wtop)) { + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_none); wtop = window; + } else { + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_advance); + sack = (sack + 1) % RXRPC_SACK_SIZE; + } + rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); @@ -426,43 +444,39 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, __skb_unlink(oos, &call->rx_oos_queue); last = osp->hdr.flags & RXRPC_LAST_PACKET; seq = osp->hdr.seq; - if (!reset_sack) { - reset_from = seq; - reset_sack = true; - } + call->ackr_sack_table[sack] = 0; + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_fill); + sack = (sack + 1) % RXRPC_SACK_SIZE; window++; rxrpc_input_queue_data(call, oos, window, wtop, - rxrpc_receive_queue_oos); + rxrpc_receive_queue_oos); } spin_unlock(&call->recvmsg_queue.lock); - if (reset_sack) { - do { - call->ackr_sack_table[reset_from % RXRPC_SACK_SIZE] = 0; - } while (reset_from++, before(reset_from, window)); - } + call->ackr_sack_base = sack; } else { - bool keep = false; + unsigned int slot; ack_reason = RXRPC_ACK_OUT_OF_SEQUENCE; - if (!call->ackr_sack_table[seq % RXRPC_SACK_SIZE]) { - call->ackr_sack_table[seq % RXRPC_SACK_SIZE] = 1; - keep = 1; + slot = seq - window; + sack = (sack + slot) % RXRPC_SACK_SIZE; + + if (call->ackr_sack_table[sack % RXRPC_SACK_SIZE]) { + ack_reason = RXRPC_ACK_DUPLICATE; + goto send_ack; } + call->ackr_sack_table[sack % RXRPC_SACK_SIZE] |= 1; + trace_rxrpc_sack(call, seq, sack, rxrpc_sack_oos); + if (after(seq + 1, wtop)) { wtop = seq + 1; rxrpc_input_update_ack_window(call, window, wtop); } - if (!keep) { - ack_reason = RXRPC_ACK_DUPLICATE; - goto send_ack; - } - skb_queue_walk(&call->rx_oos_queue, oos) { struct rxrpc_skb_priv *osp = rxrpc_skb(oos); @@ -550,19 +564,27 @@ protocol_error: static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - enum rxrpc_call_state state; rxrpc_serial_t serial = sp->hdr.serial; rxrpc_seq_t seq0 = sp->hdr.seq; - _enter("{%llx,%x},{%u,%x}", - atomic64_read(&call->ackr_window), call->rx_highest_seq, + _enter("{%x,%x,%x},{%u,%x}", + call->ackr_window, call->ackr_wtop, call->rx_highest_seq, skb->len, seq0); - state = READ_ONCE(call->state); - if (state >= RXRPC_CALL_COMPLETE) + if (__rxrpc_call_is_complete(call)) return; - if (state == RXRPC_CALL_SERVER_RECV_REQUEST) { + switch (__rxrpc_call_state(call)) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + /* Received data implicitly ACKs all of the request + * packets we sent when we're acting as a client. + */ + if (!rxrpc_receiving_reply(call)) + goto out_notify; + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: { unsigned long timo = READ_ONCE(call->next_req_timo); unsigned long now, expect_req_by; @@ -573,21 +595,18 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_reduce_call_timer(call, expect_req_by, now, rxrpc_timer_set_for_idle); } + break; } - /* Received data implicitly ACKs all of the request packets we sent - * when we're acting as a client. - */ - if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST || - state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && - !rxrpc_receiving_reply(call)) - goto out_notify; + default: + break; + } if (!rxrpc_input_split_jumbo(call, skb)) { - rxrpc_proto_abort("VLD", call, sp->hdr.seq); + rxrpc_proto_abort(call, sp->hdr.seq, rxrpc_badmsg_bad_jumbo); goto out_notify; } - skb = NULL; + return; out_notify: trace_rxrpc_notify_socket(call->debug_id, serial); @@ -765,7 +784,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) offset = sizeof(struct rxrpc_wire_header); if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) - return rxrpc_proto_abort("XAK", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack); offset += sizeof(ack); ack_serial = sp->hdr.serial; @@ -845,7 +864,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) ioffset = offset + nr_acks + 3; if (skb->len >= ioffset + sizeof(info) && skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) - return rxrpc_proto_abort("XAI", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_info); if (nr_acks > 0) skb_condense(skb); @@ -868,10 +887,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_input_ackinfo(call, skb, &info); if (first_soft_ack == 0) - return rxrpc_proto_abort("AK0", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ - switch (READ_ONCE(call->state)) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: case RXRPC_CALL_SERVER_SEND_REPLY: @@ -883,20 +902,20 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (before(hard_ack, call->acks_hard_ack) || after(hard_ack, call->tx_top)) - return rxrpc_proto_abort("AKW", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window); if (nr_acks > call->tx_top - hard_ack) - return rxrpc_proto_abort("AKN", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); if (after(hard_ack, call->acks_hard_ack)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { - rxrpc_end_tx_phase(call, false, "ETA"); + rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); return; } } if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) - return rxrpc_proto_abort("XSA", call, 0); + return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack, nr_acks, &summary); } @@ -918,7 +937,7 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_ack_summary summary = { 0 }; if (rxrpc_rotate_tx_window(call, call->tx_top, &summary)) - rxrpc_end_tx_phase(call, false, "ETL"); + rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ackall); } /* @@ -963,27 +982,23 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: - rxrpc_input_data(call, skb); - break; + return rxrpc_input_data(call, skb); case RXRPC_PACKET_TYPE_ACK: - rxrpc_input_ack(call, skb); - break; + return rxrpc_input_ack(call, skb); case RXRPC_PACKET_TYPE_BUSY: /* Just ignore BUSY packets from the server; the retry and * lifespan timers will take care of business. BUSY packets * from the client don't make sense. */ - break; + return; case RXRPC_PACKET_TYPE_ABORT: - rxrpc_input_abort(call, skb); - break; + return rxrpc_input_abort(call, skb); case RXRPC_PACKET_TYPE_ACKALL: - rxrpc_input_ackall(call, skb); - break; + return rxrpc_input_ackall(call, skb); default: break; @@ -998,24 +1013,18 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) */ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) { - struct rxrpc_connection *conn = call->conn; - - switch (READ_ONCE(call->state)) { + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_AWAIT_ACK: rxrpc_call_completed(call); fallthrough; case RXRPC_CALL_COMPLETE: break; default: - if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN)) - rxrpc_send_abort_packet(call); + rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ESHUTDOWN, + rxrpc_eproto_improper_term); trace_rxrpc_improper_term(call); break; } rxrpc_input_call_event(call, skb); - - spin_lock(&conn->bundle->channel_lock); - __rxrpc_disconnect_call(conn, call); - spin_unlock(&conn->bundle->channel_lock); } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 0eb8471bfc53..34353b6e584b 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -43,25 +43,17 @@ static void none_free_call_crypto(struct rxrpc_call *call) } static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("chall_none")); - return -EPROTO; + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_challenge); } static int none_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("resp_none")); - return -EPROTO; + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_response); } static void none_clear(struct rxrpc_connection *conn) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index d83ae3193032..4a3a08a0e2cd 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -25,6 +25,7 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, */ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) { + struct sk_buff_head *rx_queue; struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); if (unlikely(!local)) { @@ -36,7 +37,16 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) skb->mark = RXRPC_SKB_MARK_PACKET; rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); - skb_queue_tail(&local->rx_queue, skb); + rx_queue = &local->rx_queue; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + if (rxrpc_inject_rx_delay || + !skb_queue_empty(&local->rx_delay_queue)) { + skb->tstamp = ktime_add_ms(skb->tstamp, rxrpc_inject_rx_delay); + rx_queue = &local->rx_delay_queue; + } +#endif + + skb_queue_tail(rx_queue, skb); rxrpc_wake_up_io_thread(local); return 0; } @@ -67,9 +77,31 @@ void rxrpc_error_report(struct sock *sk) } /* + * Directly produce an abort from a packet. + */ +bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, + s32 abort_code, int err) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + abort_code, err); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = abort_code; + return false; +} + +static bool rxrpc_bad_message(struct sk_buff *skb, enum rxrpc_abort_reason why) +{ + return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EBADMSG); +} + +#define just_discard true + +/* * Process event packets targeted at a local endpoint. */ -static void rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) +static bool rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); char v; @@ -81,22 +113,21 @@ static void rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) if (v == 0) rxrpc_send_version_request(local, &sp->hdr, skb); } + + return true; } /* * Extract the wire header from a packet and translate the byte order. */ -static noinline -int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) +static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, + struct sk_buff *skb) { struct rxrpc_wire_header whdr; /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) { - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_hdr")); - return -EBADMSG; - } + if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_short_hdr); memset(sp, 0, sizeof(*sp)); sp->hdr.epoch = ntohl(whdr.epoch); @@ -110,7 +141,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); - return 0; + return true; } /* @@ -130,28 +161,28 @@ static bool rxrpc_extract_abort(struct sk_buff *skb) /* * Process packets received on the local endpoint */ -static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) +static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) { struct rxrpc_connection *conn; struct sockaddr_rxrpc peer_srx; struct rxrpc_skb_priv *sp; struct rxrpc_peer *peer = NULL; struct sk_buff *skb = *_skb; - int ret = 0; + bool ret = false; skb_pull(skb, sizeof(struct udphdr)); sp = rxrpc_skb(skb); /* dig out the RxRPC connection details */ - if (rxrpc_extract_header(sp, skb) < 0) - goto bad_message; + if (!rxrpc_extract_header(sp, skb)) + return just_discard; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); - return 0; + return just_discard; } } @@ -160,28 +191,28 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) - return 0; - rxrpc_input_version(local, skb); - return 0; + return just_discard; + return rxrpc_input_version(local, skb); case RXRPC_PACKET_TYPE_BUSY: if (rxrpc_to_server(sp)) - return 0; + return just_discard; fallthrough; case RXRPC_PACKET_TYPE_ACK: case RXRPC_PACKET_TYPE_ACKALL: if (sp->hdr.callNumber == 0) - goto bad_message; + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); break; case RXRPC_PACKET_TYPE_ABORT: if (!rxrpc_extract_abort(skb)) - return 0; /* Just discard if malformed */ + return just_discard; /* Just discard if malformed */ break; case RXRPC_PACKET_TYPE_DATA: - if (sp->hdr.callNumber == 0 || - sp->hdr.seq == 0) - goto bad_message; + if (sp->hdr.callNumber == 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); + if (sp->hdr.seq == 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_seq); /* Unshare the packet so that it can be modified for in-place * decryption. @@ -191,7 +222,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) if (!skb) { rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem); *_skb = NULL; - return 0; + return just_discard; } if (skb != *_skb) { @@ -205,28 +236,28 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) case RXRPC_PACKET_TYPE_CHALLENGE: if (rxrpc_to_server(sp)) - return 0; + return just_discard; break; case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_to_client(sp)) - return 0; + return just_discard; break; /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: - return 0; + return just_discard; default: - goto bad_message; + return rxrpc_bad_message(skb, rxrpc_badmsg_unsupported_packet); } if (sp->hdr.serviceId == 0) - goto bad_message; + return rxrpc_bad_message(skb, rxrpc_badmsg_zero_service); if (WARN_ON_ONCE(rxrpc_extract_addr_from_skb(&peer_srx, skb) < 0)) - return true; /* Unsupported address type - discard. */ + return just_discard; /* Unsupported address type. */ if (peer_srx.transport.family != local->srx.transport.family && (peer_srx.transport.family == AF_INET && @@ -234,7 +265,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", peer_srx.transport.family, local->srx.transport.family); - return true; /* Wrong address type - discard. */ + return just_discard; /* Wrong address type. */ } if (rxrpc_to_client(sp)) { @@ -242,12 +273,8 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) conn = rxrpc_find_client_connection_rcu(local, &peer_srx, skb); conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); rcu_read_unlock(); - if (!conn) { - trace_rxrpc_abort(0, "NCC", sp->hdr.cid, - sp->hdr.callNumber, sp->hdr.seq, - RXKADINCONSISTENCY, EBADMSG); - goto protocol_error; - } + if (!conn) + return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_conn); ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); rxrpc_put_connection(conn, rxrpc_conn_put_call_input); @@ -280,18 +307,6 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) ret = rxrpc_new_incoming_call(local, peer, NULL, &peer_srx, skb); rxrpc_put_peer(peer, rxrpc_peer_put_input); - if (ret < 0) - goto reject_packet; - return 0; - -bad_message: - trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); -protocol_error: - skb->priority = RX_PROTOCOL_ERROR; - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; -reject_packet: - rxrpc_reject_packet(local, skb); return ret; } @@ -306,21 +321,23 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; + bool ret; if (sp->hdr.securityIndex != conn->security_ix) - goto wrong_security; + return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, + RXKADINCONSISTENCY, -EBADMSG); if (sp->hdr.serviceId != conn->service_id) { int old_id; if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) - goto reupgrade; + return rxrpc_protocol_error(skb, rxrpc_eproto_reupgrade); + old_id = cmpxchg(&conn->service_id, conn->orig_service_id, sp->hdr.serviceId); - if (old_id != conn->orig_service_id && old_id != sp->hdr.serviceId) - goto reupgrade; + return rxrpc_protocol_error(skb, rxrpc_eproto_bad_upgrade); } if (after(sp->hdr.serial, conn->hi_serial)) @@ -336,19 +353,19 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, /* Ignore really old calls */ if (sp->hdr.callNumber < chan->last_call) - return 0; + return just_discard; if (sp->hdr.callNumber == chan->last_call) { if (chan->call || sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) - return 0; + return just_discard; /* For the previous service call, if completed successfully, we * discard all further packets. */ if (rxrpc_conn_is_service(conn) && chan->last_type == RXRPC_PACKET_TYPE_ACK) - return 0; + return just_discard; /* But otherwise we need to retransmit the final packet from * data cached in the connection record. @@ -358,19 +375,17 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, sp->hdr.seq, sp->hdr.serial, sp->hdr.flags); - rxrpc_input_conn_packet(conn, skb); - return 0; + rxrpc_conn_retransmit_call(conn, skb, channel); + return just_discard; } - rcu_read_lock(); - call = rxrpc_try_get_call(rcu_dereference(chan->call), - rxrpc_call_get_input); - rcu_read_unlock(); + call = rxrpc_try_get_call(chan->call, rxrpc_call_get_input); if (sp->hdr.callNumber > chan->call_id) { if (rxrpc_to_client(sp)) { rxrpc_put_call(call, rxrpc_call_put_input); - goto reject_packet; + return rxrpc_protocol_error(skb, + rxrpc_eproto_unexpected_implicit_end); } if (call) { @@ -382,38 +397,14 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (!call) { if (rxrpc_to_client(sp)) - goto bad_message; - if (rxrpc_new_incoming_call(conn->local, conn->peer, conn, - peer_srx, skb)) - return 0; - goto reject_packet; + return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_call); + return rxrpc_new_incoming_call(conn->local, conn->peer, conn, + peer_srx, skb); } - rxrpc_input_call_event(call, skb); + ret = rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return 0; - -wrong_security: - trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RXKADINCONSISTENCY, EBADMSG); - skb->priority = RXKADINCONSISTENCY; - goto post_abort; - -reupgrade: - trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); - goto protocol_error; - -bad_message: - trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); -protocol_error: - skb->priority = RX_PROTOCOL_ERROR; -post_abort: - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; -reject_packet: - rxrpc_reject_packet(conn->local, skb); - return 0; + return ret; } /* @@ -421,10 +412,17 @@ reject_packet: */ int rxrpc_io_thread(void *data) { + struct rxrpc_connection *conn; struct sk_buff_head rx_queue; struct rxrpc_local *local = data; struct rxrpc_call *call; struct sk_buff *skb; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + ktime_t now; +#endif + bool should_stop; + + complete(&local->io_thread_ready); skb_queue_head_init(&rx_queue); @@ -433,6 +431,24 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); + /* Deal with connections that want immediate attention. */ + conn = list_first_entry_or_null(&local->conn_attend_q, + struct rxrpc_connection, + attend_link); + if (conn) { + spin_lock_bh(&local->lock); + list_del_init(&conn->attend_link); + spin_unlock_bh(&local->lock); + + rxrpc_input_conn_event(conn, NULL); + rxrpc_put_connection(conn, rxrpc_conn_put_poke); + continue; + } + + if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) + rxrpc_discard_expired_client_conns(local); + /* Deal with calls that want immediate attention. */ if ((call = list_first_entry_or_null(&local->call_attend_q, struct rxrpc_call, @@ -447,12 +463,17 @@ int rxrpc_io_thread(void *data) continue; } + if (!list_empty(&local->new_client_calls)) + rxrpc_connect_client_calls(local); + /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: skb->priority = 0; - rxrpc_input_packet(local, &skb); + if (!rxrpc_input_packet(local, &skb)) + rxrpc_reject_packet(local, skb); trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_free_skb(skb, rxrpc_skb_put_input); break; @@ -460,6 +481,11 @@ int rxrpc_io_thread(void *data) rxrpc_input_error(local, skb); rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + rxrpc_input_conn_event(sp->conn, skb); + rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); + break; default: WARN_ON_ONCE(1); rxrpc_free_skb(skb, rxrpc_skb_put_unknown); @@ -468,6 +494,17 @@ int rxrpc_io_thread(void *data) continue; } + /* Inject a delay into packets if requested. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + now = ktime_get_real(); + while ((skb = skb_peek(&local->rx_delay_queue))) { + if (ktime_before(now, skb->tstamp)) + break; + skb = skb_dequeue(&local->rx_delay_queue); + skb_queue_tail(&local->rx_queue, skb); + } +#endif + if (!skb_queue_empty(&local->rx_queue)) { spin_lock_irq(&local->rx_queue.lock); skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); @@ -476,14 +513,41 @@ int rxrpc_io_thread(void *data) } set_current_state(TASK_INTERRUPTIBLE); + should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || - !list_empty(&local->call_attend_q)) { + !list_empty(&local->call_attend_q) || + !list_empty(&local->conn_attend_q) || + !list_empty(&local->new_client_calls) || + test_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) { __set_current_state(TASK_RUNNING); continue; } - if (kthread_should_stop()) + if (should_stop) break; + +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + skb = skb_peek(&local->rx_delay_queue); + if (skb) { + unsigned long timeout; + ktime_t tstamp = skb->tstamp; + ktime_t now = ktime_get_real(); + s64 delay_ns = ktime_to_ns(ktime_sub(tstamp, now)); + + if (delay_ns <= 0) { + __set_current_state(TASK_RUNNING); + continue; + } + + timeout = nsecs_to_jiffies(delay_ns); + timeout = max(timeout, 1UL); + schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + continue; + } +#endif + schedule(); } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 44222923c0d1..7d910aee4f8c 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -82,30 +82,61 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, } } +static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) +{ + struct rxrpc_local *local = + container_of(timer, struct rxrpc_local, client_conn_reap_timer); + + if (local->kill_all_client_conns && + test_and_set_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) + rxrpc_wake_up_io_thread(local); +} + /* * Allocate a new local endpoint. */ -static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, +static struct rxrpc_local *rxrpc_alloc_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; + u32 tmp; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { refcount_set(&local->ref, 1); atomic_set(&local->active_users, 1); - local->rxnet = rxnet; + local->net = net; + local->rxnet = rxrpc_net(net); INIT_HLIST_NODE(&local->link); - init_rwsem(&local->defrag_sem); + init_completion(&local->io_thread_ready); +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + skb_queue_head_init(&local->rx_delay_queue); +#endif skb_queue_head_init(&local->rx_queue); + INIT_LIST_HEAD(&local->conn_attend_q); INIT_LIST_HEAD(&local->call_attend_q); + local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); + local->kill_all_client_conns = false; + INIT_LIST_HEAD(&local->idle_client_conns); + timer_setup(&local->client_conn_reap_timer, + rxrpc_client_conn_reap_timeout, 0); + spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; + idr_init(&local->conn_ids); + get_random_bytes(&tmp, sizeof(tmp)); + tmp &= 0x3fffffff; + if (tmp == 0) + tmp = 1; + idr_set_cursor(&local->conn_ids, tmp); + INIT_LIST_HEAD(&local->new_client_calls); + spin_lock_init(&local->client_call_lock); + trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1); } @@ -189,6 +220,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) goto error_sock; } + wait_for_completion(&local->io_thread_ready); local->io_thread = io_thread; _leave(" = 0"); return 0; @@ -246,7 +278,7 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, goto found; } - local = rxrpc_alloc_local(rxnet, srx); + local = rxrpc_alloc_local(net, srx); if (!local) goto nomem; @@ -357,10 +389,11 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { - unsigned int debug_id = local->debug_id; + unsigned int debug_id; int r, u; if (local) { + debug_id = local->debug_id; r = refcount_read(&local->ref); u = atomic_dec_return(&local->active_users); trace_rxrpc_local(debug_id, why, r, u); @@ -403,7 +436,11 @@ void rxrpc_destroy_local(struct rxrpc_local *local) /* At this point, there should be no more packets coming in to the * local endpoint. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + rxrpc_purge_queue(&local->rx_delay_queue); +#endif rxrpc_purge_queue(&local->rx_queue); + rxrpc_purge_client_connections(local); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 056c428d8bf3..825b81183046 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -53,3 +53,10 @@ unsigned int rxrpc_rx_mtu = 5692; * sender that we're willing to handle. */ unsigned int rxrpc_rx_jumbo_max = 4; + +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY +/* + * The delay to inject into packet reception. + */ +unsigned long rxrpc_inject_rx_delay; +#endif diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 5905530e2f33..a0319c040c25 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -10,15 +10,6 @@ unsigned int rxrpc_net_id; -static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) -{ - struct rxrpc_net *rxnet = - container_of(timer, struct rxrpc_net, client_conn_reap_timer); - - if (rxnet->live) - rxrpc_queue_work(&rxnet->client_conn_reaper); -} - static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_net *rxnet = @@ -63,14 +54,6 @@ static __net_init int rxrpc_init_net(struct net *net) rxrpc_service_conn_reap_timeout, 0); atomic_set(&rxnet->nr_client_conns, 0); - rxnet->kill_all_client_conns = false; - spin_lock_init(&rxnet->client_conn_cache_lock); - mutex_init(&rxnet->client_conn_discard_lock); - INIT_LIST_HEAD(&rxnet->idle_client_conns); - INIT_WORK(&rxnet->client_conn_reaper, - rxrpc_discard_expired_client_conns); - timer_setup(&rxnet->client_conn_reap_timer, - rxrpc_client_conn_reap_timeout, 0); INIT_HLIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 3d8c9f830ee0..5e53429c6922 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -80,62 +80,40 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call) */ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, struct rxrpc_call *call, - struct rxrpc_txbuf *txb) + struct rxrpc_txbuf *txb, + u16 *_rwind) { struct rxrpc_ackinfo ackinfo; - unsigned int qsize; - rxrpc_seq_t window, wtop, wrap_point, ix, first; + unsigned int qsize, sack, wrap, to; + rxrpc_seq_t window, wtop; int rsize; - u64 wtmp; u32 mtu, jmax; u8 *ackp = txb->acks; - u8 sack_buffer[sizeof(call->ackr_sack_table)] __aligned(8); - atomic_set(&call->ackr_nr_unacked, 0); + call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); + clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); - /* Barrier against rxrpc_input_data(). */ -retry: - wtmp = atomic64_read_acquire(&call->ackr_window); - window = lower_32_bits(wtmp); - wtop = upper_32_bits(wtmp); + window = call->ackr_window; + wtop = call->ackr_wtop; + sack = call->ackr_sack_base % RXRPC_SACK_SIZE; txb->ack.firstPacket = htonl(window); - txb->ack.nAcks = 0; + txb->ack.nAcks = wtop - window; if (after(wtop, window)) { - /* Try to copy the SACK ring locklessly. We can use the copy, - * only if the now-current top of the window didn't go past the - * previously read base - otherwise we can't know whether we - * have old data or new data. - */ - memcpy(sack_buffer, call->ackr_sack_table, sizeof(sack_buffer)); - wrap_point = window + RXRPC_SACK_SIZE - 1; - wtmp = atomic64_read_acquire(&call->ackr_window); - window = lower_32_bits(wtmp); - wtop = upper_32_bits(wtmp); - if (after(wtop, wrap_point)) { - cond_resched(); - goto retry; - } - - /* The buffer is maintained as a ring with an invariant mapping - * between bit position and sequence number, so we'll probably - * need to rotate it. - */ - txb->ack.nAcks = wtop - window; - ix = window % RXRPC_SACK_SIZE; - first = sizeof(sack_buffer) - ix; + wrap = RXRPC_SACK_SIZE - sack; + to = min_t(unsigned int, txb->ack.nAcks, RXRPC_SACK_SIZE); - if (ix + txb->ack.nAcks <= RXRPC_SACK_SIZE) { - memcpy(txb->acks, sack_buffer + ix, txb->ack.nAcks); + if (sack + txb->ack.nAcks <= RXRPC_SACK_SIZE) { + memcpy(txb->acks, call->ackr_sack_table + sack, txb->ack.nAcks); } else { - memcpy(txb->acks, sack_buffer + ix, first); - memcpy(txb->acks + first, sack_buffer, - txb->ack.nAcks - first); + memcpy(txb->acks, call->ackr_sack_table + sack, wrap); + memcpy(txb->acks + wrap, call->ackr_sack_table, + to - wrap); } - ackp += txb->ack.nAcks; + ackp += to; } else if (before(wtop, window)) { pr_warn("ack window backward %x %x", window, wtop); } else if (txb->ack.reason == RXRPC_ACK_DELAY) { @@ -147,6 +125,7 @@ retry: jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); + *_rwind = rsize; ackinfo.rxMTU = htonl(rxrpc_rx_mtu); ackinfo.maxMTU = htonl(mtu); ackinfo.rwind = htonl(rsize); @@ -213,6 +192,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) rxrpc_serial_t serial; size_t len, n; int ret, rtt_slot = -1; + u16 rwind; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; @@ -228,7 +208,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) if (txb->ack.reason == RXRPC_ACK_PING) txb->wire.flags |= RXRPC_REQUEST_ACK; - n = rxrpc_fill_out_ack(conn, call, txb); + n = rxrpc_fill_out_ack(conn, call, txb, &rwind); if (n == 0) return 0; @@ -240,7 +220,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) txb->wire.serial = htonl(serial); trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(txb->ack.firstPacket), - ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks); + ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks, + rwind); if (txb->ack.reason == RXRPC_ACK_PING) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); @@ -253,15 +234,18 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); - if (ret < 0) + if (ret < 0) { trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); - else + } else { trace_rxrpc_tx_packet(call->debug_id, &txb->wire, rxrpc_tx_point_call_ack); + if (txb->wire.flags & RXRPC_REQUEST_ACK) + call->peer->rtt_last_req = ktime_get_real(); + } rxrpc_tx_backoff(call, ret); - if (call->state < RXRPC_CALL_COMPLETE) { + if (!__rxrpc_call_is_complete(call)) { if (ret < 0) rxrpc_cancel_rtt_probe(call, serial, rtt_slot); rxrpc_set_keepalive(call); @@ -429,8 +413,6 @@ dont_set_request_ack: if (txb->len >= call->peer->maxdata) goto send_fragmentable; - down_read(&conn->local->defrag_sem); - txb->last_sent = ktime_get_real(); if (txb->wire.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); @@ -445,7 +427,6 @@ dont_set_request_ack: ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - up_read(&conn->local->defrag_sem); if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); rxrpc_cancel_rtt_probe(call, serial, rtt_slot); @@ -506,8 +487,6 @@ send_fragmentable: /* attempt to send this message with fragmentation enabled */ _debug("send fragment"); - down_write(&conn->local->defrag_sem); - txb->last_sent = ktime_get_real(); if (txb->wire.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); @@ -539,12 +518,66 @@ send_fragmentable: rxrpc_tx_point_call_data_frag); } rxrpc_tx_backoff(call, ret); - - up_write(&conn->local->defrag_sem); goto done; } /* + * Transmit a connection-level abort. + */ +void rxrpc_send_conn_abort(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header whdr; + struct msghdr msg; + struct kvec iov[2]; + __be32 word; + size_t len; + u32 serial; + int ret; + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(conn->proto.cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->service_id); + + word = htonl(conn->abort_code); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &word; + iov[1].iov_len = sizeof(word); + + len = iov[0].iov_len + iov[1].iov_len; + + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_conn_abort); + _debug("sendmsg failed: %d", ret); + return; + } + + trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort); + + conn->peer->last_tx_at = ktime_get_seconds(); +} + +/* * Reject a packet through the local endpoint. */ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) @@ -667,7 +700,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) static inline void rxrpc_instant_resend(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { - if (call->state < RXRPC_CALL_COMPLETE) + if (!__rxrpc_call_is_complete(call)) kdebug("resend"); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 6685bf917aa6..552ba84a255c 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -235,6 +235,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, struct rxrpc_peer *peer; const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1; time64_t keepalive_at; + bool use; int slot; spin_lock(&rxnet->peer_hash_lock); @@ -247,9 +248,10 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, if (!rxrpc_get_peer_maybe(peer, rxrpc_peer_get_keepalive)) continue; - if (__rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive)) { - spin_unlock(&rxnet->peer_hash_lock); + use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); + spin_unlock(&rxnet->peer_hash_lock); + if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; slot = keepalive_at - base; _debug("%02x peer %u t=%d {%pISp}", @@ -270,9 +272,11 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, spin_lock(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); + spin_unlock(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } - rxrpc_put_peer_locked(peer, rxrpc_peer_put_keepalive); + rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); + spin_lock(&rxnet->peer_hash_lock); } spin_unlock(&rxnet->peer_hash_lock); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 608946dcc505..8d7a715a0bb1 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -147,10 +147,10 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, * assess the MTU size for the network interface through which this peer is * reached */ -static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, +static void rxrpc_assess_MTU_size(struct rxrpc_local *local, struct rxrpc_peer *peer) { - struct net *net = sock_net(&rx->sk); + struct net *net = local->net; struct dst_entry *dst; struct rtable *rt; struct flowi fl; @@ -226,7 +226,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, rxrpc_peer_init_rtt(peer); peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; - trace_rxrpc_peer(peer->debug_id, why, 1); + trace_rxrpc_peer(peer->debug_id, 1, why); } _leave(" = %p", peer); @@ -236,11 +236,11 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, /* * Initialise peer record. */ -static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer, +static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(rx, peer); + rxrpc_assess_MTU_size(local, peer); peer->mtu = peer->if_mtu; peer->rtt_last_req = ktime_get_real(); @@ -272,8 +272,7 @@ static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer, /* * Set up a new peer. */ -static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, - struct rxrpc_local *local, +static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, unsigned long hash_key, gfp_t gfp) @@ -285,7 +284,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, peer = rxrpc_alloc_peer(local, gfp, rxrpc_peer_new_client); if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); - rxrpc_init_peer(rx, peer, hash_key); + rxrpc_init_peer(local, peer, hash_key); } _leave(" = %p", peer); @@ -304,14 +303,13 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer) * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. */ -void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, - struct rxrpc_peer *peer) +void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); - rxrpc_init_peer(rx, peer, hash_key); + rxrpc_init_peer(local, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); @@ -322,8 +320,7 @@ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, /* * obtain a remote transport endpoint for the specified address */ -struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, - struct rxrpc_local *local, +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; @@ -343,7 +340,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, /* The peer is not yet present in hash - create a candidate * for a new record and then redo the search. */ - candidate = rxrpc_create_peer(rx, local, srx, hash_key, gfp); + candidate = rxrpc_create_peer(local, srx, hash_key, gfp); if (!candidate) { _leave(" = NULL [nomem]"); return NULL; @@ -382,7 +379,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace int r; __refcount_inc(&peer->ref, &r); - trace_rxrpc_peer(peer->debug_id, why, r + 1); + trace_rxrpc_peer(peer->debug_id, r + 1, why); return peer; } @@ -439,25 +436,6 @@ void rxrpc_put_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) } /* - * Drop a ref on a peer record where the caller already holds the - * peer_hash_lock. - */ -void rxrpc_put_peer_locked(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) -{ - unsigned int debug_id = peer->debug_id; - bool dead; - int r; - - dead = __refcount_dec_and_test(&peer->ref, &r); - trace_rxrpc_peer(debug_id, r - 1, why); - if (dead) { - hash_del_rcu(&peer->hash_link); - list_del_init(&peer->keepalive_link); - rxrpc_free_peer(peer); - } -} - -/* * Make sure all peer records have been discarded. */ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 3a59591ec061..682636d3b060 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -12,13 +12,13 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { [RXRPC_CONN_UNUSED] = "Unused ", + [RXRPC_CONN_CLIENT_UNSECURED] = "ClUnsec ", [RXRPC_CONN_CLIENT] = "Client ", [RXRPC_CONN_SERVICE_PREALLOC] = "SvPrealc", [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ", [RXRPC_CONN_SERVICE] = "SvSecure", - [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", - [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CONN_ABORTED] = "Aborted ", }; /* @@ -51,10 +51,10 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_local *local; struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + enum rxrpc_call_state state; unsigned long timeout = 0; rxrpc_seq_t acks_hard_ack; char lbuff[50], rbuff[50]; - u64 wtmp; if (v == &rxnet->calls) { seq_puts(seq, @@ -75,13 +75,13 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) sprintf(rbuff, "%pISpc", &call->dest_srx.transport); - if (call->state != RXRPC_CALL_SERVER_PREALLOC) { + state = rxrpc_call_state(call); + if (state != RXRPC_CALL_SERVER_PREALLOC) { timeout = READ_ONCE(call->expect_rx_by); timeout -= jiffies; } acks_hard_ack = READ_ONCE(call->acks_hard_ack); - wtmp = atomic64_read_acquire(&call->ackr_window); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", @@ -92,11 +92,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->call_id, rxrpc_is_service_call(call) ? "Svc" : "Clt", refcount_read(&call->ref), - rxrpc_call_states[call->state], + rxrpc_call_states[state], call->abort_code, call->debug_id, acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, - lower_32_bits(wtmp), upper_32_bits(wtmp) - lower_32_bits(wtmp), + call->ackr_window, call->ackr_wtop - call->ackr_window, call->rx_serial, call->cong_cwnd, timeout); @@ -143,6 +143,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) { struct rxrpc_connection *conn; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + const char *state; char lbuff[50], rbuff[50]; if (v == &rxnet->conn_proc_list) { @@ -163,9 +164,11 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) } sprintf(lbuff, "%pISpc", &conn->local->srx.transport); - sprintf(rbuff, "%pISpc", &conn->peer->srx.transport); print: + state = rxrpc_is_conn_aborted(conn) ? + rxrpc_call_completions[conn->completion] : + rxrpc_conn_states[conn->state]; seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %s %3u %3d" " %s %08x %08x %08x %08x %08x %08x %08x\n", @@ -176,7 +179,7 @@ print: rxrpc_conn_is_service(conn) ? "Svc" : "Clt", refcount_read(&conn->ref), atomic_read(&conn->active), - rxrpc_conn_states[conn->state], + state, key_serial(conn->key), atomic_read(&conn->serial), conn->hi_serial, diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 36b25d003cf0..a482f88c5fc5 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -40,12 +40,12 @@ void rxrpc_notify_socket(struct rxrpc_call *call) call->notify_rx(sk, call, call->user_call_ID); spin_unlock(&call->notify_lock); } else { - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - write_unlock(&rx->recvmsg_lock); + spin_unlock(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -59,85 +59,6 @@ void rxrpc_notify_socket(struct rxrpc_call *call) } /* - * Transition a call to the complete state. - */ -bool __rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - if (call->state < RXRPC_CALL_COMPLETE) { - call->abort_code = abort_code; - call->error = error; - call->completion = compl; - call->state = RXRPC_CALL_COMPLETE; - trace_rxrpc_call_complete(call); - wake_up(&call->waitq); - rxrpc_notify_socket(call); - return true; - } - return false; -} - -bool rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - bool ret = false; - - if (call->state < RXRPC_CALL_COMPLETE) { - write_lock(&call->state_lock); - ret = __rxrpc_set_call_completion(call, compl, abort_code, error); - write_unlock(&call->state_lock); - } - return ret; -} - -/* - * Record that a call successfully completed. - */ -bool __rxrpc_call_completed(struct rxrpc_call *call) -{ - return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); -} - -bool rxrpc_call_completed(struct rxrpc_call *call) -{ - bool ret = false; - - if (call->state < RXRPC_CALL_COMPLETE) { - write_lock(&call->state_lock); - ret = __rxrpc_call_completed(call); - write_unlock(&call->state_lock); - } - return ret; -} - -/* - * Record that a call is locally aborted. - */ -bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, u32 abort_code, int error) -{ - trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, - abort_code, error); - return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, - abort_code, error); -} - -bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, u32 abort_code, int error) -{ - bool ret; - - write_lock(&call->state_lock); - ret = __rxrpc_abort_call(why, call, seq, abort_code, error); - write_unlock(&call->state_lock); - return ret; -} - -/* * Pass a call terminating message to userspace. */ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) @@ -168,53 +89,18 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); break; default: - pr_err("Invalid terminal call state %u\n", call->state); + pr_err("Invalid terminal call state %u\n", call->completion); BUG(); break; } trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal, - lower_32_bits(atomic64_read(&call->ackr_window)) - 1, + call->ackr_window - 1, call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } /* - * End the packet reception phase. - */ -static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) -{ - rxrpc_seq_t whigh = READ_ONCE(call->rx_highest_seq); - - _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); - - trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh); - - if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) - rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); - - write_lock(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - __rxrpc_call_completed(call); - write_unlock(&call->state_lock); - break; - - case RXRPC_CALL_SERVER_RECV_REQUEST: - call->state = RXRPC_CALL_SERVER_ACK_REQUEST; - call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; - write_unlock(&call->state_lock); - rxrpc_propose_delay_ACK(call, serial, - rxrpc_propose_ack_processing_op); - break; - default: - write_unlock(&call->state_lock); - break; - } -} - -/* * Discard a packet we've used up and advance the Rx window by one. */ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) @@ -244,15 +130,14 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate, serial, call->rx_consumed); - if (last) { - rxrpc_end_rx_phase(call, serial); - return; - } + + if (last) + set_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags); /* Check to see if there's an ACK that needs sending. */ acked = atomic_add_return(call->rx_consumed - old_consumed, &call->ackr_nr_consumed); - if (acked > 2 && + if (acked > 8 && !test_and_set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_poke_call(call, rxrpc_call_poke_idle); } @@ -272,7 +157,8 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA - * (returns 1). If more packets are required, it returns -EAGAIN. + * (returns 1). If more packets are required, it returns -EAGAIN and if the + * call has failed it returns -EIO. */ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, struct iov_iter *iter, @@ -288,8 +174,14 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; - if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { - seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1; + if (rxrpc_call_has_failed(call)) { + seq = call->ackr_window - 1; + ret = -EIO; + goto done; + } + + if (test_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags)) { + seq = call->ackr_window - 1; ret = 1; goto done; } @@ -312,14 +204,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (rx_pkt_offset == 0) { ret2 = rxrpc_verify_data(call, skb); - rx_pkt_offset = sp->offset; - rx_pkt_len = sp->len; trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, - rx_pkt_offset, rx_pkt_len, ret2); + sp->offset, sp->len, ret2); if (ret2 < 0) { + kdebug("verify = %d", ret2); ret = ret2; goto out; } + rx_pkt_offset = sp->offset; + rx_pkt_len = sp->len; } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); @@ -388,13 +281,14 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct list_head *l; + unsigned int call_debug_id = 0; size_t copied = 0; long timeo; int ret; DEFINE_WAIT(wait); - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0); + trace_rxrpc_recvmsg(0, rxrpc_recvmsg_enter, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; @@ -431,7 +325,7 @@ try_again: if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, 0); + trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); @@ -440,17 +334,31 @@ try_again: /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. + * We also want to weed out calls that got requeued whilst we were + * shovelling data out. */ - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); + + if (!rxrpc_call_is_complete(call) && + skb_queue_empty(&call->recvmsg_queue)) { + list_del_init(&call->recvmsg_link); + spin_unlock(&rx->recvmsg_lock); + release_sock(&rx->sk); + trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); + rxrpc_put_call(call, rxrpc_call_put_recvmsg); + goto try_again; + } + if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - write_unlock(&rx->recvmsg_lock); + spin_unlock(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0); + call_debug_id = call->debug_id; + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); /* We're going to drop the socket lock, so we need to lock the call * against interference by sendmsg. @@ -492,36 +400,37 @@ try_again: msg->msg_namelen = len; } - switch (READ_ONCE(call->state)) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - case RXRPC_CALL_SERVER_RECV_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, - flags, &copied); - if (ret == -EAGAIN) - ret = 0; - - if (!skb_queue_empty(&call->recvmsg_queue)) - rxrpc_notify_socket(call); - break; - default: + ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, + flags, &copied); + if (ret == -EAGAIN) ret = 0; - break; - } - + if (ret == -EIO) + goto call_failed; if (ret < 0) goto error_unlock_call; - if (call->state == RXRPC_CALL_COMPLETE) { - ret = rxrpc_recvmsg_term(call, msg); - if (ret < 0) - goto error_unlock_call; - if (!(flags & MSG_PEEK)) - rxrpc_release_call(rx, call); - msg->msg_flags |= MSG_EOR; - ret = 1; - } + if (rxrpc_call_is_complete(call) && + skb_queue_empty(&call->recvmsg_queue)) + goto call_complete; + if (rxrpc_call_has_failed(call)) + goto call_failed; + + if (!skb_queue_empty(&call->recvmsg_queue)) + rxrpc_notify_socket(call); + goto not_yet_complete; +call_failed: + rxrpc_purge_queue(&call->recvmsg_queue); +call_complete: + ret = rxrpc_recvmsg_term(call, msg); + if (ret < 0) + goto error_unlock_call; + if (!(flags & MSG_PEEK)) + rxrpc_release_call(rx, call); + msg->msg_flags |= MSG_EOR; + ret = 1; + +not_yet_complete: if (ret == 0) msg->msg_flags |= MSG_MORE; else @@ -531,22 +440,22 @@ try_again: error_unlock_call: mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_recvmsg); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; error_requeue_call: if (!(flags & MSG_PEEK)) { - write_lock(&rx->recvmsg_lock); + spin_lock(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - write_unlock(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0); + spin_unlock(&rx->recvmsg_lock); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); } error_no_call: release_sock(&rx->sk); error_trace: - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; wait_interrupted: @@ -584,49 +493,34 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, size_t offset = 0; int ret; - _enter("{%d,%s},%zu,%d", - call->debug_id, rxrpc_call_states[call->state], - *_len, want_more); - - ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_SECURING); + _enter("{%d},%zu,%d", call->debug_id, *_len, want_more); mutex_lock(&call->user_mutex); - switch (READ_ONCE(call->state)) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - case RXRPC_CALL_SERVER_RECV_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - ret = rxrpc_recvmsg_data(sock, call, NULL, iter, - *_len, 0, &offset); - *_len -= offset; - if (ret < 0) - goto out; - - /* We can only reach here with a partially full buffer if we - * have reached the end of the data. We must otherwise have a - * full buffer or have been given -EAGAIN. - */ - if (ret == 1) { - if (iov_iter_count(iter) > 0) - goto short_data; - if (!want_more) - goto read_phase_complete; - ret = 0; - goto out; - } - - if (!want_more) - goto excess_data; + ret = rxrpc_recvmsg_data(sock, call, NULL, iter, *_len, 0, &offset); + *_len -= offset; + if (ret == -EIO) + goto call_failed; + if (ret < 0) goto out; - case RXRPC_CALL_COMPLETE: - goto call_complete; - - default: - ret = -EINPROGRESS; + /* We can only reach here with a partially full buffer if we have + * reached the end of the data. We must otherwise have a full buffer + * or have been given -EAGAIN. + */ + if (ret == 1) { + if (iov_iter_count(iter) > 0) + goto short_data; + if (!want_more) + goto read_phase_complete; + ret = 0; goto out; } + if (!want_more) + goto excess_data; + goto out; + read_phase_complete: ret = 1; out: @@ -637,14 +531,18 @@ out: return ret; short_data: - trace_rxrpc_rx_eproto(call, 0, tracepoint_string("short_data")); + trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_short_data, + call->cid, call->call_id, call->rx_consumed, + 0, -EBADMSG); ret = -EBADMSG; goto out; excess_data: - trace_rxrpc_rx_eproto(call, 0, tracepoint_string("excess_data")); + trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_excess_data, + call->cid, call->call_id, call->rx_consumed, + 0, -EMSGSIZE); ret = -EMSGSIZE; goto out; -call_complete: +call_failed: *_abort = call->abort_code; ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index d1233720e05f..1bf571a66e02 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -411,18 +411,15 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; struct scatterlist sg[16]; - bool aborted; u32 data_size, buf; u16 check; int ret; _enter(""); - if (sp->len < 8) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_hdr", "V1H", - RXKADSEALEDINCON); - goto protocol_error; - } + if (sp->len < 8) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_1_short_header); /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. @@ -442,11 +439,9 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_zero(req); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_len", "XV1", - RXKADDATALEN); - goto protocol_error; - } + if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_1_short_encdata); sp->offset += sizeof(sechdr); sp->len -= sizeof(sechdr); @@ -456,26 +451,16 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, check = buf >> 16; check ^= seq ^ call->call_id; check &= 0xffff; - if (check != 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_check", "V1C", - RXKADSEALEDINCON); - goto protocol_error; - } - - if (data_size > sp->len) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_datalen", "V1L", - RXKADDATALEN); - goto protocol_error; - } + if (check != 0) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_1_short_check); + if (data_size > sp->len) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_1_short_data); sp->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; - -protocol_error: - if (aborted) - rxrpc_send_abort_packet(call); - return -EPROTO; } /* @@ -490,18 +475,15 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; - bool aborted; u32 data_size, buf; u16 check; int nsg, ret; _enter(",{%d}", sp->len); - if (sp->len < 8) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_hdr", "V2H", - RXKADSEALEDINCON); - goto protocol_error; - } + if (sp->len < 8) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_2_short_header); /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. @@ -513,7 +495,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } else { sg = kmalloc_array(nsg, sizeof(*sg), GFP_NOIO); if (!sg) - goto nomem; + return -ENOMEM; } sg_init_table(sg, nsg); @@ -537,11 +519,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, kfree(sg); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_len", "XV2", - RXKADDATALEN); - goto protocol_error; - } + if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_2_short_len); sp->offset += sizeof(sechdr); sp->len -= sizeof(sechdr); @@ -551,30 +531,17 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, check = buf >> 16; check ^= seq ^ call->call_id; check &= 0xffff; - if (check != 0) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_check", "V2C", - RXKADSEALEDINCON); - goto protocol_error; - } + if (check != 0) + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_2_short_check); - if (data_size > sp->len) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_datalen", "V2L", - RXKADDATALEN); - goto protocol_error; - } + if (data_size > sp->len) + return rxrpc_abort_eproto(call, skb, RXKADDATALEN, + rxkad_abort_2_short_data); sp->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; - -protocol_error: - if (aborted) - rxrpc_send_abort_packet(call); - return -EPROTO; - -nomem: - _leave(" = -ENOMEM"); - return -ENOMEM; } /* @@ -590,7 +557,6 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) __be32 buf[2]; } crypto __aligned(8); rxrpc_seq_t seq = sp->hdr.seq; - bool aborted; int ret; u16 cksum; u32 x, y; @@ -627,9 +593,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) cksum = 1; /* zero checksums are not permitted */ if (cksum != sp->hdr.cksum) { - aborted = rxrpc_abort_eproto(call, skb, "rxkad_csum", "VCK", - RXKADSEALEDINCON); - goto protocol_error; + ret = rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_bad_checksum); + goto out; } switch (call->conn->security_level) { @@ -647,13 +613,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) break; } +out: skcipher_request_free(req); return ret; - -protocol_error: - if (aborted) - rxrpc_send_abort_packet(call); - return -EPROTO; } /* @@ -821,34 +783,30 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, * respond to a challenge packet */ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { const struct rxrpc_key_token *token; struct rxkad_challenge challenge; struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - const char *eproto; - u32 version, nonce, min_level, abort_code; - int ret; + u32 version, nonce, min_level; + int ret = -EPROTO; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - eproto = tracepoint_string("chall_no_key"); - abort_code = RX_PROTOCOL_ERROR; if (!conn->key) - goto protocol_error; + return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); - abort_code = RXKADEXPIRED; ret = key_validate(conn->key); if (ret < 0) - goto other_error; + return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); - eproto = tracepoint_string("chall_short"); - abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &challenge, sizeof(challenge)) < 0) - goto protocol_error; + return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); version = ntohl(challenge.version); nonce = ntohl(challenge.nonce); @@ -856,15 +814,13 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); - eproto = tracepoint_string("chall_ver"); - abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) - goto protocol_error; + return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); - abort_code = RXKADLEVELFAIL; - ret = -EACCES; if (conn->security_level < min_level) - goto other_error; + return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); token = conn->key->payload.data[0]; @@ -893,13 +849,6 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); kfree(resp); return ret; - -protocol_error: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); - ret = -EPROTO; -other_error: - *_abort_code = abort_code; - return ret; } /* @@ -910,20 +859,15 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, void *ticket, size_t ticket_len, struct rxrpc_crypt *_session_key, - time64_t *_expiry, - u32 *_abort_code) + time64_t *_expiry) { struct skcipher_request *req; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv, key; struct scatterlist sg[1]; struct in_addr addr; unsigned int life; - const char *eproto; time64_t issue, now; bool little_endian; - int ret; - u32 abort_code; u8 *p, *q, *name, *end; _enter("{%d},{%x}", conn->debug_id, key_serial(server_key)); @@ -935,10 +879,9 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, memcpy(&iv, &server_key->payload.data[2], sizeof(iv)); - ret = -ENOMEM; req = skcipher_request_alloc(server_key->payload.data[0], GFP_NOFS); if (!req) - goto temporary_error; + return -ENOMEM; sg_init_one(&sg[0], ticket, ticket_len); skcipher_request_set_callback(req, 0, NULL, NULL); @@ -949,18 +892,21 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, p = ticket; end = p + ticket_len; -#define Z(field) \ - ({ \ - u8 *__str = p; \ - eproto = tracepoint_string("rxkad_bad_"#field); \ - q = memchr(p, 0, end - p); \ - if (!q || q - p > (field##_SZ)) \ - goto bad_ticket; \ - for (; p < q; p++) \ - if (!isprint(*p)) \ - goto bad_ticket; \ - p++; \ - __str; \ +#define Z(field, fieldl) \ + ({ \ + u8 *__str = p; \ + q = memchr(p, 0, end - p); \ + if (!q || q - p > field##_SZ) \ + return rxrpc_abort_conn( \ + conn, skb, RXKADBADTICKET, -EPROTO, \ + rxkad_abort_resp_tkt_##fieldl); \ + for (; p < q; p++) \ + if (!isprint(*p)) \ + return rxrpc_abort_conn( \ + conn, skb, RXKADBADTICKET, -EPROTO, \ + rxkad_abort_resp_tkt_##fieldl); \ + p++; \ + __str; \ }) /* extract the ticket flags */ @@ -969,20 +915,20 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, p++; /* extract the authentication name */ - name = Z(ANAME); + name = Z(ANAME, aname); _debug("KIV ANAME: %s", name); /* extract the principal's instance */ - name = Z(INST); + name = Z(INST, inst); _debug("KIV INST : %s", name); /* extract the principal's authentication domain */ - name = Z(REALM); + name = Z(REALM, realm); _debug("KIV REALM: %s", name); - eproto = tracepoint_string("rxkad_bad_len"); if (end - p < 4 + 8 + 4 + 2) - goto bad_ticket; + return rxrpc_abort_conn(conn, skb, RXKADBADTICKET, -EPROTO, + rxkad_abort_resp_tkt_short); /* get the IPv4 address of the entity that requested the ticket */ memcpy(&addr, p, sizeof(addr)); @@ -1014,38 +960,23 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, _debug("KIV ISSUE: %llx [%llx]", issue, now); /* check the ticket is in date */ - if (issue > now) { - abort_code = RXKADNOAUTH; - ret = -EKEYREJECTED; - goto other_error; - } - - if (issue < now - life) { - abort_code = RXKADEXPIRED; - ret = -EKEYEXPIRED; - goto other_error; - } + if (issue > now) + return rxrpc_abort_conn(conn, skb, RXKADNOAUTH, -EKEYREJECTED, + rxkad_abort_resp_tkt_future); + if (issue < now - life) + return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, -EKEYEXPIRED, + rxkad_abort_resp_tkt_expired); *_expiry = issue + life; /* get the service name */ - name = Z(SNAME); + name = Z(SNAME, sname); _debug("KIV SNAME: %s", name); /* get the service instance name */ - name = Z(INST); + name = Z(INST, sinst); _debug("KIV SINST: %s", name); return 0; - -bad_ticket: - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); - abort_code = RXKADBADTICKET; - ret = -EPROTO; -other_error: - *_abort_code = abort_code; - return ret; -temporary_error: - return ret; } /* @@ -1086,17 +1017,15 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, * verify a response */ static int rxkad_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb) { struct rxkad_response *response; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; struct key *server_key; - const char *eproto; time64_t expiry; void *ticket; - u32 abort_code, version, kvno, ticket_len, level; + u32 version, kvno, ticket_len, level; __be32 csum; int ret, i; @@ -1104,22 +1033,18 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, server_key = rxrpc_look_up_server_security(conn, skb, 0, 0); if (IS_ERR(server_key)) { - switch (PTR_ERR(server_key)) { + ret = PTR_ERR(server_key); + switch (ret) { case -ENOKEY: - abort_code = RXKADUNKNOWNKEY; - break; + return rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, ret, + rxkad_abort_resp_nokey); case -EKEYEXPIRED: - abort_code = RXKADEXPIRED; - break; + return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_resp_key_expired); default: - abort_code = RXKADNOAUTH; - break; + return rxrpc_abort_conn(conn, skb, RXKADNOAUTH, ret, + rxkad_abort_resp_key_rejected); } - trace_rxrpc_abort(0, "SVK", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - abort_code, PTR_ERR(server_key)); - *_abort_code = abort_code; - return -EPROTO; } ret = -ENOMEM; @@ -1127,11 +1052,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (!response) goto temporary_error; - eproto = tracepoint_string("rxkad_rsp_short"); - abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - response, sizeof(*response)) < 0) + response, sizeof(*response)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_resp_short); goto protocol_error; + } version = ntohl(response->version); ticket_len = ntohl(response->ticket_len); @@ -1139,20 +1065,23 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len); - eproto = tracepoint_string("rxkad_rsp_ver"); - abort_code = RXKADINCONSISTENCY; - if (version != RXKAD_VERSION) + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_resp_version); goto protocol_error; + } - eproto = tracepoint_string("rxkad_rsp_tktlen"); - abort_code = RXKADTICKETLEN; - if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) + if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) { + rxrpc_abort_conn(conn, skb, RXKADTICKETLEN, -EPROTO, + rxkad_abort_resp_tkt_len); goto protocol_error; + } - eproto = tracepoint_string("rxkad_rsp_unkkey"); - abort_code = RXKADUNKNOWNKEY; - if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) + if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) { + rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, -EPROTO, + rxkad_abort_resp_unknown_tkt); goto protocol_error; + } /* extract the kerberos ticket and decrypt and decode it */ ret = -ENOMEM; @@ -1160,15 +1089,15 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (!ticket) goto temporary_error_free_resp; - eproto = tracepoint_string("rxkad_tkt_short"); - abort_code = RXKADPACKETSHORT; - ret = skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), - ticket, ticket_len); - if (ret < 0) - goto temporary_error_free_ticket; + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), + ticket, ticket_len) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_resp_short_tkt); + goto protocol_error; + } ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, - &session_key, &expiry, _abort_code); + &session_key, &expiry); if (ret < 0) goto temporary_error_free_ticket; @@ -1176,56 +1105,61 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, * response */ rxkad_decrypt_response(conn, response, &session_key); - eproto = tracepoint_string("rxkad_rsp_param"); - abort_code = RXKADSEALEDINCON; - if (ntohl(response->encrypted.epoch) != conn->proto.epoch) - goto protocol_error_free; - if (ntohl(response->encrypted.cid) != conn->proto.cid) - goto protocol_error_free; - if (ntohl(response->encrypted.securityIndex) != conn->security_ix) + if (ntohl(response->encrypted.epoch) != conn->proto.epoch || + ntohl(response->encrypted.cid) != conn->proto.cid || + ntohl(response->encrypted.securityIndex) != conn->security_ix) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_param); goto protocol_error_free; + } + csum = response->encrypted.checksum; response->encrypted.checksum = 0; rxkad_calc_response_checksum(response); - eproto = tracepoint_string("rxkad_rsp_csum"); - if (response->encrypted.checksum != csum) + if (response->encrypted.checksum != csum) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_checksum); goto protocol_error_free; + } - spin_lock(&conn->bundle->channel_lock); for (i = 0; i < RXRPC_MAXCALLS; i++) { - struct rxrpc_call *call; u32 call_id = ntohl(response->encrypted.call_id[i]); + u32 counter = READ_ONCE(conn->channels[i].call_counter); + + if (call_id > INT_MAX) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_callid); + goto protocol_error_free; + } - eproto = tracepoint_string("rxkad_rsp_callid"); - if (call_id > INT_MAX) - goto protocol_error_unlock; - - eproto = tracepoint_string("rxkad_rsp_callctr"); - if (call_id < conn->channels[i].call_counter) - goto protocol_error_unlock; - - eproto = tracepoint_string("rxkad_rsp_callst"); - if (call_id > conn->channels[i].call_counter) { - call = rcu_dereference_protected( - conn->channels[i].call, - lockdep_is_held(&conn->bundle->channel_lock)); - if (call && call->state < RXRPC_CALL_COMPLETE) - goto protocol_error_unlock; + if (call_id < counter) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_call_ctr); + goto protocol_error_free; + } + + if (call_id > counter) { + if (conn->channels[i].call) { + rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_call_state); + goto protocol_error_free; + } conn->channels[i].call_counter = call_id; } } - spin_unlock(&conn->bundle->channel_lock); - eproto = tracepoint_string("rxkad_rsp_seq"); - abort_code = RXKADOUTOFSEQUENCE; - if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) + if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) { + rxrpc_abort_conn(conn, skb, RXKADOUTOFSEQUENCE, -EPROTO, + rxkad_abort_resp_ooseq); goto protocol_error_free; + } - eproto = tracepoint_string("rxkad_rsp_level"); - abort_code = RXKADLEVELFAIL; level = ntohl(response->encrypted.level); - if (level > RXRPC_SECURITY_ENCRYPT) + if (level > RXRPC_SECURITY_ENCRYPT) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EPROTO, + rxkad_abort_resp_level); goto protocol_error_free; + } conn->security_level = level; /* create a key to hold the security data and expiration time - after @@ -1240,15 +1174,11 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _leave(" = 0"); return 0; -protocol_error_unlock: - spin_unlock(&conn->bundle->channel_lock); protocol_error_free: kfree(ticket); protocol_error: kfree(response); - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); key_put(server_key); - *_abort_code = abort_code; return -EPROTO; temporary_error_free_ticket: diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 66f5eea291ff..16dcabb71ebe 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -10,6 +10,8 @@ #include <linux/slab.h> #include <net/sock.h> #include <net/af_rxrpc.h> +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> MODULE_DESCRIPTION("rxperf test server (afs)"); MODULE_AUTHOR("Red Hat, Inc."); @@ -275,7 +277,7 @@ static void rxperf_deliver_to_call(struct work_struct *work) struct rxperf_call *call = container_of(work, struct rxperf_call, work); enum rxperf_call_state state; u32 abort_code, remote_abort = 0; - int ret; + int ret = 0; if (call->state == RXPERF_CALL_COMPLETE) return; @@ -307,12 +309,14 @@ static void rxperf_deliver_to_call(struct work_struct *work) case -EOPNOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - abort_code, ret, "GOP"); + abort_code, ret, + rxperf_abort_op_not_supported); goto call_complete; case -ENOTSUPP: abort_code = RX_USER_ABORT; rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - abort_code, ret, "GUA"); + abort_code, ret, + rxperf_abort_op_not_supported); goto call_complete; case -EIO: pr_err("Call %u in bad state %u\n", @@ -324,11 +328,13 @@ static void rxperf_deliver_to_call(struct work_struct *work) case -ENOMEM: case -EFAULT: rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - RXGEN_SS_UNMARSHAL, ret, "GUM"); + RXGEN_SS_UNMARSHAL, ret, + rxperf_abort_unmarshal_error); goto call_complete; default: rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - RX_CALL_DEAD, ret, "GER"); + RX_CALL_DEAD, ret, + rxperf_abort_general_error); goto call_complete; } } @@ -523,7 +529,8 @@ static int rxperf_process_call(struct rxperf_call *call) if (n == -ENOMEM) rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, - RXGEN_SS_MARSHAL, -ENOMEM, "GOM"); + RXGEN_SS_MARSHAL, -ENOMEM, + rxperf_abort_oom); return n; } diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 209f2c25a0da..cb8dd1d3b1d4 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -67,13 +67,13 @@ const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) */ int rxrpc_init_client_call_security(struct rxrpc_call *call) { - const struct rxrpc_security *sec; + const struct rxrpc_security *sec = &rxrpc_no_security; struct rxrpc_key_token *token; struct key *key = call->key; int ret; if (!key) - return 0; + goto found; ret = key_validate(key); if (ret < 0) @@ -88,7 +88,7 @@ int rxrpc_init_client_call_security(struct rxrpc_call *call) found: call->security = sec; - _leave(" = 0"); + call->security_ix = sec->security_index; return 0; } @@ -97,38 +97,31 @@ found: */ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) { - const struct rxrpc_security *sec; struct rxrpc_key_token *token; struct key *key = conn->key; - int ret; + int ret = 0; _enter("{%d},{%x}", conn->debug_id, key_serial(key)); - if (!key) - return 0; - - ret = key_validate(key); - if (ret < 0) - return ret; - for (token = key->payload.data[0]; token; token = token->next) { - sec = rxrpc_security_lookup(token->security_index); - if (sec) + if (token->security_index == conn->security->security_index) goto found; } return -EKEYREJECTED; found: - conn->security = sec; - - ret = conn->security->init_connection_security(conn, token); - if (ret < 0) { - conn->security = &rxrpc_no_security; - return ret; + mutex_lock(&conn->security_lock); + if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { + ret = conn->security->init_connection_security(conn, token); + if (ret == 0) { + spin_lock(&conn->state_lock); + if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) + conn->state = RXRPC_CONN_CLIENT; + spin_unlock(&conn->state_lock); + } } - - _leave(" = 0"); - return 0; + mutex_unlock(&conn->security_lock); + return ret; } /* @@ -144,21 +137,15 @@ const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *rx, sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { - trace_rxrpc_abort(0, "SVS", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EKEYREJECTED); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; + rxrpc_direct_abort(skb, rxrpc_abort_unsupported_security, + RX_INVALID_OPERATION, -EKEYREJECTED); return NULL; } if (sp->hdr.securityIndex != RXRPC_SECURITY_NONE && !rx->securities) { - trace_rxrpc_abort(0, "SVR", - sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EKEYREJECTED); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = sec->no_key_abort; + rxrpc_direct_abort(skb, rxrpc_abort_no_service_key, + sec->no_key_abort, -EKEYREJECTED); return NULL; } @@ -191,9 +178,9 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); - rcu_read_lock(); + read_lock(&conn->local->services_lock); - rx = rcu_dereference(conn->local->service); + rx = conn->local->service; if (!rx) goto out; @@ -215,6 +202,6 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, } out: - rcu_read_unlock(); + read_unlock(&conn->local->services_lock); return key; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 9fa7e37f7155..da49fcf1c456 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -18,6 +18,81 @@ #include "ar-internal.h" /* + * Propose an abort to be made in the I/O thread. + */ +bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, + enum rxrpc_abort_reason why) +{ + _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); + + if (!call->send_abort && !rxrpc_call_is_complete(call)) { + call->send_abort_why = why; + call->send_abort_err = error; + call->send_abort_seq = 0; + /* Request abort locklessly vs rxrpc_input_call_event(). */ + smp_store_release(&call->send_abort, abort_code); + rxrpc_poke_call(call, rxrpc_call_poke_abort); + return true; + } + + return false; +} + +/* + * Wait for a call to become connected. Interruption here doesn't cause the + * call to be aborted. + */ +static int rxrpc_wait_to_be_connected(struct rxrpc_call *call, long *timeo) +{ + DECLARE_WAITQUEUE(myself, current); + int ret = 0; + + _enter("%d", call->debug_id); + + if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) + return call->error; + + add_wait_queue_exclusive(&call->waitq, &myself); + + for (;;) { + ret = call->error; + if (ret < 0) + break; + + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + case RXRPC_PREINTERRUPTIBLE: + set_current_state(TASK_INTERRUPTIBLE); + break; + case RXRPC_UNINTERRUPTIBLE: + default: + set_current_state(TASK_UNINTERRUPTIBLE); + break; + } + if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) { + ret = call->error; + break; + } + if ((call->interruptibility == RXRPC_INTERRUPTIBLE || + call->interruptibility == RXRPC_PREINTERRUPTIBLE) && + signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + *timeo = schedule_timeout(*timeo); + } + + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); + + if (ret == 0 && rxrpc_call_is_complete(call)) + ret = call->error; + + _leave(" = %d", ret); + return ret; +} + +/* * Return true if there's sufficient Tx queue space. */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) @@ -39,7 +114,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, if (rxrpc_check_tx_space(call, NULL)) return 0; - if (call->state >= RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) return call->error; if (signal_pending(current)) @@ -74,7 +149,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, if (rxrpc_check_tx_space(call, &tx_win)) return 0; - if (call->state >= RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) return call->error; if (timeout == 0 && @@ -103,7 +178,7 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, if (rxrpc_check_tx_space(call, NULL)) return 0; - if (call->state >= RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) return call->error; trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); @@ -168,7 +243,6 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { - unsigned long now; rxrpc_seq_t seq = txb->seq; bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke; @@ -191,36 +265,10 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, poke = list_empty(&call->tx_sendmsg); list_add_tail(&txb->call_link, &call->tx_sendmsg); call->tx_prepared = seq; + if (last) + rxrpc_notify_end_tx(rx, call, notify_end_tx); spin_unlock(&call->tx_lock); - if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { - _debug("________awaiting reply/ACK__________"); - write_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; - rxrpc_notify_end_tx(rx, call, notify_end_tx); - break; - case RXRPC_CALL_SERVER_ACK_REQUEST: - call->state = RXRPC_CALL_SERVER_SEND_REPLY; - now = jiffies; - WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); - if (call->ackr_reason == RXRPC_ACK_DELAY) - call->ackr_reason = 0; - trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); - if (!last) - break; - fallthrough; - case RXRPC_CALL_SERVER_SEND_REPLY: - call->state = RXRPC_CALL_SERVER_AWAIT_ACK; - rxrpc_notify_end_tx(rx, call, notify_end_tx); - break; - default: - break; - } - write_unlock(&call->state_lock); - } - if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } @@ -245,6 +293,16 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + ret = rxrpc_wait_to_be_connected(call, &timeo); + if (ret < 0) + return ret; + + if (call->conn->state == RXRPC_CONN_CLIENT_UNSECURED) { + ret = rxrpc_init_client_conn_security(call->conn); + if (ret < 0) + return ret; + } + /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -252,15 +310,20 @@ reload: ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; - state = READ_ONCE(call->state); + state = rxrpc_call_state(call); ret = -ESHUTDOWN; if (state >= RXRPC_CALL_COMPLETE) goto maybe_error; ret = -EPROTO; if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && state != RXRPC_CALL_SERVER_ACK_REQUEST && - state != RXRPC_CALL_SERVER_SEND_REPLY) + state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Request phase complete for this client call */ + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); goto maybe_error; + } ret = -EMSGSIZE; if (call->tx_total_len != -1) { @@ -329,7 +392,7 @@ reload: /* check for the far side aborting the call or a network error * occurring */ - if (call->state == RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) goto call_terminated; /* add the packet to the send queue if it's now full */ @@ -354,12 +417,9 @@ reload: success: ret = copied; - if (READ_ONCE(call->state) == RXRPC_CALL_COMPLETE) { - read_lock(&call->state_lock); - if (call->error < 0) - ret = call->error; - read_unlock(&call->state_lock); - } + if (rxrpc_call_is_complete(call) && + call->error < 0) + ret = call->error; out: call->tx_pending = txb; _leave(" = %d", ret); @@ -543,7 +603,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ - rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp); _leave(" = %p\n", call); return call; } @@ -556,7 +615,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) { - enum rxrpc_call_state state; struct rxrpc_call *call; unsigned long now, j; bool dropped_lock = false; @@ -598,10 +656,10 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) return PTR_ERR(call); /* ... and we have the call lock. */ ret = 0; - if (READ_ONCE(call->state) == RXRPC_CALL_COMPLETE) + if (rxrpc_call_is_complete(call)) goto out_put_unlock; } else { - switch (READ_ONCE(call->state)) { + switch (rxrpc_call_state(call)) { case RXRPC_CALL_UNINITIALISED: case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_PREALLOC: @@ -625,7 +683,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) - goto error_put; + goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } @@ -655,17 +713,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) break; } - state = READ_ONCE(call->state); - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, state, call->conn); - - if (state >= RXRPC_CALL_COMPLETE) { + if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; } else if (p.command == RXRPC_CMD_SEND_ABORT) { + rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, + rxrpc_abort_call_sendmsg); ret = 0; - if (rxrpc_abort_call("CMD", call, 0, p.abort_code, -ECONNABORTED)) - ret = rxrpc_send_abort_packet(call); } else if (p.command != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; } else { @@ -705,34 +759,17 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, bool dropped_lock = false; int ret; - _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); + _enter("{%d},", call->debug_id); ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); mutex_lock(&call->user_mutex); - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - switch (READ_ONCE(call->state)) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - case RXRPC_CALL_SERVER_SEND_REPLY: - ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, - notify_end_tx, &dropped_lock); - break; - case RXRPC_CALL_COMPLETE: - read_lock(&call->state_lock); + ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, + notify_end_tx, &dropped_lock); + if (ret == -ESHUTDOWN) ret = call->error; - read_unlock(&call->state_lock); - break; - default: - /* Request phase complete for this client call */ - trace_rxrpc_rx_eproto(call, 0, tracepoint_string("late_send")); - ret = -EPROTO; - break; - } if (!dropped_lock) mutex_unlock(&call->user_mutex); @@ -747,24 +784,20 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet * @error: Local error value - * @why: 3-char string indicating why. + * @why: Indication as to why. * * Allow a kernel service to abort a call, if it's still in an abortable state * and return true if the call was aborted, false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, - u32 abort_code, int error, const char *why) + u32 abort_code, int error, enum rxrpc_abort_reason why) { bool aborted; - _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); + _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); - - aborted = rxrpc_abort_call(why, call, 0, abort_code, error); - if (aborted) - rxrpc_send_abort_packet(call); - + aborted = rxrpc_propose_abort(call, abort_code, error, why); mutex_unlock(&call->user_mutex); return aborted; } diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index ebe0c75e7b07..3bcd6ee80396 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -63,7 +63,7 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); - kfree_skb(skb); + consume_skb(skb); } } @@ -78,6 +78,6 @@ void rxrpc_purge_queue(struct sk_buff_head *list) int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); - kfree_skb(skb); + consume_skb(skb); } } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index cde3224a5cd2..ecaeb4ecfb58 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -17,6 +17,9 @@ static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = 255; static const unsigned long one_jiffy = 1; static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY +static const unsigned long max_500 = 500; +#endif /* * RxRPC operating parameters. @@ -63,6 +66,19 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra2 = (void *)&max_jiffies, }, + /* Values used in milliseconds */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + { + .procname = "inject_rx_delay", + .data = &rxrpc_inject_rx_delay, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (void *)SYSCTL_LONG_ZERO, + .extra2 = (void *)&max_500, + }, +#endif + /* Non-time values */ { .procname = "reap_client_conns", @@ -109,7 +125,6 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&four, }, - { } }; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index d2cf2aac3adb..d43be8512386 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -110,12 +110,8 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); - for (;;) { - spin_lock(&call->tx_lock); - txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link); - if (!txb) - break; + while ((txb = list_first_entry_or_null(&call->tx_buffer, + struct rxrpc_txbuf, call_link))) { hard_ack = smp_load_acquire(&call->acks_hard_ack); if (before(hard_ack, txb->seq)) break; @@ -128,15 +124,11 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); - spin_unlock(&call->tx_lock); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); if (after(call->acks_hard_ack, call->tx_bottom + 128)) wake = true; } - spin_unlock(&call->tx_lock); - if (wake) wake_up(&call->waitq); } diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 4662a6ce8a7e..4b95cb1ac435 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -45,23 +45,6 @@ if NET_SCHED comment "Queueing/Scheduling" -config NET_SCH_CBQ - tristate "Class Based Queueing (CBQ)" - help - Say Y here if you want to use the Class-Based Queueing (CBQ) packet - scheduling algorithm. This algorithm classifies the waiting packets - into a tree-like hierarchy of classes; the leaves of this tree are - in turn scheduled by separate algorithms. - - See the top of <file:net/sched/sch_cbq.c> for more details. - - CBQ is a commonly used scheduler, so if you're unsure, you should - say Y here. Then say Y to all the queueing algorithms below that you - want to use as leaf disciplines. - - To compile this code as a module, choose M here: the - module will be called sch_cbq. - config NET_SCH_HTB tristate "Hierarchical Token Bucket (HTB)" help @@ -85,20 +68,6 @@ config NET_SCH_HFSC To compile this code as a module, choose M here: the module will be called sch_hfsc. -config NET_SCH_ATM - tristate "ATM Virtual Circuits (ATM)" - depends on ATM - help - Say Y here if you want to use the ATM pseudo-scheduler. This - provides a framework for invoking classifiers, which in turn - select classes of this queuing discipline. Each class maps - the flow(s) it is handling to a given virtual circuit. - - See the top of <file:net/sched/sch_atm.c> for more details. - - To compile this code as a module, choose M here: the - module will be called sch_atm. - config NET_SCH_PRIO tristate "Multi Band Priority Queueing (PRIO)" help @@ -195,8 +164,14 @@ config NET_SCH_ETF To compile this code as a module, choose M here: the module will be called sch_etf. +config NET_SCH_MQPRIO_LIB + tristate + help + Common library for manipulating mqprio queue configurations. + config NET_SCH_TAPRIO tristate "Time Aware Priority (taprio) Scheduler" + select NET_SCH_MQPRIO_LIB help Say Y here if you want to use the Time Aware Priority (taprio) packet scheduling algorithm. @@ -217,17 +192,6 @@ config NET_SCH_GRED To compile this code as a module, choose M here: the module will be called sch_gred. -config NET_SCH_DSMARK - tristate "Differentiated Services marker (DSMARK)" - help - Say Y if you want to schedule packets according to the - Differentiated Services architecture proposed in RFC 2475. - Technical information on this method, with pointers to associated - RFCs, is available at <http://www.gta.ufrj.br/diffserv/>. - - To compile this code as a module, choose M here: the - module will be called sch_dsmark. - config NET_SCH_NETEM tristate "Network emulator (NETEM)" help @@ -253,6 +217,7 @@ config NET_SCH_DRR config NET_SCH_MQPRIO tristate "Multi-queue priority scheduler (MQPRIO)" + select NET_SCH_MQPRIO_LIB help Say Y here if you want to use the Multi-queue Priority scheduler. This scheduler allows QOS to be offloaded on NICs that have support @@ -337,7 +302,7 @@ config NET_SCH_FQ Say Y here if you want to use the FQ packet scheduling algorithm. FQ does flow separation, and is able to respect pacing requirements - set by TCP stack into sk->sk_pacing_rate (for localy generated + set by TCP stack into sk->sk_pacing_rate (for locally generated traffic) To compile this driver as a module, choose M here: the module @@ -503,17 +468,6 @@ config NET_CLS_BASIC To compile this code as a module, choose M here: the module will be called cls_basic. -config NET_CLS_TCINDEX - tristate "Traffic-Control Index (TCINDEX)" - select NET_CLS - help - Say Y here if you want to be able to classify packets based on - traffic control indices. You will want this feature if you want - to implement Differentiated Services together with DSMARK. - - To compile this code as a module, choose M here: the - module will be called cls_tcindex. - config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" depends on INET @@ -559,34 +513,6 @@ config CLS_U32_MARK help Say Y here to be able to use netfilter marks as u32 key. -config NET_CLS_RSVP - tristate "IPv4 Resource Reservation Protocol (RSVP)" - select NET_CLS - help - The Resource Reservation Protocol (RSVP) permits end systems to - request a minimum and maximum data flow rate for a connection; this - is important for real time data such as streaming sound or video. - - Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests. - - To compile this code as a module, choose M here: the - module will be called cls_rsvp. - -config NET_CLS_RSVP6 - tristate "IPv6 Resource Reservation Protocol (RSVP6)" - select NET_CLS - help - The Resource Reservation Protocol (RSVP) permits end systems to - request a minimum and maximum data flow rate for a connection; this - is important for real time data such as streaming sound or video. - - Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests and you are using the IPv6 protocol. - - To compile this code as a module, choose M here: the - module will be called cls_rsvp6. - config NET_CLS_FLOW tristate "Flow classifier" select NET_CLS @@ -977,6 +903,8 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE + select NF_CONNTRACK_OVS + select NF_NAT_OVS if NF_NAT help Say Y here to allow sending the packets to conntrack module. diff --git a/net/sched/Makefile b/net/sched/Makefile index dd14ef413fda..b5fd49641d91 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,25 +33,23 @@ obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_ACT_CT) += act_ct.o obj-$(CONFIG_NET_ACT_GATE) += act_gate.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o -obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o -obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o -obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_MQPRIO_LIB) += sch_mqprio_lib.o obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o @@ -69,9 +67,6 @@ obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o -obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o -obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o -obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9b31a10cc639..eda58b78da13 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -23,6 +23,7 @@ #include <net/act_api.h> #include <net/netlink.h> #include <net/flow_offload.h> +#include <net/tc_wrapper.h> #ifdef CONFIG_INET DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); @@ -168,11 +169,6 @@ static bool tc_act_skip_sw(u32 flags) return (flags & TCA_ACT_FLAGS_SKIP_SW) ? true : false; } -static bool tc_act_in_hw(struct tc_action *act) -{ - return !!act->in_hw_count; -} - /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static bool tc_act_flags_valid(u32 flags) { @@ -191,6 +187,7 @@ static int offload_action_init(struct flow_offload_action *fl_action, fl_action->extack = extack; fl_action->command = cmd; fl_action->index = act->tcfa_index; + fl_action->cookie = (unsigned long)act; if (act->ops->offload_act_setup) { spin_lock_bh(&act->tcfa_lock); @@ -306,9 +303,6 @@ int tcf_action_update_hw_stats(struct tc_action *action) struct flow_offload_action fl_act = {}; int err; - if (!tc_act_in_hw(action)) - return -EOPNOTSUPP; - err = offload_action_init(&fl_act, action, FLOW_ACT_STATS, NULL); if (err) return err; @@ -538,6 +532,8 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, (unsigned long)p->tcfa_tm.lastuse)) continue; + tcf_action_update_hw_stats(p); + nest = nla_nest_start_noflag(skb, n_i); if (!nest) { index--; @@ -1080,7 +1076,7 @@ restart_act_graph: repeat_ttl = 32; repeat: - ret = a->ops->act(skb, a, res); + ret = tc_act(skb, a, res); if (unlikely(ret == TC_ACT_REPEAT)) { if (--repeat_ttl != 0) goto repeat; @@ -1538,9 +1534,6 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (p == NULL) goto errout; - /* update hw stats for this action */ - tcf_action_update_hw_stats(p); - /* compat_mode being true specifies a call that is supposed * to add additional backward compatibility statistic TLVs. */ @@ -1581,7 +1574,7 @@ errout: static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, - int ref) + int ref, struct netlink_ext_ack *extack) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -1605,7 +1598,12 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], nla_nest_end(skb, nest); + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1624,7 +1622,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 1) <= 0) { + 0, 1, NULL) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1798,7 +1796,7 @@ tcf_reoffload_del_notify(struct net *net, struct tc_action *action) if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1) <= 0) { + if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1885,7 +1883,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 2) <= 0) { + 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; @@ -1964,7 +1962,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, - RTM_NEWACTION, 0, 0) <= 0) { + RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index b79eee44e24e..b0455fda7d0b 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -18,6 +18,7 @@ #include <linux/tc_act/tc_bpf.h> #include <net/tc_act/tc_bpf.h> +#include <net/tc_wrapper.h> #define ACT_BPF_NAME_LEN 256 @@ -31,8 +32,9 @@ struct tcf_bpf_cfg { static struct tc_action_ops act_bpf_ops; -static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, + const struct tc_action *act, + struct tcf_result *res) { bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index d41002e4613f..8dabfb52ea3d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -20,6 +20,7 @@ #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_connmark.h> #include <net/tc_act/tc_connmark.h> +#include <net/tc_wrapper.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -27,20 +28,23 @@ static struct tc_action_ops act_connmark_ops; -static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct tcf_connmark_info *ca = to_connmark(a); + struct tcf_connmark_parms *parms; struct nf_conntrack_zone zone; struct nf_conn *c; int proto; - spin_lock(&ca->tcf_lock); tcf_lastuse_update(&ca->tcf_tm); - bstats_update(&ca->tcf_bstats, skb); + tcf_action_update_bstats(&ca->common, skb); + + parms = rcu_dereference_bh(ca->parms); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): @@ -62,31 +66,29 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, c = nf_ct_get(skb, &ctinfo); if (c) { skb->mark = READ_ONCE(c->mark); - /* using overlimits stats to count how many packets marked */ - ca->tcf_qstats.overlimits++; - goto out; + goto count; } - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - proto, ca->net, &tuple)) + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, parms->net, + &tuple)) goto out; - zone.id = ca->zone; + zone.id = parms->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; - thash = nf_conntrack_find_get(ca->net, &zone, &tuple); + thash = nf_conntrack_find_get(parms->net, &zone, &tuple); if (!thash) goto out; c = nf_ct_tuplehash_to_ctrack(thash); - /* using overlimits stats to count how many packets marked */ - ca->tcf_qstats.overlimits++; skb->mark = READ_ONCE(c->mark); nf_ct_put(c); +count: + /* using overlimits stats to count how many packets marked */ + tcf_action_inc_overlimit_qstats(&ca->common); out: - spin_unlock(&ca->tcf_lock); - return ca->tcf_action; + return READ_ONCE(ca->tcf_action); } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { @@ -99,6 +101,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); + struct tcf_connmark_parms *nparms, *oparms; struct nlattr *tb[TCA_CONNMARK_MAX + 1]; bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; @@ -118,52 +121,66 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!tb[TCA_CONNMARK_PARMS]) return -EINVAL; + nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + if (!nparms) + return -ENOMEM; + parm = nla_data(tb[TCA_CONNMARK_PARMS]); index = parm->index; ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { - ret = tcf_idr_create(tn, index, est, a, - &act_connmark_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_connmark_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); - return ret; + err = ret; + goto out_free; } ci = to_connmark(*a); - err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, - extack); - if (err < 0) - goto release_idr; - tcf_action_set_ctrlact(*a, parm->action, goto_ch); - ci->net = net; - ci->zone = parm->zone; + + nparms->net = net; + nparms->zone = parm->zone; ret = ACT_P_CREATED; } else if (ret > 0) { ci = to_connmark(*a); - if (bind) - return 0; - if (!(flags & TCA_ACT_FLAGS_REPLACE)) { - tcf_idr_release(*a, bind); - return -EEXIST; + if (bind) { + err = 0; + goto out_free; } - err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, - extack); - if (err < 0) + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { + err = -EEXIST; goto release_idr; - /* replacing action and zone */ - spin_lock_bh(&ci->tcf_lock); - goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - ci->zone = parm->zone; - spin_unlock_bh(&ci->tcf_lock); - if (goto_ch) - tcf_chain_put_by_act(goto_ch); + } + + nparms->net = rtnl_dereference(ci->parms)->net; + nparms->zone = parm->zone; + ret = 0; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + spin_lock_bh(&ci->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); + spin_unlock_bh(&ci->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + + if (oparms) + kfree_rcu(oparms, rcu); + return ret; + release_idr: tcf_idr_release(*a, bind); +out_free: + kfree(nparms); return err; } @@ -177,11 +194,14 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; + struct tcf_connmark_parms *parms; struct tcf_t t; spin_lock_bh(&ci->tcf_lock); + parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock)); + opt.action = ci->tcf_action; - opt.zone = ci->zone; + opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -199,6 +219,16 @@ nla_put_failure: return -1; } +static void tcf_connmark_cleanup(struct tc_action *a) +{ + struct tcf_connmark_info *ci = to_connmark(a); + struct tcf_connmark_parms *parms; + + parms = rcu_dereference_protected(ci->parms, 1); + if (parms) + kfree_rcu(parms, rcu); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .id = TCA_ID_CONNMARK, @@ -206,6 +236,7 @@ static struct tc_action_ops act_connmark_ops = { .act = tcf_connmark_act, .dump = tcf_connmark_dump, .init = tcf_connmark_init, + .cleanup = tcf_connmark_cleanup, .size = sizeof(struct tcf_connmark_info), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 1366adf9b909..95e9304024b7 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -32,6 +32,7 @@ #include <linux/tc_act/tc_csum.h> #include <net/tc_act/tc_csum.h> +#include <net/tc_wrapper.h> static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, @@ -563,8 +564,9 @@ fail: return 0; } -static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); bool orig_vlan_tag_present = false; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index dd5ae7551956..9cc0bc7c71ed 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -24,6 +24,7 @@ #include <net/ipv6_frag.h> #include <uapi/linux/tc_act/tc_ct.h> #include <net/tc_act/tc_ct.h> +#include <net/tc_wrapper.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> @@ -169,11 +170,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, enum ip_conntrack_dir dir, + enum ip_conntrack_info ctinfo, struct flow_action *action) { struct nf_conn_labels *ct_labels; struct flow_action_entry *entry; - enum ip_conntrack_info ctinfo; u32 *act_ct_labels; entry = tcf_ct_flow_table_flow_action_get_next(action); @@ -181,8 +182,6 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) entry->ct_metadata.mark = READ_ONCE(ct->mark); #endif - ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; /* aligns with the CT reference on the SKB nf_ct_set */ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; @@ -236,22 +235,28 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net, } static int tcf_ct_flow_table_fill_actions(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir tdir, struct nf_flow_rule *flow_rule) { struct flow_action *action = &flow_rule->rule->action; int num_entries = action->num_entries; struct nf_conn *ct = flow->ct; + enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; int i, err; switch (tdir) { case FLOW_OFFLOAD_DIR_ORIGINAL: dir = IP_CT_DIR_ORIGINAL; + ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + IP_CT_ESTABLISHED : IP_CT_NEW; + if (ctinfo == IP_CT_ESTABLISHED) + set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); break; case FLOW_OFFLOAD_DIR_REPLY: dir = IP_CT_DIR_REPLY; + ctinfo = IP_CT_ESTABLISHED_REPLY; break; default: return -EOPNOTSUPP; @@ -261,7 +266,7 @@ static int tcf_ct_flow_table_fill_actions(struct net *net, if (err) goto err_nat; - tcf_ct_flow_table_add_action_meta(ct, dir, action); + tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action); return 0; err_nat: @@ -364,7 +369,7 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, - bool tcp) + bool tcp, bool bidirectional) { struct nf_conn_act_ct_ext *act_ct_ext; struct flow_offload *entry; @@ -383,6 +388,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } + if (bidirectional) + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags); act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { @@ -406,26 +413,34 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - bool tcp = false; - - if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) || - !test_bit(IPS_ASSURED_BIT, &ct->status)) - return; + bool tcp = false, bidirectional = true; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: - tcp = true; - if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) return; + + tcp = true; break; case IPPROTO_UDP: + if (!nf_ct_is_confirmed(ct)) + return; + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) + bidirectional = false; break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; - if (ct->status & IPS_NAT_MASK) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->status & IPS_NAT_MASK) return; + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) @@ -441,7 +456,7 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, ct->status & IPS_SEQ_ADJUST) return; - tcf_ct_flow_table_add(ct_ft, ct, tcp); + tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional); } static bool @@ -620,13 +635,30 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); ct = flow->ct; + if (dir == FLOW_OFFLOAD_DIR_REPLY && + !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) { + /* Only offload reply direction after connection became + * assured. + */ + if (test_bit(IPS_ASSURED_BIT, &ct->status)) + set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); + else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags)) + /* If flow_table flow has already been updated to the + * established state, then don't refresh. + */ + return false; + } + if (tcph && (unlikely(tcph->fin || tcph->rst))) { flow_offload_teardown(flow); return false; } - ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; + if (dir == FLOW_OFFLOAD_DIR_ORIGINAL) + ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + IP_CT_ESTABLISHED : IP_CT_NEW; + else + ctinfo = IP_CT_ESTABLISHED_REPLY; flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); @@ -694,31 +726,6 @@ drop_ct: return false; } -/* Trim the skb to the length specified by the IP/IPv6 header, - * removing any trailing lower-layer padding. This prepares the skb - * for higher-layer processing that assumes skb->len excludes padding - * (such as nf_ip_checksum). The caller needs to pull the skb to the - * network header, and ensure ip_hdr/ipv6_hdr points to valid data. - */ -static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) -{ - unsigned int len; - - switch (family) { - case NFPROTO_IPV4: - len = ntohs(ip_hdr(skb)->tot_len); - break; - case NFPROTO_IPV6: - len = sizeof(struct ipv6hdr) - + ntohs(ipv6_hdr(skb)->payload_len); - break; - default: - len = skb->len; - } - - return pskb_trim_rcsum(skb, len); -} - static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) { u8 family = NFPROTO_UNSPEC; @@ -778,6 +785,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, struct nf_conn *ct; int err = 0; bool frag; + u8 proto; u16 mru; /* Previously seen (loopback)? Ignore. */ @@ -793,50 +801,14 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, return err; skb_get(skb); - mru = tc_skb_cb(skb)->mru; - - if (family == NFPROTO_IPV4) { - enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - local_bh_disable(); - err = ip_defrag(net, skb, user); - local_bh_enable(); - if (err && err != -EINPROGRESS) - return err; - - if (!err) { - *defrag = true; - mru = IPCB(skb)->frag_max_size; - } - } else { /* NFPROTO_IPV6 */ -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - - memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - err = nf_ct_frag6_gather(net, skb, user); - if (err && err != -EINPROGRESS) - goto out_free; - - if (!err) { - *defrag = true; - mru = IP6CB(skb)->frag_max_size; - } -#else - err = -EOPNOTSUPP; - goto out_free; -#endif - } + err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru); + if (err) + return err; - if (err != -EINPROGRESS) - tc_skb_cb(skb)->mru = mru; - skb_clear_hash(skb); - skb->ignore_df = 1; - return err; + *defrag = true; + tc_skb_cb(skb)->mru = mru; -out_free: - kfree_skb(skb); - return err; + return 0; } static void tcf_ct_params_free(struct tcf_ct_params *params) @@ -863,90 +835,6 @@ static void tcf_ct_params_free_rcu(struct rcu_head *head) tcf_ct_params_free(params); } -#if IS_ENABLED(CONFIG_NF_NAT) -/* Modelled after nf_nat_ipv[46]_fn(). - * range is only used for new, uninitialized NAT state. - * Returns either NF_ACCEPT or NF_DROP. - */ -static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype) -{ - __be16 proto = skb_protocol(skb, true); - int hooknum, err = NF_ACCEPT; - - /* See HOOK2MANIP(). */ - if (maniptype == NF_NAT_MANIP_SRC) - hooknum = NF_INET_LOCAL_IN; /* Source NAT */ - else - hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (proto == htons(ETH_P_IP) && - ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - hooknum)) - err = NF_DROP; - goto out; - } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { - __be16 frag_off; - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - int hdrlen = ipv6_skip_exthdr(skb, - sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, - ctinfo, - hooknum, - hdrlen)) - err = NF_DROP; - goto out; - } - } - /* Non-ICMP, fall thru to initialize if needed. */ - fallthrough; - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - /* Initialize according to the NAT action. */ - err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) - /* Action is set up to establish a new - * mapping. - */ - ? nf_nat_setup_info(ct, range, maniptype) - : nf_nat_alloc_null_binding(ct, hooknum); - if (err != NF_ACCEPT) - goto out; - } - break; - - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - - default: - err = NF_DROP; - goto out; - } - - err = nf_nat_packet(ct, ctinfo, hooknum, skb); - if (err == NF_ACCEPT) { - if (maniptype == NF_NAT_MANIP_SRC) - tc_skb_cb(skb)->post_ct_snat = 1; - if (maniptype == NF_NAT_MANIP_DST) - tc_skb_cb(skb)->post_ct_dnat = 1; - } -out: - return err; -} -#endif /* CONFIG_NF_NAT */ - static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) @@ -986,60 +874,30 @@ static int tcf_ct_act_nat(struct sk_buff *skb, bool commit) { #if IS_ENABLED(CONFIG_NF_NAT) - int err; - enum nf_nat_manip_type maniptype; + int err, action = 0; if (!(ct_action & TCA_CT_ACT_NAT)) return NF_ACCEPT; + if (ct_action & TCA_CT_ACT_NAT_SRC) + action |= BIT(NF_NAT_MANIP_SRC); + if (ct_action & TCA_CT_ACT_NAT_DST) + action |= BIT(NF_NAT_MANIP_DST); - /* Add NAT extension if not confirmed yet. */ - if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) - return NF_DROP; /* Can't NAT. */ - - if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && - (ctinfo != IP_CT_RELATED || commit)) { - /* NAT an established or related connection like before. */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) - /* This is the REPLY direction for a connection - * for which NAT was applied in the forward - * direction. Do the reverse NAT. - */ - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; - else - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; - } else if (ct_action & TCA_CT_ACT_NAT_SRC) { - maniptype = NF_NAT_MANIP_SRC; - } else if (ct_action & TCA_CT_ACT_NAT_DST) { - maniptype = NF_NAT_MANIP_DST; - } else { - return NF_ACCEPT; - } + err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit); + + if (action & BIT(NF_NAT_MANIP_SRC)) + tc_skb_cb(skb)->post_ct_snat = 1; + if (action & BIT(NF_NAT_MANIP_DST)) + tc_skb_cb(skb)->post_ct_dnat = 1; - err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); - if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { - if (ct->status & IPS_SRC_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; - - err = ct_nat_execute(skb, ct, ctinfo, range, - maniptype); - } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - err = ct_nat_execute(skb, ct, ctinfo, NULL, - NF_NAT_MANIP_SRC); - } - } return err; #else return NF_ACCEPT; #endif } -static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct net *net = dev_net(skb->dev); enum ip_conntrack_info ctinfo; @@ -1093,7 +951,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, if (err) goto drop; - err = tcf_ct_skb_network_trim(skb, family); + err = nf_ct_skb_network_trim(skb, family); if (err) goto drop; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index eaa02f098d1c..4d15b6a6169c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -18,6 +18,7 @@ #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ctinfo.h> #include <net/tc_act/tc_ctinfo.h> +#include <net/tc_wrapper.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -75,8 +76,9 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } -static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash = NULL; struct tcf_ctinfo *ca = to_ctinfo(a); @@ -91,7 +93,7 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); - bstats_update(&ca->tcf_bstats, skb); + tcf_action_update_bstats(&ca->common, skb); action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); @@ -210,8 +212,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, index = actparm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_ctinfo_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 62d682b96b88..904ab3d457ef 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -18,6 +18,7 @@ #include <net/pkt_cls.h> #include <linux/tc_act/tc_gact.h> #include <net/tc_act/tc_gact.h> +#include <net/tc_wrapper.h> static struct tc_action_ops act_gact_ops; @@ -25,7 +26,7 @@ static struct tc_action_ops act_gact_ops; static int gact_net_rand(struct tcf_gact *gact) { smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ - if (prandom_u32_max(gact->tcfg_pval)) + if (get_random_u32_below(gact->tcfg_pval)) return gact->tcf_action; return gact->tcfg_paction; } @@ -145,8 +146,9 @@ release_idr: return err; } -static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_gact_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 3049878e7315..c9a811f4c7ee 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -14,6 +14,7 @@ #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gate.h> +#include <net/tc_wrapper.h> static struct tc_action_ops act_gate_ops; @@ -113,39 +114,42 @@ static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) return HRTIMER_RESTART; } -static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); - - spin_lock(&gact->tcf_lock); + int action = READ_ONCE(gact->tcf_action); tcf_lastuse_update(&gact->tcf_tm); - bstats_update(&gact->tcf_bstats, skb); + tcf_action_update_bstats(&gact->common, skb); + spin_lock(&gact->tcf_lock); if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { spin_unlock(&gact->tcf_lock); - return gact->tcf_action; + return action; } - if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) + if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) { + spin_unlock(&gact->tcf_lock); goto drop; + } if (gact->current_max_octets >= 0) { gact->current_entry_octets += qdisc_pkt_len(skb); if (gact->current_entry_octets > gact->current_max_octets) { - gact->tcf_qstats.overlimits++; - goto drop; + spin_unlock(&gact->tcf_lock); + goto overlimit; } } - spin_unlock(&gact->tcf_lock); - return gact->tcf_action; -drop: - gact->tcf_qstats.drops++; - spin_unlock(&gact->tcf_lock); + return action; +overlimit: + tcf_action_inc_overlimit_qstats(&gact->common); +drop: + tcf_action_inc_drop_qstats(&gact->common); return TC_ACT_SHOT; } @@ -355,8 +359,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, return 0; if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_gate_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_gate_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 41d63b33461d..bc7611b0744c 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -29,6 +29,7 @@ #include <net/tc_act/tc_ife.h> #include <linux/etherdevice.h> #include <net/ife.h> +#include <net/tc_wrapper.h> static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; @@ -861,8 +862,9 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, return action; } -static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 1625e1037416..5d96ffebd40f 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -20,6 +20,7 @@ #include <net/pkt_sched.h> #include <linux/tc_act/tc_ipt.h> #include <net/tc_act/tc_ipt.h> +#include <net/tc_wrapper.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -216,8 +217,9 @@ static int tcf_xt_init(struct net *net, struct nlattr *nla, a, &act_xt_ops, tp, flags); } -static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ipt_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *ipt = to_ipt(a); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b8ad6ae282c0..8037ec9b1d31 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -24,12 +24,13 @@ #include <net/pkt_cls.h> #include <linux/tc_act/tc_mirred.h> #include <net/tc_act/tc_mirred.h> +#include <net/tc_wrapper.h> static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); -#define MIRRED_RECURSION_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_rec_level); +#define MIRRED_NEST_LIMIT 4 +static DEFINE_PER_CPU(unsigned int, mirred_nest_level); static bool tcf_mirred_is_act_redirect(int action) { @@ -205,26 +206,34 @@ release_idr: return err; } +static bool is_mirred_nested(void) +{ + return unlikely(__this_cpu_read(mirred_nest_level) > 1); +} + static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) { int err; if (!want_ingress) err = tcf_dev_queue_xmit(skb, dev_queue_xmit); + else if (is_mirred_nested()) + err = netif_rx(skb); else err = netif_receive_skb(skb); return err; } -static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; - unsigned int rec_level; + unsigned int nest_level; int retval, err = 0; bool use_reinsert; bool want_ingress; @@ -235,11 +244,11 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, int mac_len; bool at_nh; - rec_level = __this_cpu_inc_return(mirred_rec_level); - if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { + nest_level = __this_cpu_inc_return(mirred_nest_level); + if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_SHOT; } @@ -308,7 +317,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, err = tcf_mirred_forward(want_ingress, skb); if (err) tcf_action_inc_overlimit_qstats(&m->common); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_CONSUMED; } } @@ -320,7 +329,7 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return retval; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 8ad25cc8ccd5..6b26bdb999d7 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -14,6 +14,7 @@ #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_mpls.h> +#include <net/tc_wrapper.h> static struct tc_action_ops act_mpls_ops; @@ -49,8 +50,9 @@ static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse, return cpu_to_be32(new_lse); } -static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; @@ -132,6 +134,11 @@ static int valid_label(const struct nlattr *attr, { const u32 *label = nla_data(attr); + if (nla_len(attr) != sizeof(*label)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid MPLS label length"); + return -EINVAL; + } + if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) { NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range"); return -EINVAL; @@ -143,7 +150,8 @@ static int valid_label(const struct nlattr *attr, static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, - [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label), + [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + valid_label), [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7), [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1), [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9265145f1040..4184af5abbf3 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -24,7 +24,7 @@ #include <net/tc_act/tc_nat.h> #include <net/tcp.h> #include <net/udp.h> - +#include <net/tc_wrapper.h> static struct tc_action_ops act_nat_ops; @@ -38,6 +38,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, { struct tc_action_net *tn = net_generic(net, act_nat_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; + struct tcf_nat_parms *nparm, *oparm; struct nlattr *tb[TCA_NAT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_nat *parm; @@ -59,8 +60,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_nat_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, &act_nat_ops, + bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -79,29 +80,43 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; + + nparm = kzalloc(sizeof(*nparm), GFP_KERNEL); + if (!nparm) { + err = -ENOMEM; + goto release_idr; + } + + nparm->old_addr = parm->old_addr; + nparm->new_addr = parm->new_addr; + nparm->mask = parm->mask; + nparm->flags = parm->flags; + p = to_tcf_nat(*a); spin_lock_bh(&p->tcf_lock); - p->old_addr = parm->old_addr; - p->new_addr = parm->new_addr; - p->mask = parm->mask; - p->flags = parm->flags; - goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + oparm = rcu_replace_pointer(p->parms, nparm, lockdep_is_held(&p->tcf_lock)); spin_unlock_bh(&p->tcf_lock); + if (goto_ch) tcf_chain_put_by_act(goto_ch); + if (oparm) + kfree_rcu(oparm, rcu); + return ret; release_idr: tcf_idr_release(*a, bind); return err; } -static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_nat *p = to_tcf_nat(a); + struct tcf_nat_parms *parms; struct iphdr *iph; __be32 old_addr; __be32 new_addr; @@ -112,18 +127,16 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, int ihl; int noff; - spin_lock(&p->tcf_lock); - tcf_lastuse_update(&p->tcf_tm); - old_addr = p->old_addr; - new_addr = p->new_addr; - mask = p->mask; - egress = p->flags & TCA_NAT_FLAG_EGRESS; - action = p->tcf_action; + tcf_action_update_bstats(&p->common, skb); - bstats_update(&p->tcf_bstats, skb); + action = READ_ONCE(p->tcf_action); - spin_unlock(&p->tcf_lock); + parms = rcu_dereference_bh(p->parms); + old_addr = parms->old_addr; + new_addr = parms->new_addr; + mask = parms->mask; + egress = parms->flags & TCA_NAT_FLAG_EGRESS; if (unlikely(action == TC_ACT_SHOT)) goto drop; @@ -247,9 +260,7 @@ out: return action; drop: - spin_lock(&p->tcf_lock); - p->tcf_qstats.drops++; - spin_unlock(&p->tcf_lock); + tcf_action_inc_drop_qstats(&p->common); return TC_ACT_SHOT; } @@ -263,15 +274,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; + struct tcf_nat_parms *parms; struct tcf_t t; spin_lock_bh(&p->tcf_lock); - opt.old_addr = p->old_addr; - opt.new_addr = p->new_addr; - opt.mask = p->mask; - opt.flags = p->flags; + opt.action = p->tcf_action; + parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock)); + + opt.old_addr = parms->old_addr; + opt.new_addr = parms->new_addr; + opt.mask = parms->mask; + opt.flags = parms->flags; + if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -288,6 +304,16 @@ nla_put_failure: return -1; } +static void tcf_nat_cleanup(struct tc_action *a) +{ + struct tcf_nat *p = to_tcf_nat(a); + struct tcf_nat_parms *parms; + + parms = rcu_dereference_protected(p->parms, 1); + if (parms) + kfree_rcu(parms, rcu); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .id = TCA_ID_NAT, @@ -295,6 +321,7 @@ static struct tc_action_ops act_nat_ops = { .act = tcf_nat_act, .dump = tcf_nat_dump, .init = tcf_nat_init, + .cleanup = tcf_nat_cleanup, .size = sizeof(struct tcf_nat), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 94ed5857ce67..77d288d384ae 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -20,6 +20,7 @@ #include <net/tc_act/tc_pedit.h> #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> static struct tc_action_ops act_pedit_ops; @@ -133,6 +134,17 @@ nla_failure: return -EINVAL; } +static void tcf_pedit_cleanup_rcu(struct rcu_head *head) +{ + struct tcf_pedit_parms *parms = + container_of(head, struct tcf_pedit_parms, rcu); + + kfree(parms->tcfp_keys_ex); + kfree(parms->tcfp_keys); + + kfree(parms); +} + static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, @@ -140,10 +152,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; - struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; - struct tc_pedit_key *keys = NULL; - struct tcf_pedit_key_ex *keys_ex; + struct tcf_pedit_parms *oparms, *nparms; + struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; @@ -180,18 +191,25 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return -EINVAL; } - keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); - if (IS_ERR(keys_ex)) - return PTR_ERR(keys_ex); + nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + if (!nparms) + return -ENOMEM; + + nparms->tcfp_keys_ex = + tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); + if (IS_ERR(nparms->tcfp_keys_ex)) { + ret = PTR_ERR(nparms->tcfp_keys_ex); + goto out_free; + } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_pedit_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); - goto out_free; + goto out_free_ex; } ret = ACT_P_CREATED; } else if (err > 0) { @@ -203,7 +221,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } } else { ret = err; - goto out_free; + goto out_free_ex; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); @@ -211,48 +229,50 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, ret = err; goto out_release; } - p = to_pedit(*a); - spin_lock_bh(&p->tcf_lock); - if (ret == ACT_P_CREATED || - (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) { - keys = kmalloc(ksize, GFP_ATOMIC); - if (!keys) { - spin_unlock_bh(&p->tcf_lock); - ret = -ENOMEM; - goto put_chain; - } - kfree(p->tcfp_keys); - p->tcfp_keys = keys; - p->tcfp_nkeys = parm->nkeys; + nparms->tcfp_off_max_hint = 0; + nparms->tcfp_flags = parm->flags; + nparms->tcfp_nkeys = parm->nkeys; + + nparms->tcfp_keys = kmalloc(ksize, GFP_KERNEL); + if (!nparms->tcfp_keys) { + ret = -ENOMEM; + goto put_chain; } - memcpy(p->tcfp_keys, parm->keys, ksize); - p->tcfp_off_max_hint = 0; - for (i = 0; i < p->tcfp_nkeys; ++i) { - u32 cur = p->tcfp_keys[i].off; + + memcpy(nparms->tcfp_keys, parm->keys, ksize); + + for (i = 0; i < nparms->tcfp_nkeys; ++i) { + u32 cur = nparms->tcfp_keys[i].off; /* sanitize the shift value for any later use */ - p->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, - p->tcfp_keys[i].shift); + nparms->tcfp_keys[i].shift = min_t(size_t, + BITS_PER_TYPE(int) - 1, + nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ - cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift; + cur += (0xff & nparms->tcfp_keys[i].offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ - p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4); + nparms->tcfp_off_max_hint = + max(nparms->tcfp_off_max_hint, cur + 4); } - p->tcfp_flags = parm->flags; + p = to_pedit(*a); + + spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + oparms = rcu_replace_pointer(p->parms, nparms, 1); + spin_unlock_bh(&p->tcf_lock); - kfree(p->tcfp_keys_ex); - p->tcfp_keys_ex = keys_ex; + if (oparms) + call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); - spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); + return ret; put_chain: @@ -260,19 +280,22 @@ put_chain: tcf_chain_put_by_act(goto_ch); out_release: tcf_idr_release(*a, bind); +out_free_ex: + kfree(nparms->tcfp_keys_ex); out_free: - kfree(keys_ex); + kfree(nparms); return ret; - } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); - struct tc_pedit_key *keys = p->tcfp_keys; + struct tcf_pedit_parms *parms; - kfree(keys); - kfree(p->tcfp_keys_ex); + parms = rcu_dereference_protected(p->parms, 1); + + if (parms) + call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) @@ -319,112 +342,109 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb, return ret; } -static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { + enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); + struct tcf_pedit_key_ex *tkey_ex; + struct tcf_pedit_parms *parms; + struct tc_pedit_key *tkey; u32 max_offset; int i; - spin_lock(&p->tcf_lock); + parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + - p->tcfp_off_max_hint; + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) - goto unlock; + goto done; tcf_lastuse_update(&p->tcf_tm); + tcf_action_update_bstats(&p->common, skb); - if (p->tcfp_nkeys > 0) { - struct tc_pedit_key *tkey = p->tcfp_keys; - struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; - enum pedit_header_type htype = - TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; - enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; - - for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr, hdata; - int offset = tkey->off; - int hoffset; - u32 val; - int rc; - - if (tkey_ex) { - htype = tkey_ex->htype; - cmd = tkey_ex->cmd; - - tkey_ex++; - } + tkey = parms->tcfp_keys; + tkey_ex = parms->tcfp_keys_ex; - rc = pedit_skb_hdr_offset(skb, htype, &hoffset); - if (rc) { - pr_info("tc action pedit bad header type specified (0x%x)\n", - htype); - goto bad; - } + for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { + int offset = tkey->off; + u32 *ptr, hdata; + int hoffset; + u32 val; + int rc; - if (tkey->offmask) { - u8 *d, _d; - - if (!offset_valid(skb, hoffset + tkey->at)) { - pr_info("tc action pedit 'at' offset %d out of bounds\n", - hoffset + tkey->at); - goto bad; - } - d = skb_header_pointer(skb, hoffset + tkey->at, - sizeof(_d), &_d); - if (!d) - goto bad; - offset += (*d & tkey->offmask) >> tkey->shift; - } + if (tkey_ex) { + htype = tkey_ex->htype; + cmd = tkey_ex->cmd; - if (offset % 4) { - pr_info("tc action pedit offset must be on 32 bit boundaries\n"); - goto bad; - } + tkey_ex++; + } - if (!offset_valid(skb, hoffset + offset)) { - pr_info("tc action pedit offset %d out of bounds\n", - hoffset + offset); - goto bad; - } + rc = pedit_skb_hdr_offset(skb, htype, &hoffset); + if (rc) { + pr_info("tc action pedit bad header type specified (0x%x)\n", + htype); + goto bad; + } - ptr = skb_header_pointer(skb, hoffset + offset, - sizeof(hdata), &hdata); - if (!ptr) - goto bad; - /* just do it, baby */ - switch (cmd) { - case TCA_PEDIT_KEY_EX_CMD_SET: - val = tkey->val; - break; - case TCA_PEDIT_KEY_EX_CMD_ADD: - val = (*ptr + tkey->val) & ~tkey->mask; - break; - default: - pr_info("tc action pedit bad command (%d)\n", - cmd); + if (tkey->offmask) { + u8 *d, _d; + + if (!offset_valid(skb, hoffset + tkey->at)) { + pr_info("tc action pedit 'at' offset %d out of bounds\n", + hoffset + tkey->at); goto bad; } + d = skb_header_pointer(skb, hoffset + tkey->at, + sizeof(_d), &_d); + if (!d) + goto bad; + offset += (*d & tkey->offmask) >> tkey->shift; + } - *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &hdata) - skb_store_bits(skb, hoffset + offset, ptr, 4); + if (offset % 4) { + pr_info("tc action pedit offset must be on 32 bit boundaries\n"); + goto bad; } - goto done; - } else { - WARN(1, "pedit BUG: index %d\n", p->tcf_index); + if (!offset_valid(skb, hoffset + offset)) { + pr_info("tc action pedit offset %d out of bounds\n", + hoffset + offset); + goto bad; + } + + ptr = skb_header_pointer(skb, hoffset + offset, + sizeof(hdata), &hdata); + if (!ptr) + goto bad; + /* just do it, baby */ + switch (cmd) { + case TCA_PEDIT_KEY_EX_CMD_SET: + val = tkey->val; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + val = (*ptr + tkey->val) & ~tkey->mask; + break; + default: + pr_info("tc action pedit bad command (%d)\n", + cmd); + goto bad; + } + + *ptr = ((*ptr & tkey->mask) ^ val); + if (ptr == &hdata) + skb_store_bits(skb, hoffset + offset, ptr, 4); } + goto done; + bad: - p->tcf_qstats.overlimits++; + tcf_action_inc_overlimit_qstats(&p->common); done: - bstats_update(&p->tcf_bstats, skb); -unlock: - spin_unlock(&p->tcf_lock); return p->tcf_action; } @@ -443,30 +463,33 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); + struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; - s = struct_size(opt, keys, p->tcfp_nkeys); + spin_lock_bh(&p->tcf_lock); + parms = rcu_dereference_protected(p->parms, 1); + s = struct_size(opt, keys, parms->tcfp_nkeys); - /* netlink spinlocks held above us - must use ATOMIC */ opt = kzalloc(s, GFP_ATOMIC); - if (unlikely(!opt)) + if (unlikely(!opt)) { + spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; + } - spin_lock_bh(&p->tcf_lock); - memcpy(opt->keys, p->tcfp_keys, flex_array_size(opt, keys, p->tcfp_nkeys)); + memcpy(opt->keys, parms->tcfp_keys, + flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; - opt->nkeys = p->tcfp_nkeys; - opt->flags = p->tcfp_flags; + opt->nkeys = parms->tcfp_nkeys; + opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; - if (p->tcfp_keys_ex) { - if (tcf_pedit_key_ex_dump(skb, - p->tcfp_keys_ex, - p->tcfp_nkeys)) + if (parms->tcfp_keys_ex) { + if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, + parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) @@ -520,7 +543,28 @@ static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data, } *index_inc = k; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + u32 cmd = tcf_pedit_cmd(act, 0); + int k; + + switch (cmd) { + case TCA_PEDIT_KEY_EX_CMD_SET: + fl_action->id = FLOW_ACTION_MANGLE; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + fl_action->id = FLOW_ACTION_ADD; + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); + return -EOPNOTSUPP; + } + + for (k = 1; k < tcf_pedit_nkeys(act); k++) { + if (cmd != tcf_pedit_cmd(act, k)) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); + return -EOPNOTSUPP; + } + } } return 0; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0adb26e366a7..227cba58ce9f 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -19,6 +19,7 @@ #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_police.h> +#include <net/tc_wrapper.h> /* Each policer is serialized by its individual spinlock */ @@ -242,8 +243,9 @@ static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit) return len <= limit; } -static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_police *police = to_police(a); s64 now, toks, ppstoks = 0, ptoks = 0; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 7a25477f5d99..f7416b5598e0 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -20,6 +20,7 @@ #include <net/tc_act/tc_sample.h> #include <net/psample.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> #include <linux/if_arp.h> @@ -153,8 +154,9 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev) } } -static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; @@ -168,7 +170,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ - if (psample_group && (prandom_u32_max(s->rate) == 0)) { + if (psample_group && (get_random_u32_below(s->rate) == 0)) { if (!skb_at_tc_ingress(skb)) { md.in_ifindex = skb->skb_iif; md.out_ifindex = skb->dev->ifindex; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 18d376135461..4b84514534f3 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -14,6 +14,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> @@ -21,8 +22,9 @@ static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 -static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_defact *d = to_defact(a); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1710780c908a..ce7008cf291c 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -16,6 +16,7 @@ #include <net/ipv6.h> #include <net/dsfield.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> #include <linux/tc_act/tc_skbedit.h> #include <net/tc_act/tc_skbedit.h> @@ -36,8 +37,9 @@ static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params, return netdev_cap_txqueue(skb->dev, queue_mapping); } -static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index d98758a63934..dffa990a9629 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -15,14 +15,16 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> #include <linux/tc_act/tc_skbmod.h> #include <net/tc_act/tc_skbmod.h> static struct tc_action_ops act_skbmod_ops; -static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); int action, max_edit_len, err; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2691a3d8e451..2d12d2626415 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -16,14 +16,16 @@ #include <net/pkt_sched.h> #include <net/dst.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> #include <linux/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_tunnel_key.h> static struct tc_action_ops act_tunnel_key_ops; -static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 7b24e898a3e6..0251442f5f29 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -12,14 +12,16 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> #include <linux/tc_act/tc_vlan.h> #include <net/tc_act/tc_vlan.h> static struct tc_action_ops act_vlan_ops; -static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 23d1cfa4f58c..bfabc9c95fa9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -40,6 +40,7 @@ #include <net/tc_act/tc_mpls.h> #include <net/tc_act/tc_gate.h> #include <net/flow_offload.h> +#include <net/tc_wrapper.h> extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -487,7 +488,8 @@ static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block, #endif static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast); + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack); static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create, @@ -520,7 +522,7 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, */ if (is_first_reference && !by_act) tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, NULL); return chain; @@ -1564,7 +1566,7 @@ reclassify: tp->protocol != htons(ETH_P_ALL)) continue; - err = tp->classify(skb, tp, res); + err = tc_classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { first_tp = orig_tp; @@ -1816,7 +1818,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool terse_dump, bool rtnl_held) + bool terse_dump, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1856,7 +1859,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } + + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto nla_put_failure; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1870,7 +1879,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, int event, bool unicast, - bool rtnl_held) + bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1882,7 +1891,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1911,7 +1920,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -1937,14 +1946,15 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, - struct tcf_chain *chain, int event) + struct tcf_chain *chain, int event, + struct netlink_ext_ack *extack) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) - tfilter_notify(net, oskb, n, tp, block, - q, parent, NULL, event, false, true); + tfilter_notify(net, oskb, n, tp, block, q, parent, NULL, + event, false, true, extack); } static void tfilter_put(struct tcf_proto *tp, void *fh) @@ -2155,7 +2165,7 @@ replay: flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_NEWTFILTER, false, rtnl_held); + RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); /* q pointer is NULL for shared blocks */ if (q) @@ -2283,7 +2293,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; @@ -2307,7 +2317,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, tcf_proto_put(tp, rtnl_held, NULL); tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_DELTFILTER, false, rtnl_held); + RTM_DELTFILTER, false, rtnl_held, extack); err = 0; goto errout; } @@ -2451,7 +2461,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -ENOENT; } else { err = tfilter_notify(net, skb, n, tp, block, q, parent, - fh, RTM_NEWTFILTER, true, rtnl_held); + fh, RTM_NEWTFILTER, true, rtnl_held, NULL); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); } @@ -2489,7 +2499,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, a->terse_dump, true); + RTM_NEWTFILTER, a->terse_dump, true, NULL); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, @@ -2523,7 +2533,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, false, true) <= 0) + RTM_NEWTFILTER, false, true, NULL) <= 0) goto errout; cb->args[1] = 1; } @@ -2666,7 +2676,8 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct net *net, struct sk_buff *skb, struct tcf_block *block, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { unsigned char *b = skb_tail_pointer(skb); const struct tcf_proto_ops *ops; @@ -2703,7 +2714,12 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, goto nla_put_failure; } + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -2713,7 +2729,8 @@ nla_put_failure: } static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast) + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct tcf_block *block = chain->block; @@ -2727,7 +2744,7 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, portid, - seq, flags, event) <= 0) { + seq, flags, event, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2755,7 +2772,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, return -ENOBUFS; if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb, - block, portid, seq, flags, RTM_DELCHAIN) <= 0) { + block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2907,11 +2924,11 @@ replay: } tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, extack); break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference @@ -2921,7 +2938,7 @@ replay: break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, - n->nlmsg_flags, n->nlmsg_type, true); + n->nlmsg_flags, n->nlmsg_type, true, extack); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); break; @@ -3021,7 +3038,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWCHAIN); + RTM_NEWCHAIN, NULL); if (err <= 0) break; index++; @@ -3560,6 +3577,7 @@ int tc_setup_action(struct flow_action *flow_action, for (k = 0; k < index ; k++) { entry[k].hw_stats = tc_act_hw_stats(act->hw_stats); entry[k].hw_index = act->tcfa_index; + entry[k].act_cookie = (unsigned long)act; } j += index; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d229ce99e554..1b92c33b5f81 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -18,6 +18,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> struct basic_head { struct list_head flist; @@ -36,8 +37,9 @@ struct basic_filter { struct rcu_work rwork; }; -static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int basic_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { int r; struct basic_head *head = rcu_dereference_bh(tp->root); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index bc317b3eac12..466c26df853a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -19,6 +19,7 @@ #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> +#include <net/tc_wrapper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); @@ -77,8 +78,9 @@ static int cls_bpf_exec_opcode(int code) } } -static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); bool at_ingress = skb_at_tc_ingress(skb); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index ed00001b528a..bd9322d71910 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -13,6 +13,7 @@ #include <net/pkt_cls.h> #include <net/sock.h> #include <net/cls_cgroup.h> +#include <net/tc_wrapper.h> struct cls_cgroup_head { u32 handle; @@ -22,8 +23,9 @@ struct cls_cgroup_head { struct rcu_work rwork; }; -static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int cls_cgroup_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); u32 classid = task_get_classid(skb); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 014cd3de7b5d..6ab317b48d6c 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -24,6 +24,7 @@ #include <net/ip.h> #include <net/route.h> #include <net/flow_dissector.h> +#include <net/tc_wrapper.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> @@ -292,8 +293,9 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ (1 << FLOW_KEY_NFCT_PROTO_DST)) -static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct flow_head *head = rcu_dereference_bh(tp->root); struct flow_filter *f; @@ -367,7 +369,7 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { static void __flow_destroy_filter(struct flow_filter *f) { - del_timer_sync(&f->perturb_timer); + timer_shutdown_sync(&f->perturb_timer); tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); tcf_exts_put_net(&f->exts); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 25bc57ee6ea1..885c95191ccf 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -27,6 +27,7 @@ #include <net/vxlan.h> #include <net/erspan.h> #include <net/gtp.h> +#include <net/tc_wrapper.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -305,8 +306,9 @@ static u16 fl_ct_info_to_flower_map[] = { TCA_FLOWER_KEY_CT_FLAGS_NEW, }; -static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); bool post_ct = tc_skb_cb(skb)->post_ct; @@ -500,12 +502,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, rtnl_held); - tcf_exts_hw_stats_update(&f->exts, cls_flower.stats.bytes, - cls_flower.stats.pkts, - cls_flower.stats.drops, - cls_flower.stats.lastused, - cls_flower.stats.used_hw_stats, - cls_flower.stats.used_hw_stats_valid); + tcf_exts_hw_stats_update(&f->exts, &cls_flower.stats, cls_flower.use_act_stats); } static void __fl_put(struct cls_fl_filter *f) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index a32351da968c..ae9439a6c56c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -21,6 +21,7 @@ #include <net/act_api.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> +#include <net/tc_wrapper.h> #define HTSIZE 256 @@ -47,8 +48,9 @@ static u32 fw_hash(u32 handle) return handle % HTSIZE; } -static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int fw_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 39a5d9c170de..fa3bbd187eb9 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -12,6 +12,7 @@ #include <net/sch_generic.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> struct cls_mall_head { struct tcf_exts exts; @@ -24,8 +25,9 @@ struct cls_mall_head { bool deleting; }; -static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int mall_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_mall_head *head = rcu_dereference_bh(tp->root); @@ -329,11 +331,7 @@ static void mall_stats_hw_filter(struct tcf_proto *tp, tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); - tcf_exts_hw_stats_update(&head->exts, cls_mall.stats.bytes, - cls_mall.stats.pkts, cls_mall.stats.drops, - cls_mall.stats.lastused, - cls_mall.stats.used_hw_stats, - cls_mall.stats.used_hw_stats_valid); + tcf_exts_hw_stats_update(&head->exts, &cls_mall.stats, cls_mall.use_act_stats); } static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 9e43b929d4ca..d0c53724d3e8 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -17,6 +17,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> /* * 1. For now we assume that route tags < 256. @@ -121,8 +122,9 @@ static inline int route4_hash_wild(void) return 0; \ } -static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int route4_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct route4_head *head = rcu_dereference_bh(tp->root); struct dst_entry *dst; diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c deleted file mode 100644 index de1c1d4da597..000000000000 --- a/net/sched/cls_rsvp.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <net/ip.h> -#include <net/netlink.h> -#include <net/act_api.h> -#include <net/pkt_cls.h> - -#define RSVP_DST_LEN 1 -#define RSVP_ID "rsvp" -#define RSVP_OPS cls_rsvp_ops - -#include "cls_rsvp.h" -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h deleted file mode 100644 index b00a7dbd0587..000000000000 --- a/net/sched/cls_rsvp.h +++ /dev/null @@ -1,764 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -/* - Comparing to general packet classification problem, - RSVP needs only several relatively simple rules: - - * (dst, protocol) are always specified, - so that we are able to hash them. - * src may be exact, or may be wildcard, so that - we can keep a hash table plus one wildcard entry. - * source port (or flow label) is important only if src is given. - - IMPLEMENTATION. - - We use a two level hash table: The top level is keyed by - destination address and protocol ID, every bucket contains a list - of "rsvp sessions", identified by destination address, protocol and - DPI(="Destination Port ID"): triple (key, mask, offset). - - Every bucket has a smaller hash table keyed by source address - (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. - Every bucket is again a list of "RSVP flows", selected by - source address and SPI(="Source Port ID" here rather than - "security parameter index"): triple (key, mask, offset). - - - NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) - and all fragmented packets go to the best-effort traffic class. - - - NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires - only one "Generalized Port Identifier". So that for classic - ah, esp (and udp,tcp) both *pi should coincide or one of them - should be wildcard. - - At first sight, this redundancy is just a waste of CPU - resources. But DPI and SPI add the possibility to assign different - priorities to GPIs. Look also at note 4 about tunnels below. - - - NOTE 3. One complication is the case of tunneled packets. - We implement it as following: if the first lookup - matches a special session with "tunnelhdr" value not zero, - flowid doesn't contain the true flow ID, but the tunnel ID (1...255). - In this case, we pull tunnelhdr bytes and restart lookup - with tunnel ID added to the list of keys. Simple and stupid 8)8) - It's enough for PIMREG and IPIP. - - - NOTE 4. Two GPIs make it possible to parse even GRE packets. - F.e. DPI can select ETH_P_IP (and necessary flags to make - tunnelhdr correct) in GRE protocol field and SPI matches - GRE key. Is it not nice? 8)8) - - - Well, as result, despite its simplicity, we get a pretty - powerful classification engine. */ - - -struct rsvp_head { - u32 tmap[256/32]; - u32 hgenerator; - u8 tgenerator; - struct rsvp_session __rcu *ht[256]; - struct rcu_head rcu; -}; - -struct rsvp_session { - struct rsvp_session __rcu *next; - __be32 dst[RSVP_DST_LEN]; - struct tc_rsvp_gpi dpi; - u8 protocol; - u8 tunnelid; - /* 16 (src,sport) hash slots, and one wildcard source slot */ - struct rsvp_filter __rcu *ht[16 + 1]; - struct rcu_head rcu; -}; - - -struct rsvp_filter { - struct rsvp_filter __rcu *next; - __be32 src[RSVP_DST_LEN]; - struct tc_rsvp_gpi spi; - u8 tunnelhdr; - - struct tcf_result res; - struct tcf_exts exts; - - u32 handle; - struct rsvp_session *sess; - struct rcu_work rwork; -}; - -static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) -{ - unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1]; - - h ^= h>>16; - h ^= h>>8; - return (h ^ protocol ^ tunnelid) & 0xFF; -} - -static inline unsigned int hash_src(__be32 *src) -{ - unsigned int h = (__force __u32)src[RSVP_DST_LEN-1]; - - h ^= h>>16; - h ^= h>>8; - h ^= h>>4; - return h & 0xF; -} - -#define RSVP_APPLY_RESULT() \ -{ \ - int r = tcf_exts_exec(skb, &f->exts, res); \ - if (r < 0) \ - continue; \ - else if (r > 0) \ - return r; \ -} - -static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct rsvp_head *head = rcu_dereference_bh(tp->root); - struct rsvp_session *s; - struct rsvp_filter *f; - unsigned int h1, h2; - __be32 *dst, *src; - u8 protocol; - u8 tunnelid = 0; - u8 *xprt; -#if RSVP_DST_LEN == 4 - struct ipv6hdr *nhptr; - - if (!pskb_network_may_pull(skb, sizeof(*nhptr))) - return -1; - nhptr = ipv6_hdr(skb); -#else - struct iphdr *nhptr; - - if (!pskb_network_may_pull(skb, sizeof(*nhptr))) - return -1; - nhptr = ip_hdr(skb); -#endif -restart: - -#if RSVP_DST_LEN == 4 - src = &nhptr->saddr.s6_addr32[0]; - dst = &nhptr->daddr.s6_addr32[0]; - protocol = nhptr->nexthdr; - xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr); -#else - src = &nhptr->saddr; - dst = &nhptr->daddr; - protocol = nhptr->protocol; - xprt = ((u8 *)nhptr) + (nhptr->ihl<<2); - if (ip_is_fragment(nhptr)) - return -1; -#endif - - h1 = hash_dst(dst, protocol, tunnelid); - h2 = hash_src(src); - - for (s = rcu_dereference_bh(head->ht[h1]); s; - s = rcu_dereference_bh(s->next)) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && - protocol == s->protocol && - !(s->dpi.mask & - (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) && -#if RSVP_DST_LEN == 4 - dst[0] == s->dst[0] && - dst[1] == s->dst[1] && - dst[2] == s->dst[2] && -#endif - tunnelid == s->tunnelid) { - - for (f = rcu_dereference_bh(s->ht[h2]); f; - f = rcu_dereference_bh(f->next)) { - if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && - !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) -#if RSVP_DST_LEN == 4 - && - src[0] == f->src[0] && - src[1] == f->src[1] && - src[2] == f->src[2] -#endif - ) { - *res = f->res; - RSVP_APPLY_RESULT(); - -matched: - if (f->tunnelhdr == 0) - return 0; - - tunnelid = f->res.classid; - nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr)); - goto restart; - } - } - - /* And wildcard bucket... */ - for (f = rcu_dereference_bh(s->ht[16]); f; - f = rcu_dereference_bh(f->next)) { - *res = f->res; - RSVP_APPLY_RESULT(); - goto matched; - } - return -1; - } - } - return -1; -} - -static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_session *s; - struct rsvp_filter __rcu **ins; - struct rsvp_filter *pins; - unsigned int h1 = h & 0xFF; - unsigned int h2 = (h >> 8) & 0xFF; - - for (s = rtnl_dereference(head->ht[h1]); s; - s = rtnl_dereference(s->next)) { - for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ; - ins = &pins->next, pins = rtnl_dereference(*ins)) { - if (pins->handle == h) { - RCU_INIT_POINTER(n->next, pins->next); - rcu_assign_pointer(*ins, n); - return; - } - } - } - - /* Something went wrong if we are trying to replace a non-existent - * node. Mind as well halt instead of silently failing. - */ - BUG_ON(1); -} - -static void *rsvp_get(struct tcf_proto *tp, u32 handle) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_session *s; - struct rsvp_filter *f; - unsigned int h1 = handle & 0xFF; - unsigned int h2 = (handle >> 8) & 0xFF; - - if (h2 > 16) - return NULL; - - for (s = rtnl_dereference(head->ht[h1]); s; - s = rtnl_dereference(s->next)) { - for (f = rtnl_dereference(s->ht[h2]); f; - f = rtnl_dereference(f->next)) { - if (f->handle == handle) - return f; - } - } - return NULL; -} - -static int rsvp_init(struct tcf_proto *tp) -{ - struct rsvp_head *data; - - data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL); - if (data) { - rcu_assign_pointer(tp->root, data); - return 0; - } - return -ENOBUFS; -} - -static void __rsvp_delete_filter(struct rsvp_filter *f) -{ - tcf_exts_destroy(&f->exts); - tcf_exts_put_net(&f->exts); - kfree(f); -} - -static void rsvp_delete_filter_work(struct work_struct *work) -{ - struct rsvp_filter *f = container_of(to_rcu_work(work), - struct rsvp_filter, - rwork); - rtnl_lock(); - __rsvp_delete_filter(f); - rtnl_unlock(); -} - -static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) -{ - tcf_unbind_filter(tp, &f->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (tcf_exts_get_net(&f->exts)) - tcf_queue_work(&f->rwork, rsvp_delete_filter_work); - else - __rsvp_delete_filter(f); -} - -static void rsvp_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - int h1, h2; - - if (data == NULL) - return; - - for (h1 = 0; h1 < 256; h1++) { - struct rsvp_session *s; - - while ((s = rtnl_dereference(data->ht[h1])) != NULL) { - RCU_INIT_POINTER(data->ht[h1], s->next); - - for (h2 = 0; h2 <= 16; h2++) { - struct rsvp_filter *f; - - while ((f = rtnl_dereference(s->ht[h2])) != NULL) { - rcu_assign_pointer(s->ht[h2], f->next); - rsvp_delete_filter(tp, f); - } - } - kfree_rcu(s, rcu); - } - } - kfree_rcu(data, rcu); -} - -static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_filter *nfp, *f = arg; - struct rsvp_filter __rcu **fp; - unsigned int h = f->handle; - struct rsvp_session __rcu **sp; - struct rsvp_session *nsp, *s = f->sess; - int i, h1; - - fp = &s->ht[(h >> 8) & 0xFF]; - for (nfp = rtnl_dereference(*fp); nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) { - if (nfp == f) { - RCU_INIT_POINTER(*fp, f->next); - rsvp_delete_filter(tp, f); - - /* Strip tree */ - - for (i = 0; i <= 16; i++) - if (s->ht[i]) - goto out; - - /* OK, session has no flows */ - sp = &head->ht[h & 0xFF]; - for (nsp = rtnl_dereference(*sp); nsp; - sp = &nsp->next, nsp = rtnl_dereference(*sp)) { - if (nsp == s) { - RCU_INIT_POINTER(*sp, s->next); - kfree_rcu(s, rcu); - goto out; - } - } - - break; - } - } - -out: - *last = true; - for (h1 = 0; h1 < 256; h1++) { - if (rcu_access_pointer(head->ht[h1])) { - *last = false; - break; - } - } - - return 0; -} - -static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - int i = 0xFFFF; - - while (i-- > 0) { - u32 h; - - if ((data->hgenerator += 0x10000) == 0) - data->hgenerator = 0x10000; - h = data->hgenerator|salt; - if (!rsvp_get(tp, h)) - return h; - } - return 0; -} - -static int tunnel_bts(struct rsvp_head *data) -{ - int n = data->tgenerator >> 5; - u32 b = 1 << (data->tgenerator & 0x1F); - - if (data->tmap[n] & b) - return 0; - data->tmap[n] |= b; - return 1; -} - -static void tunnel_recycle(struct rsvp_head *data) -{ - struct rsvp_session __rcu **sht = data->ht; - u32 tmap[256/32]; - int h1, h2; - - memset(tmap, 0, sizeof(tmap)); - - for (h1 = 0; h1 < 256; h1++) { - struct rsvp_session *s; - for (s = rtnl_dereference(sht[h1]); s; - s = rtnl_dereference(s->next)) { - for (h2 = 0; h2 <= 16; h2++) { - struct rsvp_filter *f; - - for (f = rtnl_dereference(s->ht[h2]); f; - f = rtnl_dereference(f->next)) { - if (f->tunnelhdr == 0) - continue; - data->tgenerator = f->res.classid; - tunnel_bts(data); - } - } - } - } - - memcpy(data->tmap, tmap, sizeof(tmap)); -} - -static u32 gen_tunnel(struct rsvp_head *data) -{ - int i, k; - - for (k = 0; k < 2; k++) { - for (i = 255; i > 0; i--) { - if (++data->tgenerator == 0) - data->tgenerator = 1; - if (tunnel_bts(data)) - return data->tgenerator; - } - tunnel_recycle(data); - } - return 0; -} - -static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { - [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, - [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, -}; - -static int rsvp_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, - u32 handle, struct nlattr **tca, - void **arg, u32 flags, - struct netlink_ext_ack *extack) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - struct rsvp_filter *f, *nfp; - struct rsvp_filter __rcu **fp; - struct rsvp_session *nsp, *s; - struct rsvp_session __rcu **sp; - struct tc_rsvp_pinfo *pinfo = NULL; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_RSVP_MAX + 1]; - struct tcf_exts e; - unsigned int h1, h2; - __be32 *dst; - int err; - - if (opt == NULL) - return handle ? -EINVAL : 0; - - err = nla_parse_nested_deprecated(tb, TCA_RSVP_MAX, opt, rsvp_policy, - NULL); - if (err < 0) - return err; - - err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags, - extack); - if (err < 0) - goto errout2; - - f = *arg; - if (f) { - /* Node exists: adjust only classid */ - struct rsvp_filter *n; - - if (f->handle != handle && handle) - goto errout2; - - n = kmemdup(f, sizeof(*f), GFP_KERNEL); - if (!n) { - err = -ENOMEM; - goto errout2; - } - - err = tcf_exts_init(&n->exts, net, TCA_RSVP_ACT, - TCA_RSVP_POLICE); - if (err < 0) { - kfree(n); - goto errout2; - } - - if (tb[TCA_RSVP_CLASSID]) { - n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - tcf_bind_filter(tp, &n->res, base); - } - - tcf_exts_change(&n->exts, &e); - rsvp_replace(tp, n, handle); - return 0; - } - - /* Now more serious part... */ - err = -EINVAL; - if (handle) - goto errout2; - if (tb[TCA_RSVP_DST] == NULL) - goto errout2; - - err = -ENOBUFS; - f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL); - if (f == NULL) - goto errout2; - - err = tcf_exts_init(&f->exts, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); - if (err < 0) - goto errout; - h2 = 16; - if (tb[TCA_RSVP_SRC]) { - memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); - h2 = hash_src(f->src); - } - if (tb[TCA_RSVP_PINFO]) { - pinfo = nla_data(tb[TCA_RSVP_PINFO]); - f->spi = pinfo->spi; - f->tunnelhdr = pinfo->tunnelhdr; - } - if (tb[TCA_RSVP_CLASSID]) - f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - - dst = nla_data(tb[TCA_RSVP_DST]); - h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); - - err = -ENOMEM; - if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) - goto errout; - - if (f->tunnelhdr) { - err = -EINVAL; - if (f->res.classid > 255) - goto errout; - - err = -ENOMEM; - if (f->res.classid == 0 && - (f->res.classid = gen_tunnel(data)) == 0) - goto errout; - } - - for (sp = &data->ht[h1]; - (s = rtnl_dereference(*sp)) != NULL; - sp = &s->next) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && - pinfo && pinfo->protocol == s->protocol && - memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && -#if RSVP_DST_LEN == 4 - dst[0] == s->dst[0] && - dst[1] == s->dst[1] && - dst[2] == s->dst[2] && -#endif - pinfo->tunnelid == s->tunnelid) { - -insert: - /* OK, we found appropriate session */ - - fp = &s->ht[h2]; - - f->sess = s; - if (f->tunnelhdr == 0) - tcf_bind_filter(tp, &f->res, base); - - tcf_exts_change(&f->exts, &e); - - fp = &s->ht[h2]; - for (nfp = rtnl_dereference(*fp); nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) { - __u32 mask = nfp->spi.mask & f->spi.mask; - - if (mask != f->spi.mask) - break; - } - RCU_INIT_POINTER(f->next, nfp); - rcu_assign_pointer(*fp, f); - - *arg = f; - return 0; - } - } - - /* No session found. Create new one. */ - - err = -ENOBUFS; - s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL); - if (s == NULL) - goto errout; - memcpy(s->dst, dst, sizeof(s->dst)); - - if (pinfo) { - s->dpi = pinfo->dpi; - s->protocol = pinfo->protocol; - s->tunnelid = pinfo->tunnelid; - } - sp = &data->ht[h1]; - for (nsp = rtnl_dereference(*sp); nsp; - sp = &nsp->next, nsp = rtnl_dereference(*sp)) { - if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask) - break; - } - RCU_INIT_POINTER(s->next, nsp); - rcu_assign_pointer(*sp, s); - - goto insert; - -errout: - tcf_exts_destroy(&f->exts); - kfree(f); -errout2: - tcf_exts_destroy(&e); - return err; -} - -static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg, - bool rtnl_held) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - unsigned int h, h1; - - if (arg->stop) - return; - - for (h = 0; h < 256; h++) { - struct rsvp_session *s; - - for (s = rtnl_dereference(head->ht[h]); s; - s = rtnl_dereference(s->next)) { - for (h1 = 0; h1 <= 16; h1++) { - struct rsvp_filter *f; - - for (f = rtnl_dereference(s->ht[h1]); f; - f = rtnl_dereference(f->next)) { - if (!tc_cls_stats_dump(tp, arg, f)) - return; - } - } - } - } -} - -static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) -{ - struct rsvp_filter *f = fh; - struct rsvp_session *s; - struct nlattr *nest; - struct tc_rsvp_pinfo pinfo; - - if (f == NULL) - return skb->len; - s = f->sess; - - t->tcm_handle = f->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst)) - goto nla_put_failure; - pinfo.dpi = s->dpi; - pinfo.spi = f->spi; - pinfo.protocol = s->protocol; - pinfo.tunnelid = s->tunnelid; - pinfo.tunnelhdr = f->tunnelhdr; - pinfo.pad = 0; - if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo)) - goto nla_put_failure; - if (f->res.classid && - nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid)) - goto nla_put_failure; - if (((f->handle >> 8) & 0xFF) != 16 && - nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &f->exts) < 0) - goto nla_put_failure; - - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &f->exts) < 0) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, - unsigned long base) -{ - struct rsvp_filter *f = fh; - - tc_cls_bind_class(classid, cl, q, &f->res, base); -} - -static struct tcf_proto_ops RSVP_OPS __read_mostly = { - .kind = RSVP_ID, - .classify = rsvp_classify, - .init = rsvp_init, - .destroy = rsvp_destroy, - .get = rsvp_get, - .change = rsvp_change, - .delete = rsvp_delete, - .walk = rsvp_walk, - .dump = rsvp_dump, - .bind_class = rsvp_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_rsvp(void) -{ - return register_tcf_proto_ops(&RSVP_OPS); -} - -static void __exit exit_rsvp(void) -{ - unregister_tcf_proto_ops(&RSVP_OPS); -} - -module_init(init_rsvp) -module_exit(exit_rsvp) diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c deleted file mode 100644 index 64078846000e..000000000000 --- a/net/sched/cls_rsvp6.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/ipv6.h> -#include <linux/skbuff.h> -#include <net/act_api.h> -#include <net/pkt_cls.h> -#include <net/netlink.h> - -#define RSVP_DST_LEN 4 -#define RSVP_ID "rsvp6" -#define RSVP_OPS cls_rsvp6_ops - -#include "cls_rsvp.h" -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c deleted file mode 100644 index 1c9eeb98d826..000000000000 --- a/net/sched/cls_tcindex.c +++ /dev/null @@ -1,723 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/sched/cls_tcindex.c Packet classifier for skb->tc_index - * - * Written 1998,1999 by Werner Almesberger, EPFL ICA - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/refcount.h> -#include <net/act_api.h> -#include <net/netlink.h> -#include <net/pkt_cls.h> -#include <net/sch_generic.h> - -/* - * Passing parameters to the root seems to be done more awkwardly than really - * necessary. At least, u32 doesn't seem to use such dirty hacks. To be - * verified. FIXME. - */ - -#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ -#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ - - -struct tcindex_data; - -struct tcindex_filter_result { - struct tcf_exts exts; - struct tcf_result res; - struct tcindex_data *p; - struct rcu_work rwork; -}; - -struct tcindex_filter { - u16 key; - struct tcindex_filter_result result; - struct tcindex_filter __rcu *next; - struct rcu_work rwork; -}; - - -struct tcindex_data { - struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter __rcu **h; /* imperfect hash; */ - struct tcf_proto *tp; - u16 mask; /* AND key with mask */ - u32 shift; /* shift ANDed key to the right */ - u32 hash; /* hash table size; 0 if undefined */ - u32 alloc_hash; /* allocated size */ - u32 fall_through; /* 0: only classify if explicit match */ - refcount_t refcnt; /* a temporary refcnt for perfect hash */ - struct rcu_work rwork; -}; - -static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) -{ - return tcf_exts_has_actions(&r->exts) || r->res.classid; -} - -static void tcindex_data_get(struct tcindex_data *p) -{ - refcount_inc(&p->refcnt); -} - -static void tcindex_data_put(struct tcindex_data *p) -{ - if (refcount_dec_and_test(&p->refcnt)) { - kfree(p->perfect); - kfree(p->h); - kfree(p); - } -} - -static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, - u16 key) -{ - if (p->perfect) { - struct tcindex_filter_result *f = p->perfect + key; - - return tcindex_filter_is_set(f) ? f : NULL; - } else if (p->h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *f; - - fp = &p->h[key % p->hash]; - for (f = rcu_dereference_bh_rtnl(*fp); - f; - fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) - if (f->key == key) - return &f->result; - } - - return NULL; -} - - -static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct tcindex_data *p = rcu_dereference_bh(tp->root); - struct tcindex_filter_result *f; - int key = (skb->tc_index & p->mask) >> p->shift; - - pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", - skb, tp, res, p); - - f = tcindex_lookup(p, key); - if (!f) { - struct Qdisc *q = tcf_block_q(tp->chain->block); - - if (!p->fall_through) - return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); - res->class = 0; - pr_debug("alg 0x%x\n", res->classid); - return 0; - } - *res = f->res; - pr_debug("map 0x%x\n", res->classid); - - return tcf_exts_exec(skb, &f->exts, res); -} - - -static void *tcindex_get(struct tcf_proto *tp, u32 handle) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r; - - pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); - if (p->perfect && handle >= p->alloc_hash) - return NULL; - r = tcindex_lookup(p, handle); - return r && tcindex_filter_is_set(r) ? r : NULL; -} - -static int tcindex_init(struct tcf_proto *tp) -{ - struct tcindex_data *p; - - pr_debug("tcindex_init(tp %p)\n", tp); - p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); - if (!p) - return -ENOMEM; - - p->mask = 0xffff; - p->hash = DEFAULT_HASH_SIZE; - p->fall_through = 1; - refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - rcu_assign_pointer(tp->root, p); - return 0; -} - -static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) -{ - tcf_exts_destroy(&r->exts); - tcf_exts_put_net(&r->exts); - tcindex_data_put(r->p); -} - -static void tcindex_destroy_rexts_work(struct work_struct *work) -{ - struct tcindex_filter_result *r; - - r = container_of(to_rcu_work(work), - struct tcindex_filter_result, - rwork); - rtnl_lock(); - __tcindex_destroy_rexts(r); - rtnl_unlock(); -} - -static void __tcindex_destroy_fexts(struct tcindex_filter *f) -{ - tcf_exts_destroy(&f->result.exts); - tcf_exts_put_net(&f->result.exts); - kfree(f); -} - -static void tcindex_destroy_fexts_work(struct work_struct *work) -{ - struct tcindex_filter *f = container_of(to_rcu_work(work), - struct tcindex_filter, - rwork); - - rtnl_lock(); - __tcindex_destroy_fexts(f); - rtnl_unlock(); -} - -static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = arg; - struct tcindex_filter __rcu **walk; - struct tcindex_filter *f = NULL; - - pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p); - if (p->perfect) { - if (!r->res.class) - return -ENOENT; - } else { - int i; - - for (i = 0; i < p->hash; i++) { - walk = p->h + i; - for (f = rtnl_dereference(*walk); f; - walk = &f->next, f = rtnl_dereference(*walk)) { - if (&f->result == r) - goto found; - } - } - return -ENOENT; - -found: - rcu_assign_pointer(*walk, rtnl_dereference(f->next)); - } - tcf_unbind_filter(tp, &r->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (f) { - if (tcf_exts_get_net(&f->result.exts)) - tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work); - else - __tcindex_destroy_fexts(f); - } else { - tcindex_data_get(p); - - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - - *last = false; - return 0; -} - -static void tcindex_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - tcindex_data_put(p); -} - -static inline int -valid_perfect_hash(struct tcindex_data *p) -{ - return p->hash > (p->mask >> p->shift); -} - -static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { - [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, - [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, - [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, - [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, - [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, -}; - -static int tcindex_filter_result_init(struct tcindex_filter_result *r, - struct tcindex_data *p, - struct net *net) -{ - memset(r, 0, sizeof(*r)); - r->p = p; - return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, - TCA_TCINDEX_POLICE); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp); - -static void tcindex_partial_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - rtnl_lock(); - if (p->perfect) - tcindex_free_perfect_hash(p); - kfree(p); - rtnl_unlock(); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp) -{ - int i; - - for (i = 0; i < cp->hash; i++) - tcf_exts_destroy(&cp->perfect[i].exts); - kfree(cp->perfect); -} - -static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) -{ - int i, err = 0; - - cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL | __GFP_NOWARN); - if (!cp->perfect) - return -ENOMEM; - - for (i = 0; i < cp->hash; i++) { - err = tcf_exts_init(&cp->perfect[i].exts, net, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - goto errout; - cp->perfect[i].p = cp; - } - - return 0; - -errout: - tcindex_free_perfect_hash(cp); - return err; -} - -static int -tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, - u32 handle, struct tcindex_data *p, - struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) -{ - struct tcindex_filter_result new_filter_result, *old_r = r; - struct tcindex_data *cp = NULL, *oldp; - struct tcindex_filter *f = NULL; /* make gcc behave */ - struct tcf_result cr = {}; - int err, balloc = 0; - struct tcf_exts e; - - err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack); - if (err < 0) - goto errout; - - err = -ENOMEM; - /* tcindex_data attributes must look atomic to classifier/lookup so - * allocate new tcindex data and RCU assign it onto root. Keeping - * perfect hash and hash pointers from old data. - */ - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) - goto errout; - - cp->mask = p->mask; - cp->shift = p->shift; - cp->hash = p->hash; - cp->alloc_hash = p->alloc_hash; - cp->fall_through = p->fall_through; - cp->tp = tp; - refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - if (tb[TCA_TCINDEX_HASH]) - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - - if (tb[TCA_TCINDEX_MASK]) - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - - if (tb[TCA_TCINDEX_SHIFT]) { - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - if (cp->shift > 16) { - err = -EINVAL; - goto errout; - } - } - if (!cp->hash) { - /* Hash not specified, use perfect hash if the upper limit - * of the hashing index is below the threshold. - */ - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) - cp->hash = (cp->mask >> cp->shift) + 1; - else - cp->hash = DEFAULT_HASH_SIZE; - } - - if (p->perfect) { - int i; - - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout; - cp->alloc_hash = cp->hash; - for (i = 0; i < min(cp->hash, p->hash); i++) - cp->perfect[i].res = p->perfect[i].res; - balloc = 1; - } - cp->h = p->h; - - err = tcindex_filter_result_init(&new_filter_result, cp, net); - if (err < 0) - goto errout_alloc; - if (old_r) - cr = r->res; - - err = -EBUSY; - - /* Hash already allocated, make sure that we still meet the - * requirements for the allocated hash. - */ - if (cp->perfect) { - if (!valid_perfect_hash(cp) || - cp->hash > cp->alloc_hash) - goto errout_alloc; - } else if (cp->h && cp->hash != cp->alloc_hash) { - goto errout_alloc; - } - - err = -EINVAL; - if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - - if (!cp->perfect && !cp->h) - cp->alloc_hash = cp->hash; - - /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) - * but then, we'd fail handles that may become valid after some future - * mask change. While this is extremely unlikely to ever matter, - * the check below is safer (and also more backwards-compatible). - */ - if (cp->perfect || valid_perfect_hash(cp)) - if (handle >= cp->alloc_hash) - goto errout_alloc; - - - err = -ENOMEM; - if (!cp->perfect && !cp->h) { - if (valid_perfect_hash(cp)) { - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout_alloc; - balloc = 1; - } else { - struct tcindex_filter __rcu **hash; - - hash = kcalloc(cp->hash, - sizeof(struct tcindex_filter *), - GFP_KERNEL); - - if (!hash) - goto errout_alloc; - - cp->h = hash; - balloc = 2; - } - } - - if (cp->perfect) - r = cp->perfect + handle; - else - r = tcindex_lookup(cp, handle) ? : &new_filter_result; - - if (r == &new_filter_result) { - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (!f) - goto errout_alloc; - f->key = handle; - f->next = NULL; - err = tcindex_filter_result_init(&f->result, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - if (tb[TCA_TCINDEX_CLASSID]) { - cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); - tcf_bind_filter(tp, &cr, base); - } - - if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - oldp = p; - r->res = cr; - tcf_exts_change(&r->exts, &e); - - rcu_assign_pointer(tp->root, cp); - - if (r == &new_filter_result) { - struct tcindex_filter *nfp; - struct tcindex_filter __rcu **fp; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - fp = cp->h + (handle % cp->hash); - for (nfp = rtnl_dereference(*fp); - nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) - ; /* nothing */ - - rcu_assign_pointer(*fp, f); - } else { - tcf_exts_destroy(&new_filter_result.exts); - } - - if (oldp) - tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work); - return 0; - -errout_alloc: - if (balloc == 1) - tcindex_free_perfect_hash(cp); - else if (balloc == 2) - kfree(cp->h); - tcf_exts_destroy(&new_filter_result.exts); -errout: - kfree(cp); - tcf_exts_destroy(&e); - return err; -} - -static int -tcindex_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, u32 flags, - struct netlink_ext_ack *extack) -{ - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = *arg; - int err; - - pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," - "p %p,r %p,*arg %p\n", - tp, handle, tca, arg, opt, p, r, *arg); - - if (!opt) - return 0; - - err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt, - tcindex_policy, NULL); - if (err < 0) - return err; - - return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], flags, extack); -} - -static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, - bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter *f, *next; - int i; - - pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - if (!p->perfect[i].res.class) - continue; - if (!tc_cls_stats_dump(tp, walker, p->perfect + i)) - return; - } - } - if (!p->h) - return; - for (i = 0; i < p->hash; i++) { - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - if (!tc_cls_stats_dump(tp, walker, &f->result)) - return; - } - } -} - -static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - int i; - - pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); - - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - struct tcindex_filter_result *r = p->perfect + i; - - /* tcf_queue_work() does not guarantee the ordering we - * want, so we have to take this refcnt temporarily to - * ensure 'p' is freed after all tcindex_filter_result - * here. Imperfect hash does not need this, because it - * uses linked lists rather than an array. - */ - tcindex_data_get(p); - - tcf_unbind_filter(tp, &r->res); - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, - tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - } - - for (i = 0; p->h && i < p->hash; i++) { - struct tcindex_filter *f, *next; - bool last; - - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - tcindex_delete(tp, &f->result, &last, rtnl_held, NULL); - } - } - - tcf_queue_work(&p->rwork, tcindex_destroy_work); -} - - -static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = fh; - struct nlattr *nest; - - pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n", - tp, fh, skb, t, p, r); - pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (!fh) { - t->tcm_handle = ~0; /* whatever ... */ - if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || - nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || - nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || - nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) - goto nla_put_failure; - nla_nest_end(skb, nest); - } else { - if (p->perfect) { - t->tcm_handle = r - p->perfect; - } else { - struct tcindex_filter *f; - struct tcindex_filter __rcu **fp; - int i; - - t->tcm_handle = 0; - for (i = 0; !t->tcm_handle && i < p->hash; i++) { - fp = &p->h[i]; - for (f = rtnl_dereference(*fp); - !t->tcm_handle && f; - fp = &f->next, f = rtnl_dereference(*fp)) { - if (&f->result == r) - t->tcm_handle = f->key; - } - } - } - pr_debug("handle = %d\n", t->tcm_handle); - if (r->res.class && - nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &r->exts) < 0) - goto nla_put_failure; - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &r->exts) < 0) - goto nla_put_failure; - } - - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, - void *q, unsigned long base) -{ - struct tcindex_filter_result *r = fh; - - tc_cls_bind_class(classid, cl, q, &r->res, base); -} - -static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { - .kind = "tcindex", - .classify = tcindex_classify, - .init = tcindex_init, - .destroy = tcindex_destroy, - .get = tcindex_get, - .change = tcindex_change, - .delete = tcindex_delete, - .walk = tcindex_walk, - .dump = tcindex_dump, - .bind_class = tcindex_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_tcindex(void) -{ - return register_tcf_proto_ops(&cls_tcindex_ops); -} - -static void __exit exit_tcindex(void) -{ - unregister_tcf_proto_ops(&cls_tcindex_ops); -} - -module_init(init_tcindex) -module_exit(exit_tcindex) -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 34d25f7a0687..4e2e269f121f 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -39,6 +39,7 @@ #include <net/act_api.h> #include <net/pkt_cls.h> #include <linux/idr.h> +#include <net/tc_wrapper.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -100,8 +101,9 @@ static inline unsigned int u32_hash_fold(__be32 key, return h; } -static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int u32_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct { struct tc_u_knode *knode; diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 4ce681361851..5c1235e6076a 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -255,6 +255,8 @@ static int tcf_em_validate(struct tcf_proto *tp, * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { + if (em->ops->datalen > 0) + goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 4a27dfb1ba0f..aba789c30a2e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -31,6 +31,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h> #include <trace/events/qdisc.h> @@ -901,7 +902,8 @@ static void qdisc_offload_graft_root(struct net_device *dev, } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; @@ -969,7 +971,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -990,7 +997,8 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1001,12 +1009,12 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, - 0, RTM_DELQDISC) < 0) + 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, - old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } @@ -1021,10 +1029,11 @@ err_out: static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { if (new || old) - qdisc_notify(net, skb, n, clid, old, new); + qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); @@ -1104,12 +1113,12 @@ skip: qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); } else { - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } if (dev->flags & IFF_UP) @@ -1132,10 +1141,15 @@ skip: return -ENOENT; } + if (new && new->ops == &noqueue_qdisc_ops) { + NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); + return -EINVAL; + } + err = cops->graft(parent, cl, new, &old, extack); if (err) return err; - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } @@ -1268,20 +1282,21 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (err) goto err_out3; - if (ops->init) { - err = ops->init(sch, tca[TCA_OPTIONS], extack); - if (err != 0) - goto err_out5; - } - if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) { err = PTR_ERR(stab); - goto err_out4; + goto err_out3; } rcu_assign_pointer(sch->stab, stab); } + + if (ops->init) { + err = ops->init(sch, tca[TCA_OPTIONS], extack); + if (err != 0) + goto err_out4; + } + if (tca[TCA_RATE]) { err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { @@ -1306,10 +1321,13 @@ static struct Qdisc *qdisc_create(struct net_device *dev, return sch; -err_out5: - /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ +err_out4: + /* Even if ops->init() failed, we call ops->destroy() + * like qdisc_create_dflt(). + */ if (ops->destroy) ops->destroy(sch); + qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); @@ -1318,16 +1336,6 @@ err_out2: err_out: *errp = err; return NULL; - -err_out4: - /* - * Any broken qdiscs that would require a ops->reset() here? - * The qdisc was never in action so it shouldn't be necessary. - */ - qdisc_put_stab(rtnl_dereference(sch->stab)); - if (ops->destroy) - ops->destroy(sch); - goto err_out3; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, @@ -1503,7 +1511,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, if (err != 0) return err; } else { - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, NULL); } return 0; } @@ -1642,7 +1650,7 @@ replay: } err = qdisc_change(q, tca, extack); if (err == 0) - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, extack); return err; create_n_graft: @@ -1709,7 +1717,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1731,7 +1739,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1804,8 +1812,8 @@ done: ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, - unsigned long cl, - u32 portid, u32 seq, u16 flags, int event) + unsigned long cl, u32 portid, u32 seq, u16 flags, + int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1840,7 +1848,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1851,7 +1864,7 @@ nla_put_failure: static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, - unsigned long cl, int event) + unsigned long cl, int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1860,7 +1873,7 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -1887,7 +1900,7 @@ static int tclass_del_notify(struct net *net, return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, - RTM_DELTCLASS) < 0) { + RTM_DELTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -2094,7 +2107,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: - err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); + err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack); goto out; default: err = -EINVAL; @@ -2112,7 +2125,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { - tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); @@ -2134,7 +2147,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTCLASS); + RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, @@ -2273,6 +2286,8 @@ static struct pernet_operations psched_net_ops = { .exit = psched_net_exit, }; +DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); + static int __init pktsched_init(void) { int err; @@ -2300,6 +2315,8 @@ static int __init pktsched_init(void) rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, 0); + tc_wrapper_init(); + return 0; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c deleted file mode 100644 index f52255fea652..000000000000 --- a/net/sched/sch_atm.c +++ /dev/null @@ -1,703 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ - -/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <linux/atmdev.h> -#include <linux/atmclip.h> -#include <linux/rtnetlink.h> -#include <linux/file.h> /* for fput */ -#include <net/netlink.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> - -/* - * The ATM queuing discipline provides a framework for invoking classifiers - * (aka "filters"), which in turn select classes of this queuing discipline. - * Each class maps the flow(s) it is handling to a given VC. Multiple classes - * may share the same VC. - * - * When creating a class, VCs are specified by passing the number of the open - * socket descriptor by which the calling process references the VC. The kernel - * keeps the VC open at least until all classes using it are removed. - * - * In this file, most functions are named atm_tc_* to avoid confusion with all - * the atm_* in net/atm. This naming convention differs from what's used in the - * rest of net/sched. - * - * Known bugs: - * - sometimes messes up the IP stack - * - any manipulations besides the few operations described in the README, are - * untested and likely to crash the system - * - should lock the flow while there is data in the queue (?) - */ - -#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) - -struct atm_flow_data { - struct Qdisc_class_common common; - struct Qdisc *q; /* FIFO, TBF, etc. */ - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ - void (*old_pop)(struct atm_vcc *vcc, - struct sk_buff *skb); /* chaining */ - struct atm_qdisc_data *parent; /* parent qdisc */ - struct socket *sock; /* for closing */ - int ref; /* reference count */ - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct list_head list; - struct atm_flow_data *excess; /* flow for excess traffic; - NULL to set CLP instead */ - int hdr_len; - unsigned char hdr[]; /* header data; MUST BE LAST */ -}; - -struct atm_qdisc_data { - struct atm_flow_data link; /* unclassified skbs go here */ - struct list_head flows; /* NB: "link" is also on this - list */ - struct tasklet_struct task; /* dequeue tasklet */ -}; - -/* ------------------------- Class/flow operations ------------------------- */ - -static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - list_for_each_entry(flow, &p->flows, list) { - if (flow->common.classid == classid) - return flow; - } - return NULL; -} - -static int atm_tc_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n", - sch, p, flow, new, old); - if (list_empty(&flow->list)) - return -EINVAL; - if (!new) - new = &noop_qdisc; - *old = flow->q; - flow->q = new; - if (*old) - qdisc_reset(*old); - return 0; -} - -static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl) -{ - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow); - return flow ? flow->q : NULL; -} - -static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid) -{ - struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid); - flow = lookup_flow(sch, classid); - pr_debug("%s: flow %p\n", __func__, flow); - return (unsigned long)flow; -} - -static unsigned long atm_tc_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) -{ - struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid); - flow = lookup_flow(sch, classid); - if (flow) - flow->ref++; - pr_debug("%s: flow %p\n", __func__, flow); - return (unsigned long)flow; -} - -/* - * atm_tc_put handles all destructions, including the ones that are explicitly - * requested (atm_tc_destroy, etc.). The assumption here is that we never drop - * anything that still seems to be in use. - */ -static void atm_tc_put(struct Qdisc *sch, unsigned long cl) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - if (--flow->ref) - return; - pr_debug("atm_tc_put: destroying\n"); - list_del_init(&flow->list); - pr_debug("atm_tc_put: qdisc %p\n", flow->q); - qdisc_put(flow->q); - tcf_block_put(flow->block); - if (flow->sock) { - pr_debug("atm_tc_put: f_count %ld\n", - file_count(flow->sock->file)); - flow->vcc->pop = flow->old_pop; - sockfd_put(flow->sock); - } - if (flow->excess) - atm_tc_put(sch, (unsigned long)flow->excess); - if (flow != &p->link) - kfree(flow); - /* - * If flow == &p->link, the qdisc no longer works at this point and - * needs to be removed. (By the caller of atm_tc_put.) - */ -} - -static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb) -{ - struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; - - pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p); - VCC2FLOW(vcc)->old_pop(vcc, skb); - tasklet_schedule(&p->task); -} - -static const u8 llc_oui_ip[] = { - 0xaa, /* DSAP: non-ISO */ - 0xaa, /* SSAP: non-ISO */ - 0x03, /* Ctrl: Unnumbered Information Command PDU */ - 0x00, /* OUI: EtherType */ - 0x00, 0x00, - 0x08, 0x00 -}; /* Ethertype IP (0800) */ - -static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = { - [TCA_ATM_FD] = { .type = NLA_U32 }, - [TCA_ATM_EXCESS] = { .type = NLA_U32 }, -}; - -static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)*arg; - struct atm_flow_data *excess = NULL; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_ATM_MAX + 1]; - struct socket *sock; - int fd, error, hdr_len; - void *hdr; - - pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," - "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt); - /* - * The concept of parents doesn't apply for this qdisc. - */ - if (parent && parent != TC_H_ROOT && parent != sch->handle) - return -EINVAL; - /* - * ATM classes cannot be changed. In order to change properties of the - * ATM connection, that socket needs to be modified directly (via the - * native ATM API. In order to send a flow to a different VC, the old - * class needs to be removed and a new one added. (This may be changed - * later.) - */ - if (flow) - return -EBUSY; - if (opt == NULL) - return -EINVAL; - - error = nla_parse_nested_deprecated(tb, TCA_ATM_MAX, opt, atm_policy, - NULL); - if (error < 0) - return error; - - if (!tb[TCA_ATM_FD]) - return -EINVAL; - fd = nla_get_u32(tb[TCA_ATM_FD]); - pr_debug("atm_tc_change: fd %d\n", fd); - if (tb[TCA_ATM_HDR]) { - hdr_len = nla_len(tb[TCA_ATM_HDR]); - hdr = nla_data(tb[TCA_ATM_HDR]); - } else { - hdr_len = RFC1483LLC_LEN; - hdr = NULL; /* default LLC/SNAP for IP */ - } - if (!tb[TCA_ATM_EXCESS]) - excess = NULL; - else { - excess = (struct atm_flow_data *) - atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS])); - if (!excess) - return -ENOENT; - } - pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n", - opt->nla_type, nla_len(opt), hdr_len); - sock = sockfd_lookup(fd, &error); - if (!sock) - return error; /* f_count++ */ - pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file)); - if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { - error = -EPROTOTYPE; - goto err_out; - } - /* @@@ should check if the socket is really operational or we'll crash - on vcc->send */ - if (classid) { - if (TC_H_MAJ(classid ^ sch->handle)) { - pr_debug("atm_tc_change: classid mismatch\n"); - error = -EINVAL; - goto err_out; - } - } else { - int i; - unsigned long cl; - - for (i = 1; i < 0x8000; i++) { - classid = TC_H_MAKE(sch->handle, 0x8000 | i); - cl = atm_tc_find(sch, classid); - if (!cl) - break; - } - } - pr_debug("atm_tc_change: new id %x\n", classid); - flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL); - pr_debug("atm_tc_change: flow %p\n", flow); - if (!flow) { - error = -ENOBUFS; - goto err_out; - } - - error = tcf_block_get(&flow->block, &flow->filter_list, sch, - extack); - if (error) { - kfree(flow); - goto err_out; - } - - flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, - extack); - if (!flow->q) - flow->q = &noop_qdisc; - pr_debug("atm_tc_change: qdisc %p\n", flow->q); - flow->sock = sock; - flow->vcc = ATM_SD(sock); /* speedup */ - flow->vcc->user_back = flow; - pr_debug("atm_tc_change: vcc %p\n", flow->vcc); - flow->old_pop = flow->vcc->pop; - flow->parent = p; - flow->vcc->pop = sch_atm_pop; - flow->common.classid = classid; - flow->ref = 1; - flow->excess = excess; - list_add(&flow->list, &p->link.list); - flow->hdr_len = hdr_len; - if (hdr) - memcpy(flow->hdr, hdr, hdr_len); - else - memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip)); - *arg = (unsigned long)flow; - return 0; -err_out: - sockfd_put(sock); - return error; -} - -static int atm_tc_delete(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - if (list_empty(&flow->list)) - return -EINVAL; - if (rcu_access_pointer(flow->filter_list) || flow == &p->link) - return -EBUSY; - /* - * Reference count must be 2: one for "keepalive" (set at class - * creation), and one for the reference held when calling delete. - */ - if (flow->ref < 2) { - pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref); - return -EINVAL; - } - if (flow->ref > 2) - return -EBUSY; /* catch references via excess, etc. */ - atm_tc_put(sch, arg); - return 0; -} - -static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); - if (walker->stop) - return; - list_for_each_entry(flow, &p->flows, list) { - if (!tc_qdisc_stats_dump(sch, (unsigned long)flow, walker)) - break; - } -} - -static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - return flow ? flow->block : p->link.block; -} - -/* --------------------------- Qdisc operations ---------------------------- */ - -static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - struct tcf_result res; - int result; - int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - - pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); - result = TC_ACT_OK; /* be nice to gcc */ - flow = NULL; - if (TC_H_MAJ(skb->priority) != sch->handle || - !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) { - struct tcf_proto *fl; - - list_for_each_entry(flow, &p->flows, list) { - fl = rcu_dereference_bh(flow->filter_list); - if (fl) { - result = tcf_classify(skb, NULL, fl, &res, true); - if (result < 0) - continue; - flow = (struct atm_flow_data *)res.class; - if (!flow) - flow = lookup_flow(sch, res.classid); - goto done; - } - } - flow = NULL; -done: - ; - } - if (!flow) { - flow = &p->link; - } else { - if (flow->vcc) - ATM_SKB(skb)->atm_options = flow->vcc->atm_options; - /*@@@ looks good ... but it's not supposed to work :-) */ -#ifdef CONFIG_NET_CLS_ACT - switch (result) { - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - __qdisc_drop(skb, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - case TC_ACT_SHOT: - __qdisc_drop(skb, to_free); - goto drop; - case TC_ACT_RECLASSIFY: - if (flow->excess) - flow = flow->excess; - else - ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP; - break; - } -#endif - } - - ret = qdisc_enqueue(skb, flow->q, to_free); - if (ret != NET_XMIT_SUCCESS) { -drop: __maybe_unused - if (net_xmit_drop_count(ret)) { - qdisc_qstats_drop(sch); - if (flow) - flow->qstats.drops++; - } - return ret; - } - /* - * Okay, this may seem weird. We pretend we've dropped the packet if - * it goes via ATM. The reason for this is that the outer qdisc - * expects to be able to q->dequeue the packet later on if we return - * success at this place. Also, sch->q.qdisc needs to reflect whether - * there is a packet egligible for dequeuing or not. Note that the - * statistics of the outer qdisc are necessarily wrong because of all - * this. There's currently no correct solution for this. - */ - if (flow == &p->link) { - sch->q.qlen++; - return NET_XMIT_SUCCESS; - } - tasklet_schedule(&p->task); - return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -} - -/* - * Dequeue packets and send them over ATM. Note that we quite deliberately - * avoid checking net_device's flow control here, simply because sch_atm - * uses its own channels, which have nothing to do with any CLIP/LANE/or - * non-ATM interfaces. - */ - -static void sch_atm_dequeue(struct tasklet_struct *t) -{ - struct atm_qdisc_data *p = from_tasklet(p, t, task); - struct Qdisc *sch = qdisc_from_priv(p); - struct atm_flow_data *flow; - struct sk_buff *skb; - - pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - if (flow == &p->link) - continue; - /* - * If traffic is properly shaped, this won't generate nasty - * little bursts. Otherwise, it may ... (but that's okay) - */ - while ((skb = flow->q->ops->peek(flow->q))) { - if (!atm_may_send(flow->vcc, skb->truesize)) - break; - - skb = qdisc_dequeue_peeked(flow->q); - if (unlikely(!skb)) - break; - - qdisc_bstats_update(sch, skb); - bstats_update(&flow->bstats, skb); - pr_debug("atm_tc_dequeue: sending on class %p\n", flow); - /* remove any LL header somebody else has attached */ - skb_pull(skb, skb_network_offset(skb)); - if (skb_headroom(skb) < flow->hdr_len) { - struct sk_buff *new; - - new = skb_realloc_headroom(skb, flow->hdr_len); - dev_kfree_skb(skb); - if (!new) - continue; - skb = new; - } - pr_debug("sch_atm_dequeue: ip %p, data %p\n", - skb_network_header(skb), skb->data); - ATM_SKB(skb)->vcc = flow->vcc; - memcpy(skb_push(skb, flow->hdr_len), flow->hdr, - flow->hdr_len); - refcount_add(skb->truesize, - &sk_atm(flow->vcc)->sk_wmem_alloc); - /* atm.atm_options are already set by atm_tc_enqueue */ - flow->vcc->send(flow->vcc, skb); - } - } -} - -static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct sk_buff *skb; - - pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p); - tasklet_schedule(&p->task); - skb = qdisc_dequeue_peeked(p->link.q); - if (skb) - sch->q.qlen--; - return skb; -} - -static struct sk_buff *atm_tc_peek(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - - pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p); - - return p->link.q->ops->peek(p->link.q); -} - -static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - int err; - - pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); - INIT_LIST_HEAD(&p->flows); - INIT_LIST_HEAD(&p->link.list); - gnet_stats_basic_sync_init(&p->link.bstats); - list_add(&p->link.list, &p->flows); - p->link.q = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, sch->handle, extack); - if (!p->link.q) - p->link.q = &noop_qdisc; - pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - p->link.vcc = NULL; - p->link.sock = NULL; - p->link.common.classid = sch->handle; - p->link.ref = 1; - - err = tcf_block_get(&p->link.block, &p->link.filter_list, sch, - extack); - if (err) - return err; - - tasklet_setup(&p->task, sch_atm_dequeue); - return 0; -} - -static void atm_tc_reset(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) - qdisc_reset(flow->q); -} - -static void atm_tc_destroy(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow, *tmp; - - pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - tcf_block_put(flow->block); - flow->block = NULL; - } - - list_for_each_entry_safe(flow, tmp, &p->flows, list) { - if (flow->ref > 1) - pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref); - atm_tc_put(sch, (unsigned long)flow); - } - tasklet_kill(&p->task); -} - -static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - struct nlattr *nest; - - pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", - sch, p, flow, skb, tcm); - if (list_empty(&flow->list)) - return -EINVAL; - tcm->tcm_handle = flow->common.classid; - tcm->tcm_info = flow->q->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr)) - goto nla_put_failure; - if (flow->vcc) { - struct sockaddr_atmpvc pvc; - int state; - - memset(&pvc, 0, sizeof(pvc)); - pvc.sap_family = AF_ATMPVC; - pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; - pvc.sap_addr.vpi = flow->vcc->vpi; - pvc.sap_addr.vci = flow->vcc->vci; - if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc)) - goto nla_put_failure; - state = ATM_VF2VS(flow->vcc->flags); - if (nla_put_u32(skb, TCA_ATM_STATE, state)) - goto nla_put_failure; - } - if (flow->excess) { - if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->common.classid)) - goto nla_put_failure; - } else { - if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) - goto nla_put_failure; - } - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} -static int -atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) -{ - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || - gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) - return -1; - - return 0; -} - -static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - return 0; -} - -static const struct Qdisc_class_ops atm_class_ops = { - .graft = atm_tc_graft, - .leaf = atm_tc_leaf, - .find = atm_tc_find, - .change = atm_tc_change, - .delete = atm_tc_delete, - .walk = atm_tc_walk, - .tcf_block = atm_tc_tcf_block, - .bind_tcf = atm_tc_bind_filter, - .unbind_tcf = atm_tc_put, - .dump = atm_tc_dump_class, - .dump_stats = atm_tc_dump_class_stats, -}; - -static struct Qdisc_ops atm_qdisc_ops __read_mostly = { - .cl_ops = &atm_class_ops, - .id = "atm", - .priv_size = sizeof(struct atm_qdisc_data), - .enqueue = atm_tc_enqueue, - .dequeue = atm_tc_dequeue, - .peek = atm_tc_peek, - .init = atm_tc_init, - .reset = atm_tc_reset, - .destroy = atm_tc_destroy, - .dump = atm_tc_dump, - .owner = THIS_MODULE, -}; - -static int __init atm_init(void) -{ - return register_qdisc(&atm_qdisc_ops); -} - -static void __exit atm_exit(void) -{ - unregister_qdisc(&atm_qdisc_ops); -} - -module_init(atm_init) -module_exit(atm_exit) -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 3ed0c3342189..7970217b565a 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1209,7 +1209,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, iph_check->daddr != iph->daddr) continue; - seglen = ntohs(iph_check->tot_len) - + seglen = iph_totlen(skb, iph_check) - (4 * iph_check->ihl); } else if (iph_check->version == 6) { ipv6h = (struct ipv6hdr *)iph; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c deleted file mode 100644 index 6568e17c4c63..000000000000 --- a/net/sched/sch_cbq.c +++ /dev/null @@ -1,1727 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/sch_cbq.c Class-Based Queueing discipline. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <net/netlink.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> - - -/* Class-Based Queueing (CBQ) algorithm. - ======================================= - - Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource - Management Models for Packet Networks", - IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 - - [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 - - [3] Sally Floyd, "Notes on Class-Based Queueing: Setting - Parameters", 1996 - - [4] Sally Floyd and Michael Speer, "Experimental Results - for Class-Based Queueing", 1998, not published. - - ----------------------------------------------------------------------- - - Algorithm skeleton was taken from NS simulator cbq.cc. - If someone wants to check this code against the LBL version, - he should take into account that ONLY the skeleton was borrowed, - the implementation is different. Particularly: - - --- The WRR algorithm is different. Our version looks more - reasonable (I hope) and works when quanta are allowed to be - less than MTU, which is always the case when real time classes - have small rates. Note, that the statement of [3] is - incomplete, delay may actually be estimated even if class - per-round allotment is less than MTU. Namely, if per-round - allotment is W*r_i, and r_1+...+r_k = r < 1 - - delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B - - In the worst case we have IntServ estimate with D = W*r+k*MTU - and C = MTU*r. The proof (if correct at all) is trivial. - - - --- It seems that cbq-2.0 is not very accurate. At least, I cannot - interpret some places, which look like wrong translations - from NS. Anyone is advised to find these differences - and explain to me, why I am wrong 8). - - --- Linux has no EOI event, so that we cannot estimate true class - idle time. Workaround is to consider the next dequeue event - as sign that previous packet is finished. This is wrong because of - internal device queueing, but on a permanently loaded link it is true. - Moreover, combined with clock integrator, this scheme looks - very close to an ideal solution. */ - -struct cbq_sched_data; - - -struct cbq_class { - struct Qdisc_class_common common; - struct cbq_class *next_alive; /* next class with backlog in this priority band */ - -/* Parameters */ - unsigned char priority; /* class priority */ - unsigned char priority2; /* priority to be used after overlimit */ - unsigned char ewma_log; /* time constant for idle time calculation */ - - u32 defmap; - - /* Link-sharing scheduler parameters */ - long maxidle; /* Class parameters: see below. */ - long offtime; - long minidle; - u32 avpkt; - struct qdisc_rate_table *R_tab; - - /* General scheduler (WRR) parameters */ - long allot; - long quantum; /* Allotment per WRR round */ - long weight; /* Relative allotment: see below */ - - struct Qdisc *qdisc; /* Ptr to CBQ discipline */ - struct cbq_class *split; /* Ptr to split node */ - struct cbq_class *share; /* Ptr to LS parent in the class tree */ - struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ - struct cbq_class *borrow; /* NULL if class is bandwidth limited; - parent otherwise */ - struct cbq_class *sibling; /* Sibling chain */ - struct cbq_class *children; /* Pointer to children chain */ - - struct Qdisc *q; /* Elementary queueing discipline */ - - -/* Variables */ - unsigned char cpriority; /* Effective priority */ - unsigned char delayed; - unsigned char level; /* level of the class in hierarchy: - 0 for leaf classes, and maximal - level of children + 1 for nodes. - */ - - psched_time_t last; /* Last end of service */ - psched_time_t undertime; - long avgidle; - long deficit; /* Saved deficit for WRR */ - psched_time_t penalized; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct net_rate_estimator __rcu *rate_est; - struct tc_cbq_xstats xstats; - - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - - int filters; - - struct cbq_class *defaults[TC_PRIO_MAX + 1]; -}; - -struct cbq_sched_data { - struct Qdisc_class_hash clhash; /* Hash table of all classes */ - int nclasses[TC_CBQ_MAXPRIO + 1]; - unsigned int quanta[TC_CBQ_MAXPRIO + 1]; - - struct cbq_class link; - - unsigned int activemask; - struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes - with backlog */ - -#ifdef CONFIG_NET_CLS_ACT - struct cbq_class *rx_class; -#endif - struct cbq_class *tx_class; - struct cbq_class *tx_borrowed; - int tx_len; - psched_time_t now; /* Cached timestamp */ - unsigned int pmask; - - struct qdisc_watchdog watchdog; /* Watchdog timer, - started when CBQ has - backlog, but cannot - transmit just now */ - psched_tdiff_t wd_expires; - int toplevel; - u32 hgenerator; -}; - - -#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len) - -static inline struct cbq_class * -cbq_class_lookup(struct cbq_sched_data *q, u32 classid) -{ - struct Qdisc_class_common *clc; - - clc = qdisc_class_find(&q->clhash, classid); - if (clc == NULL) - return NULL; - return container_of(clc, struct cbq_class, common); -} - -#ifdef CONFIG_NET_CLS_ACT - -static struct cbq_class * -cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) -{ - struct cbq_class *cl; - - for (cl = this->tparent; cl; cl = cl->tparent) { - struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT]; - - if (new != NULL && new != this) - return new; - } - return NULL; -} - -#endif - -/* Classify packet. The procedure is pretty complicated, but - * it allows us to combine link sharing and priority scheduling - * transparently. - * - * Namely, you can put link sharing rules (f.e. route based) at root of CBQ, - * so that it resolves to split nodes. Then packets are classified - * by logical priority, or a more specific classifier may be attached - * to the split node. - */ - -static struct cbq_class * -cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *head = &q->link; - struct cbq_class **defmap; - struct cbq_class *cl = NULL; - u32 prio = skb->priority; - struct tcf_proto *fl; - struct tcf_result res; - - /* - * Step 1. If skb->priority points to one of our classes, use it. - */ - if (TC_H_MAJ(prio ^ sch->handle) == 0 && - (cl = cbq_class_lookup(q, prio)) != NULL) - return cl; - - *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - for (;;) { - int result = 0; - defmap = head->defaults; - - fl = rcu_dereference_bh(head->filter_list); - /* - * Step 2+n. Apply classifier. - */ - result = tcf_classify(skb, NULL, fl, &res, true); - if (!fl || result < 0) - goto fallback; - - cl = (void *)res.class; - if (!cl) { - if (TC_H_MAJ(res.classid)) - cl = cbq_class_lookup(q, res.classid); - else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) - cl = defmap[TC_PRIO_BESTEFFORT]; - - if (cl == NULL) - goto fallback; - } - if (cl->level >= head->level) - goto fallback; -#ifdef CONFIG_NET_CLS_ACT - switch (result) { - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - fallthrough; - case TC_ACT_SHOT: - return NULL; - case TC_ACT_RECLASSIFY: - return cbq_reclassify(skb, cl); - } -#endif - if (cl->level == 0) - return cl; - - /* - * Step 3+n. If classifier selected a link sharing class, - * apply agency specific classifier. - * Repeat this procedure until we hit a leaf node. - */ - head = cl; - } - -fallback: - cl = head; - - /* - * Step 4. No success... - */ - if (TC_H_MAJ(prio) == 0 && - !(cl = head->defaults[prio & TC_PRIO_MAX]) && - !(cl = head->defaults[TC_PRIO_BESTEFFORT])) - return head; - - return cl; -} - -/* - * A packet has just been enqueued on the empty class. - * cbq_activate_class adds it to the tail of active class list - * of its priority band. - */ - -static inline void cbq_activate_class(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - int prio = cl->cpriority; - struct cbq_class *cl_tail; - - cl_tail = q->active[prio]; - q->active[prio] = cl; - - if (cl_tail != NULL) { - cl->next_alive = cl_tail->next_alive; - cl_tail->next_alive = cl; - } else { - cl->next_alive = cl; - q->activemask |= (1<<prio); - } -} - -/* - * Unlink class from active chain. - * Note that this same procedure is done directly in cbq_dequeue* - * during round-robin procedure. - */ - -static void cbq_deactivate_class(struct cbq_class *this) -{ - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - int prio = this->cpriority; - struct cbq_class *cl; - struct cbq_class *cl_prev = q->active[prio]; - - do { - cl = cl_prev->next_alive; - if (cl == this) { - cl_prev->next_alive = cl->next_alive; - cl->next_alive = NULL; - - if (cl == q->active[prio]) { - q->active[prio] = cl_prev; - if (cl == q->active[prio]) { - q->active[prio] = NULL; - q->activemask &= ~(1<<prio); - return; - } - } - return; - } - } while ((cl_prev = cl) != q->active[prio]); -} - -static void -cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) -{ - int toplevel = q->toplevel; - - if (toplevel > cl->level) { - psched_time_t now = psched_get_time(); - - do { - if (cl->undertime < now) { - q->toplevel = cl->level; - return; - } - } while ((cl = cl->borrow) != NULL && toplevel > cl->level); - } -} - -static int -cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - int ret; - struct cbq_class *cl = cbq_classify(skb, sch, &ret); - -#ifdef CONFIG_NET_CLS_ACT - q->rx_class = cl; -#endif - if (cl == NULL) { - if (ret & __NET_XMIT_BYPASS) - qdisc_qstats_drop(sch); - __qdisc_drop(skb, to_free); - return ret; - } - - ret = qdisc_enqueue(skb, cl->q, to_free); - if (ret == NET_XMIT_SUCCESS) { - sch->q.qlen++; - cbq_mark_toplevel(q, cl); - if (!cl->next_alive) - cbq_activate_class(cl); - return ret; - } - - if (net_xmit_drop_count(ret)) { - qdisc_qstats_drop(sch); - cbq_mark_toplevel(q, cl); - cl->qstats.drops++; - } - return ret; -} - -/* Overlimit action: penalize leaf class by adding offtime */ -static void cbq_overlimit(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = cl->undertime - q->now; - - if (!cl->delayed) { - delay += cl->offtime; - - /* - * Class goes to sleep, so that it will have no - * chance to work avgidle. Let's forgive it 8) - * - * BTW cbq-2.0 has a crap in this - * place, apparently they forgot to shift it by cl->ewma_log. - */ - if (cl->avgidle < 0) - delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); - if (cl->avgidle < cl->minidle) - cl->avgidle = cl->minidle; - if (delay <= 0) - delay = 1; - cl->undertime = q->now + delay; - - cl->xstats.overactions++; - cl->delayed = 1; - } - if (q->wd_expires == 0 || q->wd_expires > delay) - q->wd_expires = delay; - - /* Dirty work! We must schedule wakeups based on - * real available rate, rather than leaf rate, - * which may be tiny (even zero). - */ - if (q->toplevel == TC_CBQ_MAXLEVEL) { - struct cbq_class *b; - psched_tdiff_t base_delay = q->wd_expires; - - for (b = cl->borrow; b; b = b->borrow) { - delay = b->undertime - q->now; - if (delay < base_delay) { - if (delay <= 0) - delay = 1; - base_delay = delay; - } - } - - q->wd_expires = base_delay; - } -} - -/* - * It is mission critical procedure. - * - * We "regenerate" toplevel cutoff, if transmitting class - * has backlog and it is not regulated. It is not part of - * original CBQ description, but looks more reasonable. - * Probably, it is wrong. This question needs further investigation. - */ - -static inline void -cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, - struct cbq_class *borrowed) -{ - if (cl && q->toplevel >= borrowed->level) { - if (cl->q->q.qlen > 1) { - do { - if (borrowed->undertime == PSCHED_PASTPERFECT) { - q->toplevel = borrowed->level; - return; - } - } while ((borrowed = borrowed->borrow) != NULL); - } -#if 0 - /* It is not necessary now. Uncommenting it - will save CPU cycles, but decrease fairness. - */ - q->toplevel = TC_CBQ_MAXLEVEL; -#endif - } -} - -static void -cbq_update(struct cbq_sched_data *q) -{ - struct cbq_class *this = q->tx_class; - struct cbq_class *cl = this; - int len = q->tx_len; - psched_time_t now; - - q->tx_class = NULL; - /* Time integrator. We calculate EOS time - * by adding expected packet transmission time. - */ - now = q->now + L2T(&q->link, len); - - for ( ; cl; cl = cl->share) { - long avgidle = cl->avgidle; - long idle; - - _bstats_update(&cl->bstats, len, 1); - - /* - * (now - last) is total time between packet right edges. - * (last_pktlen/rate) is "virtual" busy time, so that - * - * idle = (now - last) - last_pktlen/rate - */ - - idle = now - cl->last; - if ((unsigned long)idle > 128*1024*1024) { - avgidle = cl->maxidle; - } else { - idle -= L2T(cl, len); - - /* true_avgidle := (1-W)*true_avgidle + W*idle, - * where W=2^{-ewma_log}. But cl->avgidle is scaled: - * cl->avgidle == true_avgidle/W, - * hence: - */ - avgidle += idle - (avgidle>>cl->ewma_log); - } - - if (avgidle <= 0) { - /* Overlimit or at-limit */ - - if (avgidle < cl->minidle) - avgidle = cl->minidle; - - cl->avgidle = avgidle; - - /* Calculate expected time, when this class - * will be allowed to send. - * It will occur, when: - * (1-W)*true_avgidle + W*delay = 0, i.e. - * idle = (1/W - 1)*(-true_avgidle) - * or - * idle = (1 - W)*(-cl->avgidle); - */ - idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); - - /* - * That is not all. - * To maintain the rate allocated to the class, - * we add to undertime virtual clock, - * necessary to complete transmitted packet. - * (len/phys_bandwidth has been already passed - * to the moment of cbq_update) - */ - - idle -= L2T(&q->link, len); - idle += L2T(cl, len); - - cl->undertime = now + idle; - } else { - /* Underlimit */ - - cl->undertime = PSCHED_PASTPERFECT; - if (avgidle > cl->maxidle) - cl->avgidle = cl->maxidle; - else - cl->avgidle = avgidle; - } - if ((s64)(now - cl->last) > 0) - cl->last = now; - } - - cbq_update_toplevel(q, this, q->tx_borrowed); -} - -static inline struct cbq_class * -cbq_under_limit(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *this_cl = cl; - - if (cl->tparent == NULL) - return cl; - - if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { - cl->delayed = 0; - return cl; - } - - do { - /* It is very suspicious place. Now overlimit - * action is generated for not bounded classes - * only if link is completely congested. - * Though it is in agree with ancestor-only paradigm, - * it looks very stupid. Particularly, - * it means that this chunk of code will either - * never be called or result in strong amplification - * of burstiness. Dangerous, silly, and, however, - * no another solution exists. - */ - cl = cl->borrow; - if (!cl) { - this_cl->qstats.overlimits++; - cbq_overlimit(this_cl); - return NULL; - } - if (cl->level > q->toplevel) - return NULL; - } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); - - cl->delayed = 0; - return cl; -} - -static inline struct sk_buff * -cbq_dequeue_prio(struct Qdisc *sch, int prio) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl_tail, *cl_prev, *cl; - struct sk_buff *skb; - int deficit; - - cl_tail = cl_prev = q->active[prio]; - cl = cl_prev->next_alive; - - do { - deficit = 0; - - /* Start round */ - do { - struct cbq_class *borrow = cl; - - if (cl->q->q.qlen && - (borrow = cbq_under_limit(cl)) == NULL) - goto skip_class; - - if (cl->deficit <= 0) { - /* Class exhausted its allotment per - * this round. Switch to the next one. - */ - deficit = 1; - cl->deficit += cl->quantum; - goto next_class; - } - - skb = cl->q->dequeue(cl->q); - - /* Class did not give us any skb :-( - * It could occur even if cl->q->q.qlen != 0 - * f.e. if cl->q == "tbf" - */ - if (skb == NULL) - goto skip_class; - - cl->deficit -= qdisc_pkt_len(skb); - q->tx_class = cl; - q->tx_borrowed = borrow; - if (borrow != cl) { -#ifndef CBQ_XSTATS_BORROWS_BYTES - borrow->xstats.borrows++; - cl->xstats.borrows++; -#else - borrow->xstats.borrows += qdisc_pkt_len(skb); - cl->xstats.borrows += qdisc_pkt_len(skb); -#endif - } - q->tx_len = qdisc_pkt_len(skb); - - if (cl->deficit <= 0) { - q->active[prio] = cl; - cl = cl->next_alive; - cl->deficit += cl->quantum; - } - return skb; - -skip_class: - if (cl->q->q.qlen == 0 || prio != cl->cpriority) { - /* Class is empty or penalized. - * Unlink it from active chain. - */ - cl_prev->next_alive = cl->next_alive; - cl->next_alive = NULL; - - /* Did cl_tail point to it? */ - if (cl == cl_tail) { - /* Repair it! */ - cl_tail = cl_prev; - - /* Was it the last class in this band? */ - if (cl == cl_tail) { - /* Kill the band! */ - q->active[prio] = NULL; - q->activemask &= ~(1<<prio); - if (cl->q->q.qlen) - cbq_activate_class(cl); - return NULL; - } - - q->active[prio] = cl_tail; - } - if (cl->q->q.qlen) - cbq_activate_class(cl); - - cl = cl_prev; - } - -next_class: - cl_prev = cl; - cl = cl->next_alive; - } while (cl_prev != cl_tail); - } while (deficit); - - q->active[prio] = cl_prev; - - return NULL; -} - -static inline struct sk_buff * -cbq_dequeue_1(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct sk_buff *skb; - unsigned int activemask; - - activemask = q->activemask & 0xFF; - while (activemask) { - int prio = ffz(~activemask); - activemask &= ~(1<<prio); - skb = cbq_dequeue_prio(sch, prio); - if (skb) - return skb; - } - return NULL; -} - -static struct sk_buff * -cbq_dequeue(struct Qdisc *sch) -{ - struct sk_buff *skb; - struct cbq_sched_data *q = qdisc_priv(sch); - psched_time_t now; - - now = psched_get_time(); - - if (q->tx_class) - cbq_update(q); - - q->now = now; - - for (;;) { - q->wd_expires = 0; - - skb = cbq_dequeue_1(sch); - if (skb) { - qdisc_bstats_update(sch, skb); - sch->q.qlen--; - return skb; - } - - /* All the classes are overlimit. - * - * It is possible, if: - * - * 1. Scheduler is empty. - * 2. Toplevel cutoff inhibited borrowing. - * 3. Root class is overlimit. - * - * Reset 2d and 3d conditions and retry. - * - * Note, that NS and cbq-2.0 are buggy, peeking - * an arbitrary class is appropriate for ancestor-only - * sharing, but not for toplevel algorithm. - * - * Our version is better, but slower, because it requires - * two passes, but it is unavoidable with top-level sharing. - */ - - if (q->toplevel == TC_CBQ_MAXLEVEL && - q->link.undertime == PSCHED_PASTPERFECT) - break; - - q->toplevel = TC_CBQ_MAXLEVEL; - q->link.undertime = PSCHED_PASTPERFECT; - } - - /* No packets in scheduler or nobody wants to give them to us :-( - * Sigh... start watchdog timer in the last case. - */ - - if (sch->q.qlen) { - qdisc_qstats_overlimit(sch); - if (q->wd_expires) - qdisc_watchdog_schedule(&q->watchdog, - now + q->wd_expires); - } - return NULL; -} - -/* CBQ class maintenance routines */ - -static void cbq_adjust_levels(struct cbq_class *this) -{ - if (this == NULL) - return; - - do { - int level = 0; - struct cbq_class *cl; - - cl = this->children; - if (cl) { - do { - if (cl->level > level) - level = cl->level; - } while ((cl = cl->sibling) != this->children); - } - this->level = level + 1; - } while ((this = this->tparent) != NULL); -} - -static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) -{ - struct cbq_class *cl; - unsigned int h; - - if (q->quanta[prio] == 0) - return; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - /* BUGGGG... Beware! This expression suffer of - * arithmetic overflows! - */ - if (cl->priority == prio) { - cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ - q->quanta[prio]; - } - if (cl->quantum <= 0 || - cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { - pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", - cl->common.classid, cl->quantum); - cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; - } - } - } -} - -static void cbq_sync_defmap(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *split = cl->split; - unsigned int h; - int i; - - if (split == NULL) - return; - - for (i = 0; i <= TC_PRIO_MAX; i++) { - if (split->defaults[i] == cl && !(cl->defmap & (1<<i))) - split->defaults[i] = NULL; - } - - for (i = 0; i <= TC_PRIO_MAX; i++) { - int level = split->level; - - if (split->defaults[i]) - continue; - - for (h = 0; h < q->clhash.hashsize; h++) { - struct cbq_class *c; - - hlist_for_each_entry(c, &q->clhash.hash[h], - common.hnode) { - if (c->split == split && c->level < level && - c->defmap & (1<<i)) { - split->defaults[i] = c; - level = c->level; - } - } - } - } -} - -static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) -{ - struct cbq_class *split = NULL; - - if (splitid == 0) { - split = cl->split; - if (!split) - return; - splitid = split->common.classid; - } - - if (split == NULL || split->common.classid != splitid) { - for (split = cl->tparent; split; split = split->tparent) - if (split->common.classid == splitid) - break; - } - - if (split == NULL) - return; - - if (cl->split != split) { - cl->defmap = 0; - cbq_sync_defmap(cl); - cl->split = split; - cl->defmap = def & mask; - } else - cl->defmap = (cl->defmap & ~mask) | (def & mask); - - cbq_sync_defmap(cl); -} - -static void cbq_unlink_class(struct cbq_class *this) -{ - struct cbq_class *cl, **clp; - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - - qdisc_class_hash_remove(&q->clhash, &this->common); - - if (this->tparent) { - clp = &this->sibling; - cl = *clp; - do { - if (cl == this) { - *clp = cl->sibling; - break; - } - clp = &cl->sibling; - } while ((cl = *clp) != this->sibling); - - if (this->tparent->children == this) { - this->tparent->children = this->sibling; - if (this->sibling == this) - this->tparent->children = NULL; - } - } else { - WARN_ON(this->sibling != this); - } -} - -static void cbq_link_class(struct cbq_class *this) -{ - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - struct cbq_class *parent = this->tparent; - - this->sibling = this; - qdisc_class_hash_insert(&q->clhash, &this->common); - - if (parent == NULL) - return; - - if (parent->children == NULL) { - parent->children = this; - } else { - this->sibling = parent->children->sibling; - parent->children->sibling = this; - } -} - -static void -cbq_reset(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl; - int prio; - unsigned int h; - - q->activemask = 0; - q->pmask = 0; - q->tx_class = NULL; - q->tx_borrowed = NULL; - qdisc_watchdog_cancel(&q->watchdog); - q->toplevel = TC_CBQ_MAXLEVEL; - q->now = psched_get_time(); - - for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) - q->active[prio] = NULL; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - qdisc_reset(cl->q); - - cl->next_alive = NULL; - cl->undertime = PSCHED_PASTPERFECT; - cl->avgidle = cl->maxidle; - cl->deficit = cl->quantum; - cl->cpriority = cl->priority; - } - } -} - - -static void cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) -{ - if (lss->change & TCF_CBQ_LSS_FLAGS) { - cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; - cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; - } - if (lss->change & TCF_CBQ_LSS_EWMA) - cl->ewma_log = lss->ewma_log; - if (lss->change & TCF_CBQ_LSS_AVPKT) - cl->avpkt = lss->avpkt; - if (lss->change & TCF_CBQ_LSS_MINIDLE) - cl->minidle = -(long)lss->minidle; - if (lss->change & TCF_CBQ_LSS_MAXIDLE) { - cl->maxidle = lss->maxidle; - cl->avgidle = lss->maxidle; - } - if (lss->change & TCF_CBQ_LSS_OFFTIME) - cl->offtime = lss->offtime; -} - -static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) -{ - q->nclasses[cl->priority]--; - q->quanta[cl->priority] -= cl->weight; - cbq_normalize_quanta(q, cl->priority); -} - -static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) -{ - q->nclasses[cl->priority]++; - q->quanta[cl->priority] += cl->weight; - cbq_normalize_quanta(q, cl->priority); -} - -static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - - if (wrr->allot) - cl->allot = wrr->allot; - if (wrr->weight) - cl->weight = wrr->weight; - if (wrr->priority) { - cl->priority = wrr->priority - 1; - cl->cpriority = cl->priority; - if (cl->priority >= cl->priority2) - cl->priority2 = TC_CBQ_MAXPRIO - 1; - } - - cbq_addprio(q, cl); - return 0; -} - -static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) -{ - cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); - return 0; -} - -static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { - [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, - [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, - [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, - [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, - [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, - [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, - [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, -}; - -static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], - struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - int err; - - if (!opt) { - NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, - cbq_policy, extack); - if (err < 0) - return err; - - if (tb[TCA_CBQ_WRROPT]) { - const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]); - - if (wrr->priority > TC_CBQ_MAXPRIO) { - NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO"); - err = -EINVAL; - } - } - return err; -} - -static int cbq_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_CBQ_MAX + 1]; - struct tc_ratespec *r; - int err; - - qdisc_watchdog_init(&q->watchdog, sch); - - err = cbq_opt_parse(tb, opt, extack); - if (err < 0) - return err; - - if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) { - NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete"); - return -EINVAL; - } - - r = nla_data(tb[TCA_CBQ_RATE]); - - q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack); - if (!q->link.R_tab) - return -EINVAL; - - err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack); - if (err) - goto put_rtab; - - err = qdisc_class_hash_init(&q->clhash); - if (err < 0) - goto put_block; - - q->link.sibling = &q->link; - q->link.common.classid = sch->handle; - q->link.qdisc = sch; - q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle, NULL); - if (!q->link.q) - q->link.q = &noop_qdisc; - else - qdisc_hash_add(q->link.q, true); - - q->link.priority = TC_CBQ_MAXPRIO - 1; - q->link.priority2 = TC_CBQ_MAXPRIO - 1; - q->link.cpriority = TC_CBQ_MAXPRIO - 1; - q->link.allot = psched_mtu(qdisc_dev(sch)); - q->link.quantum = q->link.allot; - q->link.weight = q->link.R_tab->rate.rate; - - q->link.ewma_log = TC_CBQ_DEF_EWMA; - q->link.avpkt = q->link.allot/2; - q->link.minidle = -0x7FFFFFFF; - - q->toplevel = TC_CBQ_MAXLEVEL; - q->now = psched_get_time(); - - cbq_link_class(&q->link); - - if (tb[TCA_CBQ_LSSOPT]) - cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); - - cbq_addprio(q, &q->link); - return 0; - -put_block: - tcf_block_put(q->link.block); - -put_rtab: - qdisc_put_rtab(q->link.R_tab); - return err; -} - -static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - - if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_lssopt opt; - - opt.flags = 0; - if (cl->borrow == NULL) - opt.flags |= TCF_CBQ_LSS_BOUNDED; - if (cl->share == NULL) - opt.flags |= TCF_CBQ_LSS_ISOLATED; - opt.ewma_log = cl->ewma_log; - opt.level = cl->level; - opt.avpkt = cl->avpkt; - opt.maxidle = cl->maxidle; - opt.minidle = (u32)(-cl->minidle); - opt.offtime = cl->offtime; - opt.change = ~0; - if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_wrropt opt; - - memset(&opt, 0, sizeof(opt)); - opt.flags = 0; - opt.allot = cl->allot; - opt.priority = cl->priority + 1; - opt.cpriority = cl->cpriority + 1; - opt.weight = cl->weight; - if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_fopt opt; - - if (cl->split || cl->defmap) { - opt.split = cl->split ? cl->split->common.classid : 0; - opt.defmap = cl->defmap; - opt.defchange = ~0; - if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt)) - goto nla_put_failure; - } - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) -{ - if (cbq_dump_lss(skb, cl) < 0 || - cbq_dump_rate(skb, cl) < 0 || - cbq_dump_wrr(skb, cl) < 0 || - cbq_dump_fopt(skb, cl) < 0) - return -1; - return 0; -} - -static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct nlattr *nest; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - if (cbq_dump_attr(skb, &q->link) < 0) - goto nla_put_failure; - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static int -cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - q->link.xstats.avgidle = q->link.avgidle; - return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); -} - -static int -cbq_dump_class(struct Qdisc *sch, unsigned long arg, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - struct nlattr *nest; - - if (cl->tparent) - tcm->tcm_parent = cl->tparent->common.classid; - else - tcm->tcm_parent = TC_H_ROOT; - tcm->tcm_handle = cl->common.classid; - tcm->tcm_info = cl->q->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - if (cbq_dump_attr(skb, cl) < 0) - goto nla_put_failure; - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static int -cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - __u32 qlen; - - cl->xstats.avgidle = cl->avgidle; - cl->xstats.undertime = 0; - qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog); - - if (cl->undertime != PSCHED_PASTPERFECT) - cl->xstats.undertime = cl->undertime - q->now; - - if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) - return -1; - - return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); -} - -static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old, struct netlink_ext_ack *extack) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->common.classid, extack); - if (new == NULL) - return -ENOBUFS; - } - - *old = qdisc_replace(sch, new, &cl->q); - return 0; -} - -static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - return cl->q; -} - -static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - cbq_deactivate_class(cl); -} - -static unsigned long cbq_find(struct Qdisc *sch, u32 classid) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - return (unsigned long)cbq_class_lookup(q, classid); -} - -static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - WARN_ON(cl->filters); - - tcf_block_put(cl->block); - qdisc_put(cl->q); - qdisc_put_rtab(cl->R_tab); - gen_kill_estimator(&cl->rate_est); - if (cl != &q->link) - kfree(cl); -} - -static void cbq_destroy(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct hlist_node *next; - struct cbq_class *cl; - unsigned int h; - -#ifdef CONFIG_NET_CLS_ACT - q->rx_class = NULL; -#endif - /* - * Filters must be destroyed first because we don't destroy the - * classes from root to leafs which means that filters can still - * be bound to classes which have been destroyed already. --TGR '04 - */ - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - tcf_block_put(cl->block); - cl->block = NULL; - } - } - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], - common.hnode) - cbq_destroy_class(sch, cl); - } - qdisc_class_hash_destroy(&q->clhash); -} - -static int -cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, - unsigned long *arg, struct netlink_ext_ack *extack) -{ - int err; - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)*arg; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_CBQ_MAX + 1]; - struct cbq_class *parent; - struct qdisc_rate_table *rtab = NULL; - - err = cbq_opt_parse(tb, opt, extack); - if (err < 0) - return err; - - if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) { - NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params"); - return -EOPNOTSUPP; - } - - if (cl) { - /* Check parent */ - if (parentid) { - if (cl->tparent && - cl->tparent->common.classid != parentid) { - NL_SET_ERR_MSG(extack, "Invalid parent id"); - return -EINVAL; - } - if (!cl->tparent && parentid != TC_H_ROOT) { - NL_SET_ERR_MSG(extack, "Parent must be root"); - return -EINVAL; - } - } - - if (tb[TCA_CBQ_RATE]) { - rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), - tb[TCA_CBQ_RTAB], extack); - if (rtab == NULL) - return -EINVAL; - } - - if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, - true, - tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); - qdisc_put_rtab(rtab); - return err; - } - } - - /* Change class parameters */ - sch_tree_lock(sch); - - if (cl->next_alive != NULL) - cbq_deactivate_class(cl); - - if (rtab) { - qdisc_put_rtab(cl->R_tab); - cl->R_tab = rtab; - } - - if (tb[TCA_CBQ_LSSOPT]) - cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - - if (tb[TCA_CBQ_WRROPT]) { - cbq_rmprio(q, cl); - cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); - } - - if (tb[TCA_CBQ_FOPT]) - cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); - - if (cl->q->q.qlen) - cbq_activate_class(cl); - - sch_tree_unlock(sch); - - return 0; - } - - if (parentid == TC_H_ROOT) - return -EINVAL; - - if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) { - NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing"); - return -EINVAL; - } - - rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB], - extack); - if (rtab == NULL) - return -EINVAL; - - if (classid) { - err = -EINVAL; - if (TC_H_MAJ(classid ^ sch->handle) || - cbq_class_lookup(q, classid)) { - NL_SET_ERR_MSG(extack, "Specified class not found"); - goto failure; - } - } else { - int i; - classid = TC_H_MAKE(sch->handle, 0x8000); - - for (i = 0; i < 0x8000; i++) { - if (++q->hgenerator >= 0x8000) - q->hgenerator = 1; - if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) - break; - } - err = -ENOSR; - if (i >= 0x8000) { - NL_SET_ERR_MSG(extack, "Unable to generate classid"); - goto failure; - } - classid = classid|q->hgenerator; - } - - parent = &q->link; - if (parentid) { - parent = cbq_class_lookup(q, parentid); - err = -EINVAL; - if (!parent) { - NL_SET_ERR_MSG(extack, "Failed to find parentid"); - goto failure; - } - } - - err = -ENOBUFS; - cl = kzalloc(sizeof(*cl), GFP_KERNEL); - if (cl == NULL) - goto failure; - - gnet_stats_basic_sync_init(&cl->bstats); - err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); - if (err) { - kfree(cl); - goto failure; - } - - if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, true, tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); - tcf_block_put(cl->block); - kfree(cl); - goto failure; - } - } - - cl->R_tab = rtab; - rtab = NULL; - cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, - NULL); - if (!cl->q) - cl->q = &noop_qdisc; - else - qdisc_hash_add(cl->q, true); - - cl->common.classid = classid; - cl->tparent = parent; - cl->qdisc = sch; - cl->allot = parent->allot; - cl->quantum = cl->allot; - cl->weight = cl->R_tab->rate.rate; - - sch_tree_lock(sch); - cbq_link_class(cl); - cl->borrow = cl->tparent; - if (cl->tparent != &q->link) - cl->share = cl->tparent; - cbq_adjust_levels(parent); - cl->minidle = -0x7FFFFFFF; - cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); - if (cl->ewma_log == 0) - cl->ewma_log = q->link.ewma_log; - if (cl->maxidle == 0) - cl->maxidle = q->link.maxidle; - if (cl->avpkt == 0) - cl->avpkt = q->link.avpkt; - if (tb[TCA_CBQ_FOPT]) - cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); - sch_tree_unlock(sch); - - qdisc_class_hash_grow(sch, &q->clhash); - - *arg = (unsigned long)cl; - return 0; - -failure: - qdisc_put_rtab(rtab); - return err; -} - -static int cbq_delete(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - - if (cl->filters || cl->children || cl == &q->link) - return -EBUSY; - - sch_tree_lock(sch); - - qdisc_purge_queue(cl->q); - - if (cl->next_alive) - cbq_deactivate_class(cl); - - if (q->tx_borrowed == cl) - q->tx_borrowed = q->tx_class; - if (q->tx_class == cl) { - q->tx_class = NULL; - q->tx_borrowed = NULL; - } -#ifdef CONFIG_NET_CLS_ACT - if (q->rx_class == cl) - q->rx_class = NULL; -#endif - - cbq_unlink_class(cl); - cbq_adjust_levels(cl->tparent); - cl->defmap = 0; - cbq_sync_defmap(cl); - - cbq_rmprio(q, cl); - sch_tree_unlock(sch); - - cbq_destroy_class(sch, cl); - return 0; -} - -static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - - if (cl == NULL) - cl = &q->link; - - return cl->block; -} - -static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, - u32 classid) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *p = (struct cbq_class *)parent; - struct cbq_class *cl = cbq_class_lookup(q, classid); - - if (cl) { - if (p && p->level <= cl->level) - return 0; - cl->filters++; - return (unsigned long)cl; - } - return 0; -} - -static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - cl->filters--; -} - -static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl; - unsigned int h; - - if (arg->stop) - return; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) - return; - } - } -} - -static const struct Qdisc_class_ops cbq_class_ops = { - .graft = cbq_graft, - .leaf = cbq_leaf, - .qlen_notify = cbq_qlen_notify, - .find = cbq_find, - .change = cbq_change_class, - .delete = cbq_delete, - .walk = cbq_walk, - .tcf_block = cbq_tcf_block, - .bind_tcf = cbq_bind_filter, - .unbind_tcf = cbq_unbind_filter, - .dump = cbq_dump_class, - .dump_stats = cbq_dump_class_stats, -}; - -static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { - .next = NULL, - .cl_ops = &cbq_class_ops, - .id = "cbq", - .priv_size = sizeof(struct cbq_sched_data), - .enqueue = cbq_enqueue, - .dequeue = cbq_dequeue, - .peek = qdisc_peek_dequeued, - .init = cbq_init, - .reset = cbq_reset, - .destroy = cbq_destroy, - .change = NULL, - .dump = cbq_dump, - .dump_stats = cbq_dump_stats, - .owner = THIS_MODULE, -}; - -static int __init cbq_module_init(void) -{ - return register_qdisc(&cbq_qdisc_ops); -} -static void __exit cbq_module_exit(void) -{ - unregister_qdisc(&cbq_qdisc_ops); -} -module_init(cbq_module_init) -module_exit(cbq_module_exit) -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 3ac3e5c80b6f..19c851125901 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -183,7 +183,7 @@ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, int retrys = 3; do { - *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask; + *pidx = (q->head + get_random_u32_below(choke_len(q))) & q->tab_mask; skb = q->tab[*pidx]; if (skb) return skb; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c deleted file mode 100644 index 401ffaf87d62..000000000000 --- a/net/sched/sch_dsmark.c +++ /dev/null @@ -1,518 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* net/sched/sch_dsmark.c - Differentiated Services field marker */ - -/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ - - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/bitops.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <asm/byteorder.h> - -/* - * classid class marking - * ------- ----- ------- - * n/a 0 n/a - * x:0 1 use entry [0] - * ... ... ... - * x:y y>0 y+1 use entry [y] - * ... ... ... - * x:indices-1 indices use entry [indices-1] - * ... ... ... - * x:y y+1 use entry [y & (indices-1)] - * ... ... ... - * 0xffff 0x10000 use entry [indices-1] - */ - - -#define NO_DEFAULT_INDEX (1 << 16) - -struct mask_value { - u8 mask; - u8 value; -}; - -struct dsmark_qdisc_data { - struct Qdisc *q; - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - struct mask_value *mv; - u16 indices; - u8 set_tc_index; - u32 default_index; /* index range is 0...0xffff */ -#define DSMARK_EMBEDDED_SZ 16 - struct mask_value embedded[DSMARK_EMBEDDED_SZ]; -}; - -static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) -{ - return index <= p->indices && index > 0; -} - -/* ------------------------- Class/flow operations ------------------------- */ - -static int dsmark_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", - __func__, sch, p, new, old); - - if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle, NULL); - if (new == NULL) - new = &noop_qdisc; - } - - *old = qdisc_replace(sch, new, &p->q); - return 0; -} - -static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - return p->q; -} - -static unsigned long dsmark_find(struct Qdisc *sch, u32 classid) -{ - return TC_H_MIN(classid) + 1; -} - -static unsigned long dsmark_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) -{ - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", - __func__, sch, qdisc_priv(sch), classid); - - return dsmark_find(sch, classid); -} - -static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl) -{ -} - -static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = { - [TCA_DSMARK_INDICES] = { .type = NLA_U16 }, - [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 }, - [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG }, - [TCA_DSMARK_MASK] = { .type = NLA_U8 }, - [TCA_DSMARK_VALUE] = { .type = NLA_U8 }, -}; - -static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err = -EINVAL; - - pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", - __func__, sch, p, classid, parent, *arg); - - if (!dsmark_valid_index(p, *arg)) { - err = -ENOENT; - goto errout; - } - - if (!opt) - goto errout; - - err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, - dsmark_policy, NULL); - if (err < 0) - goto errout; - - if (tb[TCA_DSMARK_VALUE]) - p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]); - - if (tb[TCA_DSMARK_MASK]) - p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - - err = 0; - -errout: - return err; -} - -static int dsmark_delete(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - if (!dsmark_valid_index(p, arg)) - return -EINVAL; - - p->mv[arg - 1].mask = 0xff; - p->mv[arg - 1].value = 0; - - return 0; -} - -static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - int i; - - pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", - __func__, sch, p, walker); - - if (walker->stop) - return; - - for (i = 0; i < p->indices; i++) { - if (p->mv[i].mask == 0xff && !p->mv[i].value) { - walker->count++; - continue; - } - if (!tc_qdisc_stats_dump(sch, i + 1, walker)) - break; - } -} - -static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - return p->block; -} - -/* --------------------------- Qdisc operations ---------------------------- */ - -static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - unsigned int len = qdisc_pkt_len(skb); - struct dsmark_qdisc_data *p = qdisc_priv(sch); - int err; - - pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); - - if (p->set_tc_index) { - int wlen = skb_network_offset(skb); - - switch (skb_protocol(skb, true)) { - case htons(ETH_P_IP): - wlen += sizeof(struct iphdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) - goto drop; - - skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) - & ~INET_ECN_MASK; - break; - - case htons(ETH_P_IPV6): - wlen += sizeof(struct ipv6hdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) - goto drop; - - skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) - & ~INET_ECN_MASK; - break; - default: - skb->tc_index = 0; - break; - } - } - - if (TC_H_MAJ(skb->priority) == sch->handle) - skb->tc_index = TC_H_MIN(skb->priority); - else { - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tcf_classify(skb, NULL, fl, &res, false); - - pr_debug("result %d class 0x%04x\n", result, res.classid); - - switch (result) { -#ifdef CONFIG_NET_CLS_ACT - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - __qdisc_drop(skb, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - - case TC_ACT_SHOT: - goto drop; -#endif - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - break; - - default: - if (p->default_index != NO_DEFAULT_INDEX) - skb->tc_index = p->default_index; - break; - } - } - - err = qdisc_enqueue(skb, p->q, to_free); - if (err != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(err)) - qdisc_qstats_drop(sch); - return err; - } - - sch->qstats.backlog += len; - sch->q.qlen++; - - return NET_XMIT_SUCCESS; - -drop: - qdisc_drop(skb, sch, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -} - -static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct sk_buff *skb; - u32 index; - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - skb = qdisc_dequeue_peeked(p->q); - if (skb == NULL) - return NULL; - - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - index = skb->tc_index & (p->indices - 1); - pr_debug("index %d->%d\n", skb->tc_index, index); - - switch (skb_protocol(skb, true)) { - case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, - p->mv[index].value); - break; - case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask, - p->mv[index].value); - break; - default: - /* - * Only complain if a change was actually attempted. - * This way, we can send non-IP traffic through dsmark - * and don't need yet another qdisc as a bypass. - */ - if (p->mv[index].mask != 0xff || p->mv[index].value) - pr_warn("%s: unsupported protocol %d\n", - __func__, ntohs(skb_protocol(skb, true))); - break; - } - - return skb; -} - -static struct sk_buff *dsmark_peek(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - return p->q->ops->peek(p->q); -} - -static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err = -EINVAL; - u32 default_index = NO_DEFAULT_INDEX; - u16 indices; - int i; - - pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); - - if (!opt) - goto errout; - - err = tcf_block_get(&p->block, &p->filter_list, sch, extack); - if (err) - return err; - - err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, - dsmark_policy, NULL); - if (err < 0) - goto errout; - - err = -EINVAL; - if (!tb[TCA_DSMARK_INDICES]) - goto errout; - indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); - - if (hweight32(indices) != 1) - goto errout; - - if (tb[TCA_DSMARK_DEFAULT_INDEX]) - default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); - - if (indices <= DSMARK_EMBEDDED_SZ) - p->mv = p->embedded; - else - p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL); - if (!p->mv) { - err = -ENOMEM; - goto errout; - } - for (i = 0; i < indices; i++) { - p->mv[i].mask = 0xff; - p->mv[i].value = 0; - } - p->indices = indices; - p->default_index = default_index; - p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); - - p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, - NULL); - if (p->q == NULL) - p->q = &noop_qdisc; - else - qdisc_hash_add(p->q, true); - - pr_debug("%s: qdisc %p\n", __func__, p->q); - - err = 0; -errout: - return err; -} - -static void dsmark_reset(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - if (p->q) - qdisc_reset(p->q); -} - -static void dsmark_destroy(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - tcf_block_put(p->block); - qdisc_put(p->q); - if (p->mv != p->embedded) - kfree(p->mv); -} - -static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opts = NULL; - - pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl); - - if (!dsmark_valid_index(p, cl)) - return -EINVAL; - - tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1); - tcm->tcm_info = p->q->handle; - - opts = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (opts == NULL) - goto nla_put_failure; - if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) || - nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value)) - goto nla_put_failure; - - return nla_nest_end(skb, opts); - -nla_put_failure: - nla_nest_cancel(skb, opts); - return -EMSGSIZE; -} - -static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opts = NULL; - - opts = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (opts == NULL) - goto nla_put_failure; - if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) - goto nla_put_failure; - - if (p->default_index != NO_DEFAULT_INDEX && - nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index)) - goto nla_put_failure; - - if (p->set_tc_index && - nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX)) - goto nla_put_failure; - - return nla_nest_end(skb, opts); - -nla_put_failure: - nla_nest_cancel(skb, opts); - return -EMSGSIZE; -} - -static const struct Qdisc_class_ops dsmark_class_ops = { - .graft = dsmark_graft, - .leaf = dsmark_leaf, - .find = dsmark_find, - .change = dsmark_change, - .delete = dsmark_delete, - .walk = dsmark_walk, - .tcf_block = dsmark_tcf_block, - .bind_tcf = dsmark_bind_filter, - .unbind_tcf = dsmark_unbind_filter, - .dump = dsmark_dump_class, -}; - -static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { - .next = NULL, - .cl_ops = &dsmark_class_ops, - .id = "dsmark", - .priv_size = sizeof(struct dsmark_qdisc_data), - .enqueue = dsmark_enqueue, - .dequeue = dsmark_dequeue, - .peek = dsmark_peek, - .init = dsmark_init, - .reset = dsmark_reset, - .destroy = dsmark_destroy, - .change = NULL, - .dump = dsmark_dump, - .owner = THIS_MODULE, -}; - -static int __init dsmark_module_init(void) -{ - return register_qdisc(&dsmark_qdisc_ops); -} - -static void __exit dsmark_module_exit(void) -{ - unregister_qdisc(&dsmark_qdisc_ops); -} - -module_init(dsmark_module_init) -module_exit(dsmark_module_exit) - -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index a661b062cca8..872d127c9db4 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -377,6 +377,7 @@ static int gred_offload_dump_stats(struct Qdisc *sch) /* Even if driver returns failure adjust the stats - in case offload * ended but driver still wants to adjust the values. */ + sch_tree_lock(sch); for (i = 0; i < MAX_DPs; i++) { if (!table->tab[i]) continue; @@ -393,6 +394,7 @@ static int gred_offload_dump_stats(struct Qdisc *sch) sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; } _bstats_update(&sch->bstats, bytes, packets); + sch_tree_unlock(sch); kfree(hw_stats); return ret; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index e5b4bbf3ce3d..92f2975b6a82 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -199,8 +199,14 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle) { return (unsigned long)htb_find(handle, sch); } + +#define HTB_DIRECT ((struct htb_class *)-1L) + /** * htb_classify - classify a packet into class + * @skb: the socket buffer + * @sch: the active queue discipline + * @qerr: pointer for returned status code * * It returns NULL if the packet should be dropped or -1 if the packet * should be passed directly thru. In all other cases leaf class is returned. @@ -211,8 +217,6 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle) * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful * then finish and return direct queue. */ -#define HTB_DIRECT ((struct htb_class *)-1L) - static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { @@ -427,7 +431,10 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) while (cl->cmode == HTB_MAY_BORROW && p && mask) { m = mask; while (m) { - int prio = ffz(~m); + unsigned int prio = ffz(~m); + + if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio))) + break; m &= ~(1 << prio); if (p->inner.clprio[prio].feed.rb_node) @@ -1545,7 +1552,7 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, struct tc_htb_qopt_offload offload_opt; struct netdev_queue *dev_queue; struct Qdisc *q = cl->leaf.q; - struct Qdisc *old = NULL; + struct Qdisc *old; int err; if (cl->level) @@ -1553,14 +1560,17 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, WARN_ON(!q); dev_queue = htb_offload_get_queue(cl); - old = htb_graft_helper(dev_queue, NULL); - if (destroying) - /* Before HTB is destroyed, the kernel grafts noop_qdisc to - * all queues. + /* When destroying, caller qdisc_graft grafts the new qdisc and invokes + * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload + * does not need to graft or qdisc_put the qdisc being destroyed. + */ + if (!destroying) { + old = htb_graft_helper(dev_queue, NULL); + /* Last qdisc grafted should be the same as cl->leaf.q when + * calling htb_delete. */ - WARN_ON(!(old->flags & TCQ_F_BUILTIN)); - else WARN_ON(old != q); + } if (cl->parent) { _bstats_update(&cl->parent->bstats_bias, @@ -1577,10 +1587,12 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, }; err = htb_offload(qdisc_dev(sch), &offload_opt); - if (!err || destroying) - qdisc_put(old); - else - htb_graft_helper(dev_queue, old); + if (!destroying) { + if (!err) + qdisc_put(old); + else + htb_graft_helper(dev_queue, old); + } if (last_child) return err; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 4c68abaa289b..48ed87b91086 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -17,6 +17,8 @@ #include <net/sch_generic.h> #include <net/pkt_cls.h> +#include "sch_mqprio_lib.h" + struct mqprio_sched { struct Qdisc **qdiscs; u16 mode; @@ -27,6 +29,62 @@ struct mqprio_sched { u64 max_rate[TC_QOPT_MAX_QUEUE]; }; +static int mqprio_enable_offload(struct Qdisc *sch, + const struct tc_mqprio_qopt *qopt, + struct netlink_ext_ack *extack) +{ + struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; + struct mqprio_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err, i; + + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + if (priv->shaper != TC_MQPRIO_SHAPER_DCB) + return -EINVAL; + break; + case TC_MQPRIO_MODE_CHANNEL: + mqprio.flags = priv->flags; + if (priv->flags & TC_MQPRIO_F_MODE) + mqprio.mode = priv->mode; + if (priv->flags & TC_MQPRIO_F_SHAPER) + mqprio.shaper = priv->shaper; + if (priv->flags & TC_MQPRIO_F_MIN_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.min_rate[i] = priv->min_rate[i]; + if (priv->flags & TC_MQPRIO_F_MAX_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.max_rate[i] = priv->max_rate[i]; + break; + default: + return -EINVAL; + } + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, + &mqprio); + if (err) + return err; + + priv->hw_offload = mqprio.qopt.hw; + + return 0; +} + +static void mqprio_disable_offload(struct Qdisc *sch) +{ + struct tc_mqprio_qopt_offload mqprio = { { 0 } }; + struct mqprio_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + case TC_MQPRIO_MODE_CHANNEL: + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, + &mqprio); + break; + } +} + static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); @@ -41,37 +99,17 @@ static void mqprio_destroy(struct Qdisc *sch) kfree(priv->qdiscs); } - if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { - struct tc_mqprio_qopt_offload mqprio = { { 0 } }; - - switch (priv->mode) { - case TC_MQPRIO_MODE_DCB: - case TC_MQPRIO_MODE_CHANNEL: - dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_QDISC_MQPRIO, - &mqprio); - break; - default: - return; - } - } else { + if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) + mqprio_disable_offload(sch); + else netdev_set_num_tc(dev, 0); - } } -static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + const struct tc_mqprio_caps *caps, + struct netlink_ext_ack *extack) { - int i, j; - - /* Verify num_tc is not out of max range */ - if (qopt->num_tc > TC_MAX_QUEUE) - return -EINVAL; - - /* Verify priority mapping uses valid tcs */ - for (i = 0; i < TC_BITMASK + 1; i++) { - if (qopt->prio_tc_map[i] >= qopt->num_tc) - return -EINVAL; - } + int err; /* Limit qopt->hw to maximum supported offload value. Drivers have * the option of overriding this later if they don't support the a @@ -80,31 +118,23 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; - /* If hardware offload is requested we will leave it to the device - * to either populate the queue counts itself or to validate the - * provided queue counts. If ndo_setup_tc is not present then - * hardware doesn't support offload and we should return an error. + /* If hardware offload is requested, we will leave 3 options to the + * device driver: + * - populate the queue counts itself (and ignore what was requested) + * - validate the provided queue counts by itself (and apply them) + * - request queue count validation here (and apply them) */ - if (qopt->hw) - return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL; - - for (i = 0; i < qopt->num_tc; i++) { - unsigned int last = qopt->offset[i] + qopt->count[i]; - - /* Verify the queue count is in tx range being equal to the - * real_num_tx_queues indicates the last queue is in use. - */ - if (qopt->offset[i] >= dev->real_num_tx_queues || - !qopt->count[i] || - last > dev->real_num_tx_queues) - return -EINVAL; - - /* Verify that the offset and counts do not overlap */ - for (j = i + 1; j < qopt->num_tc; j++) { - if (last > qopt->offset[j]) - return -EINVAL; - } - } + err = mqprio_validate_qopt(dev, qopt, + !qopt->hw || caps->validate_queue_counts, + false, extack); + if (err) + return err; + + /* If ndo_setup_tc is not present then hardware doesn't support offload + * and we should return an error. + */ + if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) + return -EINVAL; return 0; } @@ -130,6 +160,67 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } +static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt, + struct nlattr *opt) +{ + struct mqprio_sched *priv = qdisc_priv(sch); + struct nlattr *tb[TCA_MQPRIO_MAX + 1]; + struct nlattr *attr; + int i, rem, err; + + err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, + sizeof(*qopt)); + if (err < 0) + return err; + + if (!qopt->hw) + return -EINVAL; + + if (tb[TCA_MQPRIO_MODE]) { + priv->flags |= TC_MQPRIO_F_MODE; + priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); + } + + if (tb[TCA_MQPRIO_SHAPER]) { + priv->flags |= TC_MQPRIO_F_SHAPER; + priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); + } + + if (tb[TCA_MQPRIO_MIN_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->min_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MIN_RATE; + } + + if (tb[TCA_MQPRIO_MAX_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->max_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MAX_RATE; + } + + return 0; +} + static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -139,9 +230,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; - struct nlattr *tb[TCA_MQPRIO_MAX + 1]; - struct nlattr *attr; - int rem; + struct tc_mqprio_caps caps; int len; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); @@ -160,61 +249,18 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO, + &caps, sizeof(caps)); + qopt = nla_data(opt); - if (mqprio_parse_opt(dev, qopt)) + if (mqprio_parse_opt(dev, qopt, &caps, extack)) return -EINVAL; len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); if (len > 0) { - err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, - sizeof(*qopt)); - if (err < 0) + err = mqprio_parse_nlattr(sch, qopt, opt); + if (err) return err; - - if (!qopt->hw) - return -EINVAL; - - if (tb[TCA_MQPRIO_MODE]) { - priv->flags |= TC_MQPRIO_F_MODE; - priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); - } - - if (tb[TCA_MQPRIO_SHAPER]) { - priv->flags |= TC_MQPRIO_F_SHAPER; - priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); - } - - if (tb[TCA_MQPRIO_MIN_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->min_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MIN_RATE; - } - - if (tb[TCA_MQPRIO_MAX_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->max_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MAX_RATE; - } } /* pre-allocate qdisc, attachment can't fail */ @@ -241,36 +287,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, * supplied and verified mapping */ if (qopt->hw) { - struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; - - switch (priv->mode) { - case TC_MQPRIO_MODE_DCB: - if (priv->shaper != TC_MQPRIO_SHAPER_DCB) - return -EINVAL; - break; - case TC_MQPRIO_MODE_CHANNEL: - mqprio.flags = priv->flags; - if (priv->flags & TC_MQPRIO_F_MODE) - mqprio.mode = priv->mode; - if (priv->flags & TC_MQPRIO_F_SHAPER) - mqprio.shaper = priv->shaper; - if (priv->flags & TC_MQPRIO_F_MIN_RATE) - for (i = 0; i < mqprio.qopt.num_tc; i++) - mqprio.min_rate[i] = priv->min_rate[i]; - if (priv->flags & TC_MQPRIO_F_MAX_RATE) - for (i = 0; i < mqprio.qopt.num_tc; i++) - mqprio.max_rate[i] = priv->max_rate[i]; - break; - default: - return -EINVAL; - } - err = dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_QDISC_MQPRIO, - &mqprio); + err = mqprio_enable_offload(sch, qopt, extack); if (err) return err; - - priv->hw_offload = mqprio.qopt.hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -387,7 +406,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; - unsigned int ntx, tc; + unsigned int ntx; sch->q.qlen = 0; gnet_stats_basic_sync_init(&sch->bstats); @@ -411,15 +430,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } - opt.num_tc = netdev_get_num_tc(dev); - memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + mqprio_qopt_reconstruct(dev, &opt); opt.hw = priv->hw_offload; - for (tc = 0; tc < netdev_get_num_tc(dev); tc++) { - opt.count[tc] = dev->tc_to_txq[tc].count; - opt.offset[tc] = dev->tc_to_txq[tc].offset; - } - if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c new file mode 100644 index 000000000000..c58a533b8ec5 --- /dev/null +++ b/net/sched/sch_mqprio_lib.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/types.h> +#include <net/pkt_sched.h> + +#include "sch_mqprio_lib.h" + +/* Returns true if the intervals [a, b) and [c, d) overlap. */ +static bool intervals_overlap(int a, int b, int c, int d) +{ + int left = max(a, c), right = min(b, d); + + return left < right; +} + +static int mqprio_validate_queue_counts(struct net_device *dev, + const struct tc_mqprio_qopt *qopt, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack) +{ + int i, j; + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + if (!qopt->count[i]) { + NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d", + i); + return -EINVAL; + } + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->real_num_tx_queues || + last > dev->real_num_tx_queues) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Queues %d:%d for TC %d exceed the %d TX queues available", + qopt->count[i], qopt->offset[i], + i, dev->real_num_tx_queues); + return -EINVAL; + } + + if (allow_overlapping_txqs) + continue; + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (intervals_overlap(qopt->offset[i], last, + qopt->offset[j], + qopt->offset[j] + + qopt->count[j])) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "TC %d queues %d@%d overlap with TC %d queues %d@%d", + i, qopt->count[i], qopt->offset[i], + j, qopt->count[j], qopt->offset[j]); + return -EINVAL; + } + } + } + + return 0; +} + +int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + bool validate_queue_counts, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack) +{ + int i, err; + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) { + NL_SET_ERR_MSG(extack, + "Number of traffic classes is outside valid range"); + return -EINVAL; + } + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i <= TC_BITMASK; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) { + NL_SET_ERR_MSG(extack, + "Invalid traffic class in priority to traffic class mapping"); + return -EINVAL; + } + } + + if (validate_queue_counts) { + err = mqprio_validate_queue_counts(dev, qopt, + allow_overlapping_txqs, + extack); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mqprio_validate_qopt); + +void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt) +{ + int tc, num_tc = netdev_get_num_tc(dev); + + qopt->num_tc = num_tc; + memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map)); + + for (tc = 0; tc < num_tc; tc++) { + qopt->count[tc] = dev->tc_to_txq[tc].count; + qopt->offset[tc] = dev->tc_to_txq[tc].offset; + } +} +EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct); + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h new file mode 100644 index 000000000000..63f725ab8761 --- /dev/null +++ b/net/sched/sch_mqprio_lib.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SCH_MQPRIO_LIB_H +#define __SCH_MQPRIO_LIB_H + +#include <linux/types.h> + +struct net_device; +struct netlink_ext_ack; +struct tc_mqprio_qopt; + +int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + bool validate_queue_counts, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack); +void mqprio_qopt_reconstruct(struct net_device *dev, + struct tc_mqprio_qopt *qopt); + +#endif diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index fb00ac40ecb7..6ef3021e1169 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -513,8 +513,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto finish_segs; } - skb->data[prandom_u32_max(skb_headlen(skb))] ^= - 1<<prandom_u32_max(8); + skb->data[get_random_u32_below(skb_headlen(skb))] ^= + 1<<get_random_u32_below(8); } if (unlikely(sch->q.qlen >= sch->limit)) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 570389f6cdd7..1f469861eae3 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -26,7 +26,11 @@ #include <net/sock.h> #include <net/tcp.h> +#include "sch_mqprio_lib.h" + static LIST_HEAD(taprio_list); +static struct static_key_false taprio_have_broken_mqprio; +static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 @@ -35,15 +39,19 @@ static LIST_HEAD(taprio_list); #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { - struct list_head list; - - /* The instant that this entry "closes" and the next one - * should open, the qdisc will make some effort so that no - * packet leaves after this time. + /* Durations between this GCL entry and the GCL entry where the + * respective traffic class gate closes */ - ktime_t close_time; + u64 gate_duration[TC_MAX_QUEUE]; + atomic_t budget[TC_MAX_QUEUE]; + /* The qdisc makes some effort so that no packet leaves + * after this time + */ + ktime_t gate_close_time[TC_MAX_QUEUE]; + struct list_head list; + /* Used to calculate when to advance the schedule */ + ktime_t end_time; ktime_t next_txtime; - atomic_t budget; int index; u32 gate_mask; u32 interval; @@ -51,10 +59,16 @@ struct sched_entry { }; struct sched_gate_list { + /* Longest non-zero contiguous gate durations per traffic class, + * or 0 if a traffic class gate never opens during the schedule. + */ + u64 max_open_gate_duration[TC_MAX_QUEUE]; + u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ + u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; - ktime_t cycle_close_time; + ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; @@ -67,6 +81,8 @@ struct taprio_sched { enum tk_offsets tk_offset; int clockid; bool offloaded; + bool detected_mqprio; + bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ @@ -78,8 +94,8 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; - u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ - u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */ + int cur_txq[TC_MAX_QUEUE]; + u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 txtime_delay; }; @@ -88,6 +104,57 @@ struct __tc_taprio_qopt_offload { struct tc_taprio_qopt_offload offload; }; +static void taprio_calculate_gate_durations(struct taprio_sched *q, + struct sched_gate_list *sched) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + struct sched_entry *entry, *cur; + int tc; + + list_for_each_entry(entry, &sched->entries, list) { + u32 gates_still_open = entry->gate_mask; + + /* For each traffic class, calculate each open gate duration, + * starting at this schedule entry and ending at the schedule + * entry containing a gate close event for that TC. + */ + cur = entry; + + do { + if (!gates_still_open) + break; + + for (tc = 0; tc < num_tc; tc++) { + if (!(gates_still_open & BIT(tc))) + continue; + + if (cur->gate_mask & BIT(tc)) + entry->gate_duration[tc] += cur->interval; + else + gates_still_open &= ~BIT(tc); + } + + cur = list_next_entry_circular(cur, &sched->entries, list); + } while (cur != entry); + + /* Keep track of the maximum gate duration for each traffic + * class, taking care to not confuse a traffic class which is + * temporarily closed with one that is always closed. + */ + for (tc = 0; tc < num_tc; tc++) + if (entry->gate_duration[tc] && + sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) + sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; + } +} + +static bool taprio_entry_allows_tx(ktime_t skb_end_time, + struct sched_entry *entry, int tc) +{ + return ktime_before(skb_end_time, entry->gate_close_time[tc]); +} + static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) @@ -180,6 +247,63 @@ static int length_to_duration(struct taprio_sched *q, int len) return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } +static int duration_to_length(struct taprio_sched *q, u64 duration) +{ + return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); +} + +/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the + * q->max_sdu[] requested by the user and the max_sdu dynamically determined by + * the maximum open gate durations at the given link speed. + */ +static void taprio_update_queue_max_sdu(struct taprio_sched *q, + struct sched_gate_list *sched, + struct qdisc_size_table *stab) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + u32 max_sdu_from_user; + u32 max_sdu_dynamic; + u32 max_sdu; + int tc; + + for (tc = 0; tc < num_tc; tc++) { + max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; + + /* TC gate never closes => keep the queueMaxSDU + * selected by the user + */ + if (sched->max_open_gate_duration[tc] == sched->cycle_time) { + max_sdu_dynamic = U32_MAX; + } else { + u32 max_frm_len; + + max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); + /* Compensate for L1 overhead from size table, + * but don't let the frame size go negative + */ + if (stab) { + max_frm_len -= stab->szopts.overhead; + max_frm_len = max_t(int, max_frm_len, + dev->hard_header_len + 1); + } + max_sdu_dynamic = max_frm_len - dev->hard_header_len; + if (max_sdu_dynamic > dev->max_mtu) + max_sdu_dynamic = U32_MAX; + } + + max_sdu = min(max_sdu_dynamic, max_sdu_from_user); + + if (max_sdu != U32_MAX) { + sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; + sched->max_sdu[tc] = max_sdu; + } else { + sched->max_frm_len[tc] = U32_MAX; /* never oversized */ + sched->max_sdu[tc] = 0; + } + } +} + /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. @@ -413,14 +537,33 @@ done: return txtime; } -static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, - struct Qdisc *child, struct sk_buff **to_free) +/* Devices with full offload are expected to honor this in hardware */ +static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, + struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sched_gate_list *sched; int prio = skb->priority; + bool exceeds = false; u8 tc; + tc = netdev_get_prio_tc_map(dev, prio); + + rcu_read_lock(); + sched = rcu_dereference(q->oper_sched); + if (sched && skb->len > sched->max_frm_len[tc]) + exceeds = true; + rcu_read_unlock(); + + return exceeds; +} + +static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) @@ -431,17 +574,53 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); } - /* Devices with full offload are expected to honor this in hardware */ - tc = netdev_get_prio_tc_map(dev, prio); - if (skb->len > q->max_frm_len[tc]) - return qdisc_drop(skb, sch, to_free); - qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } +static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, + struct sk_buff **to_free) +{ + unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); + netdev_features_t features = netif_skb_features(skb); + struct sk_buff *segs, *nskb; + int ret; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + qdisc_skb_cb(segs)->pkt_len = segs->len; + slen += segs->len; + + /* FIXME: we should be segmenting to a smaller size + * rather than dropping these + */ + if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) + ret = qdisc_drop(segs, sch, to_free); + else + ret = taprio_enqueue_one(segs, sch, child, to_free); + + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + qdisc_qstats_drop(sch); + } else { + numsegs++; + } + } + + if (numsegs > 1) + qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); + consume_skb(skb); + + return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ @@ -458,97 +637,190 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); - /* Large packets might not be transmitted when the transmission duration - * exceeds any configured interval. Therefore, segment the skb into - * smaller chunks. Drivers with full offload are expected to handle - * this in hardware. - */ - if (skb_is_gso(skb)) { - unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); - netdev_features_t features = netif_skb_features(skb); - struct sk_buff *segs, *nskb; - int ret; - - segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); - if (IS_ERR_OR_NULL(segs)) - return qdisc_drop(skb, sch, to_free); + if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { + /* Large packets might not be transmitted when the transmission + * duration exceeds any configured interval. Therefore, segment + * the skb into smaller chunks. Drivers with full offload are + * expected to handle this in hardware. + */ + if (skb_is_gso(skb)) + return taprio_enqueue_segmented(skb, sch, child, + to_free); - skb_list_walk_safe(segs, segs, nskb) { - skb_mark_not_on_list(segs); - qdisc_skb_cb(segs)->pkt_len = segs->len; - slen += segs->len; + return qdisc_drop(skb, sch, to_free); + } - ret = taprio_enqueue_one(segs, sch, child, to_free); - if (ret != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(ret)) - qdisc_qstats_drop(sch); - } else { - numsegs++; - } - } + return taprio_enqueue_one(skb, sch, child, to_free); +} - if (numsegs > 1) - qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); - consume_skb(skb); +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); + return NULL; +} + +static void taprio_set_budgets(struct taprio_sched *q, + struct sched_gate_list *sched, + struct sched_entry *entry) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + int tc, budget; + + for (tc = 0; tc < num_tc; tc++) { + /* Traffic classes which never close have infinite budget */ + if (entry->gate_duration[tc] == sched->cycle_time) + budget = INT_MAX; + else + budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, + atomic64_read(&q->picos_per_byte)); + + atomic_set(&entry->budget[tc], budget); + } +} - return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +/* When an skb is sent, it consumes from the budget of all traffic classes */ +static int taprio_update_budgets(struct sched_entry *entry, size_t len, + int tc_consumed, int num_tc) +{ + int tc, budget, new_budget = 0; + + for (tc = 0; tc < num_tc; tc++) { + budget = atomic_read(&entry->budget[tc]); + /* Don't consume from infinite budget */ + if (budget == INT_MAX) { + if (tc == tc_consumed) + new_budget = budget; + continue; + } + + if (tc == tc_consumed) + new_budget = atomic_sub_return(len, &entry->budget[tc]); + else + atomic_sub(len, &entry->budget[tc]); } - return taprio_enqueue_one(skb, sch, child, to_free); + return new_budget; } -/* Will not be called in the full offload case, since the TX queues are - * attached to the Qdisc created using qdisc_create_dflt() - */ -static struct sk_buff *taprio_peek(struct Qdisc *sch) +static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, + struct sched_entry *entry, + u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct sched_entry *entry; + struct Qdisc *child = q->qdiscs[txq]; + int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; - u32 gate_mask; - int i; + ktime_t guard; + int prio; + int len; + u8 tc; - rcu_read_lock(); - entry = rcu_dereference(q->current_entry); - gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; - rcu_read_unlock(); + if (unlikely(!child)) + return NULL; - if (!gate_mask) + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) + goto skip_peek_checks; + + skb = child->ops->peek(child); + if (!skb) return NULL; - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - int prio; - u8 tc; + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); - if (unlikely(!child)) - continue; + if (!(gate_mask & BIT(tc))) + return NULL; - skb = child->ops->peek(child); - if (!skb) - continue; + len = qdisc_pkt_len(skb); + guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); - if (TXTIME_ASSIST_IS_ENABLED(q->flags)) - return skb; + /* In the case that there's no gate entry, there's no + * guard band ... + */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + !taprio_entry_allows_tx(guard, entry, tc)) + return NULL; + + /* ... and no budget. */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + taprio_update_budgets(entry, len, tc, num_tc) < 0) + return NULL; + +skip_peek_checks: + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + return NULL; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; +} + +static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) +{ + int offset = dev->tc_to_txq[tc].offset; + int count = dev->tc_to_txq[tc].count; - prio = skb->priority; - tc = netdev_get_prio_tc_map(dev, prio); + (*txq)++; + if (*txq == offset + count) + *txq = offset; +} + +/* Prioritize higher traffic classes, and select among TXQs belonging to the + * same TC using round robin + */ +static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, + struct sched_entry *entry, + u32 gate_mask) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int num_tc = netdev_get_num_tc(dev); + struct sk_buff *skb; + int tc; + + for (tc = num_tc - 1; tc >= 0; tc--) { + int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; - return skb; + do { + skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], + entry, gate_mask); + + taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); + + if (skb) + return skb; + } while (q->cur_txq[tc] != first_txq); } return NULL; } -static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) +/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic + * class other than to determine whether the gate is open or not + */ +static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, + struct sched_entry *entry, + u32 gate_mask) { - atomic_set(&entry->budget, - div64_u64((u64)entry->interval * PSEC_PER_NSEC, - atomic64_read(&q->picos_per_byte))); + struct net_device *dev = qdisc_dev(sch); + struct sk_buff *skb; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); + if (skb) + return skb; + } + + return NULL; } /* Will not be called in the full offload case, since the TX queues are @@ -557,11 +829,9 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; - int i; rcu_read_lock(); entry = rcu_dereference(q->current_entry); @@ -571,69 +841,23 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; - if (!gate_mask) goto done; - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - ktime_t guard; - int prio; - int len; - u8 tc; - - if (unlikely(!child)) - continue; - - if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { - skb = child->ops->dequeue(child); - if (!skb) - continue; - goto skb_found; - } - - skb = child->ops->peek(child); - if (!skb) - continue; - - prio = skb->priority; - tc = netdev_get_prio_tc_map(dev, prio); - - if (!(gate_mask & BIT(tc))) { - skb = NULL; - continue; - } - - len = qdisc_pkt_len(skb); - guard = ktime_add_ns(taprio_get_time(q), - length_to_duration(q, len)); - - /* In the case that there's no gate entry, there's no - * guard band ... - */ - if (gate_mask != TAPRIO_ALL_GATES_OPEN && - ktime_after(guard, entry->close_time)) { - skb = NULL; - continue; - } - - /* ... and no budget. */ - if (gate_mask != TAPRIO_ALL_GATES_OPEN && - atomic_sub_return(len, &entry->budget) < 0) { - skb = NULL; - continue; - } - - skb = child->ops->dequeue(child); - if (unlikely(!skb)) - goto done; - -skb_found: - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - goto done; + if (static_branch_unlikely(&taprio_have_broken_mqprio) && + !static_branch_likely(&taprio_have_working_mqprio)) { + /* Single NIC kind which is broken */ + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); + } else if (static_branch_likely(&taprio_have_working_mqprio) && + !static_branch_unlikely(&taprio_have_broken_mqprio)) { + /* Single NIC kind which prioritizes properly */ + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); + } else { + /* Mixed NIC kinds present in system, need dynamic testing */ + if (q->broken_mqprio) + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); + else + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: @@ -648,7 +872,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper, if (list_is_last(&entry->list, &oper->entries)) return true; - if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0) + if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; @@ -656,7 +880,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper, static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, - ktime_t close_time) + ktime_t end_time) { ktime_t next_base_time, extension_time; @@ -665,18 +889,18 @@ static bool should_change_schedules(const struct sched_gate_list *admin, next_base_time = sched_base_time(admin); - /* This is the simple case, the close_time would fall after + /* This is the simple case, the end_time would fall after * the next schedule base_time. */ - if (ktime_compare(next_base_time, close_time) <= 0) + if (ktime_compare(next_base_time, end_time) <= 0) return true; - /* This is the cycle_time_extension case, if the close_time + /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ - extension_time = ktime_add_ns(close_time, oper->cycle_time_extension); + extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after @@ -692,10 +916,13 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); + struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; + int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; - ktime_t close_time; + ktime_t end_time; + int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, @@ -714,41 +941,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) * entry of all schedules are pre-calculated during the * schedule initialization. */ - if (unlikely(!entry || entry->close_time == oper->base_time)) { + if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); - close_time = next->close_time; + end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); - oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time, - oper->cycle_time); + oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, + oper->cycle_time); } else { next = list_next_entry(entry, list); } - close_time = ktime_add_ns(entry->close_time, next->interval); - close_time = min_t(ktime_t, close_time, oper->cycle_close_time); + end_time = ktime_add_ns(entry->end_time, next->interval); + end_time = min_t(ktime_t, end_time, oper->cycle_end_time); + + for (tc = 0; tc < num_tc; tc++) { + if (next->gate_duration[tc] == oper->cycle_time) + next->gate_close_time[tc] = KTIME_MAX; + else + next->gate_close_time[tc] = ktime_add_ns(entry->end_time, + next->gate_duration[tc]); + } - if (should_change_schedules(admin, oper, close_time)) { + if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ - close_time = sched_base_time(admin); + end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } - next->close_time = close_time; - taprio_set_budget(q, next); + next->end_time = end_time; + taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); - hrtimer_set_expires(&q->advance_timer, close_time); + hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); @@ -916,6 +1151,8 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + taprio_calculate_gate_durations(q, new); + return 0; } @@ -924,7 +1161,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, struct netlink_ext_ack *extack, u32 taprio_flags) { - int i, j; + bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt && !dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); @@ -937,52 +1174,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, if (dev->num_tc) return 0; - /* Verify num_tc is not out of max range */ - if (qopt->num_tc > TC_MAX_QUEUE) { - NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); - return -EINVAL; - } - /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } - /* Verify priority mapping uses valid tcs */ - for (i = 0; i <= TC_BITMASK; i++) { - if (qopt->prio_tc_map[i] >= qopt->num_tc) { - NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); - return -EINVAL; - } - } - - for (i = 0; i < qopt->num_tc; i++) { - unsigned int last = qopt->offset[i] + qopt->count[i]; - - /* Verify the queue count is in tx range being equal to the - * real_num_tx_queues indicates the last queue is in use. - */ - if (qopt->offset[i] >= dev->num_tx_queues || - !qopt->count[i] || - last > dev->real_num_tx_queues) { - NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); - return -EINVAL; - } - - if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) - continue; - - /* Verify that the offset and counts do not overlap */ - for (j = i + 1; j < qopt->num_tc; j++) { - if (last > qopt->offset[j]) { - NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); - return -EINVAL; - } - } - } - - return 0; + /* For some reason, in txtime-assist mode, we allow TXQ ranges for + * different TCs to overlap, and just validate the TXQ ranges. + */ + return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, + extack); } static int taprio_get_start_time(struct Qdisc *sch, @@ -1019,11 +1221,14 @@ static int taprio_get_start_time(struct Qdisc *sch, return 0; } -static void setup_first_close_time(struct taprio_sched *q, - struct sched_gate_list *sched, ktime_t base) +static void setup_first_end_time(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) { + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; + int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); @@ -1031,10 +1236,18 @@ static void setup_first_close_time(struct taprio_sched *q, cycle = sched->cycle_time; /* FIXME: find a better place to do this */ - sched->cycle_close_time = ktime_add_ns(base, cycle); + sched->cycle_end_time = ktime_add_ns(base, cycle); + + first->end_time = ktime_add_ns(base, first->interval); + taprio_set_budgets(q, sched, first); + + for (tc = 0; tc < num_tc; tc++) { + if (first->gate_duration[tc] == sched->cycle_time) + first->gate_close_time[tc] = KTIME_MAX; + else + first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); + } - first->close_time = ktime_add_ns(base, first->interval); - taprio_set_budget(q, first); rcu_assign_pointer(q->current_entry, NULL); } @@ -1088,6 +1301,8 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct sched_gate_list *oper, *admin; + struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); @@ -1100,6 +1315,17 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, continue; taprio_set_picos_per_byte(dev, q); + + stab = rtnl_dereference(q->root->stab); + + oper = rtnl_dereference(q->oper_sched); + if (oper) + taprio_update_queue_max_sdu(q, oper, stab); + + admin = rtnl_dereference(q->admin_sched); + if (admin) + taprio_update_queue_max_sdu(q, admin, stab); + break; } @@ -1203,7 +1429,8 @@ static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, - struct tc_taprio_qopt_offload *offload) + struct tc_taprio_qopt_offload *offload, + const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; @@ -1217,7 +1444,11 @@ static void taprio_sched_to_offload(struct net_device *dev, e->command = entry->command; e->interval = entry->interval; - e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); + if (caps->gate_mask_per_txq) + e->gate_mask = tc_map_to_queue_mask(dev, + entry->gate_mask); + else + e->gate_mask = entry->gate_mask; i++; } @@ -1225,6 +1456,34 @@ static void taprio_sched_to_offload(struct net_device *dev, offload->num_entries = i; } +static void taprio_detect_broken_mqprio(struct taprio_sched *q) +{ + struct net_device *dev = qdisc_dev(q->root); + struct tc_taprio_caps caps; + + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, + &caps, sizeof(caps)); + + q->broken_mqprio = caps.broken_mqprio; + if (q->broken_mqprio) + static_branch_inc(&taprio_have_broken_mqprio); + else + static_branch_inc(&taprio_have_working_mqprio); + + q->detected_mqprio = true; +} + +static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) +{ + if (!q->detected_mqprio) + return; + + if (q->broken_mqprio) + static_branch_dec(&taprio_have_broken_mqprio); + else + static_branch_dec(&taprio_have_working_mqprio); +} + static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, @@ -1261,7 +1520,8 @@ static int taprio_enable_offload(struct net_device *dev, return -ENOMEM; } offload->enable = 1; - taprio_sched_to_offload(dev, sched, offload); + mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); + taprio_sched_to_offload(dev, sched, offload, &caps); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; @@ -1452,7 +1712,6 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; unsigned long seen_tcs = 0; struct nlattr *n; @@ -1466,18 +1725,14 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) continue; - err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack); + err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, + extack); if (err) goto out; } - for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->max_sdu[tc] = max_sdu[tc]; - if (max_sdu[tc]) - q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len; - else - q->max_frm_len[tc] = U32_MAX; /* never oversized */ - } out: return err; @@ -1533,6 +1788,7 @@ static int taprio_new_flags(const struct nlattr *attr, u32 old, static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); @@ -1585,6 +1841,23 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto free_sched; } + if (mqprio) { + err = netdev_set_num_tc(dev, mqprio->num_tc); + if (err) + goto free_sched; + for (i = 0; i < mqprio->num_tc; i++) { + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + q->cur_txq[i] = mqprio->offset[i]; + } + + /* Always use supplied priority mappings */ + for (i = 0; i <= TC_BITMASK; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } + err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; @@ -1600,21 +1873,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto free_sched; taprio_set_picos_per_byte(dev, q); - - if (mqprio) { - err = netdev_set_num_tc(dev, mqprio->num_tc); - if (err) - goto free_sched; - for (i = 0; i < mqprio->num_tc; i++) - netdev_set_tc_queue(dev, i, - mqprio->count[i], - mqprio->offset[i]); - - /* Always use supplied priority mappings */ - for (i = 0; i <= TC_BITMASK; i++) - netdev_set_prio_tc_map(dev, i, - mqprio->prio_tc_map[i]); - } + taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); @@ -1663,7 +1922,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { - setup_first_close_time(q, new_admin, start); + setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); @@ -1683,6 +1942,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, new_admin = NULL; err = 0; + if (!stab) + NL_SET_ERR_MSG_MOD(extack, + "Size table not specified, frame length estimations may be inaccurate"); + unlock: spin_unlock_bh(qdisc_lock(sch)); @@ -1700,6 +1963,7 @@ static void taprio_reset(struct Qdisc *sch) int i; hrtimer_cancel(&q->advance_timer); + if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) @@ -1720,6 +1984,7 @@ static void taprio_destroy(struct Qdisc *sch) * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); + qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); @@ -1741,6 +2006,8 @@ static void taprio_destroy(struct Qdisc *sch) if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); + + taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, @@ -1805,6 +2072,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, q->qdiscs[i] = qdisc; } + taprio_detect_broken_mqprio(q); + return taprio_change(sch, opt, extack); } @@ -1945,7 +2214,8 @@ error_nest: return -1; } -static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb) +static int taprio_dump_tc_entries(struct sk_buff *skb, + struct sched_gate_list *sched) { struct nlattr *n; int tc; @@ -1959,7 +2229,7 @@ static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, - q->max_sdu[tc])) + sched->max_sdu[tc])) goto nla_put_failure; nla_nest_end(skb, n); @@ -1979,18 +2249,11 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; - unsigned int i; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); - opt.num_tc = netdev_get_num_tc(dev); - memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); - - for (i = 0; i < netdev_get_num_tc(dev); i++) { - opt.count[i] = dev->tc_to_txq[i].count; - opt.offset[i] = dev->tc_to_txq[i].offset; - } + mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) @@ -2010,7 +2273,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; - if (taprio_dump_tc_entries(q, skb)) + if (oper && taprio_dump_tc_entries(skb, oper)) goto options_error; if (oper && dump_schedule(skb, oper)) diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 59e653b528b1..6b95d3ba8fe1 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -73,6 +73,12 @@ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, } } + /* If somehow no addresses were found that can be used with this + * scope, it's an error. + */ + if (list_empty(&dest->address_list)) + error = -ENETUNREACH; + out: if (error) sctp_bind_addr_clean(dest); diff --git a/net/sctp/diag.c b/net/sctp/diag.c index a557009e9832..c3d6b92dd386 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -343,11 +343,9 @@ static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) + if (!list_is_first(&tsp->asoc->asocs, &ep->asocs)) return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 097bd60ce964..62b436a2c8fe 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -807,8 +807,6 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - sk_refcnt_debug_inc(newsk); - if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 909a89a1cff4..c365df24ad33 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -601,8 +601,6 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; - sk_refcnt_debug_inc(newsk); - if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 5acbdf0d38f3..b91616f819de 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -59,6 +59,7 @@ #include <net/ipv6.h> #include <net/inet_common.h> #include <net/busy_poll.h> +#include <trace/events/sock.h> #include <linux/socket.h> /* for sa_family_t */ #include <linux/export.h> @@ -8321,9 +8322,9 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) int low, high, remaining, index; unsigned int rover; - inet_get_local_port_range(net, &low, &high); + inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32_max(remaining) + low; + rover = get_random_u32_below(remaining) + low; do { rover++; @@ -9244,6 +9245,8 @@ void sctp_data_ready(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 7f40ed117fc7..a7a9136198fd 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -84,17 +84,18 @@ static struct ctl_table sctp_table[] = { { /* sentinel */ } }; +/* The following index defines are used in sctp_sysctl_net_register(). + * If you add new items to the sctp_net_table, please ensure that + * the index values of these defines hold the same meaning indicated by + * their macro names when they appear in sctp_net_table. + */ +#define SCTP_RTO_MIN_IDX 0 +#define SCTP_RTO_MAX_IDX 1 +#define SCTP_PF_RETRANS_IDX 2 +#define SCTP_PS_RETRANS_IDX 3 + static struct ctl_table sctp_net_table[] = { - { - .procname = "rto_initial", - .data = &init_net.sctp.rto_initial, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = &timer_max - }, - { + [SCTP_RTO_MIN_IDX] = { .procname = "rto_min", .data = &init_net.sctp.rto_min, .maxlen = sizeof(unsigned int), @@ -103,7 +104,7 @@ static struct ctl_table sctp_net_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &init_net.sctp.rto_max }, - { + [SCTP_RTO_MAX_IDX] = { .procname = "rto_max", .data = &init_net.sctp.rto_max, .maxlen = sizeof(unsigned int), @@ -112,6 +113,33 @@ static struct ctl_table sctp_net_table[] = { .extra1 = &init_net.sctp.rto_min, .extra2 = &timer_max }, + [SCTP_PF_RETRANS_IDX] = { + .procname = "pf_retrans", + .data = &init_net.sctp.pf_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &init_net.sctp.ps_retrans, + }, + [SCTP_PS_RETRANS_IDX] = { + .procname = "ps_retrans", + .data = &init_net.sctp.ps_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.sctp.pf_retrans, + .extra2 = &ps_retrans_max, + }, + { + .procname = "rto_initial", + .data = &init_net.sctp.rto_initial, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &timer_max + }, { .procname = "rto_alpha_exp_divisor", .data = &init_net.sctp.rto_alpha, @@ -208,24 +236,6 @@ static struct ctl_table sctp_net_table[] = { .extra2 = SYSCTL_INT_MAX, }, { - .procname = "pf_retrans", - .data = &init_net.sctp.pf_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &init_net.sctp.ps_retrans, - }, - { - .procname = "ps_retrans", - .data = &init_net.sctp.ps_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &init_net.sctp.pf_retrans, - .extra2 = &ps_retrans_max, - }, - { .procname = "sndbuf_policy", .data = &init_net.sctp.sndbuf_policy, .maxlen = sizeof(int), @@ -597,6 +607,11 @@ int sctp_sysctl_net_register(struct net *net) for (i = 0; table[i].data; i++) table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; + table[SCTP_RTO_MIN_IDX].extra2 = &net->sctp.rto_max; + table[SCTP_RTO_MAX_IDX].extra1 = &net->sctp.rto_min; + table[SCTP_PF_RETRANS_IDX].extra2 = &net->sctp.ps_retrans; + table[SCTP_PS_RETRANS_IDX].extra1 = &net->sctp.pf_retrans; + net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table); if (net->sctp.sysctl_header == NULL) { kfree(table); diff --git a/net/sctp/transport.c b/net/sctp/transport.c index f8fd98784977..2f66a2006517 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -196,10 +196,8 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport) /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); - if ((time_before(transport->hb_timer.expires, expires) || - !timer_pending(&transport->hb_timer)) && - !mod_timer(&transport->hb_timer, - expires + prandom_u32_max(transport->rto))) + if (!mod_timer(&transport->hb_timer, + expires + get_random_u32_below(transport->rto))) sctp_transport_hold(transport); } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e12d4fa5aece..d7a7420e81ec 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -27,6 +27,7 @@ #include <linux/if_vlan.h> #include <linux/rcupdate_wait.h> #include <linux/ctype.h> +#include <linux/splice.h> #include <net/sock.h> #include <net/tcp.h> @@ -359,8 +360,6 @@ static void smc_destruct(struct sock *sk) return; if (!sock_flag(sk, SOCK_DEAD)) return; - - sk_refcnt_debug_dec(sk); } static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, @@ -389,7 +388,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); - sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); @@ -501,7 +499,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, return -EINVAL; /* protect against parallel smcr_link_reg_buf() */ - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; @@ -509,7 +507,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, if (rc) break; } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); return rc; } @@ -518,15 +516,30 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; + bool do_slow = false; int i, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; + + down_read(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + if (!rmb_desc->is_reg_mr[link->link_idx]) { + up_read(&lgr->llc_conf_mutex); + goto slow_path; + } + } + /* mr register already */ + goto fast_path; +slow_path: + do_slow = true; /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; @@ -534,7 +547,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, if (rc) goto out; } - +fast_path: /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { @@ -543,7 +556,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, } rmb_desc->is_conf_rkey = true; out: - mutex_unlock(&lgr->llc_conf_mutex); + do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } @@ -3382,12 +3395,14 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; - smc_ism_init(); + rc = smc_ism_init(); + if (rc) + goto out_pernet_subsys_stat; smc_clc_init(); rc = smc_nl_init(); if (rc) - goto out_pernet_subsys_stat; + goto out_ism; rc = smc_pnet_init(); if (rc) @@ -3480,6 +3495,8 @@ out_pnet: smc_pnet_exit(); out_nl: smc_nl_exit(); +out_ism: + smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: @@ -3495,6 +3512,7 @@ static void __exit smc_exit(void) sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); + smc_ism_exit(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 1472f31480d8..b9b8b07aa702 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -673,7 +673,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, */ krflags = MSG_PEEK | MSG_WAITALL; clc_sk->sk_rcvtimeo = timeout; - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { @@ -720,7 +720,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } else { recvlen = datlen; } - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) { @@ -737,7 +737,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, /* receive remaining proposal message */ recvlen = datlen > SMC_CLC_RECV_BUF_LEN ? SMC_CLC_RECV_BUF_LEN : datlen; - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); datlen -= len; } @@ -813,6 +813,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) struct smc_clc_v2_extension *v2_ext; struct smc_clc_msg_smcd *pclc_smcd; struct smc_clc_msg_trail *trl; + struct smcd_dev *smcd; int len, i, plen, rc; int reason_code = 0; struct kvec vec[8]; @@ -868,7 +869,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (smcd_indicated(ini->smc_type_v1)) { /* add SMC-D specifics */ if (ini->ism_dev[0]) { - pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid); + smcd = ini->ism_dev[0]; + pclc_smcd->ism.gid = + htonll(smcd->ops->get_local_gid(smcd)); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); } @@ -914,8 +917,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) plen += sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { + smcd = ini->ism_dev[i]; gidchids[i - 1].gid = - htonll(ini->ism_dev[i]->local_gid); + htonll(smcd->ops->get_local_gid(smcd)); gidchids[i - 1].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); } @@ -1000,7 +1004,8 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); clc->hdr.typev1 = SMC_TYPE_D; - clc->d0.gid = conn->lgr->smcd->local_gid; + clc->d0.gid = + conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd); clc->d0.token = conn->rmb_desc->token; clc->d0.dmbe_size = conn->rmbe_size_short; clc->d0.dmbe_idx = 0; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index c305d8dd23f8..b330a1fa453e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -500,6 +500,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smcd_dev *smcd = lgr->smcd; struct nlattr *attrs; void *nlh; @@ -515,8 +516,9 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid, - SMC_NLA_LGR_D_PAD)) + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, + smcd->ops->get_local_gid(smcd), + SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, SMC_NLA_LGR_D_PAD)) @@ -820,6 +822,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct list_head *lgr_list; + struct smcd_dev *smcd; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; @@ -851,8 +854,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ - mutex_init(&lgr->sndbufs_lock); - mutex_init(&lgr->rmbs_lock); + init_rwsem(&lgr->sndbufs_lock); + init_rwsem(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); @@ -866,7 +869,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ - get_device(&ini->ism_dev[ini->ism_selected]->dev); + smcd = ini->ism_dev[ini->ism_selected]; + get_device(smcd->ops->get_dev(smcd)); lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected]; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; @@ -1094,7 +1098,7 @@ err_out: static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { - struct mutex *lock; /* lock buffer list */ + struct rw_semaphore *lock; /* lock buffer list */ int rc; if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { @@ -1102,10 +1106,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ - mutex_lock(&lgr->llc_conf_mutex); + down_read(&lgr->llc_conf_mutex); smc_llc_do_delete_rkey(lgr, buf_desc); buf_desc->is_conf_rkey = false; - mutex_unlock(&lgr->llc_conf_mutex); + up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } @@ -1114,9 +1118,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, /* buf registration failed, reuse not possible */ lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; - mutex_lock(lock); + down_write(lock); list_del(&buf_desc->list); - mutex_unlock(lock); + up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); } else { @@ -1220,15 +1224,16 @@ static void smcr_buf_unmap_lgr(struct smc_link *lnk) int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) smcr_buf_unmap_link(buf_desc, true, lnk); - mutex_unlock(&lgr->rmbs_lock); - mutex_lock(&lgr->sndbufs_lock); + up_write(&lgr->rmbs_lock); + + down_write(&lgr->sndbufs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) smcr_buf_unmap_link(buf_desc, false, lnk); - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); } } @@ -1373,19 +1378,19 @@ static void smc_lgr_free(struct smc_link_group *lgr) int i; if (!lgr->is_smcd) { - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i], false); } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); smc_llc_lgr_clear(lgr); } destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); + put_device(lgr->smcd->ops->get_dev(lgr->smcd)); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } @@ -1692,12 +1697,12 @@ static void smcr_link_down(struct smc_link *lnk) } else { if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); } if (!list_empty(&lgr->list)) { smc_llc_send_delete_link(to_lnk, del_link_id, @@ -1757,9 +1762,9 @@ static void smc_link_down_work(struct work_struct *work) if (list_empty(&lgr->list)) return; wake_up_all(&lgr->llc_msg_waiter); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); smcr_link_down(link); - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, @@ -1986,19 +1991,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - struct mutex *lock, + struct rw_semaphore *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - mutex_lock(lock); + down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - mutex_unlock(lock); + up_read(lock); return buf_slot; } } - mutex_unlock(lock); + up_read(lock); return NULL; } @@ -2107,13 +2112,13 @@ int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) return 0; } -static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock, struct list_head *lst, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf; int rc = 0; - mutex_lock(lock); + down_write(lock); list_for_each_entry_safe(buf_desc, bf, lst, list) { if (!buf_desc->used) continue; @@ -2122,7 +2127,7 @@ static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, goto out; } out: - mutex_unlock(lock); + up_write(lock); return rc; } @@ -2155,37 +2160,37 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) int i, rc = 0; /* reg all RMBs for a new link */ - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } } } - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) return rc; /* reg all vzalloced sndbufs for a new link */ - mutex_lock(&lgr->sndbufs_lock); + down_write(&lgr->sndbufs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { if (!buf_desc->used || !buf_desc->is_vm) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); return rc; } } } - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); return rc; } @@ -2243,7 +2248,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ - mutex_lock(&lgr->llc_conf_mutex); + down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; @@ -2256,7 +2261,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, cnt++; } out: - mutex_unlock(&lgr->llc_conf_mutex); + up_read(&lgr->llc_conf_mutex); if (!rc && !cnt) rc = -EINVAL; return rc; @@ -2305,8 +2310,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; - struct mutex *lock; /* lock buffer list */ int sk_buf_size; if (is_rmb) @@ -2354,9 +2359,9 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; - mutex_lock(lock); + down_write(lock); list_add(&buf_desc->list, buf_list); - mutex_unlock(lock); + up_write(lock); break; /* found */ } @@ -2430,9 +2435,9 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); if (rc) { - mutex_lock(&smc->conn.lgr->sndbufs_lock); + down_write(&smc->conn.lgr->sndbufs_lock); list_del(&smc->conn.sndbuf_desc->list); - mutex_unlock(&smc->conn.lgr->sndbufs_lock); + up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; } @@ -2595,6 +2600,7 @@ static int smc_core_reboot_event(struct notifier_block *this, { smc_lgrs_shutdown(); smc_ib_unregister_client(); + smc_ism_exit(); return 0; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 285f9bd8e232..08b457c2d294 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -252,9 +252,9 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - struct mutex sndbufs_lock; /* protects tx buffers */ + struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - struct mutex rmbs_lock; /* protects rx buffers */ + struct rw_semaphore rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -298,7 +298,7 @@ struct smc_link_group { /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ - struct mutex llc_conf_mutex; + struct rw_semaphore llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; struct work_struct llc_del_link_work; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 80ea7d954ece..7ff2152971a5 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -167,12 +167,13 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; + struct smcd_dev *smcd = conn->lgr->smcd; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid; - dinfo.my_gid = conn->lgr->smcd->local_gid; + dinfo.my_gid = smcd->ops->get_local_gid(smcd); dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 911fe08bc54b..3b0b7710c6b0 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,6 +17,7 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" +#include "linux/ism.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -26,6 +27,22 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +#if IS_ENABLED(CONFIG_ISM) +static void smcd_register_dev(struct ism_dev *ism); +static void smcd_unregister_dev(struct ism_dev *ism); +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask); + +static struct ism_client smc_ism_client = { + .name = "SMC-D", + .add = smcd_register_dev, + .remove = smcd_unregister_dev, + .handle_event = smcd_handle_event, + .handle_irq = smcd_handle_irq, +}; +#endif + /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { @@ -183,6 +200,7 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { +#if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; @@ -191,7 +209,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; @@ -200,6 +218,9 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb_desc->len = dmb.dmb_len; } return rc; +#else + return 0; +#endif } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -210,9 +231,11 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; struct nlattr *attrs; + struct ism_dev *ism; int use_cnt = 0; void *nlh; + ism = smcd->priv; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -227,7 +250,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev); + smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -293,10 +316,11 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct smcd_event event; + struct ism_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -336,24 +360,6 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) } } -int smc_ism_signal_shutdown(struct smc_link_group *lgr) -{ - int rc; - union smcd_sw_event_info ev_info; - - if (lgr->peer_shutdown) - return 0; - - memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); - ev_info.vlan_id = lgr->vlan_id; - ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_SHUTDOWN, - ev_info.info); - return rc; -} - /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { @@ -373,44 +379,25 @@ static void smc_ism_event_work(struct work_struct *work) kfree(wrk); } -static void smcd_release(struct device *dev) -{ - struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); - - kfree(smcd->conn); - kfree(smcd); -} - -struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) { struct smcd_dev *smcd; - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) { - kfree(smcd); + smcd->conn = devm_kcalloc(parent, max_dmbs, + sizeof(struct smc_connection *), GFP_KERNEL); + if (!smcd->conn) return NULL; - } smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); - if (!smcd->event_wq) { - kfree(smcd->conn); - kfree(smcd); + if (!smcd->event_wq) return NULL; - } - smcd->dev.parent = parent; - smcd->dev.release = smcd_release; - device_initialize(&smcd->dev); - dev_set_name(&smcd->dev, name); smcd->ops = ops; - if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -419,11 +406,23 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, init_waitqueue_head(&smcd->lgrs_deleted); return smcd; } -EXPORT_SYMBOL_GPL(smcd_alloc_dev); -int smcd_register_dev(struct smcd_dev *smcd) +static void smcd_register_dev(struct ism_dev *ism) { - int rc; + const struct smcd_ops *ops = ism_get_smcd_ops(); + struct smcd_dev *smcd; + + if (!ops) + return; + + smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + if (!smcd) + return; + smcd->priv = ism; + ism_set_priv(ism, &smc_ism_client, smcd); + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); mutex_lock(&smcd_dev_list.mutex); if (list_empty(&smcd_dev_list.list)) { @@ -444,43 +443,28 @@ int smcd_register_dev(struct smcd_dev *smcd) mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&smcd->dev), smcd->pnetid, + dev_name(&ism->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); - rc = device_add(&smcd->dev); - if (rc) { - mutex_lock(&smcd_dev_list.mutex); - list_del(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - } - - return rc; + return; } -EXPORT_SYMBOL_GPL(smcd_register_dev); -void smcd_unregister_dev(struct smcd_dev *smcd) +static void smcd_unregister_dev(struct ism_dev *ism) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&smcd->dev)); + dev_name(&ism->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); destroy_workqueue(smcd->event_wq); - - device_del(&smcd->dev); -} -EXPORT_SYMBOL_GPL(smcd_unregister_dev); - -void smcd_free_dev(struct smcd_dev *smcd) -{ - put_device(&smcd->dev); } -EXPORT_SYMBOL_GPL(smcd_free_dev); /* SMCD Device event handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, + * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) @@ -490,8 +474,9 @@ EXPORT_SYMBOL_GPL(smcd_free_dev); * Context: * - Function called in IRQ context from ISM device driver event handler. */ -void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -505,17 +490,18 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -EXPORT_SYMBOL_GPL(smcd_handle_event); /* SMCD Device interrupt handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, DMB number, and the DMBE bitmask. + * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask) { + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -525,10 +511,44 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -EXPORT_SYMBOL_GPL(smcd_handle_irq); +#endif + +int smc_ism_signal_shutdown(struct smc_link_group *lgr) +{ + int rc = 0; +#if IS_ENABLED(CONFIG_ISM) + union smcd_sw_event_info ev_info; + + if (lgr->peer_shutdown) + return 0; + + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); + ev_info.vlan_id = lgr->vlan_id; + ev_info.code = ISM_EVENT_REQUEST; + rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_SHUTDOWN, + ev_info.info); +#endif + return rc; +} -void __init smc_ism_init(void) +int smc_ism_init(void) { + int rc = 0; + +#if IS_ENABLED(CONFIG_ISM) smc_ism_v2_capable = false; memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); + + rc = ism_register_client(&smc_ism_client); +#endif + return rc; +} + +void smc_ism_exit(void) +{ +#if IS_ENABLED(CONFIG_ISM) + ism_unregister_client(&smc_ism_client); +#endif } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index d6b2db604fe8..832b2f42d79f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -42,7 +42,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); -void smc_ism_init(void); +int smc_ism_init(void); +void smc_ism_exit(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 524649d0ab65..a0840b8c935b 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -608,7 +608,7 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, prim_lnk_idx = link->link_idx; lnk_idx = link_new->link_idx; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); ext->num_rkeys = lgr->conns_num; if (!ext->num_rkeys) goto out; @@ -628,7 +628,7 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, } len += i * sizeof(ext->rt[0]); out: - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return len; } @@ -889,7 +889,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link, int rc = 0; int i; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); num_rkeys_send = lgr->conns_num; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); do { @@ -916,7 +916,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link, break; } while (num_rkeys_send || num_rkeys_recv); - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } @@ -999,14 +999,14 @@ static void smc_llc_save_add_link_rkeys(struct smc_link *link, ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); for (i = 0; i < max; i++) { smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, ext->rt[i].rmb_key, ext->rt[i].rmb_vaddr_new, ext->rt[i].rmb_key_new); } - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); } static void smc_llc_save_add_link_info(struct smc_link *link, @@ -1202,12 +1202,12 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); if (smc_llc_is_local_add_link(&qentry->msg)) smc_llc_cli_add_link_invite(qentry->link, qentry); else smc_llc_cli_add_link(qentry->link, qentry); - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); } static int smc_llc_active_link_count(struct smc_link_group *lgr) @@ -1313,7 +1313,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link, int rc = 0; int i; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); num_rkeys_send = lgr->conns_num; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); do { @@ -1338,7 +1338,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link, smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); } while (num_rkeys_send || num_rkeys_recv); out: - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } @@ -1509,13 +1509,13 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); rc = smc_llc_srv_add_link(link, qentry); if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { /* delete any asymmetric link */ smc_llc_delete_asym_link(lgr); } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); kfree(qentry); } @@ -1582,7 +1582,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_lgr_terminate_sched(lgr); goto out; } - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); /* delete single link */ for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) @@ -1616,7 +1616,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_lgr_terminate_sched(lgr); } out_unlock: - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); out: kfree(qentry); } @@ -1652,7 +1652,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) int active_links; int i; - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); lnk = qentry->link; del_llc = &qentry->msg.delete_link; @@ -1708,7 +1708,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) smc_llc_add_link_local(lnk); } out: - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); kfree(qentry); } @@ -2126,7 +2126,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) spin_lock_init(&lgr->llc_flow_lock); init_waitqueue_head(&lgr->llc_flow_waiter); init_waitqueue_head(&lgr->llc_msg_waiter); - mutex_init(&lgr->llc_conf_mutex); + init_rwsem(&lgr->llc_conf_mutex); lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); } diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 25fb2fd186e2..11775401df68 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -103,7 +103,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; @@ -162,16 +162,17 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user && + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->pnetid_by_user && (!pnet_name || - smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); - memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = false; + "%.16s\n", + dev_name(smcd->ops->get_dev(smcd)), + smcd->pnetid); + memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); + smcd->pnetid_by_user = false; rc = 0; } } @@ -331,8 +332,8 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(&smcd_dev->dev), smcd_name, - IB_DEVICE_NAME_MAX - 1)) + if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } smcd_dev = NULL; @@ -411,7 +412,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; + struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -425,14 +427,16 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, ib_port, ib_dev->pnetid[ib_port - 1]); } - smcd_dev = smc_pnet_find_smcd(ib_name); - if (smcd_dev) { - smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); - if (smcddev_applied) + smcd = smc_pnet_find_smcd(ib_name); + if (smcd) { + smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); + if (smcddev_applied) { + dev = smcd->ops->get_dev(smcd); pr_warn_ratelimited("smc: smcd device %s " "applied user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); + "%.16s\n", dev_name(dev), + smcd->pnetid); + } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. @@ -1181,7 +1185,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(&smcddev->dev); + const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 17c5aee7ee4f..4380d32f5a5f 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -13,8 +13,10 @@ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/sched/signal.h> +#include <linux/splice.h> #include <net/sock.h> +#include <trace/events/sock.h> #include "smc.h" #include "smc_core.h" @@ -31,6 +33,8 @@ static void smc_rx_wake_up(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + /* derived from sock_def_readable() */ /* called already in smc_listen_work() */ rcu_read_lock(); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 64dedffe9d26..f4b6a71ac488 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -308,7 +308,7 @@ int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, iov.iov_base = kaddr + offset; iov.iov_len = size; - iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, size); rc = smc_tx_sendmsg(smc, &msg, size); kunmap(page); return rc; diff --git a/net/socket.c b/net/socket.c index 55c5d536e5f6..4080b4ba7daf 100644 --- a/net/socket.c +++ b/net/socket.c @@ -106,6 +106,7 @@ #include <net/busy_poll.h> #include <linux/errqueue.h> #include <linux/ptp_clock_kernel.h> +#include <trace/events/sock.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -709,12 +710,22 @@ INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, size_t)); + +static noinline void call_trace_sock_send_length(struct sock *sk, int ret, + int flags) +{ + trace_sock_send_length(sk, ret, 0); +} + static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); + + if (trace_sock_send_length_enabled()) + call_trace_sock_send_length(sock->sk, ret, 0); return ret; } @@ -750,7 +761,7 @@ EXPORT_SYMBOL(sock_sendmsg); int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { - iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); + iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); @@ -776,7 +787,7 @@ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, if (!sock->ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); - iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); + iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } @@ -971,9 +982,12 @@ static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, static void sock_recv_mark(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - if (sock_flag(sk, SOCK_RCVMARK) && skb) - put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), - &skb->mark); + if (sock_flag(sk, SOCK_RCVMARK) && skb) { + /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ + __u32 mark = skb->mark; + + put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark); + } } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, @@ -989,12 +1003,21 @@ INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, size_t, int)); INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, size_t, int)); + +static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags) +{ + trace_sock_recv_length(sk, ret, flags); +} + static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, - inet_recvmsg, sock, msg, msg_data_left(msg), - flags); + int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + inet_recvmsg, sock, msg, + msg_data_left(msg), flags); + if (trace_sock_recv_length_enabled()) + call_trace_sock_recv_length(sock->sk, ret, flags); + return ret; } /** @@ -1034,7 +1057,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { msg->msg_control_is_user = false; - iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); + iov_iter_kvec(&msg->msg_iter, ITER_DEST, vec, num, size); return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); @@ -1044,6 +1067,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, { struct socket *sock; int flags; + int ret; sock = file->private_data; @@ -1051,7 +1075,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ flags |= more; - return kernel_sendpage(sock, page, offset, size, flags); + ret = kernel_sendpage(sock, page, offset, size, flags); + + if (trace_sock_send_length_enabled()) + call_trace_sock_send_length(sock->sk, ret, 0); + return ret; } static ssize_t sock_splice_read(struct file *file, loff_t *ppos, @@ -2092,7 +2120,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct iovec iov; int fput_needed; - err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter); + err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); @@ -2157,7 +2185,7 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, int err, err2; int fput_needed; - err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter); + err = import_single_range(ITER_DEST, ubuf, size, &iov, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); @@ -2411,7 +2439,7 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, if (err) return err; - err = import_iovec(save_addr ? READ : WRITE, + err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 7bb247c51e2f..2d7b1e03110a 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -302,7 +302,7 @@ __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; - if (auth && pos->auth->service != auth->service) + if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; @@ -686,6 +686,21 @@ out: return err; } +static struct gss_upcall_msg * +gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) +{ + struct gss_upcall_msg *pos; + list_for_each_entry(pos, &pipe->in_downcall, list) { + if (!uid_eq(pos->uid, uid)) + continue; + if (!rpc_msg_is_inflight(&pos->msg)) + continue; + refcount_inc(&pos->count); + return pos; + } + return NULL; +} + #define MSG_BUF_MAXSIZE 1024 static ssize_t @@ -732,7 +747,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); - gss_msg = __gss_find_upcall(pipe, uid, NULL); + gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index bcd74dddbe2d..acb822b23af1 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -49,11 +49,36 @@ #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/cache.h> +#include <linux/sunrpc/gss_krb5.h> #include <trace/events/rpcgss.h> #include "gss_rpc_upcall.h" +/* + * Unfortunately there isn't a maximum checksum size exported via the + * GSS API. Manufacture one based on GSS mechanisms supported by this + * implementation. + */ +#define GSS_MAX_CKSUMSIZE (GSS_KRB5_TOK_HDR_LEN + GSS_KRB5_MAX_CKSUM_LEN) + +/* + * This value may be increased in the future to accommodate other + * usage of the scratch buffer. + */ +#define GSS_SCRATCH_SIZE GSS_MAX_CKSUMSIZE + +struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; + /* save a pointer to the beginning of the encoded verifier, + * for use in encryption/checksumming in svcauth_gss_release: */ + __be32 *verf_start; + struct rsc *rsci; + + /* for temporary results */ + u8 gsd_scratch[GSS_SCRATCH_SIZE]; +}; /* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests * into replies. @@ -887,20 +912,18 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) static int unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) { + struct gss_svc_data *gsd = rqstp->rq_auth_data; u32 integ_len, rseqno, maj_stat; - int stat = -EINVAL; struct xdr_netobj mic; struct xdr_buf integ_buf; - mic.data = NULL; - /* NFS READ normally uses splice to send data in-place. However * the data in cache can change after the reply's MIC is computed * but before the RPC reply is sent. To prevent the client from * rejecting the server-computed MIC in this somewhat rare case, * do not use splice with the GSS integrity service. */ - __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) @@ -917,11 +940,9 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g /* copy out mic... */ if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) goto unwrap_failed; - if (mic.len > RPC_MAX_AUTH_SIZE) - goto unwrap_failed; - mic.data = kmalloc(mic.len, GFP_KERNEL); - if (!mic.data) + if (mic.len > sizeof(gsd->gsd_scratch)) goto unwrap_failed; + mic.data = gsd->gsd_scratch; if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); @@ -932,20 +953,17 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g goto bad_seqno; /* trim off the mic and padding at the end before returning */ xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4); - stat = 0; -out: - kfree(mic.data); - return stat; + return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); - goto out; + return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); - goto out; + return -EINVAL; bad_mic: trace_rpcgss_svc_mic(rqstp, maj_stat); - goto out; + return -EINVAL; } static inline int @@ -972,7 +990,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs int pad, remaining_len, offset; u32 rseqno; - __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { @@ -1023,15 +1041,6 @@ bad_unwrap: return -EINVAL; } -struct gss_svc_data { - /* decoded gss client cred: */ - struct rpc_gss_wire_cred clcred; - /* save a pointer to the beginning of the encoded verifier, - * for use in encryption/checksumming in svcauth_gss_release: */ - __be32 *verf_start; - struct rsc *rsci; -}; - static int svcauth_gss_set_client(struct svc_rqst *rqstp) { @@ -1162,18 +1171,23 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, return res; inlen = svc_getnl(argv); - if (inlen > (argv->iov_len + rqstp->rq_arg.page_len)) + if (inlen > (argv->iov_len + rqstp->rq_arg.page_len)) { + kfree(in_handle->data); return SVC_DENIED; + } pages = DIV_ROUND_UP(inlen, PAGE_SIZE); in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL); - if (!in_token->pages) + if (!in_token->pages) { + kfree(in_handle->data); return SVC_DENIED; + } in_token->page_base = 0; in_token->page_len = inlen; for (i = 0; i < pages; i++) { in_token->pages[i] = alloc_page(GFP_KERNEL); if (!in_token->pages[i]) { + kfree(in_handle->data); gss_free_in_token_pages(in_token); return SVC_DENIED; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f075a9fb5ccc..95ff74706104 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -677,7 +677,7 @@ static void cache_limit_defers(void) /* Consider removing either the first or the last */ if (cache_defer_cnt > DFR_MAX) { - if (prandom_u32_max(2)) + if (get_random_u32_below(2)) discard = list_entry(cache_defer_list.next, struct cache_deferred_req, recent); else diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 993acf38af87..0b0b9f1eed46 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1442,7 +1442,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, break; default: err = -EAFNOSUPPORT; - goto out; + goto out_release; } if (err < 0) { dprintk("RPC: can't bind UDP socket (%d)\n", err); diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 71ba4cf513bc..1b2b84feeec6 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -214,14 +214,14 @@ static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg, static int xprt_send_kvec(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t seek) { - iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len); + iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, 1, vec->iov_len); return xprt_sendmsg(sock, msg, seek); } static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg, struct xdr_buf *xdr, size_t base) { - iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, xdr_buf_pagecount(xdr), + iov_iter_bvec(&msg->msg_iter, ITER_SOURCE, xdr->bvec, xdr_buf_pagecount(xdr), xdr->page_len + xdr->page_base); return xprt_sendmsg(sock, msg, base + xdr->page_base); } @@ -244,7 +244,7 @@ static int xprt_send_rm_and_kvec(struct socket *sock, struct msghdr *msg, }; size_t len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len); + iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, iov, 2, len); return xprt_sendmsg(sock, msg, base); } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 149171774bc6..f06622814a95 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -567,7 +567,7 @@ svc_destroy(struct kref *ref) struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt); dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); - del_timer_sync(&serv->sv_temptimer); + timer_shutdown_sync(&serv->sv_temptimer); /* * The last user is gone and thus all sockets have to be destroyed to @@ -638,7 +638,6 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) return rqstp; __set_bit(RQ_BUSY, &rqstp->rq_flags); - spin_lock_init(&rqstp->rq_lock); rqstp->rq_server = serv; rqstp->rq_pool = pool; @@ -1244,10 +1243,10 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_short_len; /* Will be turned off by GSS integrity and privacy services */ - __set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ - __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); - __clear_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + clear_bit(RQ_DROPME, &rqstp->rq_flags); svc_putu32(resv, rqstp->rq_xid); @@ -1281,8 +1280,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) /* Also give the program a chance to reject this call: */ if (auth_res == SVC_OK && progp) auth_res = progp->pg_authenticate(rqstp); - if (auth_res != SVC_OK) - trace_svc_authenticate(rqstp, auth_res); + trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { case SVC_OK: break; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 2106003645a7..c2ce12538008 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1238,7 +1238,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; - __set_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; return &dr->handle; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2fc98fea59b4..99eafe87b1d5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -55,6 +55,7 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xprt.h> +#include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" @@ -260,7 +261,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, rqstp->rq_respages = &rqstp->rq_pages[i]; rqstp->rq_next_page = rqstp->rq_respages + 1; - iov_iter_bvec(&msg.msg_iter, READ, bvec, i, buflen); + iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen); if (seek) { iov_iter_advance(&msg.msg_iter, seek); buflen -= seek; @@ -298,9 +299,9 @@ static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs) static void svc_sock_secure_port(struct svc_rqst *rqstp) { if (svc_port_is_privileged(svc_addr(rqstp))) - __set_bit(RQ_SECURE, &rqstp->rq_flags); + set_bit(RQ_SECURE, &rqstp->rq_flags); else - __clear_bit(RQ_SECURE, &rqstp->rq_flags); + clear_bit(RQ_SECURE, &rqstp->rq_flags); } /* @@ -310,6 +311,8 @@ static void svc_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + trace_sk_data_ready(sk); + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); @@ -687,6 +690,8 @@ static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + trace_sk_data_ready(sk); + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); @@ -874,7 +879,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, want); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want); len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); if (len < 0) return len; @@ -1008,9 +1013,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags)) - __set_bit(RQ_LOCAL, &rqstp->rq_flags); + set_bit(RQ_LOCAL, &rqstp->rq_flags); else - __clear_bit(RQ_LOCAL, &rqstp->rq_flags); + clear_bit(RQ_LOCAL, &rqstp->rq_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index c1f559892ae8..1e05a2d723f4 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -31,7 +31,7 @@ static void rpc_sysfs_object_release(struct kobject *kobj) } static const struct kobj_ns_type_operations * -rpc_sysfs_object_child_ns_type(struct kobject *kobj) +rpc_sysfs_object_child_ns_type(const struct kobject *kobj) { return &net_ns_type_operations; } @@ -381,17 +381,17 @@ static void rpc_sysfs_xprt_release(struct kobject *kobj) kfree(xprt); } -static const void *rpc_sysfs_client_namespace(struct kobject *kobj) +static const void *rpc_sysfs_client_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_client, kobject)->net; } -static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj) +static const void *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; } -static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) +static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt, kobject)->xprt->xprt_net; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 336a7c7833e4..f7767bf22406 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1224,30 +1224,34 @@ EXPORT_SYMBOL(xdr_restrict_buflen); /** * xdr_write_pages - Insert a list of pages into an XDR buffer for sending * @xdr: pointer to xdr_stream - * @pages: list of pages - * @base: offset of first byte - * @len: length of data in bytes + * @pages: array of pages to insert + * @base: starting offset of first data byte in @pages + * @len: number of data bytes in @pages to insert * + * After the @pages are added, the tail iovec is instantiated pointing to + * end of the head buffer, and the stream is set up to encode subsequent + * items into the tail. */ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len) { struct xdr_buf *buf = xdr->buf; - struct kvec *iov = buf->tail; + struct kvec *tail = buf->tail; + buf->pages = pages; buf->page_base = base; buf->page_len = len; - iov->iov_base = (char *)xdr->p; - iov->iov_len = 0; - xdr->iov = iov; + tail->iov_base = xdr->p; + tail->iov_len = 0; + xdr->iov = tail; if (len & 3) { unsigned int pad = 4 - (len & 3); BUG_ON(xdr->p >= xdr->end); - iov->iov_base = (char *)xdr->p + (len & 3); - iov->iov_len += pad; + tail->iov_base = (char *)xdr->p + (len & 3); + tail->iov_len += pad; len += pad; *xdr->p++ = 0; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 656cec208371..ab453ede54f0 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1164,7 +1164,7 @@ xprt_request_enqueue_receive(struct rpc_task *task) spin_unlock(&xprt->queue_lock); /* Turn off autodisconnect */ - del_singleshot_timer_sync(&xprt->timer); + del_timer_sync(&xprt->timer); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 199fa012f18a..94b20fb47135 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -602,7 +602,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) static void svc_rdma_secure_port(struct svc_rqst *rqstp) { - __set_bit(RQ_SECURE, &rqstp->rq_flags); + set_bit(RQ_SECURE, &rqstp->rq_flags); } static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 44b87e4274b4..b098fde373ab 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -831,7 +831,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, return req; out3: - kfree(req->rl_sendbuf); + rpcrdma_regbuf_free(req->rl_sendbuf); out2: kfree(req); out1: diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 915b9902f673..adcbedc244d6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -52,6 +52,7 @@ #include <linux/uio.h> #include <linux/sched/mm.h> +#include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" @@ -364,7 +365,7 @@ static ssize_t xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, struct kvec *kvec, size_t count, size_t seek) { - iov_iter_kvec(&msg->msg_iter, READ, kvec, 1, count); + iov_iter_kvec(&msg->msg_iter, ITER_DEST, kvec, 1, count); return xs_sock_recvmsg(sock, msg, flags, seek); } @@ -373,7 +374,7 @@ xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, struct bio_vec *bvec, unsigned long nr, size_t count, size_t seek) { - iov_iter_bvec(&msg->msg_iter, READ, bvec, nr, count); + iov_iter_bvec(&msg->msg_iter, ITER_DEST, bvec, nr, count); return xs_sock_recvmsg(sock, msg, flags, seek); } @@ -381,7 +382,7 @@ static ssize_t xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { - iov_iter_discard(&msg->msg_iter, READ, count); + iov_iter_discard(&msg->msg_iter, ITER_DEST, count); return sock_recvmsg(sock, msg, flags); } @@ -1378,6 +1379,8 @@ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; + trace_sk_data_ready(sk); + xprt = xprt_from_sock(sk); if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, @@ -1619,7 +1622,7 @@ static int xs_get_random_port(void) if (max < min) return -EADDRINUSE; range = max - min + 1; - rand = prandom_u32_max(range); + rand = get_random_u32_below(range); return rand + min; } @@ -1882,6 +1885,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_write_space = xs_udp_write_space; sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; + sk->sk_use_task_frag = false; xprt_clear_connected(xprt); @@ -2082,6 +2086,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_use_task_frag = false; xprt_set_connected(xprt); @@ -2249,6 +2254,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; + sk->sk_use_task_frag = false; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index e8dcdf267c0c..685389d4b245 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -388,7 +388,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, */ void tipc_disc_delete(struct tipc_discoverer *d) { - del_timer_sync(&d->timer); + timer_shutdown_sync(&d->timer); kfree_skb(d->skb); kfree(d); } diff --git a/net/tipc/link.c b/net/tipc/link.c index e260c0d557f5..b3ce24823f50 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2224,7 +2224,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); - skb_linearize(skb); + if (skb_linearize(skb)) + goto exit; + hdr = buf_msg(skb); data = msg_data(hdr); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 9618e4429f0f..77a3d016cade 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -700,7 +700,7 @@ void tipc_mon_delete(struct net *net, int bearer_id) } mon->self = NULL; write_unlock_bh(&mon->lock); - del_timer_sync(&mon->timer); + timer_shutdown_sync(&mon->timer); kfree(self->domain); kfree(self); kfree(mon); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index dfea27a906f2..9b47c8409231 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -39,6 +39,7 @@ #include "node.h" #include "net.h" #include <net/genetlink.h> +#include <linux/string_helpers.h> #include <linux/tipc_config.h> /* The legacy API had an artificial message length limit called @@ -173,11 +174,6 @@ static struct sk_buff *tipc_get_err_tlv(char *str) return buf; } -static inline bool string_is_valid(char *s, int len) -{ - return memchr(s, '\0', len) ? true : false; -} - static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg, struct sk_buff *arg) @@ -445,7 +441,7 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, return -EINVAL; len = min_t(int, len, TIPC_MAX_BEARER_NAME); - if (!string_is_valid(b->name, len)) + if (!string_is_terminated(b->name, len)) return -EINVAL; if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, b->name)) @@ -486,7 +482,7 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, return -EINVAL; len = min_t(int, len, TIPC_MAX_BEARER_NAME); - if (!string_is_valid(name, len)) + if (!string_is_terminated(name, len)) return -EINVAL; if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, name)) @@ -584,7 +580,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; len = min_t(int, len, TIPC_MAX_LINK_NAME); - if (!string_is_valid(name, len)) + if (!string_is_terminated(name, len)) return -EINVAL; if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0) @@ -819,7 +815,7 @@ static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, return -EINVAL; len = min_t(int, len, TIPC_MAX_LINK_NAME); - if (!string_is_valid(lc->name, len)) + if (!string_is_terminated(lc->name, len)) return -EINVAL; media = tipc_media_find(lc->name); @@ -856,7 +852,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, return -EINVAL; len = min_t(int, len, TIPC_MAX_LINK_NAME); - if (!string_is_valid(name, len)) + if (!string_is_terminated(name, len)) return -EINVAL; if (nla_put_string(skb, TIPC_NLA_LINK_NAME, name)) diff --git a/net/tipc/node.c b/net/tipc/node.c index b48d97cbbe29..5e000fde8067 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1179,8 +1179,9 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool addr_match = false; bool sign_match = false; bool link_up = false; + bool link_is_reset = false; bool accept_addr = false; - bool reset = true; + bool reset = false; char *if_name; unsigned long intv; u16 session; @@ -1200,14 +1201,14 @@ void tipc_node_check_dest(struct net *net, u32 addr, /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); + link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { - /* All is fine. Do nothing. */ - reset = false; + /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; @@ -1232,6 +1233,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, */ accept_addr = true; *respond = true; + reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already @@ -1263,6 +1265,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, n->signature = signature; accept_addr = true; *respond = true; + reset = true; } if (!accept_addr) @@ -1291,6 +1294,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); + link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); @@ -1303,7 +1307,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); - if (reset && l && !tipc_link_is_reset(l)) + if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } @@ -1689,6 +1693,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; + struct net *peer_net; int bearer_id; int rc; @@ -1705,18 +1710,23 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, return -EHOSTUNREACH; } + rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); - if (node_up && n->peer_net && check_net(n->peer_net)) { + peer_net = n->peer_net; + tipc_node_read_unlock(n); + if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ - tipc_lxc_xmit(n->peer_net, list); + tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { - tipc_node_read_unlock(n); + rcu_read_unlock(); tipc_node_put(n); return 0; } } + rcu_read_unlock(); + tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e902b01ea3cb..37edfe10f8c6 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -37,6 +37,7 @@ #include <linux/rhashtable.h> #include <linux/sched/signal.h> +#include <trace/events/sock.h> #include "core.h" #include "name_table.h" @@ -2130,6 +2131,8 @@ static void tipc_data_ready(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) @@ -2614,6 +2617,7 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, /* Send a 'SYN-' to destination */ m.msg_name = dest; m.msg_namelen = destlen; + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); /* If connect is in non-blocking case, set MSG_DONTWAIT to * indicate send_msg() is never blocked. @@ -2776,6 +2780,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); __tipc_sendstream(new_sock, &m, 0); release_sock(new_sk); exit: @@ -3010,7 +3015,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk) struct net *net = sock_net(sk); struct tipc_net *tn = net_generic(net, tipc_net_id); u32 remaining = (TIPC_MAX_PORT - TIPC_MIN_PORT) + 1; - u32 portid = prandom_u32_max(remaining) + TIPC_MIN_PORT; + u32 portid = get_random_u32_below(remaining) + TIPC_MIN_PORT; while (remaining--) { portid++; diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index e3b427a70398..8ee0c07d00e9 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -43,6 +43,7 @@ #include "bearer.h" #include <net/sock.h> #include <linux/module.h> +#include <trace/events/sock.h> /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 @@ -396,7 +397,7 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con) iov.iov_base = &s; iov.iov_len = sizeof(s); msg.msg_name = NULL; - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, iov.iov_len); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, iov.iov_len); ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret == -EWOULDBLOCK) return -EWOULDBLOCK; @@ -439,6 +440,8 @@ static void tipc_conn_data_ready(struct sock *sk) { struct tipc_conn *con; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); con = sk->sk_user_data; if (connected(con)) { @@ -496,6 +499,8 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) { struct tipc_topsrv *srv; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); srv = sk->sk_user_data; if (srv) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a03d66046ca3..6c593788dc25 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -620,7 +620,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; - iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); + iov_iter_kvec(&msg_iter, ITER_SOURCE, &iov, 1, size); iter_offset.msg_iter = &msg_iter; rc = tls_push_data(sk, iter_offset, size, flags, TLS_RECORD_TYPE_DATA, NULL); @@ -697,7 +697,7 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) union tls_iter_offset iter; struct iov_iter msg_iter; - iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); + iov_iter_kvec(&msg_iter, ITER_SOURCE, NULL, 0, 0); iter.msg_iter = &msg_iter; return tls_push_data(sk, iter, 0, flags, TLS_RECORD_TYPE_DATA, NULL); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 264cf367e265..6d0a534b7baa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -43,6 +43,7 @@ #include <net/strparser.h> #include <net/tls.h> +#include <trace/events/sock.h> #include "tls.h" @@ -792,7 +793,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; - bool enospc, policy; + bool enospc, policy, redir_ingress; int err = 0, send; u32 delta = 0; @@ -837,6 +838,7 @@ more_data: } break; case __SK_REDIRECT: + redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; memcpy(&msg_redir, msg, sizeof(*msg)); if (msg->apply_bytes < send) @@ -846,7 +848,8 @@ more_data: sk_msg_return_zero(sk, msg, send); msg->sg.size -= send; release_sock(sk); - err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); + err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, + &msg_redir, send, flags); lock_sock(sk); if (err < 0) { *copied -= sk_msg_free_nocharge(sk, &msg_redir); @@ -2282,6 +2285,8 @@ static void tls_data_ready(struct sock *sk) struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_psock *psock; + trace_sk_data_ready(sk); + tls_strp_data_ready(&ctx->strp); psock = sk_psock_get(sk); @@ -2425,7 +2430,7 @@ static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx) { struct tls_rec *rec; - rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); + rec = list_first_entry_or_null(&ctx->tx_list, struct tls_rec, list); if (!rec) return false; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b3545fc68097..0be25e712c28 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -112,6 +112,7 @@ #include <linux/mount.h> #include <net/checksum.h> #include <linux/security.h> +#include <linux/splice.h> #include <linux/freezer.h> #include <linux/file.h> #include <linux/btf_ids.h> @@ -807,23 +808,23 @@ static int unix_count_nr_fds(struct sock *sk) static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; + unsigned char s_state; struct unix_sock *u; - int nr_fds; + int nr_fds = 0; if (sk) { + s_state = READ_ONCE(sk->sk_state); u = unix_sk(sk); - if (sock->type == SOCK_DGRAM) { - nr_fds = atomic_read(&u->scm_stat.nr_fds); - goto out_print; - } - unix_state_lock(sk); - if (sk->sk_state != TCP_LISTEN) + /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their + * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. + * SOCK_DGRAM is ordinary. So, no lock is needed. + */ + if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); - else + else if (s_state == TCP_LISTEN) nr_fds = unix_count_nr_fds(sk); - unix_state_unlock(sk); -out_print: + seq_printf(m, "scm_fds: %u\n", nr_fds); } } @@ -1999,13 +2000,20 @@ restart_locked: unix_state_lock(sk); err = 0; - if (unix_peer(sk) == other) { + if (sk->sk_type == SOCK_SEQPACKET) { + /* We are here only when racing with unix_release_sock() + * is clearing @other. Never change state to TCP_CLOSE + * unlike SOCK_DGRAM wants. + */ + unix_state_unlock(sk); + err = -EPIPE; + } else if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); + sk->sk_state = TCP_CLOSE; unix_state_unlock(sk); - sk->sk_state = TCP_CLOSE; unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; @@ -3738,6 +3746,7 @@ static int __init af_unix_init(void) rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); + proto_unregister(&unix_dgram_proto); goto out; } diff --git a/net/unix/diag.c b/net/unix/diag.c index 105f522a89fe..616b55c5b890 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -114,14 +114,16 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } -static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb) +static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb, + struct user_namespace *user_ns) { - uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk)); + uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk)); return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, - u32 portid, u32 seq, u32 flags, int sk_ino) + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct unix_diag_msg *rep; @@ -167,7 +169,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_UID) && - sk_diag_dump_uid(sk, skb)) + sk_diag_dump_uid(sk, skb, user_ns)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); @@ -179,7 +181,8 @@ out_nlmsg_trim: } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, - u32 portid, u32 seq, u32 flags) + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags) { int sk_ino; @@ -190,7 +193,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if (!sk_ino) return 0; - return sk_diag_fill(sk, skb, req, portid, seq, flags, sk_ino); + return sk_diag_fill(sk, skb, req, user_ns, portid, seq, flags, sk_ino); } static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) @@ -214,7 +217,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto next; if (!(req->udiag_states & (1 << sk->sk_state))) goto next; - if (sk_diag_dump(sk, skb, req, + if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) { @@ -282,7 +285,8 @@ again: if (!rep) goto out; - err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).portid, + err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, req->udiag_ino); if (err < 0) { nlmsg_free(rep); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 884eca7f6743..19aea7cba26e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -626,8 +626,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, struct sockaddr_vm new_addr; if (!port) - port = LAST_RESERVED_PORT + 1 + - prandom_u32_max(U32_MAX - LAST_RESERVED_PORT); + port = get_random_u32_above(LAST_RESERVED_PORT); vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); @@ -1862,8 +1861,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, written = transport->stream_enqueue(vsk, msg, len - total_written); } + if (written < 0) { - err = -ENOMEM; + err = written; goto out_err; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index ad64f403536a..28b5a8e8e094 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -42,8 +42,7 @@ struct virtio_vsock { bool tx_run; struct work_struct send_pkt_work; - spinlock_t send_pkt_list_lock; - struct list_head send_pkt_list; + struct sk_buff_head send_pkt_queue; atomic_t queued_replies; @@ -101,41 +100,31 @@ virtio_transport_send_pkt_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { - struct virtio_vsock_pkt *pkt; struct scatterlist hdr, buf, *sgs[2]; int ret, in_sg = 0, out_sg = 0; + struct sk_buff *skb; bool reply; - spin_lock_bh(&vsock->send_pkt_list_lock); - if (list_empty(&vsock->send_pkt_list)) { - spin_unlock_bh(&vsock->send_pkt_list_lock); + skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); + if (!skb) break; - } - - pkt = list_first_entry(&vsock->send_pkt_list, - struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - spin_unlock_bh(&vsock->send_pkt_list_lock); - virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_deliver_tap_pkt(skb); + reply = virtio_vsock_skb_reply(skb); - reply = pkt->reply; - - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); + sg_init_one(&hdr, virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb))); sgs[out_sg++] = &hdr; - if (pkt->buf) { - sg_init_one(&buf, pkt->buf, pkt->len); + if (skb->len > 0) { + sg_init_one(&buf, skb->data, skb->len); sgs[out_sg++] = &buf; } - ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL); + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); /* Usually this means that there is no more space available in * the vq */ if (ret < 0) { - spin_lock_bh(&vsock->send_pkt_list_lock); - list_add(&pkt->list, &vsock->send_pkt_list); - spin_unlock_bh(&vsock->send_pkt_list_lock); + virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } @@ -164,32 +153,32 @@ out: } static int -virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) +virtio_transport_send_pkt(struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr; struct virtio_vsock *vsock; - int len = pkt->len; + int len = skb->len; + + hdr = virtio_vsock_hdr(skb); rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { - virtio_transport_free_pkt(pkt); + kfree_skb(skb); len = -ENODEV; goto out_rcu; } - if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { - virtio_transport_free_pkt(pkt); + if (le64_to_cpu(hdr->dst_cid) == vsock->guest_cid) { + kfree_skb(skb); len = -ENODEV; goto out_rcu; } - if (pkt->reply) + if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); - spin_lock_bh(&vsock->send_pkt_list_lock); - list_add_tail(&pkt->list, &vsock->send_pkt_list); - spin_unlock_bh(&vsock->send_pkt_list_lock); - + virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); out_rcu: @@ -201,9 +190,7 @@ static int virtio_transport_cancel_pkt(struct vsock_sock *vsk) { struct virtio_vsock *vsock; - struct virtio_vsock_pkt *pkt, *n; int cnt = 0, ret; - LIST_HEAD(freeme); rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); @@ -212,20 +199,7 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk) goto out_rcu; } - spin_lock_bh(&vsock->send_pkt_list_lock); - list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { - if (pkt->vsk != vsk) - continue; - list_move(&pkt->list, &freeme); - } - spin_unlock_bh(&vsock->send_pkt_list_lock); - - list_for_each_entry_safe(pkt, n, &freeme, list) { - if (pkt->reply) - cnt++; - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; @@ -246,38 +220,28 @@ out_rcu: static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { - int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; - struct virtio_vsock_pkt *pkt; - struct scatterlist hdr, buf, *sgs[2]; + int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM; + struct scatterlist pkt, *p; struct virtqueue *vq; + struct sk_buff *skb; int ret; vq = vsock->vqs[VSOCK_VQ_RX]; do { - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) + skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL); + if (!skb) break; - pkt->buf = kmalloc(buf_len, GFP_KERNEL); - if (!pkt->buf) { - virtio_transport_free_pkt(pkt); + memset(skb->head, 0, VIRTIO_VSOCK_SKB_HEADROOM); + sg_init_one(&pkt, virtio_vsock_hdr(skb), total_len); + p = &pkt; + ret = virtqueue_add_sgs(vq, &p, 0, 1, skb, GFP_KERNEL); + if (ret < 0) { + kfree_skb(skb); break; } - pkt->buf_len = buf_len; - pkt->len = buf_len; - - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); - sgs[0] = &hdr; - - sg_init_one(&buf, pkt->buf, buf_len); - sgs[1] = &buf; - ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL); - if (ret) { - virtio_transport_free_pkt(pkt); - break; - } vsock->rx_buf_nr++; } while (vq->num_free); if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) @@ -299,12 +263,12 @@ static void virtio_transport_tx_work(struct work_struct *work) goto out; do { - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; unsigned int len; virtqueue_disable_cb(vq); - while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { + consume_skb(skb); added = true; } } while (!virtqueue_enable_cb(vq)); @@ -529,7 +493,7 @@ static void virtio_transport_rx_work(struct work_struct *work) do { virtqueue_disable_cb(vq); for (;;) { - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; unsigned int len; if (!virtio_transport_more_replies(vsock)) { @@ -540,23 +504,22 @@ static void virtio_transport_rx_work(struct work_struct *work) goto out; } - pkt = virtqueue_get_buf(vq, &len); - if (!pkt) { + skb = virtqueue_get_buf(vq, &len); + if (!skb) break; - } vsock->rx_buf_nr--; /* Drop short/long packets */ - if (unlikely(len < sizeof(pkt->hdr) || - len > sizeof(pkt->hdr) + pkt->len)) { - virtio_transport_free_pkt(pkt); + if (unlikely(len < sizeof(struct virtio_vsock_hdr) || + len > virtio_vsock_skb_len(skb))) { + kfree_skb(skb); continue; } - pkt->len = len - sizeof(pkt->hdr); - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(&virtio_transport, pkt); + virtio_vsock_skb_rx_put(skb); + virtio_transport_deliver_tap_pkt(skb); + virtio_transport_recv_pkt(&virtio_transport, skb); } } while (!virtqueue_enable_cb(vq)); @@ -610,7 +573,7 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) { struct virtio_device *vdev = vsock->vdev; - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; /* Reset all connected sockets when the VQs disappear */ vsock_for_each_connected_socket(&virtio_transport.transport, @@ -637,23 +600,16 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) virtio_reset_device(vdev); mutex_lock(&vsock->rx_lock); - while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) + kfree_skb(skb); mutex_unlock(&vsock->rx_lock); mutex_lock(&vsock->tx_lock); - while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) + kfree_skb(skb); mutex_unlock(&vsock->tx_lock); - spin_lock_bh(&vsock->send_pkt_list_lock); - while (!list_empty(&vsock->send_pkt_list)) { - pkt = list_first_entry(&vsock->send_pkt_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - spin_unlock_bh(&vsock->send_pkt_list_lock); + virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); @@ -690,8 +646,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); mutex_init(&vsock->event_lock); - spin_lock_init(&vsock->send_pkt_list_lock); - INIT_LIST_HEAD(&vsock->send_pkt_list); + skb_queue_head_init(&vsock->send_pkt_queue); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a9980e9b9304..a1581c77cf84 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -37,53 +37,56 @@ virtio_transport_get_ops(struct vsock_sock *vsk) return container_of(t, struct virtio_transport, transport); } -static struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, +/* Returns a new packet on success, otherwise returns NULL. + * + * If NULL is returned, errp is set to a negative errno. + */ +static struct sk_buff * +virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, size_t len, u32 src_cid, u32 src_port, u32 dst_cid, u32 dst_port) { - struct virtio_vsock_pkt *pkt; + const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len; + struct virtio_vsock_hdr *hdr; + struct sk_buff *skb; + void *payload; int err; - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) + skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); + if (!skb) return NULL; - pkt->hdr.type = cpu_to_le16(info->type); - pkt->hdr.op = cpu_to_le16(info->op); - pkt->hdr.src_cid = cpu_to_le64(src_cid); - pkt->hdr.dst_cid = cpu_to_le64(dst_cid); - pkt->hdr.src_port = cpu_to_le32(src_port); - pkt->hdr.dst_port = cpu_to_le32(dst_port); - pkt->hdr.flags = cpu_to_le32(info->flags); - pkt->len = len; - pkt->hdr.len = cpu_to_le32(len); - pkt->reply = info->reply; - pkt->vsk = info->vsk; + hdr = virtio_vsock_hdr(skb); + hdr->type = cpu_to_le16(info->type); + hdr->op = cpu_to_le16(info->op); + hdr->src_cid = cpu_to_le64(src_cid); + hdr->dst_cid = cpu_to_le64(dst_cid); + hdr->src_port = cpu_to_le32(src_port); + hdr->dst_port = cpu_to_le32(dst_port); + hdr->flags = cpu_to_le32(info->flags); + hdr->len = cpu_to_le32(len); if (info->msg && len > 0) { - pkt->buf = kmalloc(len, GFP_KERNEL); - if (!pkt->buf) - goto out_pkt; - - pkt->buf_len = len; - - err = memcpy_from_msg(pkt->buf, info->msg, len); + payload = skb_put(skb, len); + err = memcpy_from_msg(payload, info->msg, len); if (err) goto out; if (msg_data_left(info->msg) == 0 && info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { - pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); if (info->msg->msg_flags & MSG_EOR) - pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); } } + if (info->reply) + virtio_vsock_skb_set_reply(skb); + trace_virtio_transport_alloc_pkt(src_cid, src_port, dst_cid, dst_port, len, @@ -91,19 +94,18 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, info->op, info->flags); - return pkt; + return skb; out: - kfree(pkt->buf); -out_pkt: - kfree(pkt); + kfree_skb(skb); return NULL; } /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { - struct virtio_vsock_pkt *pkt = opaque; + struct virtio_vsock_hdr *pkt_hdr; + struct sk_buff *pkt = opaque; struct af_vsockmon_hdr *hdr; struct sk_buff *skb; size_t payload_len; @@ -113,10 +115,11 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) * the payload length from the header and the buffer pointer taking * care of the offset in the original packet. */ - payload_len = le32_to_cpu(pkt->hdr.len); - payload_buf = pkt->buf + pkt->off; + pkt_hdr = virtio_vsock_hdr(pkt); + payload_len = pkt->len; + payload_buf = pkt->data; - skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len, + skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); if (!skb) return NULL; @@ -124,16 +127,16 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) hdr = skb_put(skb, sizeof(*hdr)); /* pkt->hdr is little-endian so no need to byteswap here */ - hdr->src_cid = pkt->hdr.src_cid; - hdr->src_port = pkt->hdr.src_port; - hdr->dst_cid = pkt->hdr.dst_cid; - hdr->dst_port = pkt->hdr.dst_port; + hdr->src_cid = pkt_hdr->src_cid; + hdr->src_port = pkt_hdr->src_port; + hdr->dst_cid = pkt_hdr->dst_cid; + hdr->dst_port = pkt_hdr->dst_port; hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO); - hdr->len = cpu_to_le16(sizeof(pkt->hdr)); + hdr->len = cpu_to_le16(sizeof(*pkt_hdr)); memset(hdr->reserved, 0, sizeof(hdr->reserved)); - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(pkt_hdr->op)) { case VIRTIO_VSOCK_OP_REQUEST: case VIRTIO_VSOCK_OP_RESPONSE: hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT); @@ -154,7 +157,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) break; } - skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr)); + skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { skb_put_data(skb, payload_buf, payload_len); @@ -163,13 +166,13 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) return skb; } -void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) +void virtio_transport_deliver_tap_pkt(struct sk_buff *skb) { - if (pkt->tap_delivered) + if (virtio_vsock_skb_tap_delivered(skb)) return; - vsock_deliver_tap(virtio_transport_build_skb, pkt); - pkt->tap_delivered = true; + vsock_deliver_tap(virtio_transport_build_skb, skb); + virtio_vsock_skb_set_tap_delivered(skb); } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); @@ -192,8 +195,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, u32 src_cid, src_port, dst_cid, dst_port; const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; - struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; + struct sk_buff *skb; info->type = virtio_transport_get_type(sk_vsock(vsk)); @@ -224,42 +227,47 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) return pkt_len; - pkt = virtio_transport_alloc_pkt(info, pkt_len, + skb = virtio_transport_alloc_skb(info, pkt_len, src_cid, src_port, dst_cid, dst_port); - if (!pkt) { + if (!skb) { virtio_transport_put_credit(vvs, pkt_len); return -ENOMEM; } - virtio_transport_inc_tx_pkt(vvs, pkt); + virtio_transport_inc_tx_pkt(vvs, skb); - return t_ops->send_pkt(pkt); + return t_ops->send_pkt(skb); } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - if (vvs->rx_bytes + pkt->len > vvs->buf_alloc) + if (vvs->rx_bytes + skb->len > vvs->buf_alloc) return false; - vvs->rx_bytes += pkt->len; + vvs->rx_bytes += skb->len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - vvs->rx_bytes -= pkt->len; - vvs->fwd_cnt += pkt->len; + int len; + + len = skb_headroom(skb) - sizeof(struct virtio_vsock_hdr) - skb->len; + vvs->rx_bytes -= len; + vvs->fwd_cnt += len; } -void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + spin_lock_bh(&vvs->rx_lock); vvs->last_fwd_cnt = vvs->fwd_cnt; - pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); - pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); + hdr->fwd_cnt = cpu_to_le32(vvs->fwd_cnt); + hdr->buf_alloc = cpu_to_le32(vvs->buf_alloc); spin_unlock_bh(&vvs->rx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); @@ -303,29 +311,29 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; size_t bytes, total = 0, off; + struct sk_buff *skb, *tmp; int err = -EFAULT; spin_lock_bh(&vvs->rx_lock); - list_for_each_entry(pkt, &vvs->rx_queue, list) { - off = pkt->off; + skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { + off = 0; if (total == len) break; - while (total < len && off < pkt->len) { + while (total < len && off < skb->len) { bytes = len - total; - if (bytes > pkt->len - off) - bytes = pkt->len - off; + if (bytes > skb->len - off) + bytes = skb->len - off; /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since memcpy_to_msg() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf + off, bytes); + err = memcpy_to_msg(msg, skb->data + off, bytes); if (err) goto out; @@ -352,37 +360,38 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; size_t bytes, total = 0; - u32 free_space; + struct sk_buff *skb; int err = -EFAULT; + u32 free_space; spin_lock_bh(&vvs->rx_lock); - while (total < len && !list_empty(&vvs->rx_queue)) { - pkt = list_first_entry(&vvs->rx_queue, - struct virtio_vsock_pkt, list); + while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + skb = __skb_dequeue(&vvs->rx_queue); bytes = len - total; - if (bytes > pkt->len - pkt->off) - bytes = pkt->len - pkt->off; + if (bytes > skb->len) + bytes = skb->len; /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since memcpy_to_msg() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + err = memcpy_to_msg(msg, skb->data, bytes); if (err) goto out; spin_lock_bh(&vvs->rx_lock); total += bytes; - pkt->off += bytes; - if (pkt->off == pkt->len) { - virtio_transport_dec_rx_pkt(vvs, pkt); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); + skb_pull(skb, bytes); + + if (skb->len == 0) { + virtio_transport_dec_rx_pkt(vvs, skb); + consume_skb(skb); + } else { + __skb_queue_head(&vvs->rx_queue, skb); } } @@ -414,10 +423,10 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, int flags) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; int dequeued_len = 0; size_t user_buf_len = msg_data_left(msg); bool msg_ready = false; + struct sk_buff *skb; spin_lock_bh(&vvs->rx_lock); @@ -427,13 +436,18 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, } while (!msg_ready) { - pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); + struct virtio_vsock_hdr *hdr; + + skb = __skb_dequeue(&vvs->rx_queue); + if (!skb) + break; + hdr = virtio_vsock_hdr(skb); if (dequeued_len >= 0) { size_t pkt_len; size_t bytes_to_copy; - pkt_len = (size_t)le32_to_cpu(pkt->hdr.len); + pkt_len = (size_t)le32_to_cpu(hdr->len); bytes_to_copy = min(user_buf_len, pkt_len); if (bytes_to_copy) { @@ -444,7 +458,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); + err = memcpy_to_msg(msg, skb->data, bytes_to_copy); if (err) { /* Copy of message failed. Rest of * fragments will be freed without copy. @@ -452,6 +466,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len = err; } else { user_buf_len -= bytes_to_copy; + skb_pull(skb, bytes_to_copy); } spin_lock_bh(&vvs->rx_lock); @@ -461,17 +476,16 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len += pkt_len; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) { + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { msg_ready = true; vvs->msg_count--; - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); + virtio_transport_dec_rx_pkt(vvs, skb); + kfree_skb(skb); } spin_unlock_bh(&vvs->rx_lock); @@ -609,7 +623,7 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, spin_lock_init(&vvs->rx_lock); spin_lock_init(&vvs->tx_lock); - INIT_LIST_HEAD(&vvs->rx_queue); + skb_queue_head_init(&vvs->rx_queue); return 0; } @@ -806,16 +820,16 @@ void virtio_transport_destruct(struct vsock_sock *vsk) EXPORT_SYMBOL_GPL(virtio_transport_destruct); static int virtio_transport_reset(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .reply = !!pkt, + .reply = !!skb, .vsk = vsk, }; /* Send RST only if the original pkt is not a RST pkt */ - if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (skb && le16_to_cpu(virtio_vsock_hdr(skb)->op) == VIRTIO_VSOCK_OP_RST) return 0; return virtio_transport_send_pkt_info(vsk, &info); @@ -825,29 +839,30 @@ static int virtio_transport_reset(struct vsock_sock *vsk, * attempt was made to connect to a socket that does not exist. */ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - struct virtio_vsock_pkt *reply; + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .type = le16_to_cpu(pkt->hdr.type), + .type = le16_to_cpu(hdr->type), .reply = true, }; + struct sk_buff *reply; /* Send RST only if the original pkt is not a RST pkt */ - if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) return 0; - reply = virtio_transport_alloc_pkt(&info, 0, - le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port), - le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); + reply = virtio_transport_alloc_skb(&info, 0, + le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port), + le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); if (!reply) return -ENOMEM; if (!t) { - virtio_transport_free_pkt(reply); + kfree_skb(reply); return -ENOTCONN; } @@ -858,16 +873,11 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, static void virtio_transport_remove_sock(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt, *tmp; /* We don't need to take rx_lock, as the socket is closing and we are * removing it. */ - list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - + __skb_queue_purge(&vvs->rx_queue); vsock_remove_sock(vsk); } @@ -981,13 +991,14 @@ EXPORT_SYMBOL_GPL(virtio_transport_release); static int virtio_transport_recv_connecting(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); - int err; int skerr; + int err; - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RESPONSE: sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; @@ -1008,7 +1019,7 @@ virtio_transport_recv_connecting(struct sock *sk, return 0; destroy: - virtio_transport_reset(vsk, pkt); + virtio_transport_reset(vsk, skb); sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk_error_report(sk); @@ -1017,34 +1028,37 @@ destroy: static void virtio_transport_recv_enqueue(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { struct virtio_vsock_sock *vvs = vsk->trans; bool can_enqueue, free_pkt = false; + struct virtio_vsock_hdr *hdr; + u32 len; - pkt->len = le32_to_cpu(pkt->hdr.len); - pkt->off = 0; + hdr = virtio_vsock_hdr(skb); + len = le32_to_cpu(hdr->len); spin_lock_bh(&vvs->rx_lock); - can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt); + can_enqueue = virtio_transport_inc_rx_pkt(vvs, skb); if (!can_enqueue) { free_pkt = true; goto out; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count++; /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small * payload. */ - if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) { - struct virtio_vsock_pkt *last_pkt; + if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue)) { + struct virtio_vsock_hdr *last_hdr; + struct sk_buff *last_skb; - last_pkt = list_last_entry(&vvs->rx_queue, - struct virtio_vsock_pkt, list); + last_skb = skb_peek_tail(&vvs->rx_queue); + last_hdr = virtio_vsock_hdr(last_skb); /* If there is space in the last packet queued, we copy the * new packet in its buffer. We avoid this if the last packet @@ -1052,35 +1066,35 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, * delimiter of SEQPACKET message, so 'pkt' is the first packet * of a new message. */ - if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && - !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) { - memcpy(last_pkt->buf + last_pkt->len, pkt->buf, - pkt->len); - last_pkt->len += pkt->len; + if (skb->len < skb_tailroom(last_skb) && + !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) { + memcpy(skb_put(last_skb, skb->len), skb->data, skb->len); free_pkt = true; - last_pkt->hdr.flags |= pkt->hdr.flags; + last_hdr->flags |= hdr->flags; + last_hdr->len = cpu_to_le32(last_skb->len); goto out; } } - list_add_tail(&pkt->list, &vvs->rx_queue); + __skb_queue_tail(&vvs->rx_queue, skb); out: spin_unlock_bh(&vvs->rx_lock); if (free_pkt) - virtio_transport_free_pkt(pkt); + kfree_skb(skb); } static int virtio_transport_recv_connected(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); int err = 0; - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RW: - virtio_transport_recv_enqueue(vsk, pkt); + virtio_transport_recv_enqueue(vsk, skb); vsock_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_REQUEST: @@ -1090,18 +1104,17 @@ virtio_transport_recv_connected(struct sock *sk, sk->sk_write_space(sk); break; case VIRTIO_VSOCK_OP_SHUTDOWN: - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) vsk->peer_shutdown |= RCV_SHUTDOWN; - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset(vsk, NULL); - virtio_transport_do_close(vsk, true); } - if (le32_to_cpu(pkt->hdr.flags)) + if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) sk->sk_state_change(sk); break; case VIRTIO_VSOCK_OP_RST: @@ -1112,28 +1125,30 @@ virtio_transport_recv_connected(struct sock *sk, break; } - virtio_transport_free_pkt(pkt); + kfree_skb(skb); return err; } static void virtio_transport_recv_disconnecting(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); - if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) virtio_transport_do_close(vsk, true); } static int virtio_transport_send_response(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RESPONSE, - .remote_cid = le64_to_cpu(pkt->hdr.src_cid), - .remote_port = le32_to_cpu(pkt->hdr.src_port), + .remote_cid = le64_to_cpu(hdr->src_cid), + .remote_port = le32_to_cpu(hdr->src_port), .reply = true, .vsk = vsk, }; @@ -1142,8 +1157,9 @@ virtio_transport_send_response(struct vsock_sock *vsk, } static bool virtio_transport_space_update(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct virtio_vsock_sock *vvs = vsk->trans; bool space_available; @@ -1158,8 +1174,8 @@ static bool virtio_transport_space_update(struct sock *sk, /* buf_alloc and fwd_cnt is always included in the hdr */ spin_lock_bh(&vvs->tx_lock); - vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); - vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt); space_available = virtio_transport_has_space(vsk); spin_unlock_bh(&vvs->tx_lock); return space_available; @@ -1167,27 +1183,28 @@ static bool virtio_transport_space_update(struct sock *sk, /* Handle server socket */ static int -virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, +virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, struct virtio_transport *t) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vchild; struct sock *child; int ret; - if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { - virtio_transport_reset_no_sock(t, pkt); + if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) { + virtio_transport_reset_no_sock(t, skb); return -EINVAL; } if (sk_acceptq_is_full(sk)) { - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } child = vsock_create_connected(sk); if (!child) { - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } @@ -1198,10 +1215,10 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); - vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port)); - vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&vchild->local_addr, le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port)); + vsock_addr_init(&vchild->remote_addr, le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); ret = vsock_assign_transport(vchild, vsk); /* Transport assigned (looking at remote_addr) must be the same @@ -1209,17 +1226,17 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, */ if (ret || vchild->transport != &t->transport) { release_sock(child); - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); sock_put(child); return ret; } - if (virtio_transport_space_update(child, pkt)) + if (virtio_transport_space_update(child, skb)) child->sk_write_space(child); vsock_insert_connected(vchild); vsock_enqueue_accept(sk, child); - virtio_transport_send_response(vchild, pkt); + virtio_transport_send_response(vchild, skb); release_sock(child); @@ -1237,29 +1254,30 @@ static bool virtio_transport_valid_type(u16 type) * lock. */ void virtio_transport_recv_pkt(struct virtio_transport *t, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct sockaddr_vm src, dst; struct vsock_sock *vsk; struct sock *sk; bool space_available; - vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); - vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&src, le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); + vsock_addr_init(&dst, le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port)); trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, dst.svm_cid, dst.svm_port, - le32_to_cpu(pkt->hdr.len), - le16_to_cpu(pkt->hdr.type), - le16_to_cpu(pkt->hdr.op), - le32_to_cpu(pkt->hdr.flags), - le32_to_cpu(pkt->hdr.buf_alloc), - le32_to_cpu(pkt->hdr.fwd_cnt)); - - if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) { - (void)virtio_transport_reset_no_sock(t, pkt); + le32_to_cpu(hdr->len), + le16_to_cpu(hdr->type), + le16_to_cpu(hdr->op), + le32_to_cpu(hdr->flags), + le32_to_cpu(hdr->buf_alloc), + le32_to_cpu(hdr->fwd_cnt)); + + if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { + (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } @@ -1270,13 +1288,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, if (!sk) { sk = vsock_find_bound_socket(&dst); if (!sk) { - (void)virtio_transport_reset_no_sock(t, pkt); + (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } } - if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) { - (void)virtio_transport_reset_no_sock(t, pkt); + if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { + (void)virtio_transport_reset_no_sock(t, skb); sock_put(sk); goto free_pkt; } @@ -1287,13 +1305,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, /* Check if sk has been closed before lock_sock */ if (sock_flag(sk, SOCK_DONE)) { - (void)virtio_transport_reset_no_sock(t, pkt); + (void)virtio_transport_reset_no_sock(t, skb); release_sock(sk); sock_put(sk); goto free_pkt; } - space_available = virtio_transport_space_update(sk, pkt); + space_available = virtio_transport_space_update(sk, skb); /* Update CID in case it has changed after a transport reset event */ if (vsk->local_addr.svm_cid != VMADDR_CID_ANY) @@ -1304,23 +1322,23 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, switch (sk->sk_state) { case TCP_LISTEN: - virtio_transport_recv_listen(sk, pkt, t); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_listen(sk, skb, t); + kfree_skb(skb); break; case TCP_SYN_SENT: - virtio_transport_recv_connecting(sk, pkt); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_connecting(sk, skb); + kfree_skb(skb); break; case TCP_ESTABLISHED: - virtio_transport_recv_connected(sk, pkt); + virtio_transport_recv_connected(sk, skb); break; case TCP_CLOSING: - virtio_transport_recv_disconnecting(sk, pkt); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_disconnecting(sk, skb); + kfree_skb(skb); break; default: - (void)virtio_transport_reset_no_sock(t, pkt); - virtio_transport_free_pkt(pkt); + (void)virtio_transport_reset_no_sock(t, skb); + kfree_skb(skb); break; } @@ -1333,16 +1351,42 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, return; free_pkt: - virtio_transport_free_pkt(pkt); + kfree_skb(skb); } EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +/* Remove skbs found in a queue that have a vsk that matches. + * + * Each skb is freed. + * + * Returns the count of skbs that were reply packets. + */ +int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue) { - kvfree(pkt->buf); - kfree(pkt); + struct sk_buff_head freeme; + struct sk_buff *skb, *tmp; + int cnt = 0; + + skb_queue_head_init(&freeme); + + spin_lock_bh(&queue->lock); + skb_queue_walk_safe(queue, skb, tmp) { + if (vsock_sk(skb->sk) != vsk) + continue; + + __skb_unlink(skb, queue); + __skb_queue_tail(&freeme, skb); + + if (virtio_vsock_skb_reply(skb)) + cnt++; + } + spin_unlock_bh(&queue->lock); + + __skb_queue_purge(&freeme); + + return cnt; } -EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); +EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 842c94286d31..36eb16a40745 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1711,7 +1711,11 @@ static int vmci_transport_dgram_enqueue( if (!dg) return -ENOMEM; - memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len); + err = memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len); + if (err) { + kfree(dg); + return err; + } dg->dst = vmci_make_handle(remote_addr->svm_cid, remote_addr->svm_port); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 169a8cf65b39..671e03240fc5 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -16,7 +16,7 @@ struct vsock_loopback { struct workqueue_struct *workqueue; spinlock_t pkt_list_lock; /* protects pkt_list */ - struct list_head pkt_list; + struct sk_buff_head pkt_queue; struct work_struct pkt_work; }; @@ -27,13 +27,13 @@ static u32 vsock_loopback_get_local_cid(void) return VMADDR_CID_LOCAL; } -static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) +static int vsock_loopback_send_pkt(struct sk_buff *skb) { struct vsock_loopback *vsock = &the_vsock_loopback; - int len = pkt->len; + int len = skb->len; spin_lock_bh(&vsock->pkt_list_lock); - list_add_tail(&pkt->list, &vsock->pkt_list); + skb_queue_tail(&vsock->pkt_queue, skb); spin_unlock_bh(&vsock->pkt_list_lock); queue_work(vsock->workqueue, &vsock->pkt_work); @@ -44,21 +44,8 @@ static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) { struct vsock_loopback *vsock = &the_vsock_loopback; - struct virtio_vsock_pkt *pkt, *n; - LIST_HEAD(freeme); - spin_lock_bh(&vsock->pkt_list_lock); - list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) { - if (pkt->vsk != vsk) - continue; - list_move(&pkt->list, &freeme); - } - spin_unlock_bh(&vsock->pkt_list_lock); - - list_for_each_entry_safe(pkt, n, &freeme, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); return 0; } @@ -121,20 +108,18 @@ static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = container_of(work, struct vsock_loopback, pkt_work); - LIST_HEAD(pkts); + struct sk_buff_head pkts; + struct sk_buff *skb; + + skb_queue_head_init(&pkts); spin_lock_bh(&vsock->pkt_list_lock); - list_splice_init(&vsock->pkt_list, &pkts); + skb_queue_splice_init(&vsock->pkt_queue, &pkts); spin_unlock_bh(&vsock->pkt_list_lock); - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(&loopback_transport, pkt); + while ((skb = __skb_dequeue(&pkts))) { + virtio_transport_deliver_tap_pkt(skb); + virtio_transport_recv_pkt(&loopback_transport, skb); } } @@ -148,7 +133,7 @@ static int __init vsock_loopback_init(void) return -ENOMEM; spin_lock_init(&vsock->pkt_list_lock); - INIT_LIST_HEAD(&vsock->pkt_list); + skb_queue_head_init(&vsock->pkt_queue); INIT_WORK(&vsock->pkt_work, vsock_loopback_work); ret = vsock_core_register(&loopback_transport.transport, @@ -166,19 +151,13 @@ out_wq: static void __exit vsock_loopback_exit(void) { struct vsock_loopback *vsock = &the_vsock_loopback; - struct virtio_vsock_pkt *pkt; vsock_core_unregister(&loopback_transport.transport); flush_work(&vsock->pkt_work); spin_lock_bh(&vsock->pkt_list_lock); - while (!list_empty(&vsock->pkt_list)) { - pkt = list_first_entry(&vsock->pkt_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + virtio_vsock_skb_queue_purge(&vsock->pkt_queue); spin_unlock_bh(&vsock->pkt_list_lock); destroy_workqueue(vsock->workqueue); diff --git a/net/wireless/ap.c b/net/wireless/ap.c index e68923200018..0962770303b2 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -39,7 +39,7 @@ static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev, wdev->u.ap.ssid_len = 0; rdev_set_qos_map(rdev, dev, NULL); if (notify) - nl80211_send_ap_stopped(wdev); + nl80211_send_ap_stopped(wdev, link_id); /* Should we apply the grace period during beaconing interface * shutdown also? diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 0e5835cd8c61..0b7e81db383d 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -1460,3 +1460,72 @@ struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev, } } EXPORT_SYMBOL(wdev_chandef); + +struct cfg80211_per_bw_puncturing_values { + u8 len; + const u16 *valid_values; +}; + +static const u16 puncturing_values_80mhz[] = { + 0x8, 0x4, 0x2, 0x1 +}; + +static const u16 puncturing_values_160mhz[] = { + 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1, 0xc0, 0x30, 0xc, 0x3 +}; + +static const u16 puncturing_values_320mhz[] = { + 0xc000, 0x3000, 0xc00, 0x300, 0xc0, 0x30, 0xc, 0x3, 0xf000, 0xf00, + 0xf0, 0xf, 0xfc00, 0xf300, 0xf0c0, 0xf030, 0xf00c, 0xf003, 0xc00f, + 0x300f, 0xc0f, 0x30f, 0xcf, 0x3f +}; + +#define CFG80211_PER_BW_VALID_PUNCTURING_VALUES(_bw) \ + { \ + .len = ARRAY_SIZE(puncturing_values_ ## _bw ## mhz), \ + .valid_values = puncturing_values_ ## _bw ## mhz \ + } + +static const struct cfg80211_per_bw_puncturing_values per_bw_puncturing[] = { + CFG80211_PER_BW_VALID_PUNCTURING_VALUES(80), + CFG80211_PER_BW_VALID_PUNCTURING_VALUES(160), + CFG80211_PER_BW_VALID_PUNCTURING_VALUES(320) +}; + +bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, + const struct cfg80211_chan_def *chandef) +{ + u32 idx, i, start_freq; + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_80: + idx = 0; + start_freq = chandef->center_freq1 - 40; + break; + case NL80211_CHAN_WIDTH_160: + idx = 1; + start_freq = chandef->center_freq1 - 80; + break; + case NL80211_CHAN_WIDTH_320: + idx = 2; + start_freq = chandef->center_freq1 - 160; + break; + default: + *bitmap = 0; + break; + } + + if (!*bitmap) + return true; + + /* check if primary channel is punctured */ + if (*bitmap & (u16)BIT((chandef->chan->center_freq - start_freq) / 20)) + return false; + + for (i = 0; i < per_bw_puncturing[idx].len; i++) + if (per_bw_puncturing[idx].valid_values[i] == *bitmap) + return true; + + return false; +} +EXPORT_SYMBOL(cfg80211_valid_disable_subchannel_bitmap); diff --git a/net/wireless/core.h b/net/wireless/core.h index af85d8909935..7c61752f6d83 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -278,8 +278,8 @@ struct cfg80211_event { }; struct cfg80211_cached_keys { - struct key_params params[CFG80211_MAX_WEP_KEYS]; - u8 data[CFG80211_MAX_WEP_KEYS][WLAN_KEY_LEN_WEP104]; + struct key_params params[4]; + u8 data[4][WLAN_KEY_LEN_WEP104]; int def; }; diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index edd062f104f4..e6fdb0b8187d 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -45,8 +45,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, cfg80211_hold_bss(bss_from_pub(bss)); wdev->u.ibss.current_bss = bss_from_pub(bss); - if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) - cfg80211_upload_connect_keys(wdev); + cfg80211_upload_connect_keys(wdev); nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); @@ -294,7 +293,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) + for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } err = __cfg80211_join_ibss(rdev, wdev->netdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 58e1fb18f85a..81d3f40d6235 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -742,7 +742,10 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP_VLAN: - if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev))) + if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev)) && + (params->link_id < 0 || + !ether_addr_equal(mgmt->bssid, + wdev->links[params->link_id].addr))) err = -EINVAL; break; case NL80211_IFTYPE_MESH_POINT: diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 33a82ecab9d5..112b4bb009c8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -805,6 +805,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN), [NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT }, + [NL80211_ATTR_PUNCT_BITMAP] = NLA_POLICY_RANGE(NLA_U8, 0, 0xffff), }; /* policy for the key attributes */ @@ -883,7 +884,7 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { }, [NL80211_REKEY_DATA_KCK] = { .type = NLA_BINARY, - .len = NL80211_KCK_EXT_LEN + .len = NL80211_KCK_EXT_LEN_32 }, [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN(NL80211_REPLAY_CTR_LEN), [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 }, @@ -1548,10 +1549,14 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) if (wdev->connected) return 0; return -ENOLINK; + case NL80211_IFTYPE_NAN: + if (wiphy_ext_feature_isset(wdev->wiphy, + NL80211_EXT_FEATURE_SECURE_NAN)) + return 0; + return -EINVAL; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_MONITOR: - case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: @@ -3173,6 +3178,21 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev) wdev->iftype == NL80211_IFTYPE_P2P_GO; } +static int nl80211_parse_punct_bitmap(struct cfg80211_registered_device *rdev, + struct genl_info *info, + const struct cfg80211_chan_def *chandef, + u16 *punct_bitmap) +{ + if (!wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_PUNCT)) + return -EINVAL; + + *punct_bitmap = nla_get_u32(info->attrs[NL80211_ATTR_PUNCT_BITMAP]); + if (!cfg80211_valid_disable_subchannel_bitmap(punct_bitmap, chandef)) + return -EINVAL; + + return 0; +} + int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, struct genl_info *info, struct cfg80211_chan_def *chandef) @@ -3181,8 +3201,11 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, struct nlattr **attrs = info->attrs; u32 control_freq; - if (!attrs[NL80211_ATTR_WIPHY_FREQ]) + if (!attrs[NL80211_ATTR_WIPHY_FREQ]) { + NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], + "Frequency is missing"); return -EINVAL; + } control_freq = MHZ_TO_KHZ( nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); @@ -5770,6 +5793,42 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, } } +static void nl80211_send_ap_started(struct wireless_dev *wdev, + unsigned int link_id) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_START_AP); + if (!hdr) + goto out; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + (wdev->u.ap.ssid_len && + nla_put(msg, NL80211_ATTR_SSID, wdev->u.ap.ssid_len, + wdev->u.ap.ssid)) || + (wdev->valid_links && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))) + goto out; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), msg, 0, + NL80211_MCGRP_MLME, GFP_KERNEL); + return; +out: + nlmsg_free(msg); +} + static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -5918,6 +5977,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) { + err = nl80211_parse_punct_bitmap(rdev, info, + ¶ms->chandef, + ¶ms->punct_bitmap); + if (err) + goto out; + } + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms->chandef, wdev->iftype)) { err = -EINVAL; @@ -6050,6 +6117,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) wdev->conn_owner_nlportid = info->snd_portid; + + nl80211_send_ap_started(wdev, link_id); } out_unlock: wdev_unlock(wdev); @@ -6527,6 +6596,22 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, sinfo->assoc_req_ies)) goto nla_put_failure; + if (sinfo->assoc_resp_ies_len && + nla_put(msg, NL80211_ATTR_RESP_IE, sinfo->assoc_resp_ies_len, + sinfo->assoc_resp_ies)) + goto nla_put_failure; + + if (sinfo->mlo_params_valid) { + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, + sinfo->assoc_link_id)) + goto nla_put_failure; + + if (!is_zero_ether_addr(sinfo->mld_addr) && + nla_put(msg, NL80211_ATTR_MLD_ADDR, ETH_ALEN, + sinfo->mld_addr)) + goto nla_put_failure; + } + cfg80211_sinfo_release_content(sinfo); genlmsg_end(msg, hdr); return 0; @@ -10057,6 +10142,14 @@ skip_beacons: if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX]) params.block_tx = true; + if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) { + err = nl80211_parse_punct_bitmap(rdev, info, + ¶ms.chandef, + ¶ms.punct_bitmap); + if (err) + goto free; + } + wdev_lock(wdev); err = rdev_channel_switch(rdev, dev, ¶ms); wdev_unlock(wdev); @@ -12253,6 +12346,10 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_P2P_DEVICE: break; case NL80211_IFTYPE_NAN: + if (!wiphy_ext_feature_isset(wdev->wiphy, + NL80211_EXT_FEATURE_SECURE_NAN)) + return -EOPNOTSUPP; + break; default: return -EOPNOTSUPP; } @@ -12310,6 +12407,10 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_P2P_GO: break; case NL80211_IFTYPE_NAN: + if (!wiphy_ext_feature_isset(wdev->wiphy, + NL80211_EXT_FEATURE_SECURE_NAN)) + return -EOPNOTSUPP; + break; default: return -EOPNOTSUPP; } @@ -12447,6 +12548,10 @@ static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *in case NL80211_IFTYPE_P2P_DEVICE: break; case NL80211_IFTYPE_NAN: + if (!wiphy_ext_feature_isset(wdev->wiphy, + NL80211_EXT_FEATURE_SECURE_NAN)) + return -EOPNOTSUPP; + break; default: return -EOPNOTSUPP; } @@ -13809,7 +13914,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN && !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && - nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KCK_EXT_LEN)) + nla_len(tb[NL80211_REKEY_DATA_KCK]) == NL80211_KCK_EXT_LEN) && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KCK_32 && + nla_len(tb[NL80211_REKEY_DATA_KCK]) == NL80211_KCK_EXT_LEN_32)) return -ERANGE; rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); @@ -18954,7 +19061,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef, gfp_t gfp, enum nl80211_commands notif, - u8 count, bool quiet) + u8 count, bool quiet, u16 punct_bitmap) { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct sk_buff *msg; @@ -18988,6 +19095,9 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, goto nla_put_failure; } + if (nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, punct_bitmap)) + goto nla_put_failure; + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -19000,7 +19110,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, void cfg80211_ch_switch_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - unsigned int link_id) + unsigned int link_id, u16 punct_bitmap) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -19009,7 +19119,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, ASSERT_WDEV_LOCK(wdev); WARN_INVALID_LINK_ID(wdev, link_id); - trace_cfg80211_ch_switch_notify(dev, chandef, link_id); + trace_cfg80211_ch_switch_notify(dev, chandef, link_id, punct_bitmap); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: @@ -19037,14 +19147,15 @@ void cfg80211_ch_switch_notify(struct net_device *dev, cfg80211_sched_dfs_chan_update(rdev); nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL, - NL80211_CMD_CH_SWITCH_NOTIFY, 0, false); + NL80211_CMD_CH_SWITCH_NOTIFY, 0, false, + punct_bitmap); } EXPORT_SYMBOL(cfg80211_ch_switch_notify); void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, unsigned int link_id, u8 count, - bool quiet) + bool quiet, u16 punct_bitmap) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -19053,15 +19164,17 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, ASSERT_WDEV_LOCK(wdev); WARN_INVALID_LINK_ID(wdev, link_id); - trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id); + trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id, + punct_bitmap); + nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL, NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, - count, quiet); + count, quiet, punct_bitmap); } EXPORT_SYMBOL(cfg80211_ch_switch_started_notify); -int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp, +int cfg80211_bss_color_notify(struct net_device *dev, enum nl80211_commands cmd, u8 count, u64 color_bitmap) { @@ -19075,7 +19188,7 @@ int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp, trace_cfg80211_bss_color_notify(dev, cmd, count, color_bitmap); - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -19098,7 +19211,7 @@ int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp, genlmsg_end(msg, hdr); return genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), - msg, 0, NL80211_MCGRP_MLME, gfp); + msg, 0, NL80211_MCGRP_MLME, GFP_KERNEL); nla_put_failure: nlmsg_free(msg); @@ -19661,7 +19774,7 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) } EXPORT_SYMBOL(cfg80211_crit_proto_stopped); -void nl80211_send_ap_stopped(struct wireless_dev *wdev) +void nl80211_send_ap_stopped(struct wireless_dev *wdev, unsigned int link_id) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -19679,7 +19792,9 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev) if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), - NL80211_ATTR_PAD)) + NL80211_ATTR_PAD) || + (wdev->valid_links && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))) goto out; genlmsg_end(msg, hdr); @@ -19718,7 +19833,9 @@ int cfg80211_external_auth_request(struct net_device *dev, params->action) || nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, params->bssid) || nla_put(msg, NL80211_ATTR_SSID, params->ssid.ssid_len, - params->ssid.ssid)) + params->ssid.ssid) || + (!is_zero_ether_addr(params->mld_addr) && + nla_put(msg, NL80211_ATTR_MLD_ADDR, ETH_ALEN, params->mld_addr))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -19760,6 +19877,17 @@ void cfg80211_update_owe_info_event(struct net_device *netdev, nla_put(msg, NL80211_ATTR_IE, owe_info->ie_len, owe_info->ie)) goto nla_put_failure; + if (owe_info->assoc_link_id != -1) { + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, + owe_info->assoc_link_id)) + goto nla_put_failure; + + if (!is_zero_ether_addr(owe_info->peer_mld_addr) && + nla_put(msg, NL80211_ATTR_MLD_ADDR, ETH_ALEN, + owe_info->peer_mld_addr)) + goto nla_put_failure; + } + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index ba9457e94c43..0278d817bb02 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -114,7 +114,7 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, enum nl80211_radar_event event, struct net_device *netdev, gfp_t gfp); -void nl80211_send_ap_stopped(struct wireless_dev *wdev); +void nl80211_send_ap_stopped(struct wireless_dev *wdev, unsigned int link_id); void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 4f3f31244e8b..0d40d6af7e10 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -737,51 +737,9 @@ static bool valid_country(const u8 *data, unsigned int size, } #ifdef CONFIG_CFG80211_REQUIRE_SIGNED_REGDB -static struct key *builtin_regdb_keys; - -static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen) -{ - const u8 *end = p + buflen; - size_t plen; - key_ref_t key; +#include <keys/asymmetric-type.h> - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1), - "asymmetric", NULL, p, plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_BUILT_IN | - KEY_ALLOC_BYPASS_RESTRICTION); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); -} +static struct key *builtin_regdb_keys; static int __init load_builtin_regdb_keys(void) { @@ -797,11 +755,15 @@ static int __init load_builtin_regdb_keys(void) pr_notice("Loading compiled-in X.509 certificates for regulatory database\n"); #ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS - load_keys_from_buffer(shipped_regdb_certs, shipped_regdb_certs_len); + x509_load_certificate_list(shipped_regdb_certs, + shipped_regdb_certs_len, + builtin_regdb_keys); #endif #ifdef CONFIG_CFG80211_EXTRA_REGDB_KEYDIR if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0') - load_keys_from_buffer(extra_regdb_certs, extra_regdb_certs_len); + x509_load_certificate_list(extra_regdb_certs, + extra_regdb_certs_len, + builtin_regdb_keys); #endif return 0; @@ -3198,6 +3160,9 @@ static void reg_process_self_managed_hint(struct wiphy *wiphy) request.alpha2[1] = regd->alpha2[1]; request.initiator = NL80211_REGDOM_SET_BY_DRIVER; + if (wiphy->flags & WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER) + reg_call_notifier(wiphy, &request); + nl80211_send_wiphy_reg_change_event(&request); } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 4b5b6ee0fe01..28ce13840a88 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -285,6 +285,15 @@ void cfg80211_conn_work(struct work_struct *work) wiphy_unlock(&rdev->wiphy); } +static void cfg80211_step_auth_next(struct cfg80211_conn *conn, + struct cfg80211_bss *bss) +{ + memcpy(conn->bssid, bss->bssid, ETH_ALEN); + conn->params.bssid = conn->bssid; + conn->params.channel = bss->channel; + conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; +} + /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { @@ -302,10 +311,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) if (!bss) return NULL; - memcpy(wdev->conn->bssid, bss->bssid, ETH_ALEN); - wdev->conn->params.bssid = wdev->conn->bssid; - wdev->conn->params.channel = bss->channel; - wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; + cfg80211_step_auth_next(wdev->conn, bss); schedule_work(&rdev->conn_work); return bss; @@ -597,7 +603,12 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, wdev->conn->params.ssid_len = wdev->u.client.ssid_len; /* see if we have the bss already */ - bss = cfg80211_get_conn_bss(wdev); + bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, + wdev->conn->params.bssid, + wdev->conn->params.ssid, + wdev->conn->params.ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (prev_bssid) { memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); @@ -608,6 +619,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, if (bss) { enum nl80211_timeout_reason treason; + cfg80211_step_auth_next(wdev->conn, bss); err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { @@ -724,6 +736,7 @@ void __cfg80211_connect_result(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; const struct element *country_elem = NULL; + const struct element *ssid; const u8 *country_data; u8 country_datalen; #ifdef CONFIG_CFG80211_WEXT @@ -855,8 +868,7 @@ void __cfg80211_connect_result(struct net_device *dev, ETH_ALEN); } - if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) - cfg80211_upload_connect_keys(wdev); + cfg80211_upload_connect_keys(wdev); rcu_read_lock(); for_each_valid_link(cr, link) { @@ -883,6 +895,22 @@ void __cfg80211_connect_result(struct net_device *dev, country_data, country_datalen); kfree(country_data); + if (!wdev->u.client.ssid_len) { + rcu_read_lock(); + for_each_valid_link(cr, link) { + ssid = ieee80211_bss_get_elem(cr->links[link].bss, + WLAN_EID_SSID); + + if (!ssid || !ssid->datalen) + continue; + + memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen); + wdev->u.client.ssid_len = ssid->datalen; + break; + } + rcu_read_unlock(); + } + return; out: for_each_valid_link(cr, link) @@ -1462,12 +1490,18 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, connect->crypto.ciphers_pairwise[0] = cipher; } } - - connect->crypto.wep_keys = connkeys->params; - connect->crypto.wep_tx_key = connkeys->def; } else { if (WARN_ON(connkeys)) return -EINVAL; + + /* connect can point to wdev->wext.connect which + * can hold key data from a previous connection + */ + connect->key = NULL; + connect->key_len = 0; + connect->key_idx = 0; + connect->crypto.cipher_group = 0; + connect->crypto.n_ciphers_pairwise = 0; } wdev->connect_keys = connkeys; diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 0c3f05c9be27..cdb638647e0b 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -148,7 +148,7 @@ static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); #define WIPHY_PM_OPS NULL #endif -static const void *wiphy_namespace(struct device *d) +static const void *wiphy_namespace(const struct device *d) { struct wiphy *wiphy = container_of(d, struct wiphy, dev); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index a405c3edbc47..ca7474eec723 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -19,8 +19,6 @@ else \ eth_zero_addr(__entry->entry_mac); \ } while (0) -#define MAC_PR_FMT "%pM" -#define MAC_PR_ARG(entry_mac) (__entry->entry_mac) #define MAXNAME 32 #define WIPHY_ENTRY __array(char, wiphy_name, 32) @@ -454,10 +452,10 @@ DECLARE_EVENT_CLASS(key_handle, __entry->pairwise = pairwise; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " - "key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT, + "key_index: %u, pairwise: %s, mac addr: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->key_index, BOOL_TO_STR(__entry->pairwise), - MAC_PR_ARG(mac_addr)) + __entry->mac_addr) ); DEFINE_EVENT(key_handle, rdev_get_key, @@ -496,10 +494,10 @@ TRACE_EVENT(rdev_add_key, ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " "key_index: %u, mode: %u, pairwise: %s, " - "mac addr: " MAC_PR_FMT, + "mac addr: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->key_index, __entry->mode, - BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr)) + BOOL_TO_STR(__entry->pairwise), __entry->mac_addr) ); TRACE_EVENT(rdev_set_default_key, @@ -813,11 +811,11 @@ DECLARE_EVENT_CLASS(station_add_change, __entry->opmode_notif_used = params->link_sta_params.opmode_notif_used; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" ", station flags mask: %u, station flags set: %u, " "station modify mask: %u, listen interval: %d, aid: %u, " "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, __entry->sta_flags_mask, __entry->sta_flags_set, __entry->sta_modify_mask, __entry->listen_interval, __entry->aid, __entry->plink_action, __entry->plink_state, @@ -849,8 +847,8 @@ DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt, NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac) ); DECLARE_EVENT_CLASS(station_del, @@ -871,9 +869,9 @@ DECLARE_EVENT_CLASS(station_del, __entry->subtype = params->subtype; __entry->reason_code = params->reason_code; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" ", subtype: %u, reason_code: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, __entry->subtype, __entry->reason_code) ); @@ -909,8 +907,8 @@ TRACE_EVENT(rdev_dump_station, MAC_ASSIGN(sta_mac, mac); __entry->idx = _idx; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", idx: %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM, idx: %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, __entry->idx) ); @@ -947,9 +945,9 @@ DECLARE_EVENT_CLASS(mpath_evt, MAC_ASSIGN(dst, dst); MAC_ASSIGN(next_hop, next_hop); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: " MAC_PR_FMT ", next hop: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dst), - MAC_PR_ARG(next_hop)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM, next hop: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dst, + __entry->next_hop) ); DEFINE_EVENT(mpath_evt, rdev_add_mpath, @@ -988,10 +986,9 @@ TRACE_EVENT(rdev_dump_mpath, MAC_ASSIGN(next_hop, next_hop); __entry->idx = _idx; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: " - MAC_PR_FMT ", next hop: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, MAC_PR_ARG(dst), - MAC_PR_ARG(next_hop)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, next hop: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst, + __entry->next_hop) ); TRACE_EVENT(rdev_get_mpp, @@ -1010,9 +1007,9 @@ TRACE_EVENT(rdev_get_mpp, MAC_ASSIGN(dst, dst); MAC_ASSIGN(mpp, mpp); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: " MAC_PR_FMT - ", mpp: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, - MAC_PR_ARG(dst), MAC_PR_ARG(mpp)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM" + ", mpp: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->dst, __entry->mpp) ); TRACE_EVENT(rdev_dump_mpp, @@ -1033,10 +1030,9 @@ TRACE_EVENT(rdev_dump_mpp, MAC_ASSIGN(mpp, mpp); __entry->idx = _idx; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: " - MAC_PR_FMT ", mpp: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, MAC_PR_ARG(dst), - MAC_PR_ARG(mpp)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, mpp: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst, + __entry->mpp) ); TRACE_EVENT(rdev_return_int_mpath_info, @@ -1243,9 +1239,9 @@ TRACE_EVENT(rdev_auth, eth_zero_addr(__entry->bssid); __entry->auth_type = req->auth_type; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: " MAC_PR_FMT, + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->auth_type, - MAC_PR_ARG(bssid)) + __entry->bssid) ); TRACE_EVENT(rdev_assoc, @@ -1294,10 +1290,10 @@ TRACE_EVENT(rdev_assoc, memcpy(__get_dynamic_array(fils_nonces), req->fils_nonces, 2 * FILS_NONCE_LEN); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT - ", previous bssid: " MAC_PR_FMT ", use mfp: %s, flags: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), - MAC_PR_ARG(prev_bssid), BOOL_TO_STR(__entry->use_mfp), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" + ", previous bssid: %pM, use mfp: %s, flags: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, + __entry->prev_bssid, BOOL_TO_STR(__entry->use_mfp), __entry->flags) ); @@ -1317,8 +1313,8 @@ TRACE_EVENT(rdev_deauth, MAC_ASSIGN(bssid, req->bssid); __entry->reason_code = req->reason_code; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", reason: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, reason: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->reason_code) ); @@ -1340,9 +1336,9 @@ TRACE_EVENT(rdev_disassoc, __entry->reason_code = req->reason_code; __entry->local_state_change = req->local_state_change; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", reason: %u, local state change: %s", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->reason_code, BOOL_TO_STR(__entry->local_state_change)) ); @@ -1413,12 +1409,12 @@ TRACE_EVENT(rdev_connect, __entry->flags = sme->flags; MAC_ASSIGN(prev_bssid, sme->prev_bssid); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, " - "flags: %u, previous bssid: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid, + "flags: %u, previous bssid: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid, __entry->auth_type, BOOL_TO_STR(__entry->privacy), - __entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid)) + __entry->wpa_versions, __entry->flags, __entry->prev_bssid) ); TRACE_EVENT(rdev_update_connect_params, @@ -1542,8 +1538,8 @@ TRACE_EVENT(rdev_join_ibss, memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); memcpy(__entry->ssid, params->ssid, params->ssid_len); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", ssid: %s", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, ssid: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid) ); TRACE_EVENT(rdev_join_ocb, @@ -1664,9 +1660,9 @@ TRACE_EVENT(rdev_set_bitrate_mask, __entry->link_id = link_id; MAC_ASSIGN(peer, peer); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: " MAC_PR_FMT, + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, - MAC_PR_ARG(peer)) + __entry->peer) ); TRACE_EVENT(rdev_update_mgmt_frame_registrations, @@ -1810,10 +1806,10 @@ TRACE_EVENT(rdev_tdls_mgmt, __entry->initiator = initiator; memcpy(__get_dynamic_array(buf), buf, len); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", action_code: %u, " + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, action_code: %u, " "dialog_token: %u, status_code: %u, peer_capability: %u " "initiator: %s buf: %#.2x ", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->action_code, __entry->dialog_token, __entry->status_code, __entry->peer_capability, BOOL_TO_STR(__entry->initiator), @@ -1893,8 +1889,8 @@ TRACE_EVENT(rdev_tdls_oper, MAC_ASSIGN(peer, peer); __entry->oper = oper; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", oper: %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->oper) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, oper: %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper) ); DECLARE_EVENT_CLASS(rdev_pmksa, @@ -1911,8 +1907,8 @@ DECLARE_EVENT_CLASS(rdev_pmksa, NETDEV_ASSIGN; MAC_ASSIGN(bssid, pmksa->bssid); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid) ); TRACE_EVENT(rdev_probe_client, @@ -1929,8 +1925,8 @@ TRACE_EVENT(rdev_probe_client, NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer) ); DEFINE_EVENT(rdev_pmksa, rdev_set_pmksa, @@ -2051,9 +2047,9 @@ TRACE_EVENT(rdev_tx_control_port, __entry->unencrypted = unencrypted; __entry->link_id = link_id; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT "," + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM," " proto: 0x%x, unencrypted: %s, link: %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest), + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest, be16_to_cpu(__entry->proto), BOOL_TO_STR(__entry->unencrypted), __entry->link_id) @@ -2392,8 +2388,8 @@ TRACE_EVENT(rdev_add_tx_ts, __entry->user_prio = user_prio; __entry->admitted_time = admitted_time; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d, UP %d, time %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tsid, __entry->user_prio, __entry->admitted_time) ); @@ -2413,8 +2409,8 @@ TRACE_EVENT(rdev_del_tx_ts, MAC_ASSIGN(peer, peer); __entry->tsid = tsid; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tsid) ); TRACE_EVENT(rdev_tdls_channel_switch, @@ -2435,9 +2431,9 @@ TRACE_EVENT(rdev_tdls_channel_switch, MAC_ASSIGN(addr, addr); CHAN_DEF_ASSIGN(chandef); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM" " oper class %d, " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr), + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr, __entry->oper_class, CHAN_DEF_PR_ARG) ); @@ -2455,8 +2451,8 @@ TRACE_EVENT(rdev_tdls_cancel_channel_switch, NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr) ); TRACE_EVENT(rdev_set_pmk, @@ -2488,9 +2484,9 @@ TRACE_EVENT(rdev_set_pmk, pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM" "pmk_len=%u, pmk: %s pmk_r0_name: %s", WIPHY_PR_ARG, - NETDEV_PR_ARG, MAC_PR_ARG(aa), __entry->pmk_len, + NETDEV_PR_ARG, __entry->aa, __entry->pmk_len, __print_array(__get_dynamic_array(pmk), __get_dynamic_array_len(pmk), 1), __entry->pmk_r0_name_len ? @@ -2515,8 +2511,8 @@ TRACE_EVENT(rdev_del_pmk, MAC_ASSIGN(aa, aa); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(aa)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->aa) ); TRACE_EVENT(rdev_external_auth, @@ -2528,6 +2524,7 @@ TRACE_EVENT(rdev_external_auth, MAC_ENTRY(bssid) __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1) __field(u16, status) + MAC_ENTRY(mld_addr) ), TP_fast_assign(WIPHY_ASSIGN; NETDEV_ASSIGN; @@ -2536,10 +2533,12 @@ TRACE_EVENT(rdev_external_auth, memcpy(__entry->ssid, params->ssid.ssid, params->ssid.ssid_len); __entry->status = params->status; + MAC_ASSIGN(mld_addr, params->mld_addr); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT - ", ssid: %s, status: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->bssid, __entry->ssid, __entry->status) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" + ", ssid: %s, status: %u, mld_addr: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, + __entry->ssid, __entry->status, __entry->mld_addr) ); TRACE_EVENT(rdev_start_radar_detection, @@ -2720,8 +2719,8 @@ TRACE_EVENT(rdev_update_owe_info, __entry->status = owe_info->status; memcpy(__get_dynamic_array(ie), owe_info->ie, owe_info->ie_len);), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT - " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM" + " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->status) ); @@ -2739,8 +2738,8 @@ TRACE_EVENT(rdev_probe_mesh_link, NETDEV_ASSIGN; MAC_ASSIGN(dest, dest); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest) ); TRACE_EVENT(rdev_set_tid_config, @@ -2757,8 +2756,8 @@ TRACE_EVENT(rdev_set_tid_config, NETDEV_ASSIGN; MAC_ASSIGN(peer, tid_conf->peer); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer) ); TRACE_EVENT(rdev_reset_tid_config, @@ -2777,8 +2776,8 @@ TRACE_EVENT(rdev_reset_tid_config, MAC_ASSIGN(peer, peer); __entry->tids = tids; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, tids: 0x%x", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tids) ); TRACE_EVENT(rdev_set_sar_specs, @@ -2881,8 +2880,8 @@ DECLARE_EVENT_CLASS(cfg80211_netdev_mac_evt, NETDEV_ASSIGN; MAC_ASSIGN(macaddr, macaddr); ), - TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT, - NETDEV_PR_ARG, MAC_PR_ARG(macaddr)) + TP_printk(NETDEV_PR_FMT ", mac: %pM", + NETDEV_PR_ARG, __entry->macaddr) ); DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate, @@ -2920,8 +2919,8 @@ TRACE_EVENT(cfg80211_send_rx_assoc, MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->links[0].bss->bssid); ), - TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT, - NETDEV_PR_ARG, MAC_PR_ARG(ap_addr)) + TP_printk(NETDEV_PR_FMT ", %pM", + NETDEV_PR_ARG, __entry->ap_addr) ); DECLARE_EVENT_CLASS(netdev_frame_event, @@ -2981,8 +2980,8 @@ DECLARE_EVENT_CLASS(netdev_mac_evt, NETDEV_ASSIGN; MAC_ASSIGN(mac, mac) ), - TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT, - NETDEV_PR_ARG, MAC_PR_ARG(mac)) + TP_printk(NETDEV_PR_FMT ", mac: %pM", + NETDEV_PR_ARG, __entry->mac) ); DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout, @@ -3004,8 +3003,8 @@ TRACE_EVENT(cfg80211_send_assoc_failure, MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->bss[0]->bssid); __entry->timeout = data->timeout; ), - TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT ", timeout: %d", - NETDEV_PR_ARG, MAC_PR_ARG(ap_addr), __entry->timeout) + TP_printk(NETDEV_PR_FMT ", mac: %pM, timeout: %d", + NETDEV_PR_ARG, __entry->ap_addr, __entry->timeout) ); TRACE_EVENT(cfg80211_michael_mic_failure, @@ -3027,8 +3026,8 @@ TRACE_EVENT(cfg80211_michael_mic_failure, if (tsc) memcpy(__entry->tsc, tsc, 6); ), - TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", key type: %d, key id: %d, tsc: %pm", - NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->key_type, + TP_printk(NETDEV_PR_FMT ", %pM, key type: %d, key id: %d, tsc: %pm", + NETDEV_PR_ARG, __entry->addr, __entry->key_type, __entry->key_id, __entry->tsc) ); @@ -3104,8 +3103,8 @@ TRACE_EVENT(cfg80211_new_sta, MAC_ASSIGN(mac_addr, mac_addr); SINFO_ASSIGN; ), - TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT, - NETDEV_PR_ARG, MAC_PR_ARG(mac_addr)) + TP_printk(NETDEV_PR_FMT ", %pM", + NETDEV_PR_ARG, __entry->mac_addr) ); DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta, @@ -3182,8 +3181,8 @@ TRACE_EVENT(cfg80211_rx_control_port, __entry->proto = be16_to_cpu(skb->protocol); __entry->unencrypted = unencrypted; ), - TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", - NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from), + TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s", + NETDEV_PR_ARG, __entry->len, __entry->from, __entry->proto, BOOL_TO_STR(__entry->unencrypted)) ); @@ -3245,39 +3244,47 @@ TRACE_EVENT(cfg80211_chandef_dfs_required, TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, - unsigned int link_id), - TP_ARGS(netdev, chandef, link_id), + unsigned int link_id, + u16 punct_bitmap), + TP_ARGS(netdev, chandef, link_id, punct_bitmap), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY __field(unsigned int, link_id) + __field(u16, punct_bitmap) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->link_id = link_id; + __entry->punct_bitmap = punct_bitmap; ), - TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", - NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) + TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d, punct_bitmap:%u", + NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id, + __entry->punct_bitmap) ); TRACE_EVENT(cfg80211_ch_switch_started_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, - unsigned int link_id), - TP_ARGS(netdev, chandef, link_id), + unsigned int link_id, + u16 punct_bitmap), + TP_ARGS(netdev, chandef, link_id, punct_bitmap), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY __field(unsigned int, link_id) + __field(u16, punct_bitmap) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->link_id = link_id; + __entry->punct_bitmap = punct_bitmap; ), - TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", - NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) + TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d, punct_bitmap:%u", + NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id, + __entry->punct_bitmap) ); TRACE_EVENT(cfg80211_radar_event, @@ -3324,7 +3331,7 @@ DECLARE_EVENT_CLASS(cfg80211_rx_evt, NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); ), - TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT, NETDEV_PR_ARG, MAC_PR_ARG(addr)) + TP_printk(NETDEV_PR_FMT ", %pM", NETDEV_PR_ARG, __entry->addr) ); DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame, @@ -3351,8 +3358,8 @@ TRACE_EVENT(cfg80211_ibss_joined, MAC_ASSIGN(bssid, bssid); CHAN_ASSIGN(channel); ), - TP_printk(NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", " CHAN_PR_FMT, - NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG) + TP_printk(NETDEV_PR_FMT ", bssid: %pM, " CHAN_PR_FMT, + NETDEV_PR_ARG, __entry->bssid, CHAN_PR_ARG) ); TRACE_EVENT(cfg80211_probe_status, @@ -3371,8 +3378,8 @@ TRACE_EVENT(cfg80211_probe_status, __entry->cookie = cookie; __entry->acked = acked; ), - TP_printk(NETDEV_PR_FMT " addr:" MAC_PR_FMT ", cookie: %llu, acked: %s", - NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->cookie, + TP_printk(NETDEV_PR_FMT " addr:%pM, cookie: %llu, acked: %s", + NETDEV_PR_ARG, __entry->addr, __entry->cookie, BOOL_TO_STR(__entry->acked)) ); @@ -3389,8 +3396,8 @@ TRACE_EVENT(cfg80211_cqm_pktloss_notify, MAC_ASSIGN(peer, peer); __entry->num_packets = num_packets; ), - TP_printk(NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", num of lost packets: %u", - NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->num_packets) + TP_printk(NETDEV_PR_FMT ", peer: %pM, num of lost packets: %u", + NETDEV_PR_ARG, __entry->peer, __entry->num_packets) ); DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_gtk_rekey_notify, @@ -3414,8 +3421,8 @@ TRACE_EVENT(cfg80211_pmksa_candidate_notify, MAC_ASSIGN(bssid, bssid); __entry->preauth = preauth; ), - TP_printk(NETDEV_PR_FMT ", index:%d, bssid: " MAC_PR_FMT ", pre auth: %s", - NETDEV_PR_ARG, __entry->index, MAC_PR_ARG(bssid), + TP_printk(NETDEV_PR_FMT ", index:%d, bssid: %pM, pre auth: %s", + NETDEV_PR_ARG, __entry->index, __entry->bssid, BOOL_TO_STR(__entry->preauth)) ); @@ -3455,8 +3462,8 @@ TRACE_EVENT(cfg80211_tdls_oper_request, __entry->oper = oper; __entry->reason_code = reason_code; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", oper: %d, reason_code %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->oper, + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, oper: %d, reason_code %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper, __entry->reason_code) ); @@ -3494,10 +3501,10 @@ TRACE_EVENT(cfg80211_scan_done, MAC_ASSIGN(tsf_bssid, info->tsf_bssid); } ), - TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: " MAC_PR_FMT, + TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: %pM", BOOL_TO_STR(__entry->aborted), (unsigned long long)__entry->scan_start_tsf, - MAC_PR_ARG(tsf_bssid)) + __entry->tsf_bssid) ); DECLARE_EVENT_CLASS(wiphy_id_evt, @@ -3546,9 +3553,9 @@ TRACE_EVENT(cfg80211_get_bss, __entry->bss_type = bss_type; __entry->privacy = privacy; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", %pM" ", buf: %#.2x, bss_type: %d, privacy: %d", - WIPHY_PR_ARG, CHAN_PR_ARG, MAC_PR_ARG(bssid), + WIPHY_PR_ARG, CHAN_PR_ARG, __entry->bssid, ((u8 *)__get_dynamic_array(ssid))[0], __entry->bss_type, __entry->privacy) ); @@ -3579,11 +3586,11 @@ TRACE_EVENT(cfg80211_inform_bss_frame, MAC_ASSIGN(parent_bssid, data->parent_bssid); ), TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT - "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: " - MAC_PR_FMT, WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, + "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM", + WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, __entry->signal, (unsigned long long)__entry->ts_boottime, (unsigned long long)__entry->parent_tsf, - MAC_PR_ARG(parent_bssid)) + __entry->parent_bssid) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, @@ -3597,7 +3604,7 @@ DECLARE_EVENT_CLASS(cfg80211_bss_evt, MAC_ASSIGN(bssid, pub->bssid); CHAN_ASSIGN(pub->channel); ), - TP_printk(MAC_PR_FMT ", " CHAN_PR_FMT, MAC_PR_ARG(bssid), CHAN_PR_ARG) + TP_printk("%pM, " CHAN_PR_FMT, __entry->bssid, CHAN_PR_ARG) ); DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss, @@ -3689,8 +3696,8 @@ TRACE_EVENT(cfg80211_ft_event, memcpy(__get_dynamic_array(ric_ies), ft_event->ric_ies, ft_event->ric_ies_len); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(target_ap)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->target_ap) ); TRACE_EVENT(cfg80211_stop_iface, @@ -3724,10 +3731,10 @@ TRACE_EVENT(cfg80211_pmsr_report, __entry->cookie = cookie; MAC_ASSIGN(addr, addr); ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, " MAC_PR_FMT, + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, %pM", WIPHY_PR_ARG, WDEV_PR_ARG, (unsigned long long)__entry->cookie, - MAC_PR_ARG(addr)) + __entry->addr) ); TRACE_EVENT(cfg80211_pmsr_complete, @@ -3749,20 +3756,30 @@ TRACE_EVENT(cfg80211_pmsr_complete, ); TRACE_EVENT(cfg80211_update_owe_info_event, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_update_owe_info *owe_info), - TP_ARGS(wiphy, netdev, owe_info), - TP_STRUCT__entry(WIPHY_ENTRY - NETDEV_ENTRY - MAC_ENTRY(peer) - __dynamic_array(u8, ie, owe_info->ie_len)), - TP_fast_assign(WIPHY_ASSIGN; - NETDEV_ASSIGN; - MAC_ASSIGN(peer, owe_info->peer); - memcpy(__get_dynamic_array(ie), owe_info->ie, - owe_info->ie_len);), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info), + TP_ARGS(wiphy, netdev, owe_info), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __dynamic_array(u8, ie, owe_info->ie_len) + __field(int, assoc_link_id) + MAC_ENTRY(peer_mld_addr) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, owe_info->peer); + memcpy(__get_dynamic_array(ie), owe_info->ie, + owe_info->ie_len); + __entry->assoc_link_id = owe_info->assoc_link_id; + MAC_ASSIGN(peer_mld_addr, owe_info->peer_mld_addr); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM," + " assoc_link_id: %d, peer_mld_addr: %pM", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, + __entry->assoc_link_id, __entry->peer_mld_addr) ); TRACE_EVENT(cfg80211_bss_color_notify, @@ -3800,8 +3817,8 @@ TRACE_EVENT(cfg80211_assoc_comeback, MAC_ASSIGN(ap_addr, ap_addr); __entry->timeout = timeout; ), - TP_printk(WDEV_PR_FMT ", " MAC_PR_FMT ", timeout: %u TUs", - WDEV_PR_ARG, MAC_PR_ARG(ap_addr), __entry->timeout) + TP_printk(WDEV_PR_FMT ", %pM, timeout: %u TUs", + WDEV_PR_ARG, __entry->ap_addr, __entry->timeout) ); DECLARE_EVENT_CLASS(link_station_add_mod, @@ -3859,10 +3876,10 @@ DECLARE_EVENT_CLASS(link_station_add_mod, memcpy(__get_dynamic_array(eht_capa), params->eht_capa, params->eht_capa_len); ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT - ", link mac: " MAC_PR_FMT ", link id: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac), - MAC_PR_ARG(link_mac), __entry->link_id) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" + ", link mac: %pM, link id: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac, + __entry->link_mac, __entry->link_id) ); DEFINE_EVENT(link_station_add_mod, rdev_add_link_station, @@ -3895,9 +3912,9 @@ TRACE_EVENT(rdev_del_link_station, memcpy(__entry->mld_mac, params->mld_mac, 6); __entry->link_id = params->link_id; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" ", link id: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac), + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac, __entry->link_id) ); diff --git a/net/wireless/util.c b/net/wireless/util.c index 8f403f9fe816..d1a89e82ead0 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -542,6 +542,64 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); +bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto) +{ + const __be16 *hdr_proto = hdr + ETH_ALEN; + + if (!(ether_addr_equal(hdr, rfc1042_header) && + *hdr_proto != htons(ETH_P_AARP) && + *hdr_proto != htons(ETH_P_IPX)) && + !ether_addr_equal(hdr, bridge_tunnel_header)) + return false; + + *proto = *hdr_proto; + + return true; +} +EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto); + +int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) +{ + const void *mesh_addr; + struct { + struct ethhdr eth; + u8 flags; + } payload; + int hdrlen; + int ret; + + ret = skb_copy_bits(skb, 0, &payload, sizeof(payload)); + if (ret) + return ret; + + hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags); + + if (likely(pskb_may_pull(skb, hdrlen + 8) && + ieee80211_get_8023_tunnel_proto(skb->data + hdrlen, + &payload.eth.h_proto))) + hdrlen += ETH_ALEN + 2; + else if (!pskb_may_pull(skb, hdrlen)) + return -EINVAL; + + mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; + switch (payload.flags & MESH_FLAGS_AE) { + case MESH_FLAGS_AE_A4: + memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN); + break; + case MESH_FLAGS_AE_A5_A6: + memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN); + break; + default: + break; + } + + pskb_pull(skb, hdrlen - sizeof(payload.eth)); + memcpy(skb->data, &payload.eth, sizeof(payload.eth)); + + return 0; +} +EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr); + int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, u8 data_offset, bool is_amsdu) @@ -553,7 +611,6 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, } payload; struct ethhdr tmp; u16 hdrlen; - u8 mesh_flags = 0; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; @@ -574,12 +631,6 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); - if (iftype == NL80211_IFTYPE_MESH_POINT && - skb_copy_bits(skb, hdrlen, &mesh_flags, 1) < 0) - return -1; - - mesh_flags &= MESH_FLAGS_AE; - switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): @@ -593,17 +644,6 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; - if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags == MESH_FLAGS_AE_A4) - return -1; - if (mesh_flags == MESH_FLAGS_AE_A5_A6 && - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr1), - tmp.h_dest, 2 * ETH_ALEN) < 0) - return -1; - - hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); - } break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && @@ -612,16 +652,6 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, (is_multicast_ether_addr(tmp.h_dest) && ether_addr_equal(tmp.h_source, addr))) return -1; - if (iftype == NL80211_IFTYPE_MESH_POINT) { - if (mesh_flags == MESH_FLAGS_AE_A5_A6) - return -1; - if (mesh_flags == MESH_FLAGS_AE_A4 && - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr1), - tmp.h_source, ETH_ALEN) < 0) - return -1; - hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); - } break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && @@ -631,15 +661,11 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, break; } - if (likely(skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && - ((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) && - payload.proto != htons(ETH_P_AARP) && - payload.proto != htons(ETH_P_IPX)) || - ether_addr_equal(payload.hdr, bridge_tunnel_header)))) { - /* remove RFC1042 or Bridge-Tunnel encapsulation and - * replace EtherType */ + if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT && + skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && + ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) { + /* remove RFC1042 or Bridge-Tunnel encapsulation */ hdrlen += ETH_ALEN + 2; - tmp.h_proto = payload.proto; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); @@ -711,7 +737,8 @@ __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, static struct sk_buff * __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, - int offset, int len, bool reuse_frag) + int offset, int len, bool reuse_frag, + int min_len) { struct sk_buff *frame; int cur_len = len; @@ -725,7 +752,7 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, * in the stack later. */ if (reuse_frag) - cur_len = min_t(int, len, 32); + cur_len = min_t(int, len, min_len); /* * Allocate and reserve two bytes more for payload @@ -735,6 +762,7 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, if (!frame) return NULL; + frame->priority = skb->priority; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); @@ -748,28 +776,72 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, return frame; } +bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr) +{ + int offset = 0, remaining, subframe_len, padding; + + for (offset = 0; offset < skb->len; offset += subframe_len + padding) { + struct { + __be16 len; + u8 mesh_flags; + } hdr; + u16 len; + + if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) + return false; + + if (mesh_hdr) + len = le16_to_cpu(*(__le16 *)&hdr.len) + + __ieee80211_get_mesh_hdrlen(hdr.mesh_flags); + else + len = ntohs(hdr.len); + + subframe_len = sizeof(struct ethhdr) + len; + padding = (4 - subframe_len) & 0x3; + remaining = skb->len - offset; + + if (subframe_len > remaining) + return false; + } + + return true; +} +EXPORT_SYMBOL(ieee80211_is_valid_amsdu); + void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, - const u8 *check_da, const u8 *check_sa) + const u8 *check_da, const u8 *check_sa, + bool mesh_control) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; - u16 ethertype; - u8 *payload; int offset = 0, remaining; - struct ethhdr eth; + struct { + struct ethhdr eth; + uint8_t flags; + } hdr; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; + int copy_len = sizeof(hdr.eth); + + if (iftype == NL80211_IFTYPE_MESH_POINT) + copy_len = sizeof(hdr); while (!last) { unsigned int subframe_len; - int len; + int len, mesh_len = 0; u8 padding; - skb_copy_bits(skb, offset, ð, sizeof(eth)); - len = ntohs(eth.h_proto); + skb_copy_bits(skb, offset, &hdr, copy_len); + if (iftype == NL80211_IFTYPE_MESH_POINT) + mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); + if (mesh_control) + len = le16_to_cpu(*(__le16 *)&hdr.eth.h_proto) + mesh_len; + else + len = ntohs(hdr.eth.h_proto); + subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; @@ -778,16 +850,16 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks */ - if (ether_addr_equal(eth.h_dest, rfc1042_header)) + if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header)) goto purge; offset += sizeof(struct ethhdr); last = remaining <= subframe_len + padding; /* FIXME: should we really accept multicast DA? */ - if ((check_da && !is_multicast_ether_addr(eth.h_dest) && - !ether_addr_equal(check_da, eth.h_dest)) || - (check_sa && !ether_addr_equal(check_sa, eth.h_source))) { + if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) && + !ether_addr_equal(check_da, hdr.eth.h_dest)) || + (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) { offset += len + padding; continue; } @@ -799,7 +871,7 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, reuse_skb = true; } else { frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, - reuse_frag); + reuse_frag, 32 + mesh_len); if (!frame) goto purge; @@ -810,16 +882,11 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, frame->dev = skb->dev; frame->priority = skb->priority; - payload = frame->data; - ethertype = (payload[6] << 8) | payload[7]; - if (likely((ether_addr_equal(payload, rfc1042_header) && - ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || - ether_addr_equal(payload, bridge_tunnel_header))) { - eth.h_proto = htons(ethertype); + if (likely(iftype != NL80211_IFTYPE_MESH_POINT && + ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto))) skb_pull(frame, ETH_ALEN + 2); - } - memcpy(skb_push(frame, sizeof(eth)), ð, sizeof(eth)); + memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth)); __skb_queue_tail(list, frame); } @@ -934,7 +1001,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) if (!wdev->connect_keys) return; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) { + for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 8a24dfca75af..e3acfac7430a 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -439,7 +439,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, GFP_KERNEL); if (!wdev->wext.keys) return -ENOMEM; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) + for (i = 0; i < 4; i++) wdev->wext.keys->params[i].key = wdev->wext.keys->data[i]; } diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index fe8765c4075d..13a72b17248e 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -636,7 +636,15 @@ void wireless_send_event(struct net_device * dev, } EXPORT_SYMBOL(wireless_send_event); +#ifdef CONFIG_CFG80211_WEXT +static void wireless_warn_cfg80211_wext(void) +{ + char name[sizeof(current->comm)]; + pr_warn_ratelimited("warning: `%s' uses wireless extensions that are deprecated for modern drivers; use nl80211\n", + get_task_comm(name, current)); +} +#endif /* IW handlers */ @@ -652,8 +660,12 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy && dev->ieee80211_ptr->wiphy->wext && - dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) + dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) { + wireless_warn_cfg80211_wext(); + if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) + return NULL; return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev); + } #endif /* not found */ @@ -690,8 +702,12 @@ static iw_handler get_handler(struct net_device *dev, unsigned int cmd) const struct iw_handler_def *handlers = NULL; #ifdef CONFIG_CFG80211_WEXT - if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) + if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) { + wireless_warn_cfg80211_wext(); + if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) + return NULL; handlers = dev->ieee80211_ptr->wiphy->wext; + } #endif #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 191c6d98c700..f231207ca210 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -47,7 +47,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) + for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 3b55502b2965..5c7ad301d742 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -482,6 +482,12 @@ static int x25_listen(struct socket *sock, int backlog) int rc = -EOPNOTSUPP; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + rc = -EINVAL; + release_sock(sk); + return rc; + } + if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9f0561b67c12..45eef5af0a51 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -511,7 +511,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, return skb; } -static int xsk_generic_xmit(struct sock *sk) +static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); u32 max_batch = TX_BATCH_SIZE; @@ -594,22 +594,13 @@ out: return err; } -static int xsk_xmit(struct sock *sk) +static int xsk_generic_xmit(struct sock *sk) { - struct xdp_sock *xs = xdp_sk(sk); int ret; - if (unlikely(!(xs->dev->flags & IFF_UP))) - return -ENETDOWN; - if (unlikely(!xs->tx)) - return -ENOBUFS; - - if (xs->zc) - return xsk_wakeup(xs, XDP_WAKEUP_TX); - /* Drop the RCU lock since the SKB path might sleep. */ rcu_read_unlock(); - ret = xsk_generic_xmit(sk); + ret = __xsk_generic_xmit(sk); /* Reaquire RCU lock before going into common code. */ rcu_read_lock(); @@ -627,17 +618,31 @@ static bool xsk_no_wakeup(struct sock *sk) #endif } +static int xsk_check_common(struct xdp_sock *xs) +{ + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + + return 0; +} + static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; + int err; - if (unlikely(!xsk_is_bound(xs))) - return -ENXIO; + err = xsk_check_common(xs); + if (err) + return err; if (unlikely(need_wait)) return -EOPNOTSUPP; + if (unlikely(!xs->tx)) + return -ENOBUFS; if (sk_can_busy_loop(sk)) { if (xs->zc) @@ -649,8 +654,11 @@ static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len return 0; pool = xs->pool; - if (pool->cached_need_wakeup & XDP_WAKEUP_TX) - return xsk_xmit(sk); + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { + if (xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_TX); + return xsk_generic_xmit(sk); + } return 0; } @@ -670,11 +678,11 @@ static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int bool need_wait = !(flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + int err; - if (unlikely(!xsk_is_bound(xs))) - return -ENXIO; - if (unlikely(!(xs->dev->flags & IFF_UP))) - return -ENETDOWN; + err = xsk_check_common(xs); + if (err) + return err; if (unlikely(!xs->rx)) return -ENOBUFS; if (unlikely(need_wait)) @@ -713,21 +721,20 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, sock_poll_wait(file, sock, wait); rcu_read_lock(); - if (unlikely(!xsk_is_bound(xs))) { - rcu_read_unlock(); - return mask; - } + if (xsk_check_common(xs)) + goto skip_tx; pool = xs->pool; if (pool->cached_need_wakeup) { if (xs->zc) xsk_wakeup(xs, pool->cached_need_wakeup); - else + else if (xs->tx) /* Poll needs to drive Tx also in copy mode */ - xsk_xmit(sk); + xsk_generic_xmit(sk); } +skip_tx: if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && xsk_tx_writeable(xs)) @@ -845,7 +852,6 @@ static int xsk_release(struct socket *sock) sock_orphan(sk); sock->sk = NULL; - sk_refcnt_debug_release(sk); sock_put(sk); return 0; @@ -1295,8 +1301,6 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; - unsigned long pfn; - struct page *qpg; if (READ_ONCE(xs->state) != XSK_READY) return -EBUSY; @@ -1319,13 +1323,10 @@ static int xsk_mmap(struct file *file, struct socket *sock, /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); - qpg = virt_to_head_page(q->ring); - if (size > page_size(qpg)) + if (size > q->ring_vmalloc_size) return -EINVAL; - pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, - size, vma->vm_page_prot); + return remap_vmalloc_range(vma, q->ring, 0); } static int xsk_notifier(struct notifier_block *this, @@ -1396,8 +1397,6 @@ static void xsk_destruct(struct sock *sk) if (!xp_put_pool(xs->pool)) xdp_put_umem(xs->umem, !xs->pool); - - sk_refcnt_debug_dec(sk); } static int xsk_create(struct net *net, struct socket *sock, int protocol, @@ -1427,7 +1426,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_family = PF_XDP; sk->sk_destruct = xsk_destruct; - sk_refcnt_debug_inc(sk); sock_set_flag(sk, SOCK_RCU_FREE); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index ed6c71826d31..b2df1e0f8153 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -140,6 +140,10 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) } } +#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \ + NETDEV_XDP_ACT_REDIRECT | \ + NETDEV_XDP_ACT_XSK_ZEROCOPY) + int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev, u16 queue_id, u16 flags) { @@ -178,8 +182,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, /* For copy-mode, we are done. */ return 0; - if (!netdev->netdev_ops->ndo_bpf || - !netdev->netdev_ops->ndo_xsk_wakeup) { + if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) { err = -EOPNOTSUPP; goto err_unreg_pool; } diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 6cf9586e5027..f8905400ee07 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -6,6 +6,7 @@ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> +#include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include "xsk_queue.h" @@ -23,7 +24,6 @@ static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; - gfp_t gfp_flags; size_t size; q = kzalloc(sizeof(*q), GFP_KERNEL); @@ -33,17 +33,16 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) q->nentries = nentries; q->ring_mask = nentries - 1; - gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | - __GFP_COMP | __GFP_NORETRY; size = xskq_get_ring_size(q, umem_queue); + size = PAGE_ALIGN(size); - q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, - get_order(size)); + q->ring = vmalloc_user(size); if (!q->ring) { kfree(q); return NULL; } + q->ring_vmalloc_size = size; return q; } @@ -52,6 +51,6 @@ void xskq_destroy(struct xsk_queue *q) if (!q) return; - page_frag_free(q->ring); + vfree(q->ring); kfree(q); } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index c6fb6b763658..bfb2a7e50c26 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -45,6 +45,7 @@ struct xsk_queue { struct xdp_ring *ring; u64 invalid_descs; u64 queue_empty_descs; + size_t ring_vmalloc_size; }; /* The structure of the shared state of the rings are a simple diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 494aa744bfb9..cd47f88921f5 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -3,6 +3,14 @@ # Makefile for the XFRM subsystem. # +xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o + +ifeq ($(CONFIG_XFRM_INTERFACE),m) +xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o +else ifeq ($(CONFIG_XFRM_INTERFACE),y) +xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o +endif + obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o xfrm_device.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 29a540dcb5a7..872b80188e83 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -6,6 +6,7 @@ #include <net/espintcp.h> #include <linux/skmsg.h> #include <net/inet_common.h> +#include <trace/events/sock.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif @@ -354,7 +355,7 @@ static int espintcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) *((__be16 *)buf) = cpu_to_be16(msglen); pfx_iov.iov_base = buf; pfx_iov.iov_len = sizeof(buf); - iov_iter_kvec(&pfx_iter, WRITE, &pfx_iov, 1, pfx_iov.iov_len); + iov_iter_kvec(&pfx_iter, ITER_SOURCE, &pfx_iov, 1, pfx_iov.iov_len); err = sk_msg_memcopy_from_iter(sk, &pfx_iter, &emsg->skmsg, pfx_iov.iov_len); @@ -397,6 +398,8 @@ static void espintcp_data_ready(struct sock *sk) { struct espintcp_ctx *ctx = espintcp_getctx(sk); + trace_sk_data_ready(sk); + strp_data_ready(&ctx->strp); } @@ -489,6 +492,7 @@ static int espintcp_init_sk(struct sock *sk) /* avoid using task_frag */ sk->sk_allocation = GFP_ATOMIC; + sk->sk_use_task_frag = false; return 0; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index a0f62fa02e06..8cbf45a8bcdc 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -5,6 +5,7 @@ * Based on code and translator idea by: Florian Westphal <fw@strlen.de> */ #include <linux/compat.h> +#include <linux/nospec.h> #include <linux/xfrm.h> #include <net/xfrm.h> @@ -302,7 +303,7 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src) nla_for_each_attr(nla, attrs, len, remaining) { int err; - switch (type) { + switch (nlh_src->nlmsg_type) { case XFRM_MSG_NEWSPDINFO: err = xfrm_nla_cpy(dst, nla, nla_len(nla)); break; @@ -437,6 +438,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } + type = array_index_nospec(type, XFRMA_MAX + 1); if (nla_len(nla) < compat_policy[type].len) { NL_SET_ERR_MSG(extack, "Attribute bad length"); return -EOPNOTSUPP; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 21269e8f2db4..95f1436bf6a2 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -132,6 +132,16 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN) return skb; + /* The packet was sent to HW IPsec packet offload engine, + * but to wrong device. Drop the packet, so it won't skip + * XFRM stack. + */ + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) { + kfree_skb(skb); + dev_core_stats_tx_dropped_inc(dev); + return NULL; + } + /* This skb was already validated on the upper/virtual dev */ if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) return skb; @@ -229,6 +239,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_dev_offload *xso = &x->xso; xfrm_address_t *saddr; xfrm_address_t *daddr; + bool is_packet_offload; if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); @@ -241,11 +252,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) { + if (xuo->flags & + ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); return -EINVAL; } + is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -260,7 +273,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, x->props.family, xfrm_smark_get(0, x)); if (IS_ERR(dst)) - return 0; + return (is_packet_offload) ? -EINVAL : 0; dev = dst->dev; @@ -271,7 +284,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { xso->dev = NULL; dev_put(dev); - return 0; + return (is_packet_offload) ? -EINVAL : 0; } if (x->props.flags & XFRM_STATE_ESN && @@ -291,15 +304,29 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->dir = XFRM_DEV_OFFLOAD_OUT; - err = dev->xfrmdev_ops->xdo_dev_state_add(x); + if (is_packet_offload) + xso->type = XFRM_DEV_OFFLOAD_PACKET; + else + xso->type = XFRM_DEV_OFFLOAD_CRYPTO; + + err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack); if (err) { xso->dev = NULL; xso->dir = 0; xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); - - if (err != -EOPNOTSUPP) { - NL_SET_ERR_MSG(extack, "Device failed to offload this state"); + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + + /* User explicitly requested packet offload mode and configured + * policy in addition to the XFRM state. So be civil to users, + * and return an error instead of taking fallback path. + * + * This WARN_ON() can be seen as a documentation for driver + * authors to do not return -EOPNOTSUPP in packet offload mode. + */ + WARN_ON(err == -EOPNOTSUPP && is_packet_offload); + if (err != -EOPNOTSUPP || is_packet_offload) { + NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state"); return err; } } @@ -308,6 +335,69 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } EXPORT_SYMBOL_GPL(xfrm_dev_state_add); +int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack) +{ + struct xfrm_dev_offload *xdo = &xp->xdo; + struct net_device *dev; + int err; + + if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) { + /* We support only packet offload mode and it means + * that user must set XFRM_OFFLOAD_PACKET bit. + */ + NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); + return -EINVAL; + } + + dev = dev_get_by_index(net, xuo->ifindex); + if (!dev) + return -EINVAL; + + if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) { + xdo->dev = NULL; + dev_put(dev); + NL_SET_ERR_MSG(extack, "Policy offload is not supported"); + return -EINVAL; + } + + xdo->dev = dev; + netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC); + xdo->real_dev = dev; + xdo->type = XFRM_DEV_OFFLOAD_PACKET; + switch (dir) { + case XFRM_POLICY_IN: + xdo->dir = XFRM_DEV_OFFLOAD_IN; + break; + case XFRM_POLICY_OUT: + xdo->dir = XFRM_DEV_OFFLOAD_OUT; + break; + case XFRM_POLICY_FWD: + xdo->dir = XFRM_DEV_OFFLOAD_FWD; + break; + default: + xdo->dev = NULL; + dev_put(dev); + NL_SET_ERR_MSG(extack, "Unrecognized offload direction"); + return -EINVAL; + } + + err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack); + if (err) { + xdo->dev = NULL; + xdo->real_dev = NULL; + xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + xdo->dir = 0; + netdev_put(dev, &xdo->dev_tracker); + NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this policy"); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xfrm_dev_policy_add); + bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { int mtu; @@ -318,8 +408,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (!x->type_offload || x->encap) return false; - if ((!dev || (dev == xfrm_dst_path(dst)->dev)) && - (!xdst->child->xfrm)) { + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET || + ((!dev || (dev == xfrm_dst_path(dst)->dev)) && + !xdst->child->xfrm)) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; @@ -410,8 +501,10 @@ static int xfrm_api_check(struct net_device *dev) static int xfrm_dev_down(struct net_device *dev) { - if (dev->features & NETIF_F_HW_ESP) + if (dev->features & NETIF_F_HW_ESP) { xfrm_dev_state_flush(dev_net(dev), dev, true); + xfrm_dev_policy_flush(dev_net(dev), dev, true); + } return NOTIFY_DONE; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c06e54a10540..436d29640ac2 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -279,8 +279,7 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), - ipipv6_hdr(skb)); + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c new file mode 100644 index 000000000000..d74f3fd20f2b --- /dev/null +++ b/net/xfrm/xfrm_interface_bpf.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable XFRM Helpers for TC-BPF hook + * + * These are called from SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> + +#include <net/dst_metadata.h> +#include <net/xfrm.h> + +/* bpf_xfrm_info - XFRM metadata information + * + * Members: + * @if_id - XFRM if_id: + * Transmit: if_id to be used in policy and state lookups + * Receive: if_id of the state matched for the incoming packet + * @link - Underlying device ifindex: + * Transmit: used as the underlying device in VRF routing + * Receive: the device on which the packet had been received + */ +struct bpf_xfrm_info { + u32 if_id; + int link; +}; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in xfrm_interface BTF"); + +/* bpf_skb_get_xfrm_info - Get XFRM metadata + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @to - Pointer to memory to which the metadata will be copied + * Cannot be NULL + */ +__bpf_kfunc int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct xfrm_md_info *info; + + info = skb_xfrm_md_info(skb); + if (!info) + return -EINVAL; + + to->if_id = info->if_id; + to->link = info->link; + return 0; +} + +/* bpf_skb_get_xfrm_info - Set XFRM metadata + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @from - Pointer to memory from which the metadata will be copied + * Cannot be NULL + */ +__bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bpf_xfrm_info *from) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct metadata_dst *md_dst; + struct xfrm_md_info *info; + + if (unlikely(skb_metadata_dst(skb))) + return -EINVAL; + + if (!xfrm_bpf_md_dst) { + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(0, METADATA_XFRM, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + if (cmpxchg(&xfrm_bpf_md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); + } + md_dst = this_cpu_ptr(xfrm_bpf_md_dst); + + info = &md_dst->u.xfrm_info; + + info->if_id = from->if_id; + info->link = from->link; + skb_dst_force(skb); + info->dst_orig = skb_dst(skb); + + dst_hold((struct dst_entry *)md_dst); + skb_dst_set(skb, (struct dst_entry *)md_dst); + return 0; +} + +__diag_pop() + +BTF_SET8_START(xfrm_ifc_kfunc_set) +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info) +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info) +BTF_SET8_END(xfrm_ifc_kfunc_set) + +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = { + .owner = THIS_MODULE, + .set = &xfrm_ifc_kfunc_set, +}; + +int __init register_xfrm_interface_bpf(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, + &xfrm_interface_kfunc_set); +} diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface_core.c index 5a67b120c4db..35279c220bd7 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -396,6 +442,14 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if_id = md_info->if_id; fl->flowi_oif = md_info->link; + if (md_info->dst_orig) { + struct dst_entry *tmp_dst = dst; + + dst = md_info->dst_orig; + skb_dst_set(skb, dst); + md_info->dst_orig = NULL; + dst_release(tmp_dst); + } } else { if_id = xi->p.if_id; } @@ -937,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -988,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, @@ -1162,12 +1216,18 @@ static int __init xfrmi_init(void) if (err < 0) goto rtnl_link_failed; + err = register_xfrm_interface_bpf(); + if (err < 0) + goto kfunc_failed; + lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; +kfunc_failed: + rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 78cb8d0a6a18..ff114d68cc43 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -492,7 +492,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) struct xfrm_state *x = dst->xfrm; struct net *net = xs_net(x); - if (err <= 0) + if (err <= 0 || x->xso.type == XFRM_DEV_OFFLOAD_PACKET) goto resume; do { @@ -717,6 +717,16 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) break; } + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + if (!xfrm_dev_offload_ok(skb, x)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -EHOSTUNREACH; + } + + return xfrm_output_resume(sk, skb, 0); + } + secpath_reset(skb); if (xfrm_dev_offload_ok(skb, x)) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9b9e2765363d..5c61ec04b839 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; @@ -425,6 +425,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); + xfrm_dev_policy_free(policy); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); @@ -535,7 +536,7 @@ redo: __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); - if (!entry0) { + if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; @@ -866,7 +867,7 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, &n->hhead); @@ -1347,7 +1348,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) else break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); @@ -1524,7 +1525,7 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); else hlist_add_head_rcu(&policy->bydst_inexact_list, chain); @@ -1561,9 +1562,12 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else + /* Packet offload policies enter to the head + * to speed-up lookups. + */ hlist_add_head_rcu(&policy->bydst, chain); return delpol; @@ -1769,12 +1773,41 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) } return err; } + +static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, + struct net_device *dev, + bool task_valid) +{ + struct xfrm_policy *pol; + int err = 0; + + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead || + xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || + pol->xdo.dev != dev) + continue; + + err = security_xfrm_policy_delete(pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, task_valid); + return err; + } + } + return err; +} #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } + +static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, + struct net_device *dev, + bool task_valid) +{ + return 0; +} #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) @@ -1814,6 +1847,44 @@ out: } EXPORT_SYMBOL(xfrm_policy_flush); +int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, + bool task_valid) +{ + int dir, err = 0, cnt = 0; + struct xfrm_policy *pol; + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + + err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid); + if (err) + goto out; + +again: + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + dir = xfrm_policy_id2dir(pol->index); + if (pol->walk.dead || + dir >= XFRM_POLICY_MAX || + pol->xdo.dev != dev) + continue; + + __xfrm_policy_unlink(pol, dir); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + cnt++; + xfrm_audit_policy_delete(pol, 1, task_valid); + xfrm_policy_kill(pol); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + goto again; + } + if (cnt) + __xfrm_policy_inexact_flush(net); + else + err = -ESRCH; +out: + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return err; +} +EXPORT_SYMBOL(xfrm_dev_policy_flush); + int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) @@ -2113,6 +2184,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, break; } } + if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET) + goto skip_inexact; + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) @@ -2245,6 +2319,7 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir) pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { + xfrm_dev_policy_delete(pol); xfrm_policy_kill(pol); return 0; } @@ -3586,7 +3661,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = ktime_get_real_seconds(); + /* This lockless write can happen from different cpus. */ + WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; @@ -3601,7 +3677,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, xfrm_pol_put(pols[0]); return 0; } - pols[1]->curlft.use_time = ktime_get_real_seconds(); + /* This write can happen from different cpus. */ + WRITE_ONCE(pols[1]->curlft.use_time, + ktime_get_real_seconds()); npols++; } } @@ -3667,6 +3745,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, goto reject; } + if (if_id) + secpath_reset(skb); + xfrm_pols_put(pols, npols); return 1; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9ec481fbfb63..2ab3e09e2227 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -84,6 +84,25 @@ static unsigned int xfrm_seq_hash(struct net *net, u32 seq) return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } +#define XFRM_STATE_INSERT(by, _n, _h, _type) \ + { \ + struct xfrm_state *_x = NULL; \ + \ + if (_type != XFRM_DEV_OFFLOAD_PACKET) { \ + hlist_for_each_entry_rcu(_x, _h, by) { \ + if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ + continue; \ + break; \ + } \ + } \ + \ + if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ + /* SAD is empty or consist from HW SAs only */ \ + hlist_add_head_rcu(_n, _h); \ + else \ + hlist_add_before_rcu(_n, &_x->by); \ + } + static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, struct hlist_head *nsrctable, @@ -100,23 +119,25 @@ static void xfrm_hash_transfer(struct hlist_head *list, h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family, nhashmask); - hlist_add_head_rcu(&x->bydst, ndsttable + h); + XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type); h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family, nhashmask); - hlist_add_head_rcu(&x->bysrc, nsrctable + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type); if (x->id.spi) { h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family, nhashmask); - hlist_add_head_rcu(&x->byspi, nspitable + h); + XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h, + x->xso.type); } if (x->km.seq) { h = __xfrm_seq_hash(x->km.seq, nhashmask); - hlist_add_head_rcu(&x->byseq, nseqtable + h); + XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h, + x->xso.type); } } } @@ -549,12 +570,14 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) int err = 0; spin_lock(&x->lock); + xfrm_dev_state_update_curlft(x); + if (x->km.state == XFRM_STATE_DEAD) goto out; if (x->km.state == XFRM_STATE_EXPIRED) goto expired; if (x->lft.hard_add_expires_seconds) { - long tmo = x->lft.hard_add_expires_seconds + + time64_t tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { if (x->xflags & XFRM_SOFT_EXPIRE) { @@ -571,8 +594,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) next = tmo; } if (x->lft.hard_use_expires_seconds) { - long tmo = x->lft.hard_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + time64_t tmo = x->lft.hard_use_expires_seconds + + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -581,7 +604,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->km.dying) goto resched; if (x->lft.soft_add_expires_seconds) { - long tmo = x->lft.soft_add_expires_seconds + + time64_t tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { warn = 1; @@ -593,8 +616,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } } if (x->lft.soft_use_expires_seconds) { - long tmo = x->lft.soft_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + time64_t tmo = x->lft.soft_use_expires_seconds + + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) @@ -951,6 +974,49 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, x->props.family = tmpl->encap_family; } +static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family, + struct xfrm_dev_offload *xdo) +{ + unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + struct xfrm_state *x; + + hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { +#ifdef CONFIG_XFRM_OFFLOAD + if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (xdo->dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif + if (x->props.family != family || + x->id.spi != spi || + x->id.proto != proto || + !xfrm_addr_equal(&x->id.daddr, daddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + if (!xfrm_state_hold_rcu(x)) + continue; + return x; + } + + return NULL; +} + static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, @@ -1092,6 +1158,23 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (pol->xdo.dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -1109,6 +1192,23 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (pol->xdo.dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -1126,8 +1226,10 @@ found: x = best; if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && - (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi, - tmpl->id.proto, encap_family)) != NULL) { + (x0 = __xfrm_state_lookup_all(net, mark, daddr, + tmpl->id.spi, tmpl->id.proto, + encap_family, + &pol->xdo)) != NULL) { to_put = x0; error = -EEXIST; goto out; @@ -1161,21 +1263,53 @@ found: x = NULL; goto out; } - +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + struct xfrm_dev_offload *xdo = &pol->xdo; + struct xfrm_dev_offload *xso = &x->xso; + + xso->type = XFRM_DEV_OFFLOAD_PACKET; + xso->dir = xdo->dir; + xso->dev = xdo->dev; + xso->real_dev = xdo->real_dev; + netdev_tracker_alloc(xso->dev, &xso->dev_tracker, + GFP_ATOMIC); + error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); + if (error) { + xso->dir = 0; + netdev_put(xso->dev, &xso->dev_tracker); + xso->dev = NULL; + xso->real_dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + x->km.state = XFRM_STATE_DEAD; + to_put = x; + x = NULL; + goto out; + } + } +#endif if (km_query(x, tmpl, pol) == 0) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, + net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, daddr, saddr, encap_family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, + net->xfrm.state_bysrc + h, + x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, + net->xfrm.state_byspi + h, + x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); - hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + XFRM_STATE_INSERT(byseq, &x->byseq, + net->xfrm.state_byseq + h, + x->xso.type); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; hrtimer_start(&x->mtimer, @@ -1185,6 +1319,18 @@ found: xfrm_hash_grow_check(net, x->bydst.next != NULL); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { +#ifdef CONFIG_XFRM_OFFLOAD + struct xfrm_dev_offload *xso = &x->xso; + + if (xso->type == XFRM_DEV_OFFLOAD_PACKET) { + xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); + xso->dir = 0; + netdev_put(xso->dev, &xso->dev_tracker); + xso->dev = NULL; + xso->real_dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + } +#endif x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; @@ -1280,22 +1426,26 @@ static void __xfrm_state_insert(struct xfrm_state *x) h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, + x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); - hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, + x->xso.type); } hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); @@ -1409,9 +1559,11 @@ static struct xfrm_state *__find_acq_core(struct net *net, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, daddr, saddr, family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + x->xso.type); net->xfrm.state_num++; @@ -1754,7 +1906,7 @@ out: hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); - if (x1->curlft.use_time) + if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { @@ -1786,8 +1938,10 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { - if (!x->curlft.use_time) - x->curlft.use_time = ktime_get_real_seconds(); + xfrm_dev_state_update_curlft(x); + + if (!READ_ONCE(x->curlft.use_time)) + WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { @@ -2081,7 +2235,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { - spi = low + prandom_u32_max(high - low + 1); + spi = get_random_u32_inclusive(low, high); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { newspi = htonl(spi); @@ -2094,7 +2248,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, spin_lock_bh(&net->xfrm.xfrm_state_lock); x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, + x->xso.type); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0eb4696661c8..cf5172d4ce68 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -956,6 +956,8 @@ static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb) xuo->ifindex = xso->dev->ifindex; if (xso->dir == XFRM_DEV_OFFLOAD_IN) xuo->flags = XFRM_OFFLOAD_INBOUND; + if (xso->type == XFRM_DEV_OFFLOAD_PACKET) + xuo->flags |= XFRM_OFFLOAD_PACKET; return 0; } @@ -1890,6 +1892,15 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, if (attrs[XFRMA_IF_ID]) xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + /* configure the hardware if offload is requested */ + if (attrs[XFRMA_OFFLOAD_DEV]) { + err = xfrm_dev_policy_add(net, xp, + nla_data(attrs[XFRMA_OFFLOAD_DEV]), + p->dir, extack); + if (err) + goto error; + } + return xp; error: *errp = err; @@ -1929,6 +1940,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) { + xfrm_dev_policy_delete(xp); security_xfrm_policy_free(xp->security); kfree(xp); return err; @@ -2041,6 +2053,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3379,6 +3393,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3497,6 +3513,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3580,6 +3598,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) goto out_free_skb; |