diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 16:38:19 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 16:38:19 +0200 |
commit | da19a102ce87bf3e0a7fe277a659d1fc35330d6d (patch) | |
tree | a6c1d40ef544e812b31f4b5f497c20d449d45ec3 /include/rdma | |
parent | Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc (diff) | |
parent | IB/mlx5: Add support for extended atomic operations (diff) | |
download | linux-da19a102ce87bf3e0a7fe277a659d1fc35330d6d.tar.xz linux-da19a102ce87bf3e0a7fe277a659d1fc35330d6d.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"This has been a smaller cycle with many of the commits being smallish
code fixes and improvements across the drivers.
- Driver updates for bnxt_re, cxgb4, hfi1, hns, mlx5, nes, qedr, and
rxe
- Memory window support in hns
- mlx5 user API 'flow mutate/steering' allows accessing the full
packet mangling and matching machinery from user space
- Support inter-working with verbs API calls in the 'devx' mlx5 user
API, and provide options to use devx with less privilege
- Modernize the use of syfs and the device interface to use attribute
groups and cdev properly for uverbs, and clean up some of the core
code's device list management
- More progress on net namespaces for RDMA devices
- Consolidate driver BAR mmapping support into core code helpers and
rework how RDMA holds poitners to mm_struct for get_user_pages
cases
- First pass to use 'dev_name' instead of ib_device->name
- Device renaming for RDMA devices"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (242 commits)
IB/mlx5: Add support for extended atomic operations
RDMA/core: Fix comment for hw stats init for port == 0
RDMA/core: Refactor ib_register_device() function
RDMA/core: Fix unwinding flow in case of error to register device
ib_srp: Remove WARN_ON in srp_terminate_io()
IB/mlx5: Allow scatter to CQE without global signaled WRs
IB/mlx5: Verify that driver supports user flags
IB/mlx5: Support scatter to CQE for DC transport type
RDMA/drivers: Use core provided API for registering device attributes
RDMA/core: Allow existing drivers to set one sysfs group per device
IB/rxe: Remove unnecessary enum values
RDMA/umad: Use kernel API to allocate umad indexes
RDMA/uverbs: Use kernel API to allocate uverbs indexes
RDMA/core: Increase total number of RDMA ports across all devices
IB/mlx4: Add port and TID to MAD debug print
IB/mlx4: Enable debug print of SMPs
RDMA/core: Rename ports_parent to ports_kobj
RDMA/core: Do not expose unsupported counters
IB/mlx4: Refer to the device kobject instead of ports_parent
RDMA/nldev: Allow IB device rename through RDMA netlink
...
Diffstat (limited to 'include/rdma')
-rw-r--r-- | include/rdma/ib_addr.h | 11 | ||||
-rw-r--r-- | include/rdma/ib_cm.h | 2 | ||||
-rw-r--r-- | include/rdma/ib_sa.h | 38 | ||||
-rw-r--r-- | include/rdma/ib_umem.h | 9 | ||||
-rw-r--r-- | include/rdma/ib_umem_odp.h | 75 | ||||
-rw-r--r-- | include/rdma/ib_verbs.h | 149 | ||||
-rw-r--r-- | include/rdma/rdma_cm.h | 11 | ||||
-rw-r--r-- | include/rdma/rdma_netlink.h | 4 | ||||
-rw-r--r-- | include/rdma/rdma_vt.h | 51 | ||||
-rw-r--r-- | include/rdma/rdmavt_qp.h | 7 | ||||
-rw-r--r-- | include/rdma/restrack.h | 12 | ||||
-rw-r--r-- | include/rdma/uverbs_ioctl.h | 111 | ||||
-rw-r--r-- | include/rdma/uverbs_std_types.h | 51 |
13 files changed, 385 insertions, 146 deletions
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 77c7908b7d73..2734c895c1bf 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -46,7 +46,6 @@ #include <net/ip.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> -#include <net/ipv6.h> #include <net/net_namespace.h> /** @@ -95,20 +94,18 @@ int rdma_translate_ip(const struct sockaddr *addr, * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. + * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from + * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - void *context); + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr); - int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index c10f4b5ea8ab..49f4f75499b3 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -583,7 +583,7 @@ struct ib_cm_sidr_req_param { struct sa_path_rec *path; const struct ib_gid_attr *sgid_attr; __be64 service_id; - int timeout_ms; + unsigned long timeout_ms; const void *private_data; u8 private_data_len; u8 max_cm_retries; diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index b6ddf2a1b9d8..19520979b84c 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -449,28 +449,23 @@ struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); -int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct sa_path_rec *resp, +int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, + u8 port_num, struct sa_path_rec *rec, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, struct sa_path_rec *resp, void *context), - void *context, - struct ib_sa_query **query); + void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, - struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_service_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); + struct ib_device *device, u8 port_num, u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; @@ -573,12 +568,11 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, struct ib_device *device, diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a1fd63871d17..5d3755ec5afa 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,15 +42,14 @@ struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; + struct mm_struct *owning_mm; size_t length; unsigned long address; int page_shift; - int writable; - int hugetlb; + u32 writable : 1; + u32 hugetlb : 1; + u32 is_odp : 1; struct work_struct work; - struct mm_struct *mm; - unsigned long diff; - struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; int npages; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 381cdf5a9bd1..0b1446fe2fab 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -43,6 +43,9 @@ struct umem_odp_node { }; struct ib_umem_odp { + struct ib_umem umem; + struct ib_ucontext_per_mm *per_mm; + /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will @@ -64,16 +67,9 @@ struct ib_umem_odp { struct mutex umem_mutex; void *private; /* for the HW driver to use. */ - /* When false, use the notifier counter in the ucontext struct. */ - bool mn_counters_active; int notifiers_seq; int notifiers_count; - /* A linked list of umems that don't have private mmu notifier - * counters yet. */ - struct list_head no_private_counters; - struct ib_umem *umem; - /* Tree tracking */ struct umem_odp_node interval_tree; @@ -82,15 +78,34 @@ struct ib_umem_odp { struct work_struct work; }; +static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) +{ + return container_of(umem, struct ib_umem_odp, umem); +} + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access); -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size); +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + struct mm_struct *mm; + struct pid *tgid; + bool active; + + struct rb_root_cached umem_tree; + /* Protects umem_tree */ + struct rw_semaphore umem_rwsem; -void ib_umem_odp_release(struct ib_umem *umem); + struct mmu_notifier mn; + unsigned int odp_mrs_count; + + struct list_head ucontext_list; + struct rcu_head rcu; +}; + +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, + unsigned long addr, size_t size); +void ib_umem_odp_release(struct ib_umem_odp *umem_odp); /* * The lower 2 bits of the DMA address signal the R/W permissions for @@ -105,13 +120,14 @@ void ib_umem_odp_release(struct ib_umem *umem); #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, - u64 access_mask, unsigned long current_seq); +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, + u64 bcnt, u64 access_mask, + unsigned long current_seq); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, u64 bound); -typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, +typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end, void *cookie); /* * Call the callback on each ib_umem in the range. Returns the logical or of @@ -129,46 +145,37 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length); -static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, +static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) { /* * This code is strongly based on the KVM code from * mmu_notifier_retry. Should be called with - * the relevant locks taken (item->odp_data->umem_mutex + * the relevant locks taken (umem_odp->umem_mutex * and the ucontext umem_mutex semaphore locked for read). */ - /* Do not allow page faults while the new ib_umem hasn't seen a state - * with zero notifiers yet, and doesn't have its own valid set of - * private counters. */ - if (!item->odp_data->mn_counters_active) - return 1; - - if (unlikely(item->odp_data->notifiers_count)) + if (unlikely(umem_odp->notifiers_count)) return 1; - if (item->odp_data->notifiers_seq != mmu_seq) + if (umem_odp->notifiers_seq != mmu_seq) return 1; return 0; } #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -static inline int ib_umem_odp_get(struct ib_ucontext *context, - struct ib_umem *umem, - int access) +static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { return -EINVAL; } -static inline struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +static inline struct ib_umem_odp * +ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { return ERR_PTR(-EINVAL); } -static inline void ib_umem_odp_release(struct ib_umem *umem) {} +static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0ed5d913a492..9c0c2132a2d6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -69,8 +69,11 @@ #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN +struct ib_umem_odp; + extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; +extern struct workqueue_struct *ib_comp_unbound_wq; union ib_gid { u8 raw[16]; @@ -1137,7 +1140,9 @@ enum ib_qp_create_flags { */ struct ib_qp_init_attr { + /* Consumer's event_handler callback must not block */ void (*event_handler)(struct ib_event *, void *); + void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; @@ -1146,7 +1151,7 @@ struct ib_qp_init_attr { struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; - enum ib_qp_create_flags create_flags; + u32 create_flags; /* * Only needed for special QP types, or when using the RW API. @@ -1278,21 +1283,27 @@ struct ib_qp_attr { }; enum ib_wr_opcode { - IB_WR_RDMA_WRITE, - IB_WR_RDMA_WRITE_WITH_IMM, - IB_WR_SEND, - IB_WR_SEND_WITH_IMM, - IB_WR_RDMA_READ, - IB_WR_ATOMIC_CMP_AND_SWP, - IB_WR_ATOMIC_FETCH_AND_ADD, - IB_WR_LSO, - IB_WR_SEND_WITH_INV, - IB_WR_RDMA_READ_WITH_INV, - IB_WR_LOCAL_INV, - IB_WR_REG_MR, - IB_WR_MASKED_ATOMIC_CMP_AND_SWP, - IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, + /* These are shared with userspace */ + IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, + IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, + IB_WR_SEND = IB_UVERBS_WR_SEND, + IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, + IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, + IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, + IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, + IB_WR_LSO = IB_UVERBS_WR_TSO, + IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, + IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, + IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, + IB_WR_MASKED_ATOMIC_CMP_AND_SWP = + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, + IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + + /* These are kernel only and can not be issued by userspace */ + IB_WR_REG_MR = 0x20, IB_WR_REG_SIG_MR, + /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ @@ -1485,26 +1496,15 @@ struct ib_ucontext { * it is set when we are closing the file descriptor and indicates * that mm_sem may be locked. */ - int closing; + bool closing; bool cleanup_retryable; - struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; - void (*invalidate_range)(struct ib_umem *umem, + void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - int odp_mrs_count; + struct mutex per_mm_list_lock; + struct list_head per_mm_list; #endif struct ib_rdmacg_object cg_obj; @@ -1570,9 +1570,10 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { - IB_POLL_DIRECT, /* caller context, no hw completions */ - IB_POLL_SOFTIRQ, /* poll from softirq context */ - IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ }; struct ib_cq { @@ -1589,6 +1590,7 @@ struct ib_cq { struct irq_poll iop; struct work_struct work; }; + struct workqueue_struct *comp_wq; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2263,10 +2265,11 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - spinlock_t client_data_lock; + rwlock_t client_data_lock; struct list_head core_list; /* Access to the client_data_list is protected by the client_data_lock - * spinlock and the lists_rwsem read-write semaphore */ + * rwlock and the lists_rwsem read-write semaphore + */ struct list_head client_data_list; struct ib_cache cache; @@ -2550,7 +2553,13 @@ struct ib_device { struct module *owner; struct device dev; - struct kobject *ports_parent; + /* First group for device attributes, + * Second group for driver provided attributes (optional). + * It is NULL terminated array. + */ + const struct attribute_group *groups[3]; + + struct kobject *ports_kobj; struct list_head port_list; enum { @@ -2633,9 +2642,9 @@ void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); @@ -2645,6 +2654,28 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot); +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size); +#else +static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + return -EINVAL; +} +static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + return -EINVAL; +} +#endif + static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; @@ -2728,7 +2759,6 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes - * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It @@ -2737,8 +2767,7 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll); + enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); @@ -4167,20 +4196,6 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) } -static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, - struct ib_qp *qp, struct ib_device *device) -{ - uobj->object = ibflow; - ibflow->uobject = uobj; - - if (qp) { - atomic_inc(&qp->usecnt); - ibflow->qp = qp; - } - - ibflow->device = device; -} - /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. @@ -4205,4 +4220,26 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num, void (*setup)(struct net_device *), struct net_device *netdev); +/** + * rdma_set_device_sysfs_group - Set device attributes group to have + * driver specific sysfs entries at + * for infiniband class. + * + * @device: device pointer for which attributes to be created + * @group: Pointer to group which should be added when device + * is registered with sysfs. + * rdma_set_device_sysfs_group() allows existing drivers to expose one + * group per device to have sysfs attributes. + * + * NOTE: New drivers should not make use of this API; instead new device + * parameter should be exposed via netlink command. This API and mechanism + * exist only for existing drivers. + */ +static inline void +rdma_set_device_sysfs_group(struct ib_device *dev, + const struct attribute_group *group) +{ + dev->groups[1] = group; +} + #endif /* IB_VERBS_H */ diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 5d71a7f51a9f..60987a5903b7 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -152,7 +152,11 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, * @ps: RDMA port space. * @qp_type: type of queue pair associated with the id. * - * The id holds a reference on the network namespace until it is destroyed. + * Returns a new rdma_cm_id. The id holds a reference on the network + * namespace until it is destroyed. + * + * The event handler callback serializes on the id's mutex and is + * allowed to sleep. */ #define rdma_create_id(net, event_handler, context, ps, qp_type) \ __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ @@ -192,7 +196,8 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); * @timeout_ms: Time to wait for resolution to complete. */ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms); + const struct sockaddr *dst_addr, + unsigned long timeout_ms); /** * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier @@ -202,7 +207,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, * Users must have first called rdma_resolve_addr to resolve a dst_addr * into an RDMA address before calling this routine. */ -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms); /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c369703fcd69..70218e6b5187 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -96,7 +96,7 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); /** * Check if there are any listeners to the netlink group * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. + * Returns true on success or false if no listeners. */ -int rdma_nl_chk_listeners(unsigned int group); +bool rdma_nl_chk_listeners(unsigned int group); #endif /* _RDMA_NETLINK_H */ diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e79229a0cf01..3584d0816fcd 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,6 +149,10 @@ struct rvt_ibport { #define RVT_CQN_MAX 16 /* maximum length of cq name */ +#define RVT_SGE_COPY_MEMCPY 0 +#define RVT_SGE_COPY_CACHELESS 1 +#define RVT_SGE_COPY_ADAPTIVE 2 + /* * Things that are driver specific, module parameters in hfi1 and qib */ @@ -161,6 +165,9 @@ struct rvt_driver_params { */ unsigned int lkey_table_size; unsigned int qp_table_size; + unsigned int sge_copy_mode; + unsigned int wss_threshold; + unsigned int wss_clean_period; int qpn_start; int qpn_inc; int qpn_res_start; @@ -193,6 +200,19 @@ struct rvt_ah { u8 log_pmtu; }; +/* memory working set size */ +struct rvt_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; + unsigned int clean_period; +}; + struct rvt_dev_info; struct rvt_swqe; struct rvt_driver_provided { @@ -211,11 +231,18 @@ struct rvt_driver_provided { * version requires the s_lock not to be held. The other assumes the * s_lock is held. */ - void (*schedule_send)(struct rvt_qp *qp); - void (*schedule_send_no_lock)(struct rvt_qp *qp); + bool (*schedule_send)(struct rvt_qp *qp); + bool (*schedule_send_no_lock)(struct rvt_qp *qp); - /* Driver specific work request checking */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + /* + * Driver specific work request setup and checking. + * This function is allowed to perform any setup, checks, or + * adjustments required to the SWQE in order to be usable by + * underlying protocols. This includes private data structure + * allocations. + */ + int (*setup_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); /* * Sometimes rdmavt needs to kick the driver's send progress. That is @@ -371,6 +398,9 @@ struct rvt_dev_info { /* post send table */ const struct rvt_operation_params *post_parms; + /* opcode translation table */ + const enum ib_wc_opcode *wc_opcode; + /* Driver specific helper functions */ struct rvt_driver_provided driver_f; @@ -411,6 +441,8 @@ struct rvt_dev_info { u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ spinlock_t n_mcast_grps_lock; + /* Memory Working Set Size */ + struct rvt_wss *wss; }; /** @@ -423,7 +455,14 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, const char *fmt, const char *name, const int unit) { - snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit); + /* + * FIXME: rvt and its users want to touch the ibdev before + * registration and have things like the name work. We don't have the + * infrastructure in the core to support this directly today, hack it + * to work by setting the name manually here. + */ + dev_set_name(&rdi->ibdev.dev, fmt, name, unit); + strlcpy(rdi->ibdev.name, dev_name(&rdi->ibdev.dev), IB_DEVICE_NAME_MAX); } /** @@ -434,7 +473,7 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, */ static inline const char *rvt_get_ibdev_name(const struct rvt_dev_info *rdi) { - return rdi->ibdev.name; + return dev_name(&rdi->ibdev.dev); } static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 927f6d5b6d0f..cbafb1878669 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -678,6 +678,13 @@ void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); void rvt_add_retry_timer(struct rvt_qp *qp); +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last); +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status); +void rvt_ruc_loopback(struct rvt_qp *qp); + /** * struct rvt_qp_iter - the iterator for QPs * @qp - the current QP diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 9654d33edd98..2638fa7cd702 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -173,16 +173,10 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); /** * rdma_restrack_set_task() - set the task for this resource * @res: resource entry - * @task: task struct + * @caller: kernel name, the current task will be used if the caller is NULL. */ -static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) -{ - if (res->task) - put_task_struct(res->task); - get_task_struct(task); - res->task = task; -} +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + const char *caller); /* * Helper functions for rdma drivers when filling out diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 9e997c3c2f04..84d3d15f1f38 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -52,6 +52,7 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, UVERBS_ATTR_TYPE_ENUM_IN, + UVERBS_ATTR_TYPE_IDRS_ARRAY, }; enum uverbs_obj_access { @@ -101,7 +102,7 @@ struct uverbs_attr_spec { } enum_def; } u; - /* This weird split of the enum lets us remove some padding */ + /* This weird split lets us remove some padding */ union { struct { /* @@ -111,6 +112,17 @@ struct uverbs_attr_spec { */ const struct uverbs_attr_spec *ids; } enum_def; + + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u16 min_len; + u16 max_len; + u8 access; + } objs_arr; } u2; }; @@ -251,6 +263,11 @@ static inline __attribute_const__ u32 uapi_bkey_attr(u32 attr_key) return attr_key - 1; } +static inline __attribute_const__ u32 uapi_bkey_to_key_attr(u32 attr_bkey) +{ + return attr_bkey + 1; +} + /* * ======================================= * Verbs definitions @@ -323,6 +340,27 @@ struct uverbs_object_tree_def { #define UA_MANDATORY .mandatory = 1 #define UA_OPTIONAL .mandatory = 0 +/* + * min_len must be bigger than 0 and _max_len must be smaller than 4095. Only + * READ\WRITE accesses are supported. + */ +#define UVERBS_ATTR_IDRS_ARR(_attr_id, _idr_type, _access, _min_len, _max_len, \ + ...) \ + (&(const struct uverbs_attr_def){ \ + .id = (_attr_id) + \ + BUILD_BUG_ON_ZERO((_min_len) == 0 || \ + (_max_len) > \ + PAGE_SIZE / sizeof(void *) || \ + (_min_len) > (_max_len) || \ + (_access) == UVERBS_ACCESS_NEW || \ + (_access) == UVERBS_ACCESS_DESTROY), \ + .attr = { .type = UVERBS_ATTR_TYPE_IDRS_ARRAY, \ + .u2.objs_arr.obj_type = _idr_type, \ + .u2.objs_arr.access = _access, \ + .u2.objs_arr.min_len = _min_len, \ + .u2.objs_arr.max_len = _max_len, \ + __VA_ARGS__ } }) + #define UVERBS_ATTR_IDR(_attr_id, _idr_type, _access, ...) \ (&(const struct uverbs_attr_def){ \ .id = _attr_id, \ @@ -365,6 +403,15 @@ struct uverbs_object_tree_def { __VA_ARGS__ }, \ }) +/* An input value that is a member in the enum _enum_type. */ +#define UVERBS_ATTR_CONST_IN(_attr_id, _enum_type, ...) \ + UVERBS_ATTR_PTR_IN( \ + _attr_id, \ + UVERBS_ATTR_SIZE( \ + sizeof(u64) + BUILD_BUG_ON_ZERO(!sizeof(_enum_type)), \ + sizeof(u64)), \ + __VA_ARGS__) + /* * An input value that is a bitwise combination of values of _enum_type. * This permits the flag value to be passed as either a u32 or u64, it must @@ -431,10 +478,16 @@ struct uverbs_obj_attr { const struct uverbs_api_attr *attr_elm; }; +struct uverbs_objs_arr_attr { + struct ib_uobject **uobjects; + u16 len; +}; + struct uverbs_attr { union { struct uverbs_ptr_attr ptr_attr; struct uverbs_obj_attr obj_attr; + struct uverbs_objs_arr_attr objs_arr_attr; }; }; @@ -507,6 +560,31 @@ uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle, u16 idx) return attr->ptr_attr.len; } +/** + * uverbs_attr_get_uobjs_arr() - Provides array's properties for attribute for + * UVERBS_ATTR_TYPE_IDRS_ARRAY. + * @arr: Returned pointer to array of pointers for uobjects or NULL if + * the attribute isn't provided. + * + * Return: The array length or 0 if no attribute was provided. + */ +static inline int uverbs_attr_get_uobjs_arr( + const struct uverbs_attr_bundle *attrs_bundle, u16 attr_idx, + struct ib_uobject ***arr) +{ + const struct uverbs_attr *attr = + uverbs_attr_get(attrs_bundle, attr_idx); + + if (IS_ERR(attr)) { + *arr = NULL; + return 0; + } + + *arr = attr->objs_arr_attr.uobjects; + + return attr->objs_arr_attr.len; +} + static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr) { return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data); @@ -603,6 +681,9 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return _uverbs_alloc(bundle, size, GFP_KERNEL | __GFP_ZERO); } +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val); #else static inline int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, @@ -631,6 +712,34 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return ERR_PTR(-EINVAL); } +static inline int +_uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + return -EINVAL; +} #endif +#define uverbs_get_const(_to, _attrs_bundle, _idx) \ + ({ \ + s64 _val; \ + int _ret = _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), NULL); \ + (*_to) = _val; \ + _ret; \ + }) + +#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default) \ + ({ \ + s64 _val; \ + s64 _def_val = _default; \ + int _ret = \ + _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), &_def_val); \ + (*_to) = _val; \ + _ret; \ + }) #endif diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 3b00231cc084..3db2802fbc68 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -140,5 +140,56 @@ __uobj_alloc(const struct uverbs_api_object *obj, struct ib_uverbs_file *ufile, #define uobj_alloc(_type, _ufile, _ib_dev) \ __uobj_alloc(uobj_get_type(_ufile, _type), _ufile, _ib_dev) +static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, + struct ib_uobject *uobj, + struct ib_device *ib_dev, + enum ib_flow_action_type type) +{ + atomic_set(&action->usecnt, 0); + action->device = ib_dev; + action->type = type; + action->uobject = uobj; + uobj->object = action; +} + +struct ib_uflow_resources { + size_t max; + size_t num; + size_t collection_num; + size_t counters_num; + struct ib_counters **counters; + struct ib_flow_action **collection; +}; + +struct ib_uflow_object { + struct ib_uobject uobject; + struct ib_uflow_resources *resources; +}; + +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs); +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj); +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); + +static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, + struct ib_qp *qp, struct ib_device *device, + struct ib_uflow_resources *uflow_res) +{ + struct ib_uflow_object *uflow; + + uobj->object = ibflow; + ibflow->uobject = uobj; + + if (qp) { + atomic_inc(&qp->usecnt); + ibflow->qp = qp; + } + + ibflow->device = device; + uflow = container_of(uobj, typeof(*uflow), uobject); + uflow->resources = uflow_res; +} + #endif |