diff options
-rw-r--r-- | Documentation/driver-api/vfio.rst | 48 | ||||
-rw-r--r-- | drivers/vfio/fsl-mc/vfio_fsl_mc.c | 127 | ||||
-rw-r--r-- | drivers/vfio/fsl-mc/vfio_fsl_mc_private.h | 1 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_private.h | 5 | ||||
-rw-r--r-- | drivers/vfio/mdev/vfio_mdev.c | 53 | ||||
-rw-r--r-- | drivers/vfio/pci/Kconfig | 6 | ||||
-rw-r--r-- | drivers/vfio/pci/Makefile | 1 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 271 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_nvlink2.c | 490 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 15 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_amba.c | 8 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform.c | 20 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform_common.c | 56 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform_private.h | 5 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 210 | ||||
-rw-r--r-- | include/linux/vfio.h | 37 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 38 |
17 files changed, 421 insertions, 970 deletions
diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst index f1a4d3c3ba0b..decc68cb8114 100644 --- a/Documentation/driver-api/vfio.rst +++ b/Documentation/driver-api/vfio.rst @@ -249,35 +249,41 @@ VFIO bus driver API VFIO bus drivers, such as vfio-pci make use of only a few interfaces into VFIO core. When devices are bound and unbound to the driver, -the driver should call vfio_add_group_dev() and vfio_del_group_dev() -respectively:: - - extern int vfio_add_group_dev(struct device *dev, - const struct vfio_device_ops *ops, - void *device_data); - - extern void *vfio_del_group_dev(struct device *dev); - -vfio_add_group_dev() indicates to the core to begin tracking the -iommu_group of the specified dev and register the dev as owned by -a VFIO bus driver. The driver provides an ops structure for callbacks +the driver should call vfio_register_group_dev() and +vfio_unregister_group_dev() respectively:: + + void vfio_init_group_dev(struct vfio_device *device, + struct device *dev, + const struct vfio_device_ops *ops); + int vfio_register_group_dev(struct vfio_device *device); + void vfio_unregister_group_dev(struct vfio_device *device); + +The driver should embed the vfio_device in its own structure and call +vfio_init_group_dev() to pre-configure it before going to registration. +vfio_register_group_dev() indicates to the core to begin tracking the +iommu_group of the specified dev and register the dev as owned by a VFIO bus +driver. Once vfio_register_group_dev() returns it is possible for userspace to +start accessing the driver, thus the driver should ensure it is completely +ready before calling it. The driver provides an ops structure for callbacks similar to a file operations structure:: struct vfio_device_ops { - int (*open)(void *device_data); - void (*release)(void *device_data); - ssize_t (*read)(void *device_data, char __user *buf, + int (*open)(struct vfio_device *vdev); + void (*release)(struct vfio_device *vdev); + ssize_t (*read)(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos); - ssize_t (*write)(void *device_data, const char __user *buf, + ssize_t (*write)(struct vfio_device *vdev, + const char __user *buf, size_t size, loff_t *ppos); - long (*ioctl)(void *device_data, unsigned int cmd, + long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*mmap)(void *device_data, struct vm_area_struct *vma); + int (*mmap)(struct vfio_device *vdev, + struct vm_area_struct *vma); }; -Each function is passed the device_data that was originally registered -in the vfio_add_group_dev() call above. This allows the bus driver -an easy place to store its opaque, private data. The open/release +Each function is passed the vdev that was originally registered +in the vfio_register_group_dev() call above. This allows the bus driver +to obtain its private data using container_of(). The open/release callbacks are issued when a new file descriptor is created for a device (via VFIO_GROUP_GET_DEVICE_FD). The ioctl interface provides a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index f27e25112c40..980e59551301 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -75,7 +75,8 @@ static int vfio_fsl_mc_reflck_attach(struct vfio_fsl_mc_device *vdev) goto unlock; } - cont_vdev = vfio_device_data(device); + cont_vdev = + container_of(device, struct vfio_fsl_mc_device, vdev); if (!cont_vdev || !cont_vdev->reflck) { vfio_device_put(device); ret = -ENODEV; @@ -135,9 +136,10 @@ static void vfio_fsl_mc_regions_cleanup(struct vfio_fsl_mc_device *vdev) kfree(vdev->regions); } -static int vfio_fsl_mc_open(void *device_data) +static int vfio_fsl_mc_open(struct vfio_device *core_vdev) { - struct vfio_fsl_mc_device *vdev = device_data; + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); int ret; if (!try_module_get(THIS_MODULE)) @@ -161,9 +163,10 @@ err_reg_init: return ret; } -static void vfio_fsl_mc_release(void *device_data) +static void vfio_fsl_mc_release(struct vfio_device *core_vdev) { - struct vfio_fsl_mc_device *vdev = device_data; + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); int ret; mutex_lock(&vdev->reflck->lock); @@ -197,11 +200,12 @@ static void vfio_fsl_mc_release(void *device_data) module_put(THIS_MODULE); } -static long vfio_fsl_mc_ioctl(void *device_data, unsigned int cmd, - unsigned long arg) +static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) { unsigned long minsz; - struct vfio_fsl_mc_device *vdev = device_data; + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); struct fsl_mc_device *mc_dev = vdev->mc_dev; switch (cmd) { @@ -327,10 +331,11 @@ static long vfio_fsl_mc_ioctl(void *device_data, unsigned int cmd, } } -static ssize_t vfio_fsl_mc_read(void *device_data, char __user *buf, +static ssize_t vfio_fsl_mc_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos) { - struct vfio_fsl_mc_device *vdev = device_data; + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos); loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK; struct fsl_mc_device *mc_dev = vdev->mc_dev; @@ -404,10 +409,12 @@ static int vfio_fsl_mc_send_command(void __iomem *ioaddr, uint64_t *cmd_data) return 0; } -static ssize_t vfio_fsl_mc_write(void *device_data, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t vfio_fsl_mc_write(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos) { - struct vfio_fsl_mc_device *vdev = device_data; + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos); loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK; struct fsl_mc_device *mc_dev = vdev->mc_dev; @@ -468,9 +475,11 @@ static int vfio_fsl_mc_mmap_mmio(struct vfio_fsl_mc_region region, size, vma->vm_page_prot); } -static int vfio_fsl_mc_mmap(void *device_data, struct vm_area_struct *vma) +static int vfio_fsl_mc_mmap(struct vfio_device *core_vdev, + struct vm_area_struct *vma) { - struct vfio_fsl_mc_device *vdev = device_data; + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); struct fsl_mc_device *mc_dev = vdev->mc_dev; unsigned int index; @@ -568,23 +577,39 @@ static int vfio_fsl_mc_init_device(struct vfio_fsl_mc_device *vdev) dev_err(&mc_dev->dev, "VFIO_FSL_MC: Failed to setup DPRC (%d)\n", ret); goto out_nc_unreg; } + return 0; + +out_nc_unreg: + bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb); + return ret; +} +static int vfio_fsl_mc_scan_container(struct fsl_mc_device *mc_dev) +{ + int ret; + + /* non dprc devices do not scan for other devices */ + if (!is_fsl_mc_bus_dprc(mc_dev)) + return 0; ret = dprc_scan_container(mc_dev, false); if (ret) { - dev_err(&mc_dev->dev, "VFIO_FSL_MC: Container scanning failed (%d)\n", ret); - goto out_dprc_cleanup; + dev_err(&mc_dev->dev, + "VFIO_FSL_MC: Container scanning failed (%d)\n", ret); + dprc_remove_devices(mc_dev, NULL, 0); + return ret; } - return 0; +} + +static void vfio_fsl_uninit_device(struct vfio_fsl_mc_device *vdev) +{ + struct fsl_mc_device *mc_dev = vdev->mc_dev; + + if (!is_fsl_mc_bus_dprc(mc_dev)) + return; -out_dprc_cleanup: - dprc_remove_devices(mc_dev, NULL, 0); dprc_cleanup(mc_dev); -out_nc_unreg: bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb); - vdev->nb.notifier_call = NULL; - - return ret; } static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) @@ -600,36 +625,50 @@ static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) return -EINVAL; } - vdev = devm_kzalloc(dev, sizeof(*vdev), GFP_KERNEL); + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) { ret = -ENOMEM; goto out_group_put; } + vfio_init_group_dev(&vdev->vdev, dev, &vfio_fsl_mc_ops); vdev->mc_dev = mc_dev; - - ret = vfio_add_group_dev(dev, &vfio_fsl_mc_ops, vdev); - if (ret) { - dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n"); - goto out_group_put; - } + mutex_init(&vdev->igate); ret = vfio_fsl_mc_reflck_attach(vdev); if (ret) - goto out_group_dev; + goto out_kfree; ret = vfio_fsl_mc_init_device(vdev); if (ret) goto out_reflck; - mutex_init(&vdev->igate); + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) { + dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n"); + goto out_device; + } + /* + * This triggers recursion into vfio_fsl_mc_probe() on another device + * and the vfio_fsl_mc_reflck_attach() must succeed, which relies on the + * vfio_add_group_dev() above. It has no impact on this vdev, so it is + * safe to be after the vfio device is made live. + */ + ret = vfio_fsl_mc_scan_container(mc_dev); + if (ret) + goto out_group_dev; + dev_set_drvdata(dev, vdev); return 0; +out_group_dev: + vfio_unregister_group_dev(&vdev->vdev); +out_device: + vfio_fsl_uninit_device(vdev); out_reflck: vfio_fsl_mc_reflck_put(vdev->reflck); -out_group_dev: - vfio_del_group_dev(dev); +out_kfree: + kfree(vdev); out_group_put: vfio_iommu_group_put(group, dev); return ret; @@ -637,25 +676,17 @@ out_group_put: static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) { - struct vfio_fsl_mc_device *vdev; struct device *dev = &mc_dev->dev; + struct vfio_fsl_mc_device *vdev = dev_get_drvdata(dev); - vdev = vfio_del_group_dev(dev); - if (!vdev) - return -EINVAL; - + vfio_unregister_group_dev(&vdev->vdev); mutex_destroy(&vdev->igate); + dprc_remove_devices(mc_dev, NULL, 0); + vfio_fsl_uninit_device(vdev); vfio_fsl_mc_reflck_put(vdev->reflck); - if (is_fsl_mc_bus_dprc(mc_dev)) { - dprc_remove_devices(mc_dev, NULL, 0); - dprc_cleanup(mc_dev); - } - - if (vdev->nb.notifier_call) - bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb); - + kfree(vdev); vfio_iommu_group_put(mc_dev->dev.iommu_group, dev); return 0; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h index a97ee691ed47..89700e00e77d 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h @@ -36,6 +36,7 @@ struct vfio_fsl_mc_region { }; struct vfio_fsl_mc_device { + struct vfio_device vdev; struct fsl_mc_device *mc_dev; struct notifier_block nb; int refcnt; diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index 4d62b76c4734..fdbb3bee99a9 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -35,7 +35,10 @@ struct mdev_device { bool active; }; -#define to_mdev_device(dev) container_of(dev, struct mdev_device, dev) +static inline struct mdev_device *to_mdev_device(struct device *dev) +{ + return container_of(dev, struct mdev_device, dev); +} #define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) struct mdev_type { diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c index b52eea128549..ae7e322fbe3c 100644 --- a/drivers/vfio/mdev/vfio_mdev.c +++ b/drivers/vfio/mdev/vfio_mdev.c @@ -21,10 +21,11 @@ #define DRIVER_AUTHOR "NVIDIA Corporation" #define DRIVER_DESC "VFIO based driver for Mediated device" -static int vfio_mdev_open(void *device_data) +static int vfio_mdev_open(struct vfio_device *core_vdev) { - struct mdev_device *mdev = device_data; + struct mdev_device *mdev = to_mdev_device(core_vdev->dev); struct mdev_parent *parent = mdev->parent; + int ret; if (unlikely(!parent->ops->open)) @@ -40,9 +41,9 @@ static int vfio_mdev_open(void *device_data) return ret; } -static void vfio_mdev_release(void *device_data) +static void vfio_mdev_release(struct vfio_device *core_vdev) { - struct mdev_device *mdev = device_data; + struct mdev_device *mdev = to_mdev_device(core_vdev->dev); struct mdev_parent *parent = mdev->parent; if (likely(parent->ops->release)) @@ -51,10 +52,10 @@ static void vfio_mdev_release(void *device_data) module_put(THIS_MODULE); } -static long vfio_mdev_unlocked_ioctl(void *device_data, +static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { - struct mdev_device *mdev = device_data; + struct mdev_device *mdev = to_mdev_device(core_vdev->dev); struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->ioctl)) @@ -63,10 +64,10 @@ static long vfio_mdev_unlocked_ioctl(void *device_data, return parent->ops->ioctl(mdev, cmd, arg); } -static ssize_t vfio_mdev_read(void *device_data, char __user *buf, +static ssize_t vfio_mdev_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos) { - struct mdev_device *mdev = device_data; + struct mdev_device *mdev = to_mdev_device(core_vdev->dev); struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->read)) @@ -75,10 +76,11 @@ static ssize_t vfio_mdev_read(void *device_data, char __user *buf, return parent->ops->read(mdev, buf, count, ppos); } -static ssize_t vfio_mdev_write(void *device_data, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t vfio_mdev_write(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos) { - struct mdev_device *mdev = device_data; + struct mdev_device *mdev = to_mdev_device(core_vdev->dev); struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->write)) @@ -87,9 +89,10 @@ static ssize_t vfio_mdev_write(void *device_data, const char __user *buf, return parent->ops->write(mdev, buf, count, ppos); } -static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma) +static int vfio_mdev_mmap(struct vfio_device *core_vdev, + struct vm_area_struct *vma) { - struct mdev_device *mdev = device_data; + struct mdev_device *mdev = to_mdev_device(core_vdev->dev); struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->mmap)) @@ -98,9 +101,9 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma) return parent->ops->mmap(mdev, vma); } -static void vfio_mdev_request(void *device_data, unsigned int count) +static void vfio_mdev_request(struct vfio_device *core_vdev, unsigned int count) { - struct mdev_device *mdev = device_data; + struct mdev_device *mdev = to_mdev_device(core_vdev->dev); struct mdev_parent *parent = mdev->parent; if (parent->ops->request) @@ -124,13 +127,29 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = { static int vfio_mdev_probe(struct device *dev) { struct mdev_device *mdev = to_mdev_device(dev); + struct vfio_device *vdev; + int ret; - return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev); + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) + return -ENOMEM; + + vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops); + ret = vfio_register_group_dev(vdev); + if (ret) { + kfree(vdev); + return ret; + } + dev_set_drvdata(&mdev->dev, vdev); + return 0; } static void vfio_mdev_remove(struct device *dev) { - vfio_del_group_dev(dev); + struct vfio_device *vdev = dev_get_drvdata(dev); + + vfio_unregister_group_dev(vdev); + kfree(vdev); } static struct mdev_driver vfio_mdev_driver = { diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 4abddbebd4b2..53ce78d7d07b 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -39,9 +39,3 @@ config VFIO_PCI_IGD and LPC bridge config space. To enable Intel IGD assignment through vfio-pci, say Y. - -config VFIO_PCI_NVLINK2 - def_bool y - depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU - help - VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index eff97a7cd9f1..3ff42093962f 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,7 +2,6 @@ vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o -vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 37a4a4824e30..882e890dca54 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -388,24 +388,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) } } - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { - ret = vfio_pci_nvdia_v100_nvlink2_init(vdev); - if (ret && ret != -ENODEV) { - pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); - goto disable_exit; - } - } - - if (pdev->vendor == PCI_VENDOR_ID_IBM && - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { - ret = vfio_pci_ibm_npu2_init(vdev); - if (ret && ret != -ENODEV) { - pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); - goto disable_exit; - } - } - vfio_pci_probe_mmaps(vdev); return 0; @@ -516,30 +498,29 @@ out: static struct pci_driver vfio_pci_driver; -static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev, - struct vfio_device **pf_dev) +static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev) { struct pci_dev *physfn = pci_physfn(vdev->pdev); + struct vfio_device *pf_dev; if (!vdev->pdev->is_virtfn) return NULL; - *pf_dev = vfio_device_get_from_dev(&physfn->dev); - if (!*pf_dev) + pf_dev = vfio_device_get_from_dev(&physfn->dev); + if (!pf_dev) return NULL; if (pci_dev_driver(physfn) != &vfio_pci_driver) { - vfio_device_put(*pf_dev); + vfio_device_put(pf_dev); return NULL; } - return vfio_device_data(*pf_dev); + return container_of(pf_dev, struct vfio_pci_device, vdev); } static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) { - struct vfio_device *pf_dev; - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); + struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); if (!pf_vdev) return; @@ -549,12 +530,13 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) WARN_ON(pf_vdev->vf_token->users < 0); mutex_unlock(&pf_vdev->vf_token->lock); - vfio_device_put(pf_dev); + vfio_device_put(&pf_vdev->vdev); } -static void vfio_pci_release(void *device_data) +static void vfio_pci_release(struct vfio_device *core_vdev) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); mutex_lock(&vdev->reflck->lock); @@ -580,9 +562,10 @@ static void vfio_pci_release(void *device_data) module_put(THIS_MODULE); } -static int vfio_pci_open(void *device_data) +static int vfio_pci_open(struct vfio_device *core_vdev) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -791,15 +774,16 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, } struct vfio_devices { - struct vfio_device **devices; + struct vfio_pci_device **devices; int cur_index; int max_index; }; -static long vfio_pci_ioctl(void *device_data, +static long vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); unsigned long minsz; if (cmd == VFIO_DEVICE_GET_INFO) { @@ -1279,9 +1263,7 @@ reset_info_exit: goto hot_reset_release; for (; mem_idx < devs.cur_index; mem_idx++) { - struct vfio_pci_device *tmp; - - tmp = vfio_device_data(devs.devices[mem_idx]); + struct vfio_pci_device *tmp = devs.devices[mem_idx]; ret = down_write_trylock(&tmp->memory_lock); if (!ret) { @@ -1296,17 +1278,13 @@ reset_info_exit: hot_reset_release: for (i = 0; i < devs.cur_index; i++) { - struct vfio_device *device; - struct vfio_pci_device *tmp; - - device = devs.devices[i]; - tmp = vfio_device_data(device); + struct vfio_pci_device *tmp = devs.devices[i]; if (i < mem_idx) up_write(&tmp->memory_lock); else mutex_unlock(&tmp->vma_lock); - vfio_device_put(device); + vfio_device_put(&tmp->vdev); } kfree(devs.devices); @@ -1401,11 +1379,10 @@ hot_reset_release: return -ENOTTY; } -static ssize_t vfio_pci_rw(void *device_data, char __user *buf, +static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - struct vfio_pci_device *vdev = device_data; if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; @@ -1433,22 +1410,28 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, return -EINVAL; } -static ssize_t vfio_pci_read(void *device_data, char __user *buf, +static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); + if (!count) return 0; - return vfio_pci_rw(device_data, buf, count, ppos, false); + return vfio_pci_rw(vdev, buf, count, ppos, false); } -static ssize_t vfio_pci_write(void *device_data, const char __user *buf, +static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); + if (!count) return 0; - return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); + return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); } /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ @@ -1645,9 +1628,10 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = { .fault = vfio_pci_mmap_fault, }; -static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) +static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); struct pci_dev *pdev = vdev->pdev; unsigned int index; u64 phys_len, req_len, pgoff, req_start; @@ -1713,9 +1697,10 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) return 0; } -static void vfio_pci_request(void *device_data, unsigned int count) +static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); struct pci_dev *pdev = vdev->pdev; mutex_lock(&vdev->igate); @@ -1766,8 +1751,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, return 0; /* No VF token provided or required */ if (vdev->pdev->is_virtfn) { - struct vfio_device *pf_dev; - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); + struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); bool match; if (!pf_vdev) { @@ -1780,7 +1764,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, } if (!vf_token) { - vfio_device_put(pf_dev); + vfio_device_put(&pf_vdev->vdev); pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); return -EACCES; @@ -1790,7 +1774,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); mutex_unlock(&pf_vdev->vf_token->lock); - vfio_device_put(pf_dev); + vfio_device_put(&pf_vdev->vdev); if (!match) { pci_info_ratelimited(vdev->pdev, @@ -1829,9 +1813,10 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, #define VF_TOKEN_ARG "vf_token=" -static int vfio_pci_match(void *device_data, char *buf) +static int vfio_pci_match(struct vfio_device *core_vdev, char *buf) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); bool vf_token = false; uuid_t uuid; int ret; @@ -1921,6 +1906,68 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb, return 0; } +static int vfio_pci_vf_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int ret; + + if (!pdev->is_physfn) + return 0; + + vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); + if (!vdev->vf_token) + return -ENOMEM; + + mutex_init(&vdev->vf_token->lock); + uuid_gen(&vdev->vf_token->uuid); + + vdev->nb.notifier_call = vfio_pci_bus_notifier; + ret = bus_register_notifier(&pci_bus_type, &vdev->nb); + if (ret) { + kfree(vdev->vf_token); + return ret; + } + return 0; +} + +static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev) +{ + if (!vdev->vf_token) + return; + + bus_unregister_notifier(&pci_bus_type, &vdev->nb); + WARN_ON(vdev->vf_token->users); + mutex_destroy(&vdev->vf_token->lock); + kfree(vdev->vf_token); +} + +static int vfio_pci_vga_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int ret; + + if (!vfio_pci_is_vga(pdev)) + return 0; + + ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); + if (ret) + return ret; + vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false)); + return 0; +} + +static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + + if (!vfio_pci_is_vga(pdev)) + return; + vga_client_register(pdev, NULL, NULL, NULL); + vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | + VGA_RSRC_LEGACY_IO | + VGA_RSRC_LEGACY_MEM); +} + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_device *vdev; @@ -1956,6 +2003,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto out_group_put; } + vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops); vdev->pdev = pdev; vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); @@ -1967,35 +2015,15 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&vdev->vma_list); init_rwsem(&vdev->memory_lock); - ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); + ret = vfio_pci_reflck_attach(vdev); if (ret) goto out_free; - - ret = vfio_pci_reflck_attach(vdev); + ret = vfio_pci_vf_init(vdev); if (ret) - goto out_del_group_dev; - - if (pdev->is_physfn) { - vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); - if (!vdev->vf_token) { - ret = -ENOMEM; - goto out_reflck; - } - - mutex_init(&vdev->vf_token->lock); - uuid_gen(&vdev->vf_token->uuid); - - vdev->nb.notifier_call = vfio_pci_bus_notifier; - ret = bus_register_notifier(&pci_bus_type, &vdev->nb); - if (ret) - goto out_vf_token; - } - - if (vfio_pci_is_vga(pdev)) { - vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); - vga_set_legacy_decoding(pdev, - vfio_pci_set_vga_decode(vdev, false)); - } + goto out_reflck; + ret = vfio_pci_vga_init(vdev); + if (ret) + goto out_vf; vfio_pci_probe_power_state(vdev); @@ -2013,15 +2041,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vfio_pci_set_power_state(vdev, PCI_D3hot); } - return ret; + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_power; + dev_set_drvdata(&pdev->dev, vdev); + return 0; -out_vf_token: - kfree(vdev->vf_token); +out_power: + if (!disable_idle_d3) + vfio_pci_set_power_state(vdev, PCI_D0); +out_vf: + vfio_pci_vf_uninit(vdev); out_reflck: vfio_pci_reflck_put(vdev->reflck); -out_del_group_dev: - vfio_del_group_dev(&pdev->dev); out_free: + kfree(vdev->pm_save); kfree(vdev); out_group_put: vfio_iommu_group_put(group, &pdev->dev); @@ -2030,41 +2064,25 @@ out_group_put: static void vfio_pci_remove(struct pci_dev *pdev) { - struct vfio_pci_device *vdev; + struct vfio_pci_device *vdev = dev_get_drvdata(&pdev->dev); pci_disable_sriov(pdev); - vdev = vfio_del_group_dev(&pdev->dev); - if (!vdev) - return; - - if (vdev->vf_token) { - WARN_ON(vdev->vf_token->users); - mutex_destroy(&vdev->vf_token->lock); - kfree(vdev->vf_token); - } - - if (vdev->nb.notifier_call) - bus_unregister_notifier(&pci_bus_type, &vdev->nb); + vfio_unregister_group_dev(&vdev->vdev); + vfio_pci_vf_uninit(vdev); vfio_pci_reflck_put(vdev->reflck); + vfio_pci_vga_uninit(vdev); vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); - kfree(vdev->region); - mutex_destroy(&vdev->ioeventfds_lock); if (!disable_idle_d3) vfio_pci_set_power_state(vdev, PCI_D0); + mutex_destroy(&vdev->ioeventfds_lock); + kfree(vdev->region); kfree(vdev->pm_save); kfree(vdev); - - if (vfio_pci_is_vga(pdev)) { - vga_client_register(pdev, NULL, NULL, NULL); - vga_set_legacy_decoding(pdev, - VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | - VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); - } } static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, @@ -2077,11 +2095,7 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, if (device == NULL) return PCI_ERS_RESULT_DISCONNECT; - vdev = vfio_device_data(device); - if (vdev == NULL) { - vfio_device_put(device); - return PCI_ERS_RESULT_DISCONNECT; - } + vdev = container_of(device, struct vfio_pci_device, vdev); mutex_lock(&vdev->igate); @@ -2097,7 +2111,6 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) { - struct vfio_pci_device *vdev; struct vfio_device *device; int ret = 0; @@ -2110,12 +2123,6 @@ static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) if (!device) return -ENODEV; - vdev = vfio_device_data(device); - if (!vdev) { - vfio_device_put(device); - return -ENODEV; - } - if (nr_virtfn == 0) pci_disable_sriov(pdev); else @@ -2175,7 +2182,7 @@ static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) return 0; } - vdev = vfio_device_data(device); + vdev = container_of(device, struct vfio_pci_device, vdev); if (vdev->reflck) { vfio_pci_reflck_get(vdev->reflck); @@ -2237,7 +2244,7 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) return -EBUSY; } - vdev = vfio_device_data(device); + vdev = container_of(device, struct vfio_pci_device, vdev); /* Fault if the device is not unused */ if (vdev->refcnt) { @@ -2245,7 +2252,7 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) return -EBUSY; } - devs->devices[devs->cur_index++] = device; + devs->devices[devs->cur_index++] = vdev; return 0; } @@ -2267,7 +2274,7 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) return -EBUSY; } - vdev = vfio_device_data(device); + vdev = container_of(device, struct vfio_pci_device, vdev); /* * Locking multiple devices is prone to deadlock, runaway and @@ -2278,7 +2285,7 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) return -EBUSY; } - devs->devices[devs->cur_index++] = device; + devs->devices[devs->cur_index++] = vdev; return 0; } @@ -2326,7 +2333,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) /* Does at least one need a reset? */ for (i = 0; i < devs.cur_index; i++) { - tmp = vfio_device_data(devs.devices[i]); + tmp = devs.devices[i]; if (tmp->needs_reset) { ret = pci_reset_bus(vdev->pdev); break; @@ -2335,7 +2342,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) put_devs: for (i = 0; i < devs.cur_index; i++) { - tmp = vfio_device_data(devs.devices[i]); + tmp = devs.devices[i]; /* * If reset was successful, affected devices no longer need @@ -2351,7 +2358,7 @@ put_devs: vfio_pci_set_power_state(tmp, PCI_D3hot); } - vfio_device_put(devs.devices[i]); + vfio_device_put(&tmp->vdev); } kfree(devs.devices); diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c deleted file mode 100644 index f276624fec79..000000000000 --- a/drivers/vfio/pci/vfio_pci_nvlink2.c +++ /dev/null @@ -1,490 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. - * - * Copyright (C) 2018 IBM Corp. All rights reserved. - * Author: Alexey Kardashevskiy <aik@ozlabs.ru> - * - * Register an on-GPU RAM region for cacheable access. - * - * Derived from original vfio_pci_igd.c: - * Copyright (C) 2016 Red Hat, Inc. All rights reserved. - * Author: Alex Williamson <alex.williamson@redhat.com> - */ - -#include <linux/io.h> -#include <linux/pci.h> -#include <linux/uaccess.h> -#include <linux/vfio.h> -#include <linux/sched/mm.h> -#include <linux/mmu_context.h> -#include <asm/kvm_ppc.h> -#include "vfio_pci_private.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); -EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); - -struct vfio_pci_nvgpu_data { - unsigned long gpu_hpa; /* GPU RAM physical address */ - unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ - unsigned long useraddr; /* GPU RAM userspace address */ - unsigned long size; /* Size of the GPU RAM window (usually 128GB) */ - struct mm_struct *mm; - struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */ - struct pci_dev *gpdev; - struct notifier_block group_notifier; -}; - -static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev, - char __user *buf, size_t count, loff_t *ppos, bool iswrite) -{ - unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; - struct vfio_pci_nvgpu_data *data = vdev->region[i].data; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK; - size_t sizealigned; - void __iomem *ptr; - - if (pos >= vdev->region[i].size) - return -EINVAL; - - count = min(count, (size_t)(vdev->region[i].size - pos)); - - /* - * We map only a bit of GPU RAM for a short time instead of mapping it - * for the guest lifetime as: - * - * 1) we do not know GPU RAM size, only aperture which is 4-8 times - * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture); - * 2) mapping GPU RAM allows CPU to prefetch and if this happens - * before NVLink bridge is reset (which fences GPU RAM), - * hardware management interrupts (HMI) might happen, this - * will freeze NVLink bridge. - * - * This is not fast path anyway. - */ - sizealigned = ALIGN(posoff + count, PAGE_SIZE); - ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned); - if (!ptr) - return -EFAULT; - - if (iswrite) { - if (copy_from_user(ptr + posoff, buf, count)) - count = -EFAULT; - else - *ppos += count; - } else { - if (copy_to_user(buf, ptr + posoff, count)) - count = -EFAULT; - else - *ppos += count; - } - - iounmap(ptr); - - return count; -} - -static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev, - struct vfio_pci_region *region) -{ - struct vfio_pci_nvgpu_data *data = region->data; - long ret; - - /* If there were any mappings at all... */ - if (data->mm) { - if (data->mem) { - ret = mm_iommu_put(data->mm, data->mem); - WARN_ON(ret); - } - - mmdrop(data->mm); - } - - vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, - &data->group_notifier); - - pnv_npu2_unmap_lpar_dev(data->gpdev); - - kfree(data); -} - -static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf) -{ - vm_fault_t ret; - struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_region *region = vma->vm_private_data; - struct vfio_pci_nvgpu_data *data = region->data; - unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT; - unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT; - unsigned long vm_pgoff = vma->vm_pgoff & - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); - unsigned long pfn = nv2pg + vm_pgoff + vmf_off; - - ret = vmf_insert_pfn(vma, vmf->address, pfn); - trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT, - vmf->address, ret); - - return ret; -} - -static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = { - .fault = vfio_pci_nvgpu_mmap_fault, -}; - -static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vm_area_struct *vma) -{ - int ret; - struct vfio_pci_nvgpu_data *data = region->data; - - if (data->useraddr) - return -EPERM; - - if (vma->vm_end - vma->vm_start > data->size) - return -EINVAL; - - vma->vm_private_data = region; - vma->vm_flags |= VM_PFNMAP; - vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops; - - /* - * Calling mm_iommu_newdev() here once as the region is not - * registered yet and therefore right initialization will happen now. - * Other places will use mm_iommu_find() which returns - * registered @mem and does not go gup(). - */ - data->useraddr = vma->vm_start; - data->mm = current->mm; - - mmgrab(data->mm); - ret = (int) mm_iommu_newdev(data->mm, data->useraddr, - vma_pages(vma), data->gpu_hpa, &data->mem); - - trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr, - vma->vm_end - vma->vm_start, ret); - - return ret; -} - -static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vfio_info_cap *caps) -{ - struct vfio_pci_nvgpu_data *data = region->data; - struct vfio_region_info_cap_nvlink2_ssatgt cap = { - .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, - .header.version = 1, - .tgt = data->gpu_tgt - }; - - return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); -} - -static const struct vfio_pci_regops vfio_pci_nvgpu_regops = { - .rw = vfio_pci_nvgpu_rw, - .release = vfio_pci_nvgpu_release, - .mmap = vfio_pci_nvgpu_mmap, - .add_capability = vfio_pci_nvgpu_add_capability, -}; - -static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, - unsigned long action, void *opaque) -{ - struct kvm *kvm = opaque; - struct vfio_pci_nvgpu_data *data = container_of(nb, - struct vfio_pci_nvgpu_data, - group_notifier); - - if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm && - pnv_npu2_map_lpar_dev(data->gpdev, - kvm->arch.lpid, MSR_DR | MSR_PR)) - return NOTIFY_BAD; - - return NOTIFY_OK; -} - -int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) -{ - int ret; - u64 reg[2]; - u64 tgt = 0; - struct device_node *npu_node, *mem_node; - struct pci_dev *npu_dev; - struct vfio_pci_nvgpu_data *data; - uint32_t mem_phandle = 0; - unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM; - - /* - * PCI config space does not tell us about NVLink presence but - * platform does, use this. - */ - npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0); - if (!npu_dev) - return -ENODEV; - - npu_node = pci_device_to_OF_node(npu_dev); - if (!npu_node) - return -EINVAL; - - if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) - return -ENODEV; - - mem_node = of_find_node_by_phandle(mem_phandle); - if (!mem_node) - return -EINVAL; - - if (of_property_read_variable_u64_array(mem_node, "reg", reg, - ARRAY_SIZE(reg), ARRAY_SIZE(reg)) != - ARRAY_SIZE(reg)) - return -EINVAL; - - if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { - dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); - return -EFAULT; - } - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->gpu_hpa = reg[0]; - data->gpu_tgt = tgt; - data->size = reg[1]; - - dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa, - data->gpu_hpa + data->size - 1); - - data->gpdev = vdev->pdev; - data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier; - - ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, - &events, &data->group_notifier); - if (ret) - goto free_exit; - - /* - * We have just set KVM, we do not need the listener anymore. - * Also, keeping it registered means that if more than one GPU is - * assigned, we will get several similar notifiers notifying about - * the same device again which does not help with anything. - */ - vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, - &data->group_notifier); - - ret = vfio_pci_register_dev_region(vdev, - PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, - VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, - &vfio_pci_nvgpu_regops, - data->size, - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP, - data); - if (ret) - goto free_exit; - - return 0; -free_exit: - kfree(data); - - return ret; -} - -/* - * IBM NPU2 bridge - */ -struct vfio_pci_npu2_data { - void *base; /* ATSD register virtual address, for emulated access */ - unsigned long mmio_atsd; /* ATSD physical address */ - unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ - unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ -}; - -static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev, - char __user *buf, size_t count, loff_t *ppos, bool iswrite) -{ - unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; - struct vfio_pci_npu2_data *data = vdev->region[i].data; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - - if (pos >= vdev->region[i].size) - return -EINVAL; - - count = min(count, (size_t)(vdev->region[i].size - pos)); - - if (iswrite) { - if (copy_from_user(data->base + pos, buf, count)) - return -EFAULT; - } else { - if (copy_to_user(buf, data->base + pos, count)) - return -EFAULT; - } - *ppos += count; - - return count; -} - -static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vm_area_struct *vma) -{ - int ret; - struct vfio_pci_npu2_data *data = region->data; - unsigned long req_len = vma->vm_end - vma->vm_start; - - if (req_len != PAGE_SIZE) - return -EINVAL; - - vma->vm_flags |= VM_PFNMAP; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT, - req_len, vma->vm_page_prot); - trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start, - vma->vm_end - vma->vm_start, ret); - - return ret; -} - -static void vfio_pci_npu2_release(struct vfio_pci_device *vdev, - struct vfio_pci_region *region) -{ - struct vfio_pci_npu2_data *data = region->data; - - memunmap(data->base); - kfree(data); -} - -static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vfio_info_cap *caps) -{ - struct vfio_pci_npu2_data *data = region->data; - struct vfio_region_info_cap_nvlink2_ssatgt captgt = { - .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, - .header.version = 1, - .tgt = data->gpu_tgt - }; - struct vfio_region_info_cap_nvlink2_lnkspd capspd = { - .header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, - .header.version = 1, - .link_speed = data->link_speed - }; - int ret; - - ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt)); - if (ret) - return ret; - - return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd)); -} - -static const struct vfio_pci_regops vfio_pci_npu2_regops = { - .rw = vfio_pci_npu2_rw, - .mmap = vfio_pci_npu2_mmap, - .release = vfio_pci_npu2_release, - .add_capability = vfio_pci_npu2_add_capability, -}; - -int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) -{ - int ret; - struct vfio_pci_npu2_data *data; - struct device_node *nvlink_dn; - u32 nvlink_index = 0, mem_phandle = 0; - struct pci_dev *npdev = vdev->pdev; - struct device_node *npu_node = pci_device_to_OF_node(npdev); - struct pci_controller *hose = pci_bus_to_host(npdev->bus); - u64 mmio_atsd = 0; - u64 tgt = 0; - u32 link_speed = 0xff; - - /* - * PCI config space does not tell us about NVLink presence but - * platform does, use this. - */ - if (!pnv_pci_get_gpu_dev(vdev->pdev)) - return -ENODEV; - - if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) - return -ENODEV; - - /* - * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links - * so we can allocate one register per link, using nvlink index as - * a key. - * There is always at least one ATSD register so as long as at least - * NVLink bridge #0 is passed to the guest, ATSD will be available. - */ - nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); - if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", - &nvlink_index))) - return -ENODEV; - - if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index, - &mmio_atsd)) { - if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0, - &mmio_atsd)) { - dev_warn(&vdev->pdev->dev, "No available ATSD found\n"); - mmio_atsd = 0; - } else { - dev_warn(&vdev->pdev->dev, - "Using fallback ibm,mmio-atsd[0] for ATSD.\n"); - } - } - - if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { - dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); - return -EFAULT; - } - - if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) { - dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n"); - return -EFAULT; - } - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->mmio_atsd = mmio_atsd; - data->gpu_tgt = tgt; - data->link_speed = link_speed; - if (data->mmio_atsd) { - data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT); - if (!data->base) { - ret = -ENOMEM; - goto free_exit; - } - } - - /* - * We want to expose the capability even if this specific NVLink - * did not get its own ATSD register because capabilities - * belong to VFIO regions and normally there will be ATSD register - * assigned to the NVLink bridge. - */ - ret = vfio_pci_register_dev_region(vdev, - PCI_VENDOR_ID_IBM | - VFIO_REGION_TYPE_PCI_VENDOR_TYPE, - VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, - &vfio_pci_npu2_regops, - data->mmio_atsd ? PAGE_SIZE : 0, - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP, - data); - if (ret) - goto free_exit; - - return 0; - -free_exit: - if (data->base) - memunmap(data->base); - kfree(data); - - return ret; -} diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 9cd1882a05af..5a36272cecbf 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -100,6 +100,7 @@ struct vfio_pci_mmap_vma { }; struct vfio_pci_device { + struct vfio_device vdev; struct pci_dev *pdev; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; @@ -199,20 +200,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) return -ENODEV; } #endif -#ifdef CONFIG_VFIO_PCI_NVLINK2 -extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev); -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev); -#else -static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) -{ - return -ENODEV; -} - -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) -{ - return -ENODEV; -} -#endif #ifdef CONFIG_S390 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 3626c2150101..f970eb2a999f 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -66,16 +66,18 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) if (ret) { kfree(vdev->name); kfree(vdev); + return ret; } - return ret; + dev_set_drvdata(&adev->dev, vdev); + return 0; } static void vfio_amba_remove(struct amba_device *adev) { - struct vfio_platform_device *vdev = - vfio_platform_remove_common(&adev->dev); + struct vfio_platform_device *vdev = dev_get_drvdata(&adev->dev); + vfio_platform_remove_common(vdev); kfree(vdev->name); kfree(vdev); } diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 9fb6818cea12..e4027799a154 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -54,23 +54,21 @@ static int vfio_platform_probe(struct platform_device *pdev) vdev->reset_required = reset_required; ret = vfio_platform_probe_common(vdev, &pdev->dev); - if (ret) + if (ret) { kfree(vdev); - - return ret; + return ret; + } + dev_set_drvdata(&pdev->dev, vdev); + return 0; } static int vfio_platform_remove(struct platform_device *pdev) { - struct vfio_platform_device *vdev; - - vdev = vfio_platform_remove_common(&pdev->dev); - if (vdev) { - kfree(vdev); - return 0; - } + struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev); - return -EINVAL; + vfio_platform_remove_common(vdev); + kfree(vdev); + return 0; } static struct platform_driver vfio_platform_driver = { diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index fb4b385191f2..361e5b57e369 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -218,9 +218,10 @@ static int vfio_platform_call_reset(struct vfio_platform_device *vdev, return -EINVAL; } -static void vfio_platform_release(void *device_data) +static void vfio_platform_release(struct vfio_device *core_vdev) { - struct vfio_platform_device *vdev = device_data; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); mutex_lock(&driver_lock); @@ -244,9 +245,10 @@ static void vfio_platform_release(void *device_data) module_put(vdev->parent_module); } -static int vfio_platform_open(void *device_data) +static int vfio_platform_open(struct vfio_device *core_vdev) { - struct vfio_platform_device *vdev = device_data; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); int ret; if (!try_module_get(vdev->parent_module)) @@ -293,10 +295,12 @@ err_reg: return ret; } -static long vfio_platform_ioctl(void *device_data, +static long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { - struct vfio_platform_device *vdev = device_data; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + unsigned long minsz; if (cmd == VFIO_DEVICE_GET_INFO) { @@ -455,10 +459,11 @@ err: return -EFAULT; } -static ssize_t vfio_platform_read(void *device_data, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t vfio_platform_read(struct vfio_device *core_vdev, + char __user *buf, size_t count, loff_t *ppos) { - struct vfio_platform_device *vdev = device_data; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos); loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK; @@ -531,10 +536,11 @@ err: return -EFAULT; } -static ssize_t vfio_platform_write(void *device_data, const char __user *buf, +static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos) { - struct vfio_platform_device *vdev = device_data; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos); loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK; @@ -573,9 +579,10 @@ static int vfio_platform_mmap_mmio(struct vfio_platform_region region, req_len, vma->vm_page_prot); } -static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma) +static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { - struct vfio_platform_device *vdev = device_data; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); unsigned int index; index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT); @@ -659,8 +666,7 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev, struct iommu_group *group; int ret; - if (!vdev) - return -EINVAL; + vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops); ret = vfio_platform_acpi_probe(vdev, dev); if (ret) @@ -685,13 +691,13 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev, goto put_reset; } - ret = vfio_add_group_dev(dev, &vfio_platform_ops, vdev); + ret = vfio_register_group_dev(&vdev->vdev); if (ret) goto put_iommu; mutex_init(&vdev->igate); - pm_runtime_enable(vdev->device); + pm_runtime_enable(dev); return 0; put_iommu: @@ -702,19 +708,13 @@ put_reset: } EXPORT_SYMBOL_GPL(vfio_platform_probe_common); -struct vfio_platform_device *vfio_platform_remove_common(struct device *dev) +void vfio_platform_remove_common(struct vfio_platform_device *vdev) { - struct vfio_platform_device *vdev; - - vdev = vfio_del_group_dev(dev); + vfio_unregister_group_dev(&vdev->vdev); - if (vdev) { - pm_runtime_disable(vdev->device); - vfio_platform_put_reset(vdev); - vfio_iommu_group_put(dev->iommu_group, dev); - } - - return vdev; + pm_runtime_disable(vdev->device); + vfio_platform_put_reset(vdev); + vfio_iommu_group_put(vdev->vdev.dev->iommu_group, vdev->vdev.dev); } EXPORT_SYMBOL_GPL(vfio_platform_remove_common); diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 289089910643..a5ba82c8cbc3 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/interrupt.h> +#include <linux/vfio.h> #define VFIO_PLATFORM_OFFSET_SHIFT 40 #define VFIO_PLATFORM_OFFSET_MASK (((u64)(1) << VFIO_PLATFORM_OFFSET_SHIFT) - 1) @@ -42,6 +43,7 @@ struct vfio_platform_region { }; struct vfio_platform_device { + struct vfio_device vdev; struct vfio_platform_region *regions; u32 num_regions; struct vfio_platform_irq *irqs; @@ -80,8 +82,7 @@ struct vfio_platform_reset_node { extern int vfio_platform_probe_common(struct vfio_platform_device *vdev, struct device *dev); -extern struct vfio_platform_device *vfio_platform_remove_common - (struct device *dev); +void vfio_platform_remove_common(struct vfio_platform_device *vdev); extern int vfio_platform_irq_init(struct vfio_platform_device *vdev); extern void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev); diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 38779e6fd80c..5e631c359ef2 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -46,7 +46,6 @@ static struct vfio { struct mutex group_lock; struct cdev group_cdev; dev_t group_devt; - wait_queue_head_t release_q; } vfio; struct vfio_iommu_driver { @@ -90,15 +89,6 @@ struct vfio_group { struct blocking_notifier_head notifier; }; -struct vfio_device { - struct kref kref; - struct device *dev; - const struct vfio_device_ops *ops; - struct vfio_group *group; - struct list_head group_next; - void *device_data; -}; - #ifdef CONFIG_VFIO_NOIOMMU static bool noiommu __read_mostly; module_param_named(enable_unsafe_noiommu_mode, @@ -109,8 +99,8 @@ MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. Thi /* * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe * and remove functions, any use cases other than acquiring the first - * reference for the purpose of calling vfio_add_group_dev() or removing - * that symmetric reference after vfio_del_group_dev() should use the raw + * reference for the purpose of calling vfio_register_group_dev() or removing + * that symmetric reference after vfio_unregister_group_dev() should use the raw * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put() * removes the device from the dummy group and cannot be nested. */ @@ -532,67 +522,17 @@ static struct vfio_group *vfio_group_get_from_dev(struct device *dev) /** * Device objects - create, release, get, put, search */ -static -struct vfio_device *vfio_group_create_device(struct vfio_group *group, - struct device *dev, - const struct vfio_device_ops *ops, - void *device_data) -{ - struct vfio_device *device; - - device = kzalloc(sizeof(*device), GFP_KERNEL); - if (!device) - return ERR_PTR(-ENOMEM); - - kref_init(&device->kref); - device->dev = dev; - device->group = group; - device->ops = ops; - device->device_data = device_data; - dev_set_drvdata(dev, device); - - /* No need to get group_lock, caller has group reference */ - vfio_group_get(group); - - mutex_lock(&group->device_lock); - list_add(&device->group_next, &group->device_list); - group->dev_counter++; - mutex_unlock(&group->device_lock); - - return device; -} - -static void vfio_device_release(struct kref *kref) -{ - struct vfio_device *device = container_of(kref, - struct vfio_device, kref); - struct vfio_group *group = device->group; - - list_del(&device->group_next); - group->dev_counter--; - mutex_unlock(&group->device_lock); - - dev_set_drvdata(device->dev, NULL); - - kfree(device); - - /* vfio_del_group_dev may be waiting for this device */ - wake_up(&vfio.release_q); -} - /* Device reference always implies a group reference */ void vfio_device_put(struct vfio_device *device) { - struct vfio_group *group = device->group; - kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock); - vfio_group_put(group); + if (refcount_dec_and_test(&device->refcount)) + complete(&device->comp); } EXPORT_SYMBOL_GPL(vfio_device_put); -static void vfio_device_get(struct vfio_device *device) +static bool vfio_device_try_get(struct vfio_device *device) { - vfio_group_get(device->group); - kref_get(&device->kref); + return refcount_inc_not_zero(&device->refcount); } static struct vfio_device *vfio_group_get_device(struct vfio_group *group, @@ -602,8 +542,7 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, mutex_lock(&group->device_lock); list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev) { - vfio_device_get(device); + if (device->dev == dev && vfio_device_try_get(device)) { mutex_unlock(&group->device_lock); return device; } @@ -801,14 +740,22 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, /** * VFIO driver API */ -int vfio_add_group_dev(struct device *dev, - const struct vfio_device_ops *ops, void *device_data) +void vfio_init_group_dev(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops) +{ + init_completion(&device->comp); + device->dev = dev; + device->ops = ops; +} +EXPORT_SYMBOL_GPL(vfio_init_group_dev); + +int vfio_register_group_dev(struct vfio_device *device) { + struct vfio_device *existing_device; struct iommu_group *iommu_group; struct vfio_group *group; - struct vfio_device *device; - iommu_group = iommu_group_get(dev); + iommu_group = iommu_group_get(device->dev); if (!iommu_group) return -EINVAL; @@ -827,31 +774,29 @@ int vfio_add_group_dev(struct device *dev, iommu_group_put(iommu_group); } - device = vfio_group_get_device(group, dev); - if (device) { - dev_WARN(dev, "Device already exists on group %d\n", + existing_device = vfio_group_get_device(group, device->dev); + if (existing_device) { + dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(iommu_group)); - vfio_device_put(device); + vfio_device_put(existing_device); vfio_group_put(group); return -EBUSY; } - device = vfio_group_create_device(group, dev, ops, device_data); - if (IS_ERR(device)) { - vfio_group_put(group); - return PTR_ERR(device); - } + /* Our reference on group is moved to the device */ + device->group = group; - /* - * Drop all but the vfio_device reference. The vfio_device holds - * a reference to the vfio_group, which holds a reference to the - * iommu_group. - */ - vfio_group_put(group); + /* Refcounting can't start until the driver calls register */ + refcount_set(&device->refcount, 1); + + mutex_lock(&group->device_lock); + list_add(&device->group_next, &group->device_list); + group->dev_counter++; + mutex_unlock(&group->device_lock); return 0; } -EXPORT_SYMBOL_GPL(vfio_add_group_dev); +EXPORT_SYMBOL_GPL(vfio_register_group_dev); /** * Get a reference to the vfio_device for a device. Even if the @@ -886,7 +831,7 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, int ret; if (it->ops->match) { - ret = it->ops->match(it->device_data, buf); + ret = it->ops->match(it, buf); if (ret < 0) { device = ERR_PTR(ret); break; @@ -895,9 +840,8 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, ret = !strcmp(dev_name(it->dev), buf); } - if (ret) { + if (ret && vfio_device_try_get(it)) { device = it; - vfio_device_get(device); break; } } @@ -907,32 +851,15 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, } /* - * Caller must hold a reference to the vfio_device - */ -void *vfio_device_data(struct vfio_device *device) -{ - return device->device_data; -} -EXPORT_SYMBOL_GPL(vfio_device_data); - -/* * Decrement the device reference count and wait for the device to be * removed. Open file descriptors for the device... */ -void *vfio_del_group_dev(struct device *dev) +void vfio_unregister_group_dev(struct vfio_device *device) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct vfio_device *device = dev_get_drvdata(dev); struct vfio_group *group = device->group; - void *device_data = device->device_data; struct vfio_unbound_dev *unbound; unsigned int i = 0; bool interrupted = false; - - /* - * The group exists so long as we have a device reference. Get - * a group reference and use it to scan for the device going away. - */ - vfio_group_get(group); + long rc; /* * When the device is removed from the group, the group suddenly @@ -945,7 +872,7 @@ void *vfio_del_group_dev(struct device *dev) */ unbound = kzalloc(sizeof(*unbound), GFP_KERNEL); if (unbound) { - unbound->dev = dev; + unbound->dev = device->dev; mutex_lock(&group->unbound_lock); list_add(&unbound->unbound_next, &group->unbound_list); mutex_unlock(&group->unbound_lock); @@ -953,44 +880,33 @@ void *vfio_del_group_dev(struct device *dev) WARN_ON(!unbound); vfio_device_put(device); - - /* - * If the device is still present in the group after the above - * 'put', then it is in use and we need to request it from the - * bus driver. The driver may in turn need to request the - * device from the user. We send the request on an arbitrary - * interval with counter to allow the driver to take escalating - * measures to release the device if it has the ability to do so. - */ - add_wait_queue(&vfio.release_q, &wait); - - do { - device = vfio_group_get_device(group, dev); - if (!device) - break; - + rc = try_wait_for_completion(&device->comp); + while (rc <= 0) { if (device->ops->request) - device->ops->request(device_data, i++); - - vfio_device_put(device); + device->ops->request(device, i++); if (interrupted) { - wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10); + rc = wait_for_completion_timeout(&device->comp, + HZ * 10); } else { - wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10); - if (signal_pending(current)) { + rc = wait_for_completion_interruptible_timeout( + &device->comp, HZ * 10); + if (rc < 0) { interrupted = true; - dev_warn(dev, + dev_warn(device->dev, "Device is currently in use, task" " \"%s\" (%d) " "blocked until device is released", current->comm, task_pid_nr(current)); } } + } - } while (1); + mutex_lock(&group->device_lock); + list_del(&device->group_next); + group->dev_counter--; + mutex_unlock(&group->device_lock); - remove_wait_queue(&vfio.release_q, &wait); /* * In order to support multiple devices per group, devices can be * plucked from the group while other devices in the group are still @@ -1008,11 +924,10 @@ void *vfio_del_group_dev(struct device *dev) if (list_empty(&group->device_list)) wait_event(group->container_q, !group->container); + /* Matches the get in vfio_register_group_dev() */ vfio_group_put(group); - - return device_data; } -EXPORT_SYMBOL_GPL(vfio_del_group_dev); +EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); /** * VFIO base fd, /dev/vfio/vfio @@ -1454,7 +1369,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) if (IS_ERR(device)) return PTR_ERR(device); - ret = device->ops->open(device->device_data); + ret = device->ops->open(device); if (ret) { vfio_device_put(device); return ret; @@ -1466,7 +1381,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) */ ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) { - device->ops->release(device->device_data); + device->ops->release(device); vfio_device_put(device); return ret; } @@ -1476,7 +1391,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) if (IS_ERR(filep)) { put_unused_fd(ret); ret = PTR_ERR(filep); - device->ops->release(device->device_data); + device->ops->release(device); vfio_device_put(device); return ret; } @@ -1633,7 +1548,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; - device->ops->release(device->device_data); + device->ops->release(device); vfio_group_try_dissolve_container(device->group); @@ -1650,7 +1565,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, if (unlikely(!device->ops->ioctl)) return -EINVAL; - return device->ops->ioctl(device->device_data, cmd, arg); + return device->ops->ioctl(device, cmd, arg); } static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, @@ -1661,7 +1576,7 @@ static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, if (unlikely(!device->ops->read)) return -EINVAL; - return device->ops->read(device->device_data, buf, count, ppos); + return device->ops->read(device, buf, count, ppos); } static ssize_t vfio_device_fops_write(struct file *filep, @@ -1673,7 +1588,7 @@ static ssize_t vfio_device_fops_write(struct file *filep, if (unlikely(!device->ops->write)) return -EINVAL; - return device->ops->write(device->device_data, buf, count, ppos); + return device->ops->write(device, buf, count, ppos); } static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) @@ -1683,7 +1598,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) if (unlikely(!device->ops->mmap)) return -EINVAL; - return device->ops->mmap(device->device_data, vma); + return device->ops->mmap(device, vma); } static const struct file_operations vfio_device_fops = { @@ -2379,7 +2294,6 @@ static int __init vfio_init(void) mutex_init(&vfio.iommu_drivers_lock); INIT_LIST_HEAD(&vfio.group_list); INIT_LIST_HEAD(&vfio.iommu_drivers_list); - init_waitqueue_head(&vfio.release_q); ret = misc_register(&vfio_dev); if (ret) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index b7e18bde5aa8..a2c5b30e1763 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -15,6 +15,17 @@ #include <linux/poll.h> #include <uapi/linux/vfio.h> +struct vfio_device { + struct device *dev; + const struct vfio_device_ops *ops; + struct vfio_group *group; + + /* Members below here are private, not for driver use */ + refcount_t refcount; + struct completion comp; + struct list_head group_next; +}; + /** * struct vfio_device_ops - VFIO bus driver device callbacks * @@ -32,30 +43,28 @@ */ struct vfio_device_ops { char *name; - int (*open)(void *device_data); - void (*release)(void *device_data); - ssize_t (*read)(void *device_data, char __user *buf, + int (*open)(struct vfio_device *vdev); + void (*release)(struct vfio_device *vdev); + ssize_t (*read)(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos); - ssize_t (*write)(void *device_data, const char __user *buf, + ssize_t (*write)(struct vfio_device *vdev, const char __user *buf, size_t count, loff_t *size); - long (*ioctl)(void *device_data, unsigned int cmd, + long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*mmap)(void *device_data, struct vm_area_struct *vma); - void (*request)(void *device_data, unsigned int count); - int (*match)(void *device_data, char *buf); + int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); + void (*request)(struct vfio_device *vdev, unsigned int count); + int (*match)(struct vfio_device *vdev, char *buf); }; extern struct iommu_group *vfio_iommu_group_get(struct device *dev); extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); -extern int vfio_add_group_dev(struct device *dev, - const struct vfio_device_ops *ops, - void *device_data); - -extern void *vfio_del_group_dev(struct device *dev); +void vfio_init_group_dev(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); +int vfio_register_group_dev(struct vfio_device *device); +void vfio_unregister_group_dev(struct vfio_device *device); extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); extern void vfio_device_put(struct vfio_device *device); -extern void *vfio_device_data(struct vfio_device *device); /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 8ce36c1d53ca..34b1f53a3901 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -333,17 +333,10 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) /* 10de vendor PCI sub-types */ -/* - * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space. - */ -#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1) +/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */ /* 1014 vendor PCI sub-types */ -/* - * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU - * to do TLB invalidation on a GPU. - */ -#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) +/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */ /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) @@ -637,32 +630,9 @@ struct vfio_device_migration_info { */ #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 -/* - * Capability with compressed real address (aka SSA - small system address) - * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing - * and by the userspace to associate a NVLink bridge with a GPU. - */ -#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT 4 - -struct vfio_region_info_cap_nvlink2_ssatgt { - struct vfio_info_cap_header header; - __u64 tgt; -}; +/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */ -/* - * Capability with an NVLink link speed. The value is read by - * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed" - * property in the device tree. The value is fixed in the hardware - * and failing to provide the correct value results in the link - * not working with no indication from the driver why. - */ -#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD 5 - -struct vfio_region_info_cap_nvlink2_lnkspd { - struct vfio_info_cap_header header; - __u32 link_speed; - __u32 __pad; -}; +/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */ /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, |