From 5ffd229c02731a91d08ca21e76b503c5bbb5c095 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 21 May 2013 13:33:10 +1000
Subject: powerpc/vfio: Implement IOMMU driver for VFIO

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling.  This implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWER
guest).

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/vfio/Kconfig                |   6 +
 drivers/vfio/Makefile               |   1 +
 drivers/vfio/vfio.c                 |   1 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 377 ++++++++++++++++++++++++++++++++++++
 4 files changed, 385 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

(limited to 'drivers')

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec0abd1..b464687f6e14 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a0e38b..72bfabc8629e 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6d78736563de..259ad282ae5d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1415,6 +1415,7 @@ static int __init vfio_init(void)
 	 * drivers.
 	 */
 	request_module_nowait("vfio_iommu_type1");
+	request_module_nowait("vfio_iommu_spapr_tce");
 
 	return 0;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 000000000000..bdae7a04af75
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,377 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2013 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+	bool enabled;
+};
+
+static int tce_iommu_enable(struct tce_container *container)
+{
+	int ret = 0;
+	unsigned long locked, lock_limit, npages;
+	struct iommu_table *tbl = container->tbl;
+
+	if (!container->tbl)
+		return -ENXIO;
+
+	if (!current->mm)
+		return -ESRCH; /* process exited */
+
+	if (container->enabled)
+		return -EBUSY;
+
+	/*
+	 * When userspace pages are mapped into the IOMMU, they are effectively
+	 * locked memory, so, theoretically, we need to update the accounting
+	 * of locked pages on each map and unmap.  For powerpc, the map unmap
+	 * paths can be very hot, though, and the accounting would kill
+	 * performance, especially since it would be difficult to impossible
+	 * to handle the accounting in real mode only.
+	 *
+	 * To address that, rather than precisely accounting every page, we
+	 * instead account for a worst case on locked memory when the iommu is
+	 * enabled and disabled.  The worst case upper bound on locked memory
+	 * is the size of the whole iommu window, which is usually relatively
+	 * small (compared to total memory sizes) on POWER hardware.
+	 *
+	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+	 * that would effectively kill the guest at random points, much better
+	 * enforcing the limit based on the max that the guest can map.
+	 */
+	down_write(&current->mm->mmap_sem);
+	npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+	locked = current->mm->locked_vm + npages;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+				rlimit(RLIMIT_MEMLOCK));
+		ret = -ENOMEM;
+	} else {
+
+		current->mm->locked_vm += npages;
+		container->enabled = true;
+	}
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+static void tce_iommu_disable(struct tce_container *container)
+{
+	if (!container->enabled)
+		return;
+
+	container->enabled = false;
+
+	if (!container->tbl || !current->mm)
+		return;
+
+	down_write(&current->mm->mmap_sem);
+	current->mm->locked_vm -= (container->tbl->it_size <<
+			IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+	up_write(&current->mm->mmap_sem);
+}
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	tce_iommu_disable(container);
+
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		struct vfio_iommu_type1_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		unsigned long tce, i;
+
+		if (!tbl)
+			return -ENXIO;
+
+		BUG_ON(!tbl->it_group);
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE))
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		/* iova is checked by the IOMMU API */
+		tce = param.vaddr;
+		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			tce |= TCE_PCI_READ;
+		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			tce |= TCE_PCI_WRITE;
+
+		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) {
+			ret = iommu_put_tce_user_mode(tbl,
+					(param.iova >> IOMMU_PAGE_SHIFT) + i,
+					tce);
+			if (ret)
+				break;
+			tce += IOMMU_PAGE_SIZE;
+		}
+		if (ret)
+			iommu_clear_tces_and_put_pages(tbl,
+					param.iova >> IOMMU_PAGE_SHIFT,	i);
+
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		struct vfio_iommu_type1_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
+				size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		if (param.size & ~IOMMU_PAGE_MASK)
+			return -EINVAL;
+
+		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
+				param.size >> IOMMU_PAGE_SHIFT);
+		if (ret)
+			return ret;
+
+		ret = iommu_clear_tces_and_put_pages(tbl,
+				param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_ENABLE:
+		mutex_lock(&container->lock);
+		ret = tce_iommu_enable(container);
+		mutex_unlock(&container->lock);
+		return ret;
+
+
+	case VFIO_IOMMU_DISABLE:
+		mutex_lock(&container->lock);
+		tce_iommu_disable(container);
+		mutex_unlock(&container->lock);
+		return 0;
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	int ret;
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+
+	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group); */
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		ret = -EBUSY;
+	} else if (container->enabled) {
+		pr_err("tce_vfio: attaching group #%u to enabled container\n",
+				iommu_group_id(iommu_group));
+		ret = -EBUSY;
+	} else {
+		ret = iommu_take_ownership(tbl);
+		if (!ret)
+			container->tbl = tbl;
+	}
+
+	mutex_unlock(&container->lock);
+
+	return ret;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+		if (container->enabled) {
+			pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
+					iommu_group_id(tbl->it_group));
+			tce_iommu_disable(container);
+		}
+
+		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group); */
+		container->tbl = NULL;
+		iommu_release_ownership(tbl);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
-- 
cgit v1.2.3