128 files changed, 18449 insertions, 1982 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 46b4a8e0f859..5a89e409ad18 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -188,4 +188,10 @@ source "drivers/nvdimm/Kconfig"
 
 source "drivers/nvmem/Kconfig"
 
+source "drivers/hwtracing/stm/Kconfig"
+
+source "drivers/hwtracing/intel_th/Kconfig"
+
+source "drivers/fpga/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index b250b36b54f2..7064bf476c2a 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -165,5 +165,8 @@ obj-$(CONFIG_PERF_EVENTS)	+= perf/
 obj-$(CONFIG_RAS)		+= ras/
 obj-$(CONFIG_THUNDERBOLT)	+= thunderbolt/
 obj-$(CONFIG_CORESIGHT)		+= hwtracing/coresight/
+obj-y				+= hwtracing/intel_th/
+obj-$(CONFIG_STM)		+= hwtracing/stm/
 obj-$(CONFIG_ANDROID)		+= android/
 obj-$(CONFIG_NVMEM)		+= nvmem/
+obj-$(CONFIG_FPGA)		+= fpga/
diff --git a/drivers/char/efirtc.c b/drivers/char/efirtc.c
index e39e7402e623..dc62568b7dde 100644
--- a/drivers/char/efirtc.c
+++ b/drivers/char/efirtc.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/miscdevice.h>
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/rtc.h>
 #include <linux/proc_fs.h>
@@ -395,14 +394,8 @@ efi_rtc_init(void)
 	}
 	return 0;
 }
+device_initcall(efi_rtc_init);
 
-static void __exit
-efi_rtc_exit(void)
-{
-	/* not yet used */
-}
-
-module_init(efi_rtc_init);
-module_exit(efi_rtc_exit);
-
+/*
 MODULE_LICENSE("GPL");
+*/
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 5c0baa9ffc64..240b6cf1d97c 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -12,7 +12,6 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/miscdevice.h>
@@ -1043,24 +1042,16 @@ static int hpet_acpi_add(struct acpi_device *device)
 	return hpet_alloc(&data);
 }
 
-static int hpet_acpi_remove(struct acpi_device *device)
-{
-	/* XXX need to unregister clocksource, dealloc mem, etc */
-	return -EINVAL;
-}
-
 static const struct acpi_device_id hpet_device_ids[] = {
 	{"PNP0103", 0},
 	{"", 0},
 };
-MODULE_DEVICE_TABLE(acpi, hpet_device_ids);
 
 static struct acpi_driver hpet_acpi_driver = {
 	.name = "hpet",
 	.ids = hpet_device_ids,
 	.ops = {
 		.add = hpet_acpi_add,
-		.remove = hpet_acpi_remove,
 		},
 };
 
@@ -1086,19 +1077,9 @@ static int __init hpet_init(void)
 
 	return 0;
 }
+device_initcall(hpet_init);
 
-static void __exit hpet_exit(void)
-{
-	acpi_bus_unregister_driver(&hpet_acpi_driver);
-
-	if (sysctl_header)
-		unregister_sysctl_table(sysctl_header);
-	misc_deregister(&hpet_misc);
-
-	return;
-}
-
-module_init(hpet_init);
-module_exit(hpet_exit);
+/*
 MODULE_AUTHOR("Bob Picco <Robert.Picco@hp.com>");
 MODULE_LICENSE("GPL");
+*/
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index 8a80ead8d316..94006f9c2e43 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -19,7 +19,7 @@
 #include <linux/sched.h>
 #include <linux/device.h>
 #include <linux/poll.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <asm/sn/io.h>
@@ -461,5 +461,4 @@ scdrv_init(void)
 	}
 	return 0;
 }
-
-module_init(scdrv_init);
+device_initcall(scdrv_init);
diff --git a/drivers/dma/mic_x100_dma.c b/drivers/dma/mic_x100_dma.c
index 74d9db05a5ad..068e920ecb68 100644
--- a/drivers/dma/mic_x100_dma.c
+++ b/drivers/dma/mic_x100_dma.c
@@ -193,8 +193,16 @@ static void mic_dma_prog_intr(struct mic_dma_chan *ch)
 static int mic_dma_do_dma(struct mic_dma_chan *ch, int flags, dma_addr_t src,
 			  dma_addr_t dst, size_t len)
 {
-	if (-ENOMEM == mic_dma_prog_memcpy_desc(ch, src, dst, len))
+	if (len && -ENOMEM == mic_dma_prog_memcpy_desc(ch, src, dst, len)) {
 		return -ENOMEM;
+	} else {
+		/* 3 is the maximum number of status descriptors */
+		int ret = mic_dma_avail_desc_ring_space(ch, 3);
+
+		if (ret < 0)
+			return ret;
+	}
+
 	/* Above mic_dma_prog_memcpy_desc() makes sure we have enough space */
 	if (flags & DMA_PREP_FENCE) {
 		mic_dma_prep_status_desc(&ch->desc_ring[ch->head], 0,
@@ -270,6 +278,33 @@ allocate_tx(struct mic_dma_chan *ch)
 	return tx;
 }
 
+/* Program a status descriptor with dst as address and value to be written */
+static struct dma_async_tx_descriptor *
+mic_dma_prep_status_lock(struct dma_chan *ch, dma_addr_t dst, u64 src_val,
+			 unsigned long flags)
+{
+	struct mic_dma_chan *mic_ch = to_mic_dma_chan(ch);
+	int result;
+
+	spin_lock(&mic_ch->prep_lock);
+	result = mic_dma_avail_desc_ring_space(mic_ch, 4);
+	if (result < 0)
+		goto error;
+	mic_dma_prep_status_desc(&mic_ch->desc_ring[mic_ch->head], src_val, dst,
+				 false);
+	mic_dma_hw_ring_inc_head(mic_ch);
+	result = mic_dma_do_dma(mic_ch, flags, 0, 0, 0);
+	if (result < 0)
+		goto error;
+
+	return allocate_tx(mic_ch);
+error:
+	dev_err(mic_dma_ch_to_device(mic_ch),
+		"Error enqueueing dma status descriptor, error=%d\n", result);
+	spin_unlock(&mic_ch->prep_lock);
+	return NULL;
+}
+
 /*
  * Prepare a memcpy descriptor to be added to the ring.
  * Note that the temporary descriptor adds an extra overhead of copying the
@@ -587,6 +622,8 @@ static int mic_dma_register_dma_device(struct mic_dma_device *mic_dma_dev,
 		mic_dma_free_chan_resources;
 	mic_dma_dev->dma_dev.device_tx_status = mic_dma_tx_status;
 	mic_dma_dev->dma_dev.device_prep_dma_memcpy = mic_dma_prep_memcpy_lock;
+	mic_dma_dev->dma_dev.device_prep_dma_imm_data =
+		mic_dma_prep_status_lock;
 	mic_dma_dev->dma_dev.device_prep_dma_interrupt =
 		mic_dma_prep_interrupt_lock;
 	mic_dma_dev->dma_dev.device_issue_pending = mic_dma_issue_pending;
diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
new file mode 100644
index 000000000000..dfc1f1ec093b
--- /dev/null
+++ b/drivers/fpga/Kconfig
@@ -0,0 +1,24 @@
+#
+# FPGA framework configuration
+#
+
+menu "FPGA Configuration Support"
+
+config FPGA
+	tristate "FPGA Configuration Framework"
+	help
+	  Say Y here if you want support for configuring FPGAs from the
+	  kernel.  The FPGA framework adds a FPGA manager class and FPGA
+	  manager drivers.
+
+if FPGA
+
+config FPGA_MGR_SOCFPGA
+	tristate "Altera SOCFPGA FPGA Manager"
+	depends on ARCH_SOCFPGA
+	help
+	  FPGA manager driver support for Altera SOCFPGA.
+
+endif # FPGA
+
+endmenu
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
new file mode 100644
index 000000000000..ba6c5c597a4d
--- /dev/null
+++ b/drivers/fpga/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the fpga framework and fpga manager drivers.
+#
+
+# Core FPGA Manager Framework
+obj-$(CONFIG_FPGA)			+= fpga-mgr.o
+
+# FPGA Manager Drivers
+obj-$(CONFIG_FPGA_MGR_SOCFPGA)		+= socfpga.o
diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
new file mode 100644
index 000000000000..25261636687c
--- /dev/null
+++ b/drivers/fpga/fpga-mgr.c
@@ -0,0 +1,382 @@
+/*
+ * FPGA Manager Core
+ *
+ *  Copyright (C) 2013-2015 Altera Corporation
+ *
+ * With code from the mailing list:
+ * Copyright (C) 2013 Xilinx, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/firmware.h>
+#include <linux/fpga/fpga-mgr.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+static DEFINE_IDA(fpga_mgr_ida);
+static struct class *fpga_mgr_class;
+
+/**
+ * fpga_mgr_buf_load - load fpga from image in buffer
+ * @mgr:	fpga manager
+ * @flags:	flags setting fpga confuration modes
+ * @buf:	buffer contain fpga image
+ * @count:	byte count of buf
+ *
+ * Step the low level fpga manager through the device-specific steps of getting
+ * an FPGA ready to be configured, writing the image to it, then doing whatever
+ * post-configuration steps necessary.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int fpga_mgr_buf_load(struct fpga_manager *mgr, u32 flags, const char *buf,
+		      size_t count)
+{
+	struct device *dev = &mgr->dev;
+	int ret;
+
+	if (!mgr)
+		return -ENODEV;
+
+	/*
+	 * Call the low level driver's write_init function.  This will do the
+	 * device-specific things to get the FPGA into the state where it is
+	 * ready to receive an FPGA image.
+	 */
+	mgr->state = FPGA_MGR_STATE_WRITE_INIT;
+	ret = mgr->mops->write_init(mgr, flags, buf, count);
+	if (ret) {
+		dev_err(dev, "Error preparing FPGA for writing\n");
+		mgr->state = FPGA_MGR_STATE_WRITE_INIT_ERR;
+		return ret;
+	}
+
+	/*
+	 * Write the FPGA image to the FPGA.
+	 */
+	mgr->state = FPGA_MGR_STATE_WRITE;
+	ret = mgr->mops->write(mgr, buf, count);
+	if (ret) {
+		dev_err(dev, "Error while writing image data to FPGA\n");
+		mgr->state = FPGA_MGR_STATE_WRITE_ERR;
+		return ret;
+	}
+
+	/*
+	 * After all the FPGA image has been written, do the device specific
+	 * steps to finish and set the FPGA into operating mode.
+	 */
+	mgr->state = FPGA_MGR_STATE_WRITE_COMPLETE;
+	ret = mgr->mops->write_complete(mgr, flags);
+	if (ret) {
+		dev_err(dev, "Error after writing image data to FPGA\n");
+		mgr->state = FPGA_MGR_STATE_WRITE_COMPLETE_ERR;
+		return ret;
+	}
+	mgr->state = FPGA_MGR_STATE_OPERATING;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_buf_load);
+
+/**
+ * fpga_mgr_firmware_load - request firmware and load to fpga
+ * @mgr:	fpga manager
+ * @flags:	flags setting fpga confuration modes
+ * @image_name:	name of image file on the firmware search path
+ *
+ * Request an FPGA image using the firmware class, then write out to the FPGA.
+ * Update the state before each step to provide info on what step failed if
+ * there is a failure.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int fpga_mgr_firmware_load(struct fpga_manager *mgr, u32 flags,
+			   const char *image_name)
+{
+	struct device *dev = &mgr->dev;
+	const struct firmware *fw;
+	int ret;
+
+	if (!mgr)
+		return -ENODEV;
+
+	dev_info(dev, "writing %s to %s\n", image_name, mgr->name);
+
+	mgr->state = FPGA_MGR_STATE_FIRMWARE_REQ;
+
+	ret = request_firmware(&fw, image_name, dev);
+	if (ret) {
+		mgr->state = FPGA_MGR_STATE_FIRMWARE_REQ_ERR;
+		dev_err(dev, "Error requesting firmware %s\n", image_name);
+		return ret;
+	}
+
+	ret = fpga_mgr_buf_load(mgr, flags, fw->data, fw->size);
+	if (ret)
+		return ret;
+
+	release_firmware(fw);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_firmware_load);
+
+static const char * const state_str[] = {
+	[FPGA_MGR_STATE_UNKNOWN] =		"unknown",
+	[FPGA_MGR_STATE_POWER_OFF] =		"power off",
+	[FPGA_MGR_STATE_POWER_UP] =		"power up",
+	[FPGA_MGR_STATE_RESET] =		"reset",
+
+	/* requesting FPGA image from firmware */
+	[FPGA_MGR_STATE_FIRMWARE_REQ] =		"firmware request",
+	[FPGA_MGR_STATE_FIRMWARE_REQ_ERR] =	"firmware request error",
+
+	/* Preparing FPGA to receive image */
+	[FPGA_MGR_STATE_WRITE_INIT] =		"write init",
+	[FPGA_MGR_STATE_WRITE_INIT_ERR] =	"write init error",
+
+	/* Writing image to FPGA */
+	[FPGA_MGR_STATE_WRITE] =		"write",
+	[FPGA_MGR_STATE_WRITE_ERR] =		"write error",
+
+	/* Finishing configuration after image has been written */
+	[FPGA_MGR_STATE_WRITE_COMPLETE] =	"write complete",
+	[FPGA_MGR_STATE_WRITE_COMPLETE_ERR] =	"write complete error",
+
+	/* FPGA reports to be in normal operating mode */
+	[FPGA_MGR_STATE_OPERATING] =		"operating",
+};
+
+static ssize_t name_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct fpga_manager *mgr = to_fpga_manager(dev);
+
+	return sprintf(buf, "%s\n", mgr->name);
+}
+
+static ssize_t state_show(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct fpga_manager *mgr = to_fpga_manager(dev);
+
+	return sprintf(buf, "%s\n", state_str[mgr->state]);
+}
+
+static DEVICE_ATTR_RO(name);
+static DEVICE_ATTR_RO(state);
+
+static struct attribute *fpga_mgr_attrs[] = {
+	&dev_attr_name.attr,
+	&dev_attr_state.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(fpga_mgr);
+
+static int fpga_mgr_of_node_match(struct device *dev, const void *data)
+{
+	return dev->of_node == data;
+}
+
+/**
+ * of_fpga_mgr_get - get an exclusive reference to a fpga mgr
+ * @node:	device node
+ *
+ * Given a device node, get an exclusive reference to a fpga mgr.
+ *
+ * Return: fpga manager struct or IS_ERR() condition containing error code.
+ */
+struct fpga_manager *of_fpga_mgr_get(struct device_node *node)
+{
+	struct fpga_manager *mgr;
+	struct device *dev;
+
+	if (!node)
+		return ERR_PTR(-EINVAL);
+
+	dev = class_find_device(fpga_mgr_class, NULL, node,
+				fpga_mgr_of_node_match);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	mgr = to_fpga_manager(dev);
+	put_device(dev);
+	if (!mgr)
+		return ERR_PTR(-ENODEV);
+
+	/* Get exclusive use of fpga manager */
+	if (!mutex_trylock(&mgr->ref_mutex))
+		return ERR_PTR(-EBUSY);
+
+	if (!try_module_get(THIS_MODULE)) {
+		mutex_unlock(&mgr->ref_mutex);
+		return ERR_PTR(-ENODEV);
+	}
+
+	return mgr;
+}
+EXPORT_SYMBOL_GPL(of_fpga_mgr_get);
+
+/**
+ * fpga_mgr_put - release a reference to a fpga manager
+ * @mgr:	fpga manager structure
+ */
+void fpga_mgr_put(struct fpga_manager *mgr)
+{
+	if (mgr) {
+		module_put(THIS_MODULE);
+		mutex_unlock(&mgr->ref_mutex);
+	}
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_put);
+
+/**
+ * fpga_mgr_register - register a low level fpga manager driver
+ * @dev:	fpga manager device from pdev
+ * @name:	fpga manager name
+ * @mops:	pointer to structure of fpga manager ops
+ * @priv:	fpga manager private data
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int fpga_mgr_register(struct device *dev, const char *name,
+		      const struct fpga_manager_ops *mops,
+		      void *priv)
+{
+	struct fpga_manager *mgr;
+	const char *dt_label;
+	int id, ret;
+
+	if (!mops || !mops->write_init || !mops->write ||
+	    !mops->write_complete || !mops->state) {
+		dev_err(dev, "Attempt to register without fpga_manager_ops\n");
+		return -EINVAL;
+	}
+
+	if (!name || !strlen(name)) {
+		dev_err(dev, "Attempt to register with no name!\n");
+		return -EINVAL;
+	}
+
+	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
+	if (!mgr)
+		return -ENOMEM;
+
+	id = ida_simple_get(&fpga_mgr_ida, 0, 0, GFP_KERNEL);
+	if (id < 0) {
+		ret = id;
+		goto error_kfree;
+	}
+
+	mutex_init(&mgr->ref_mutex);
+
+	mgr->name = name;
+	mgr->mops = mops;
+	mgr->priv = priv;
+
+	/*
+	 * Initialize framework state by requesting low level driver read state
+	 * from device.  FPGA may be in reset mode or may have been programmed
+	 * by bootloader or EEPROM.
+	 */
+	mgr->state = mgr->mops->state(mgr);
+
+	device_initialize(&mgr->dev);
+	mgr->dev.class = fpga_mgr_class;
+	mgr->dev.parent = dev;
+	mgr->dev.of_node = dev->of_node;
+	mgr->dev.id = id;
+	dev_set_drvdata(dev, mgr);
+
+	dt_label = of_get_property(mgr->dev.of_node, "label", NULL);
+	if (dt_label)
+		ret = dev_set_name(&mgr->dev, "%s", dt_label);
+	else
+		ret = dev_set_name(&mgr->dev, "fpga%d", id);
+
+	ret = device_add(&mgr->dev);
+	if (ret)
+		goto error_device;
+
+	dev_info(&mgr->dev, "%s registered\n", mgr->name);
+
+	return 0;
+
+error_device:
+	ida_simple_remove(&fpga_mgr_ida, id);
+error_kfree:
+	kfree(mgr);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_register);
+
+/**
+ * fpga_mgr_unregister - unregister a low level fpga manager driver
+ * @dev:	fpga manager device from pdev
+ */
+void fpga_mgr_unregister(struct device *dev)
+{
+	struct fpga_manager *mgr = dev_get_drvdata(dev);
+
+	dev_info(&mgr->dev, "%s %s\n", __func__, mgr->name);
+
+	/*
+	 * If the low level driver provides a method for putting fpga into
+	 * a desired state upon unregister, do it.
+	 */
+	if (mgr->mops->fpga_remove)
+		mgr->mops->fpga_remove(mgr);
+
+	device_unregister(&mgr->dev);
+}
+EXPORT_SYMBOL_GPL(fpga_mgr_unregister);
+
+static void fpga_mgr_dev_release(struct device *dev)
+{
+	struct fpga_manager *mgr = to_fpga_manager(dev);
+
+	ida_simple_remove(&fpga_mgr_ida, mgr->dev.id);
+	kfree(mgr);
+}
+
+static int __init fpga_mgr_class_init(void)
+{
+	pr_info("FPGA manager framework\n");
+
+	fpga_mgr_class = class_create(THIS_MODULE, "fpga_manager");
+	if (IS_ERR(fpga_mgr_class))
+		return PTR_ERR(fpga_mgr_class);
+
+	fpga_mgr_class->dev_groups = fpga_mgr_groups;
+	fpga_mgr_class->dev_release = fpga_mgr_dev_release;
+
+	return 0;
+}
+
+static void __exit fpga_mgr_class_exit(void)
+{
+	class_destroy(fpga_mgr_class);
+	ida_destroy(&fpga_mgr_ida);
+}
+
+MODULE_AUTHOR("Alan Tull <atull@opensource.altera.com>");
+MODULE_DESCRIPTION("FPGA manager framework");
+MODULE_LICENSE("GPL v2");
+
+subsys_initcall(fpga_mgr_class_init);
+module_exit(fpga_mgr_class_exit);
diff --git a/drivers/fpga/socfpga.c b/drivers/fpga/socfpga.c
new file mode 100644
index 000000000000..706b80d25266
--- /dev/null
+++ b/drivers/fpga/socfpga.c
@@ -0,0 +1,616 @@
+/*
+ * FPGA Manager Driver for Altera SOCFPGA
+ *
+ *  Copyright (C) 2013-2015 Altera Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/fpga/fpga-mgr.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/pm.h>
+
+/* Register offsets */
+#define SOCFPGA_FPGMGR_STAT_OFST				0x0
+#define SOCFPGA_FPGMGR_CTL_OFST					0x4
+#define SOCFPGA_FPGMGR_DCLKCNT_OFST				0x8
+#define SOCFPGA_FPGMGR_DCLKSTAT_OFST				0xc
+#define SOCFPGA_FPGMGR_GPIO_INTEN_OFST				0x830
+#define SOCFPGA_FPGMGR_GPIO_INTMSK_OFST				0x834
+#define SOCFPGA_FPGMGR_GPIO_INTTYPE_LEVEL_OFST			0x838
+#define SOCFPGA_FPGMGR_GPIO_INT_POL_OFST			0x83c
+#define SOCFPGA_FPGMGR_GPIO_INTSTAT_OFST			0x840
+#define SOCFPGA_FPGMGR_GPIO_RAW_INTSTAT_OFST			0x844
+#define SOCFPGA_FPGMGR_GPIO_PORTA_EOI_OFST			0x84c
+#define SOCFPGA_FPGMGR_GPIO_EXT_PORTA_OFST			0x850
+
+/* Register bit defines */
+/* SOCFPGA_FPGMGR_STAT register mode field values */
+#define SOCFPGA_FPGMGR_STAT_POWER_UP				0x0 /*ramping*/
+#define SOCFPGA_FPGMGR_STAT_RESET				0x1
+#define SOCFPGA_FPGMGR_STAT_CFG					0x2
+#define SOCFPGA_FPGMGR_STAT_INIT				0x3
+#define SOCFPGA_FPGMGR_STAT_USER_MODE				0x4
+#define SOCFPGA_FPGMGR_STAT_UNKNOWN				0x5
+#define SOCFPGA_FPGMGR_STAT_STATE_MASK				0x7
+/* This is a flag value that doesn't really happen in this register field */
+#define SOCFPGA_FPGMGR_STAT_POWER_OFF				0x0
+
+#define MSEL_PP16_FAST_NOAES_NODC				0x0
+#define MSEL_PP16_FAST_AES_NODC					0x1
+#define MSEL_PP16_FAST_AESOPT_DC				0x2
+#define MSEL_PP16_SLOW_NOAES_NODC				0x4
+#define MSEL_PP16_SLOW_AES_NODC					0x5
+#define MSEL_PP16_SLOW_AESOPT_DC				0x6
+#define MSEL_PP32_FAST_NOAES_NODC				0x8
+#define MSEL_PP32_FAST_AES_NODC					0x9
+#define MSEL_PP32_FAST_AESOPT_DC				0xa
+#define MSEL_PP32_SLOW_NOAES_NODC				0xc
+#define MSEL_PP32_SLOW_AES_NODC					0xd
+#define MSEL_PP32_SLOW_AESOPT_DC				0xe
+#define SOCFPGA_FPGMGR_STAT_MSEL_MASK				0x000000f8
+#define SOCFPGA_FPGMGR_STAT_MSEL_SHIFT				3
+
+/* SOCFPGA_FPGMGR_CTL register */
+#define SOCFPGA_FPGMGR_CTL_EN					0x00000001
+#define SOCFPGA_FPGMGR_CTL_NCE					0x00000002
+#define SOCFPGA_FPGMGR_CTL_NCFGPULL				0x00000004
+
+#define CDRATIO_X1						0x00000000
+#define CDRATIO_X2						0x00000040
+#define CDRATIO_X4						0x00000080
+#define CDRATIO_X8						0x000000c0
+#define SOCFPGA_FPGMGR_CTL_CDRATIO_MASK				0x000000c0
+
+#define SOCFPGA_FPGMGR_CTL_AXICFGEN				0x00000100
+
+#define CFGWDTH_16						0x00000000
+#define CFGWDTH_32						0x00000200
+#define SOCFPGA_FPGMGR_CTL_CFGWDTH_MASK				0x00000200
+
+/* SOCFPGA_FPGMGR_DCLKSTAT register */
+#define SOCFPGA_FPGMGR_DCLKSTAT_DCNTDONE_E_DONE			0x1
+
+/* SOCFPGA_FPGMGR_GPIO_* registers share the same bit positions */
+#define SOCFPGA_FPGMGR_MON_NSTATUS				0x0001
+#define SOCFPGA_FPGMGR_MON_CONF_DONE				0x0002
+#define SOCFPGA_FPGMGR_MON_INIT_DONE				0x0004
+#define SOCFPGA_FPGMGR_MON_CRC_ERROR				0x0008
+#define SOCFPGA_FPGMGR_MON_CVP_CONF_DONE			0x0010
+#define SOCFPGA_FPGMGR_MON_PR_READY				0x0020
+#define SOCFPGA_FPGMGR_MON_PR_ERROR				0x0040
+#define SOCFPGA_FPGMGR_MON_PR_DONE				0x0080
+#define SOCFPGA_FPGMGR_MON_NCONFIG_PIN				0x0100
+#define SOCFPGA_FPGMGR_MON_NSTATUS_PIN				0x0200
+#define SOCFPGA_FPGMGR_MON_CONF_DONE_PIN			0x0400
+#define SOCFPGA_FPGMGR_MON_FPGA_POWER_ON			0x0800
+#define SOCFPGA_FPGMGR_MON_STATUS_MASK				0x0fff
+
+#define SOCFPGA_FPGMGR_NUM_SUPPLIES 3
+#define SOCFPGA_RESUME_TIMEOUT 3
+
+/* In power-up order. Reverse for power-down. */
+static const char *supply_names[SOCFPGA_FPGMGR_NUM_SUPPLIES] __maybe_unused = {
+	"FPGA-1.5V",
+	"FPGA-1.1V",
+	"FPGA-2.5V",
+};
+
+struct socfpga_fpga_priv {
+	void __iomem *fpga_base_addr;
+	void __iomem *fpga_data_addr;
+	struct completion status_complete;
+	int irq;
+};
+
+struct cfgmgr_mode {
+	/* Values to set in the CTRL register */
+	u32 ctrl;
+
+	/* flag that this table entry is a valid mode */
+	bool valid;
+};
+
+/* For SOCFPGA_FPGMGR_STAT_MSEL field */
+static struct cfgmgr_mode cfgmgr_modes[] = {
+	[MSEL_PP16_FAST_NOAES_NODC] = { CFGWDTH_16 | CDRATIO_X1, 1 },
+	[MSEL_PP16_FAST_AES_NODC] =   { CFGWDTH_16 | CDRATIO_X2, 1 },
+	[MSEL_PP16_FAST_AESOPT_DC] =  { CFGWDTH_16 | CDRATIO_X4, 1 },
+	[MSEL_PP16_SLOW_NOAES_NODC] = { CFGWDTH_16 | CDRATIO_X1, 1 },
+	[MSEL_PP16_SLOW_AES_NODC] =   { CFGWDTH_16 | CDRATIO_X2, 1 },
+	[MSEL_PP16_SLOW_AESOPT_DC] =  { CFGWDTH_16 | CDRATIO_X4, 1 },
+	[MSEL_PP32_FAST_NOAES_NODC] = { CFGWDTH_32 | CDRATIO_X1, 1 },
+	[MSEL_PP32_FAST_AES_NODC] =   { CFGWDTH_32 | CDRATIO_X4, 1 },
+	[MSEL_PP32_FAST_AESOPT_DC] =  { CFGWDTH_32 | CDRATIO_X8, 1 },
+	[MSEL_PP32_SLOW_NOAES_NODC] = { CFGWDTH_32 | CDRATIO_X1, 1 },
+	[MSEL_PP32_SLOW_AES_NODC] =   { CFGWDTH_32 | CDRATIO_X4, 1 },
+	[MSEL_PP32_SLOW_AESOPT_DC] =  { CFGWDTH_32 | CDRATIO_X8, 1 },
+};
+
+static u32 socfpga_fpga_readl(struct socfpga_fpga_priv *priv, u32 reg_offset)
+{
+	return readl(priv->fpga_base_addr + reg_offset);
+}
+
+static void socfpga_fpga_writel(struct socfpga_fpga_priv *priv, u32 reg_offset,
+				u32 value)
+{
+	writel(value, priv->fpga_base_addr + reg_offset);
+}
+
+static u32 socfpga_fpga_raw_readl(struct socfpga_fpga_priv *priv,
+				  u32 reg_offset)
+{
+	return __raw_readl(priv->fpga_base_addr + reg_offset);
+}
+
+static void socfpga_fpga_raw_writel(struct socfpga_fpga_priv *priv,
+				    u32 reg_offset, u32 value)
+{
+	__raw_writel(value, priv->fpga_base_addr + reg_offset);
+}
+
+static void socfpga_fpga_data_writel(struct socfpga_fpga_priv *priv, u32 value)
+{
+	writel(value, priv->fpga_data_addr);
+}
+
+static inline void socfpga_fpga_set_bitsl(struct socfpga_fpga_priv *priv,
+					  u32 offset, u32 bits)
+{
+	u32 val;
+
+	val = socfpga_fpga_readl(priv, offset);
+	val |= bits;
+	socfpga_fpga_writel(priv, offset, val);
+}
+
+static inline void socfpga_fpga_clr_bitsl(struct socfpga_fpga_priv *priv,
+					  u32 offset, u32 bits)
+{
+	u32 val;
+
+	val = socfpga_fpga_readl(priv, offset);
+	val &= ~bits;
+	socfpga_fpga_writel(priv, offset, val);
+}
+
+static u32 socfpga_fpga_mon_status_get(struct socfpga_fpga_priv *priv)
+{
+	return socfpga_fpga_readl(priv, SOCFPGA_FPGMGR_GPIO_EXT_PORTA_OFST) &
+		SOCFPGA_FPGMGR_MON_STATUS_MASK;
+}
+
+static u32 socfpga_fpga_state_get(struct socfpga_fpga_priv *priv)
+{
+	u32 status = socfpga_fpga_mon_status_get(priv);
+
+	if ((status & SOCFPGA_FPGMGR_MON_FPGA_POWER_ON) == 0)
+		return SOCFPGA_FPGMGR_STAT_POWER_OFF;
+
+	return socfpga_fpga_readl(priv, SOCFPGA_FPGMGR_STAT_OFST) &
+		SOCFPGA_FPGMGR_STAT_STATE_MASK;
+}
+
+static void socfpga_fpga_clear_done_status(struct socfpga_fpga_priv *priv)
+{
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_DCLKSTAT_OFST,
+			    SOCFPGA_FPGMGR_DCLKSTAT_DCNTDONE_E_DONE);
+}
+
+/*
+ * Set the DCLKCNT, wait for DCLKSTAT to report the count completed, and clear
+ * the complete status.
+ */
+static int socfpga_fpga_dclk_set_and_wait_clear(struct socfpga_fpga_priv *priv,
+						u32 count)
+{
+	int timeout = 2;
+	u32 done;
+
+	/* Clear any existing DONE status. */
+	if (socfpga_fpga_readl(priv, SOCFPGA_FPGMGR_DCLKSTAT_OFST))
+		socfpga_fpga_clear_done_status(priv);
+
+	/* Issue the DCLK count. */
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_DCLKCNT_OFST, count);
+
+	/* Poll DCLKSTAT to see if it completed in the timeout period. */
+	do {
+		done = socfpga_fpga_readl(priv, SOCFPGA_FPGMGR_DCLKSTAT_OFST);
+		if (done == SOCFPGA_FPGMGR_DCLKSTAT_DCNTDONE_E_DONE) {
+			socfpga_fpga_clear_done_status(priv);
+			return 0;
+		}
+		udelay(1);
+	} while (timeout--);
+
+	return -ETIMEDOUT;
+}
+
+static int socfpga_fpga_wait_for_state(struct socfpga_fpga_priv *priv,
+				       u32 state)
+{
+	int timeout = 2;
+
+	/*
+	 * HW doesn't support an interrupt for changes in state, so poll to see
+	 * if it matches the requested state within the timeout period.
+	 */
+	do {
+		if ((socfpga_fpga_state_get(priv) & state) != 0)
+			return 0;
+		msleep(20);
+	} while (timeout--);
+
+	return -ETIMEDOUT;
+}
+
+static void socfpga_fpga_enable_irqs(struct socfpga_fpga_priv *priv, u32 irqs)
+{
+	/* set irqs to level sensitive */
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_GPIO_INTTYPE_LEVEL_OFST, 0);
+
+	/* set interrupt polarity */
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_GPIO_INT_POL_OFST, irqs);
+
+	/* clear irqs */
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_GPIO_PORTA_EOI_OFST, irqs);
+
+	/* unmask interrupts */
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_GPIO_INTMSK_OFST, 0);
+
+	/* enable interrupts */
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_GPIO_INTEN_OFST, irqs);
+}
+
+static void socfpga_fpga_disable_irqs(struct socfpga_fpga_priv *priv)
+{
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_GPIO_INTEN_OFST, 0);
+}
+
+static irqreturn_t socfpga_fpga_isr(int irq, void *dev_id)
+{
+	struct socfpga_fpga_priv *priv = dev_id;
+	u32 irqs, st;
+	bool conf_done, nstatus;
+
+	/* clear irqs */
+	irqs = socfpga_fpga_raw_readl(priv, SOCFPGA_FPGMGR_GPIO_INTSTAT_OFST);
+
+	socfpga_fpga_raw_writel(priv, SOCFPGA_FPGMGR_GPIO_PORTA_EOI_OFST, irqs);
+
+	st = socfpga_fpga_raw_readl(priv, SOCFPGA_FPGMGR_GPIO_EXT_PORTA_OFST);
+	conf_done = (st & SOCFPGA_FPGMGR_MON_CONF_DONE) != 0;
+	nstatus = (st & SOCFPGA_FPGMGR_MON_NSTATUS) != 0;
+
+	/* success */
+	if (conf_done && nstatus) {
+		/* disable irqs */
+		socfpga_fpga_raw_writel(priv,
+					SOCFPGA_FPGMGR_GPIO_INTEN_OFST, 0);
+		complete(&priv->status_complete);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int socfpga_fpga_wait_for_config_done(struct socfpga_fpga_priv *priv)
+{
+	int timeout, ret = 0;
+
+	socfpga_fpga_disable_irqs(priv);
+	init_completion(&priv->status_complete);
+	socfpga_fpga_enable_irqs(priv, SOCFPGA_FPGMGR_MON_CONF_DONE);
+
+	timeout = wait_for_completion_interruptible_timeout(
+						&priv->status_complete,
+						msecs_to_jiffies(10));
+	if (timeout == 0)
+		ret = -ETIMEDOUT;
+
+	socfpga_fpga_disable_irqs(priv);
+	return ret;
+}
+
+static int socfpga_fpga_cfg_mode_get(struct socfpga_fpga_priv *priv)
+{
+	u32 msel;
+
+	msel = socfpga_fpga_readl(priv, SOCFPGA_FPGMGR_STAT_OFST);
+	msel &= SOCFPGA_FPGMGR_STAT_MSEL_MASK;
+	msel >>= SOCFPGA_FPGMGR_STAT_MSEL_SHIFT;
+
+	/* Check that this MSEL setting is supported */
+	if ((msel >= ARRAY_SIZE(cfgmgr_modes)) || !cfgmgr_modes[msel].valid)
+		return -EINVAL;
+
+	return msel;
+}
+
+static int socfpga_fpga_cfg_mode_set(struct socfpga_fpga_priv *priv)
+{
+	u32 ctrl_reg;
+	int mode;
+
+	/* get value from MSEL pins */
+	mode = socfpga_fpga_cfg_mode_get(priv);
+	if (mode < 0)
+		return mode;
+
+	/* Adjust CTRL for the CDRATIO */
+	ctrl_reg = socfpga_fpga_readl(priv, SOCFPGA_FPGMGR_CTL_OFST);
+	ctrl_reg &= ~SOCFPGA_FPGMGR_CTL_CDRATIO_MASK;
+	ctrl_reg &= ~SOCFPGA_FPGMGR_CTL_CFGWDTH_MASK;
+	ctrl_reg |= cfgmgr_modes[mode].ctrl;
+
+	/* Set NCE to 0. */
+	ctrl_reg &= ~SOCFPGA_FPGMGR_CTL_NCE;
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_CTL_OFST, ctrl_reg);
+
+	return 0;
+}
+
+static int socfpga_fpga_reset(struct fpga_manager *mgr)
+{
+	struct socfpga_fpga_priv *priv = mgr->priv;
+	u32 ctrl_reg, status;
+	int ret;
+
+	/*
+	 * Step 1:
+	 *  - Set CTRL.CFGWDTH, CTRL.CDRATIO to match cfg mode
+	 *  - Set CTRL.NCE to 0
+	 */
+	ret = socfpga_fpga_cfg_mode_set(priv);
+	if (ret)
+		return ret;
+
+	/* Step 2: Set CTRL.EN to 1 */
+	socfpga_fpga_set_bitsl(priv, SOCFPGA_FPGMGR_CTL_OFST,
+			       SOCFPGA_FPGMGR_CTL_EN);
+
+	/* Step 3: Set CTRL.NCONFIGPULL to 1 to put FPGA in reset */
+	ctrl_reg = socfpga_fpga_readl(priv, SOCFPGA_FPGMGR_CTL_OFST);
+	ctrl_reg |= SOCFPGA_FPGMGR_CTL_NCFGPULL;
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_CTL_OFST, ctrl_reg);
+
+	/* Step 4: Wait for STATUS.MODE to report FPGA is in reset phase */
+	status = socfpga_fpga_wait_for_state(priv, SOCFPGA_FPGMGR_STAT_RESET);
+
+	/* Step 5: Set CONTROL.NCONFIGPULL to 0 to release FPGA from reset */
+	ctrl_reg &= ~SOCFPGA_FPGMGR_CTL_NCFGPULL;
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_CTL_OFST, ctrl_reg);
+
+	/* Timeout waiting for reset */
+	if (status)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/*
+ * Prepare the FPGA to receive the configuration data.
+ */
+static int socfpga_fpga_ops_configure_init(struct fpga_manager *mgr, u32 flags,
+					   const char *buf, size_t count)
+{
+	struct socfpga_fpga_priv *priv = mgr->priv;
+	int ret;
+
+	if (flags & FPGA_MGR_PARTIAL_RECONFIG) {
+		dev_err(&mgr->dev, "Partial reconfiguration not supported.\n");
+		return -EINVAL;
+	}
+	/* Steps 1 - 5: Reset the FPGA */
+	ret = socfpga_fpga_reset(mgr);
+	if (ret)
+		return ret;
+
+	/* Step 6: Wait for FPGA to enter configuration phase */
+	if (socfpga_fpga_wait_for_state(priv, SOCFPGA_FPGMGR_STAT_CFG))
+		return -ETIMEDOUT;
+
+	/* Step 7: Clear nSTATUS interrupt */
+	socfpga_fpga_writel(priv, SOCFPGA_FPGMGR_GPIO_PORTA_EOI_OFST,
+			    SOCFPGA_FPGMGR_MON_NSTATUS);
+
+	/* Step 8: Set CTRL.AXICFGEN to 1 to enable transfer of config data */
+	socfpga_fpga_set_bitsl(priv, SOCFPGA_FPGMGR_CTL_OFST,
+			       SOCFPGA_FPGMGR_CTL_AXICFGEN);
+
+	return 0;
+}
+
+/*
+ * Step 9: write data to the FPGA data register
+ */
+static int socfpga_fpga_ops_configure_write(struct fpga_manager *mgr,
+					    const char *buf, size_t count)
+{
+	struct socfpga_fpga_priv *priv = mgr->priv;
+	u32 *buffer_32 = (u32 *)buf;
+	size_t i = 0;
+
+	if (count <= 0)
+		return -EINVAL;
+
+	/* Write out the complete 32-bit chunks. */
+	while (count >= sizeof(u32)) {
+		socfpga_fpga_data_writel(priv, buffer_32[i++]);
+		count -= sizeof(u32);
+	}
+
+	/* Write out remaining non 32-bit chunks. */
+	switch (count) {
+	case 3:
+		socfpga_fpga_data_writel(priv, buffer_32[i++] & 0x00ffffff);
+		break;
+	case 2:
+		socfpga_fpga_data_writel(priv, buffer_32[i++] & 0x0000ffff);
+		break;
+	case 1:
+		socfpga_fpga_data_writel(priv, buffer_32[i++] & 0x000000ff);
+		break;
+	case 0:
+		break;
+	default:
+		/* This will never happen. */
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int socfpga_fpga_ops_configure_complete(struct fpga_manager *mgr,
+					       u32 flags)
+{
+	struct socfpga_fpga_priv *priv = mgr->priv;
+	u32 status;
+
+	/*
+	 * Step 10:
+	 *  - Observe CONF_DONE and nSTATUS (active low)
+	 *  - if CONF_DONE = 1 and nSTATUS = 1, configuration was successful
+	 *  - if CONF_DONE = 0 and nSTATUS = 0, configuration failed
+	 */
+	status = socfpga_fpga_wait_for_config_done(priv);
+	if (status)
+		return status;
+
+	/* Step 11: Clear CTRL.AXICFGEN to disable transfer of config data */
+	socfpga_fpga_clr_bitsl(priv, SOCFPGA_FPGMGR_CTL_OFST,
+			       SOCFPGA_FPGMGR_CTL_AXICFGEN);
+
+	/*
+	 * Step 12:
+	 *  - Write 4 to DCLKCNT
+	 *  - Wait for STATUS.DCNTDONE = 1
+	 *  - Clear W1C bit in STATUS.DCNTDONE
+	 */
+	if (socfpga_fpga_dclk_set_and_wait_clear(priv, 4))
+		return -ETIMEDOUT;
+
+	/* Step 13: Wait for STATUS.MODE to report USER MODE */
+	if (socfpga_fpga_wait_for_state(priv, SOCFPGA_FPGMGR_STAT_USER_MODE))
+		return -ETIMEDOUT;
+
+	/* Step 14: Set CTRL.EN to 0 */
+	socfpga_fpga_clr_bitsl(priv, SOCFPGA_FPGMGR_CTL_OFST,
+			       SOCFPGA_FPGMGR_CTL_EN);
+
+	return 0;
+}
+
+/* Translate state register values to FPGA framework state */
+static const enum fpga_mgr_states socfpga_state_to_framework_state[] = {
+	[SOCFPGA_FPGMGR_STAT_POWER_OFF] = FPGA_MGR_STATE_POWER_OFF,
+	[SOCFPGA_FPGMGR_STAT_RESET] = FPGA_MGR_STATE_RESET,
+	[SOCFPGA_FPGMGR_STAT_CFG] = FPGA_MGR_STATE_WRITE_INIT,
+	[SOCFPGA_FPGMGR_STAT_INIT] = FPGA_MGR_STATE_WRITE_INIT,
+	[SOCFPGA_FPGMGR_STAT_USER_MODE] = FPGA_MGR_STATE_OPERATING,
+	[SOCFPGA_FPGMGR_STAT_UNKNOWN] = FPGA_MGR_STATE_UNKNOWN,
+};
+
+static enum fpga_mgr_states socfpga_fpga_ops_state(struct fpga_manager *mgr)
+{
+	struct socfpga_fpga_priv *priv = mgr->priv;
+	enum fpga_mgr_states ret;
+	u32 state;
+
+	state = socfpga_fpga_state_get(priv);
+
+	if (state < ARRAY_SIZE(socfpga_state_to_framework_state))
+		ret = socfpga_state_to_framework_state[state];
+	else
+		ret = FPGA_MGR_STATE_UNKNOWN;
+
+	return ret;
+}
+
+static const struct fpga_manager_ops socfpga_fpga_ops = {
+	.state = socfpga_fpga_ops_state,
+	.write_init = socfpga_fpga_ops_configure_init,
+	.write = socfpga_fpga_ops_configure_write,
+	.write_complete = socfpga_fpga_ops_configure_complete,
+};
+
+static int socfpga_fpga_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct socfpga_fpga_priv *priv;
+	struct resource *res;
+	int ret;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	priv->fpga_base_addr = devm_ioremap_resource(dev, res);
+	if (IS_ERR(priv->fpga_base_addr))
+		return PTR_ERR(priv->fpga_base_addr);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	priv->fpga_data_addr = devm_ioremap_resource(dev, res);
+	if (IS_ERR(priv->fpga_data_addr))
+		return PTR_ERR(priv->fpga_data_addr);
+
+	priv->irq = platform_get_irq(pdev, 0);
+	if (priv->irq < 0)
+		return priv->irq;
+
+	ret = devm_request_irq(dev, priv->irq, socfpga_fpga_isr, 0,
+			       dev_name(dev), priv);
+	if (IS_ERR_VALUE(ret))
+		return ret;
+
+	return fpga_mgr_register(dev, "Altera SOCFPGA FPGA Manager",
+				 &socfpga_fpga_ops, priv);
+}
+
+static int socfpga_fpga_remove(struct platform_device *pdev)
+{
+	fpga_mgr_unregister(&pdev->dev);
+
+	return 0;
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id socfpga_fpga_of_match[] = {
+	{ .compatible = "altr,socfpga-fpga-mgr", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, socfpga_fpga_of_match);
+#endif
+
+static struct platform_driver socfpga_fpga_driver = {
+	.probe = socfpga_fpga_probe,
+	.remove = socfpga_fpga_remove,
+	.driver = {
+		.name	= "socfpga_fpga_manager",
+		.of_match_table = of_match_ptr(socfpga_fpga_of_match),
+	},
+};
+
+module_platform_driver(socfpga_fpga_driver);
+
+MODULE_AUTHOR("Alan Tull <atull@opensource.altera.com>");
+MODULE_DESCRIPTION("Altera SOCFPGA FPGA Manager");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index bf2476ed9356..d630b7ece735 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -191,7 +191,8 @@ static void etm_set_prog(struct etm_drvdata *drvdata)
 	isb();
 	if (coresight_timeout_etm(drvdata, ETMSR, ETMSR_PROG_BIT, 1)) {
 		dev_err(drvdata->dev,
-			"timeout observed when probing at offset %#x\n", ETMSR);
+			"%s: timeout observed when probing at offset %#x\n",
+			__func__, ETMSR);
 	}
 }
 
@@ -209,7 +210,8 @@ static void etm_clr_prog(struct etm_drvdata *drvdata)
 	isb();
 	if (coresight_timeout_etm(drvdata, ETMSR, ETMSR_PROG_BIT, 0)) {
 		dev_err(drvdata->dev,
-			"timeout observed when probing at offset %#x\n", ETMSR);
+			"%s: timeout observed when probing at offset %#x\n",
+			__func__, ETMSR);
 	}
 }
 
@@ -313,14 +315,6 @@ static void etm_enable_hw(void *info)
 	dev_dbg(drvdata->dev, "cpu: %d enable smp call done\n", drvdata->cpu);
 }
 
-static int etm_trace_id_simple(struct etm_drvdata *drvdata)
-{
-	if (!drvdata->enable)
-		return drvdata->traceid;
-
-	return (etm_readl(drvdata, ETMTRACEIDR) & ETM_TRACEID_MASK);
-}
-
 static int etm_trace_id(struct coresight_device *csdev)
 {
 	struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
@@ -1506,44 +1500,17 @@ static ssize_t timestamp_event_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(timestamp_event);
 
-static ssize_t status_show(struct device *dev,
-			   struct device_attribute *attr, char *buf)
+static ssize_t cpu_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	int ret;
-	unsigned long flags;
+	int val;
 	struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
 
-	pm_runtime_get_sync(drvdata->dev);
-	spin_lock_irqsave(&drvdata->spinlock, flags);
-
-	CS_UNLOCK(drvdata->base);
-	ret = sprintf(buf,
-		      "ETMCCR: 0x%08x\n"
-		      "ETMCCER: 0x%08x\n"
-		      "ETMSCR: 0x%08x\n"
-		      "ETMIDR: 0x%08x\n"
-		      "ETMCR: 0x%08x\n"
-		      "ETMTRACEIDR: 0x%08x\n"
-		      "Enable event: 0x%08x\n"
-		      "Enable start/stop: 0x%08x\n"
-		      "Enable control: CR1 0x%08x CR2 0x%08x\n"
-		      "CPU affinity: %d\n",
-		      drvdata->etmccr, drvdata->etmccer,
-		      etm_readl(drvdata, ETMSCR), etm_readl(drvdata, ETMIDR),
-		      etm_readl(drvdata, ETMCR), etm_trace_id_simple(drvdata),
-		      etm_readl(drvdata, ETMTEEVR),
-		      etm_readl(drvdata, ETMTSSCR),
-		      etm_readl(drvdata, ETMTECR1),
-		      etm_readl(drvdata, ETMTECR2),
-		      drvdata->cpu);
-	CS_LOCK(drvdata->base);
-
-	spin_unlock_irqrestore(&drvdata->spinlock, flags);
-	pm_runtime_put(drvdata->dev);
+	val = drvdata->cpu;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
 
-	return ret;
 }
-static DEVICE_ATTR_RO(status);
+static DEVICE_ATTR_RO(cpu);
 
 static ssize_t traceid_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
@@ -1619,11 +1586,61 @@ static struct attribute *coresight_etm_attrs[] = {
 	&dev_attr_ctxid_mask.attr,
 	&dev_attr_sync_freq.attr,
 	&dev_attr_timestamp_event.attr,
-	&dev_attr_status.attr,
 	&dev_attr_traceid.attr,
+	&dev_attr_cpu.attr,
+	NULL,
+};
+
+#define coresight_simple_func(name, offset)                             \
+static ssize_t name##_show(struct device *_dev,                         \
+			   struct device_attribute *attr, char *buf)    \
+{                                                                       \
+	struct etm_drvdata *drvdata = dev_get_drvdata(_dev->parent);    \
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n",                      \
+			 readl_relaxed(drvdata->base + offset));        \
+}                                                                       \
+DEVICE_ATTR_RO(name)
+
+coresight_simple_func(etmccr, ETMCCR);
+coresight_simple_func(etmccer, ETMCCER);
+coresight_simple_func(etmscr, ETMSCR);
+coresight_simple_func(etmidr, ETMIDR);
+coresight_simple_func(etmcr, ETMCR);
+coresight_simple_func(etmtraceidr, ETMTRACEIDR);
+coresight_simple_func(etmteevr, ETMTEEVR);
+coresight_simple_func(etmtssvr, ETMTSSCR);
+coresight_simple_func(etmtecr1, ETMTECR1);
+coresight_simple_func(etmtecr2, ETMTECR2);
+
+static struct attribute *coresight_etm_mgmt_attrs[] = {
+	&dev_attr_etmccr.attr,
+	&dev_attr_etmccer.attr,
+	&dev_attr_etmscr.attr,
+	&dev_attr_etmidr.attr,
+	&dev_attr_etmcr.attr,
+	&dev_attr_etmtraceidr.attr,
+	&dev_attr_etmteevr.attr,
+	&dev_attr_etmtssvr.attr,
+	&dev_attr_etmtecr1.attr,
+	&dev_attr_etmtecr2.attr,
+	NULL,
+};
+
+static const struct attribute_group coresight_etm_group = {
+	.attrs = coresight_etm_attrs,
+};
+
+
+static const struct attribute_group coresight_etm_mgmt_group = {
+	.attrs = coresight_etm_mgmt_attrs,
+	.name = "mgmt",
+};
+
+static const struct attribute_group *coresight_etm_groups[] = {
+	&coresight_etm_group,
+	&coresight_etm_mgmt_group,
 	NULL,
 };
-ATTRIBUTE_GROUPS(coresight_etm);
 
 static int etm_cpu_callback(struct notifier_block *nfb, unsigned long action,
 			    void *hcpu)
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 254a81a4e6f4..a6707642bb23 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -136,7 +136,9 @@ static void etm4_enable_hw(void *info)
 		writel_relaxed(drvdata->cntr_val[i],
 			       drvdata->base + TRCCNTVRn(i));
 	}
-	for (i = 0; i < drvdata->nr_resource; i++)
+
+	/* Resource selector pair 0 is always implemented and reserved */
+	for (i = 2; i < drvdata->nr_resource * 2; i++)
 		writel_relaxed(drvdata->res_ctrl[i],
 			       drvdata->base + TRCRSCTLRn(i));
 
@@ -489,8 +491,9 @@ static ssize_t reset_store(struct device *dev,
 		drvdata->cntr_val[i] = 0x0;
 	}
 
-	drvdata->res_idx = 0x0;
-	for (i = 0; i < drvdata->nr_resource; i++)
+	/* Resource selector pair 0 is always implemented and reserved */
+	drvdata->res_idx = 0x2;
+	for (i = 2; i < drvdata->nr_resource * 2; i++)
 		drvdata->res_ctrl[i] = 0x0;
 
 	for (i = 0; i < drvdata->nr_ss_cmp; i++) {
@@ -1732,7 +1735,7 @@ static ssize_t res_idx_store(struct device *dev,
 	if (kstrtoul(buf, 16, &val))
 		return -EINVAL;
 	/* Resource selector pair 0 is always implemented and reserved */
-	if ((val == 0) || (val >= drvdata->nr_resource))
+	if (val < 2 || val >= drvdata->nr_resource * 2)
 		return -EINVAL;
 
 	/*
@@ -2416,8 +2419,13 @@ static void etm4_init_arch_data(void *info)
 	drvdata->nr_addr_cmp = BMVAL(etmidr4, 0, 3);
 	/* NUMPC, bits[15:12] number of PE comparator inputs for tracing */
 	drvdata->nr_pe_cmp = BMVAL(etmidr4, 12, 15);
-	/* NUMRSPAIR, bits[19:16] the number of resource pairs for tracing */
-	drvdata->nr_resource = BMVAL(etmidr4, 16, 19);
+	/*
+	 * NUMRSPAIR, bits[19:16]
+	 * The number of resource pairs conveyed by the HW starts at 0, i.e a
+	 * value of 0x0 indicate 1 resource pair, 0x1 indicate two and so on.
+	 * As such add 1 to the value of NUMRSPAIR for a better representation.
+	 */
+	drvdata->nr_resource = BMVAL(etmidr4, 16, 19) + 1;
 	/*
 	 * NUMSSCC, bits[23:20] the number of single-shot
 	 * comparator control for tracing
@@ -2504,6 +2512,8 @@ static void etm4_init_default_data(struct etmv4_drvdata *drvdata)
 		drvdata->cntr_val[i] = 0x0;
 	}
 
+	/* Resource selector pair 0 is always implemented and reserved */
+	drvdata->res_idx = 0x2;
 	for (i = 2; i < drvdata->nr_resource * 2; i++)
 		drvdata->res_ctrl[i] = 0x0;
 
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 894531d315b8..e25492137d8b 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -240,6 +240,11 @@ static int coresight_enable_path(struct list_head *path)
 	int ret = 0;
 	struct coresight_device *cd;
 
+	/*
+	 * At this point we have a full @path, from source to sink.  The
+	 * sink is the first entry and the source the last one.  Go through
+	 * all the components and enable them one by one.
+	 */
 	list_for_each_entry(cd, path, path_link) {
 		if (cd == list_first_entry(path, struct coresight_device,
 					   path_link)) {
diff --git a/drivers/hwtracing/intel_th/Kconfig b/drivers/hwtracing/intel_th/Kconfig
new file mode 100644
index 000000000000..b7a9073d968b
--- /dev/null
+++ b/drivers/hwtracing/intel_th/Kconfig
@@ -0,0 +1,72 @@
+config INTEL_TH
+	tristate "Intel(R) Trace Hub controller"
+	help
+	  Intel(R) Trace Hub (TH) is a set of hardware blocks (subdevices) that
+	  produce, switch and output trace data from multiple hardware and
+	  software sources over several types of trace output ports encoded
+	  in System Trace Protocol (MIPI STPv2) and is intended to perform
+	  full system debugging.
+
+	  This option enables intel_th bus and common code used by TH
+	  subdevices to interact with each other and hardware and for
+	  platform glue layers to drive Intel TH devices.
+
+	  Say Y here to enable Intel(R) Trace Hub controller support.
+
+if INTEL_TH
+
+config INTEL_TH_PCI
+	tristate "Intel(R) Trace Hub PCI controller"
+	depends on PCI
+	help
+	  Intel(R) Trace Hub may exist as a PCI device. This option enables
+	  support glue layer for PCI-based Intel TH.
+
+	  Say Y here to enable PCI Intel TH support.
+
+config INTEL_TH_GTH
+	tristate "Intel(R) Trace Hub Global Trace Hub"
+	help
+	  Global Trace Hub (GTH) is the central component of the
+	  Intel TH infrastructure and acts as a switch for source
+	  and output devices. This driver is required for other
+	  Intel TH subdevices to initialize.
+
+	  Say Y here to enable GTH subdevice of Intel(R) Trace Hub.
+
+config INTEL_TH_STH
+	tristate "Intel(R) Trace Hub Software Trace Hub support"
+	depends on STM
+	help
+	  Software Trace Hub (STH) enables trace data from software
+	  trace sources to be sent out via Intel(R) Trace Hub. It
+	  uses stm class device to interface with its sources.
+
+	  Say Y here to enable STH subdevice of Intel(R) Trace Hub.
+
+config INTEL_TH_MSU
+	tristate "Intel(R) Trace Hub Memory Storage Unit"
+	help
+	  Memory Storage Unit (MSU) trace output device enables
+	  storing STP traces to system memory. It supports single
+	  and multiblock modes of operation and provides read()
+	  and mmap() access to the collected data.
+
+	  Say Y here to enable MSU output device for Intel TH.
+
+config INTEL_TH_PTI
+	tristate "Intel(R) Trace Hub PTI output"
+	help
+	  Parallel Trace Interface unit (PTI) is a trace output device
+	  of Intel TH architecture that facilitates STP trace output via
+	  a PTI port.
+
+	  Say Y to enable PTI output of Intel TH data.
+
+config INTEL_TH_DEBUG
+	bool "Intel(R) Trace Hub debugging"
+	depends on DEBUG_FS
+	help
+	  Say Y here to enable debugging.
+
+endif
diff --git a/drivers/hwtracing/intel_th/Makefile b/drivers/hwtracing/intel_th/Makefile
new file mode 100644
index 000000000000..81d42fe918f7
--- /dev/null
+++ b/drivers/hwtracing/intel_th/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_INTEL_TH)		+= intel_th.o
+intel_th-y			:= core.o
+intel_th-$(CONFIG_INTEL_TH_DEBUG) += debug.o
+
+obj-$(CONFIG_INTEL_TH_PCI)	+= intel_th_pci.o
+intel_th_pci-y			:= pci.o
+
+obj-$(CONFIG_INTEL_TH_GTH)	+= intel_th_gth.o
+intel_th_gth-y			:= gth.o
+
+obj-$(CONFIG_INTEL_TH_STH)	+= intel_th_sth.o
+intel_th_sth-y			:= sth.o
+
+obj-$(CONFIG_INTEL_TH_MSU)	+= intel_th_msu.o
+intel_th_msu-y			:= msu.o
+
+obj-$(CONFIG_INTEL_TH_PTI)	+= intel_th_pti.o
+intel_th_pti-y			:= pti.o
diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c
new file mode 100644
index 000000000000..165d3001c301
--- /dev/null
+++ b/drivers/hwtracing/intel_th/core.c
@@ -0,0 +1,692 @@
+/*
+ * Intel(R) Trace Hub driver core
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/kdev_t.h>
+#include <linux/debugfs.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+
+#include "intel_th.h"
+#include "debug.h"
+
+static DEFINE_IDA(intel_th_ida);
+
+static int intel_th_match(struct device *dev, struct device_driver *driver)
+{
+	struct intel_th_driver *thdrv = to_intel_th_driver(driver);
+	struct intel_th_device *thdev = to_intel_th_device(dev);
+
+	if (thdev->type == INTEL_TH_SWITCH &&
+	    (!thdrv->enable || !thdrv->disable))
+		return 0;
+
+	return !strcmp(thdev->name, driver->name);
+}
+
+static int intel_th_child_remove(struct device *dev, void *data)
+{
+	device_release_driver(dev);
+
+	return 0;
+}
+
+static int intel_th_probe(struct device *dev)
+{
+	struct intel_th_driver *thdrv = to_intel_th_driver(dev->driver);
+	struct intel_th_device *thdev = to_intel_th_device(dev);
+	struct intel_th_driver *hubdrv;
+	struct intel_th_device *hub = NULL;
+	int ret;
+
+	if (thdev->type == INTEL_TH_SWITCH)
+		hub = thdev;
+	else if (dev->parent)
+		hub = to_intel_th_device(dev->parent);
+
+	if (!hub || !hub->dev.driver)
+		return -EPROBE_DEFER;
+
+	hubdrv = to_intel_th_driver(hub->dev.driver);
+
+	ret = thdrv->probe(to_intel_th_device(dev));
+	if (ret)
+		return ret;
+
+	if (thdev->type == INTEL_TH_OUTPUT &&
+	    !intel_th_output_assigned(thdev))
+		ret = hubdrv->assign(hub, thdev);
+
+	return ret;
+}
+
+static int intel_th_remove(struct device *dev)
+{
+	struct intel_th_driver *thdrv = to_intel_th_driver(dev->driver);
+	struct intel_th_device *thdev = to_intel_th_device(dev);
+	struct intel_th_device *hub = to_intel_th_device(dev->parent);
+	int err;
+
+	if (thdev->type == INTEL_TH_SWITCH) {
+		err = device_for_each_child(dev, thdev, intel_th_child_remove);
+		if (err)
+			return err;
+	}
+
+	thdrv->remove(thdev);
+
+	if (intel_th_output_assigned(thdev)) {
+		struct intel_th_driver *hubdrv =
+			to_intel_th_driver(dev->parent->driver);
+
+		if (hub->dev.driver)
+			hubdrv->unassign(hub, thdev);
+	}
+
+	return 0;
+}
+
+static struct bus_type intel_th_bus = {
+	.name		= "intel_th",
+	.dev_attrs	= NULL,
+	.match		= intel_th_match,
+	.probe		= intel_th_probe,
+	.remove		= intel_th_remove,
+};
+
+static void intel_th_device_free(struct intel_th_device *thdev);
+
+static void intel_th_device_release(struct device *dev)
+{
+	intel_th_device_free(to_intel_th_device(dev));
+}
+
+static struct device_type intel_th_source_device_type = {
+	.name		= "intel_th_source_device",
+	.release	= intel_th_device_release,
+};
+
+static char *intel_th_output_devnode(struct device *dev, umode_t *mode,
+				     kuid_t *uid, kgid_t *gid)
+{
+	struct intel_th_device *thdev = to_intel_th_device(dev);
+	char *node;
+
+	if (thdev->id >= 0)
+		node = kasprintf(GFP_KERNEL, "intel_th%d/%s%d", 0, thdev->name,
+				 thdev->id);
+	else
+		node = kasprintf(GFP_KERNEL, "intel_th%d/%s", 0, thdev->name);
+
+	return node;
+}
+
+static ssize_t port_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct intel_th_device *thdev = to_intel_th_device(dev);
+
+	if (thdev->output.port >= 0)
+		return scnprintf(buf, PAGE_SIZE, "%u\n", thdev->output.port);
+
+	return scnprintf(buf, PAGE_SIZE, "unassigned\n");
+}
+
+static DEVICE_ATTR_RO(port);
+
+static int intel_th_output_activate(struct intel_th_device *thdev)
+{
+	struct intel_th_driver *thdrv = to_intel_th_driver(thdev->dev.driver);
+
+	if (thdrv->activate)
+		return thdrv->activate(thdev);
+
+	intel_th_trace_enable(thdev);
+
+	return 0;
+}
+
+static void intel_th_output_deactivate(struct intel_th_device *thdev)
+{
+	struct intel_th_driver *thdrv = to_intel_th_driver(thdev->dev.driver);
+
+	if (thdrv->deactivate)
+		thdrv->deactivate(thdev);
+	else
+		intel_th_trace_disable(thdev);
+}
+
+static ssize_t active_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct intel_th_device *thdev = to_intel_th_device(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", thdev->output.active);
+}
+
+static ssize_t active_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t size)
+{
+	struct intel_th_device *thdev = to_intel_th_device(dev);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	if (!!val != thdev->output.active) {
+		if (val)
+			ret = intel_th_output_activate(thdev);
+		else
+			intel_th_output_deactivate(thdev);
+	}
+
+	return ret ? ret : size;
+}
+
+static DEVICE_ATTR_RW(active);
+
+static struct attribute *intel_th_output_attrs[] = {
+	&dev_attr_port.attr,
+	&dev_attr_active.attr,
+	NULL,
+};
+
+ATTRIBUTE_GROUPS(intel_th_output);
+
+static struct device_type intel_th_output_device_type = {
+	.name		= "intel_th_output_device",
+	.groups		= intel_th_output_groups,
+	.release	= intel_th_device_release,
+	.devnode	= intel_th_output_devnode,
+};
+
+static struct device_type intel_th_switch_device_type = {
+	.name		= "intel_th_switch_device",
+	.release	= intel_th_device_release,
+};
+
+static struct device_type *intel_th_device_type[] = {
+	[INTEL_TH_SOURCE]	= &intel_th_source_device_type,
+	[INTEL_TH_OUTPUT]	= &intel_th_output_device_type,
+	[INTEL_TH_SWITCH]	= &intel_th_switch_device_type,
+};
+
+int intel_th_driver_register(struct intel_th_driver *thdrv)
+{
+	if (!thdrv->probe || !thdrv->remove)
+		return -EINVAL;
+
+	thdrv->driver.bus = &intel_th_bus;
+
+	return driver_register(&thdrv->driver);
+}
+EXPORT_SYMBOL_GPL(intel_th_driver_register);
+
+void intel_th_driver_unregister(struct intel_th_driver *thdrv)
+{
+	driver_unregister(&thdrv->driver);
+}
+EXPORT_SYMBOL_GPL(intel_th_driver_unregister);
+
+static struct intel_th_device *
+intel_th_device_alloc(struct intel_th *th, unsigned int type, const char *name,
+		      int id)
+{
+	struct device *parent;
+	struct intel_th_device *thdev;
+
+	if (type == INTEL_TH_SWITCH)
+		parent = th->dev;
+	else
+		parent = &th->hub->dev;
+
+	thdev = kzalloc(sizeof(*thdev) + strlen(name) + 1, GFP_KERNEL);
+	if (!thdev)
+		return NULL;
+
+	thdev->id = id;
+	thdev->type = type;
+
+	strcpy(thdev->name, name);
+	device_initialize(&thdev->dev);
+	thdev->dev.bus = &intel_th_bus;
+	thdev->dev.type = intel_th_device_type[type];
+	thdev->dev.parent = parent;
+	thdev->dev.dma_mask = parent->dma_mask;
+	thdev->dev.dma_parms = parent->dma_parms;
+	dma_set_coherent_mask(&thdev->dev, parent->coherent_dma_mask);
+	if (id >= 0)
+		dev_set_name(&thdev->dev, "%d-%s%d", th->id, name, id);
+	else
+		dev_set_name(&thdev->dev, "%d-%s", th->id, name);
+
+	return thdev;
+}
+
+static int intel_th_device_add_resources(struct intel_th_device *thdev,
+					 struct resource *res, int nres)
+{
+	struct resource *r;
+
+	r = kmemdup(res, sizeof(*res) * nres, GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	thdev->resource = r;
+	thdev->num_resources = nres;
+
+	return 0;
+}
+
+static void intel_th_device_remove(struct intel_th_device *thdev)
+{
+	device_del(&thdev->dev);
+	put_device(&thdev->dev);
+}
+
+static void intel_th_device_free(struct intel_th_device *thdev)
+{
+	kfree(thdev->resource);
+	kfree(thdev);
+}
+
+/*
+ * Intel(R) Trace Hub subdevices
+ */
+static struct intel_th_subdevice {
+	const char		*name;
+	struct resource		res[3];
+	unsigned		nres;
+	unsigned		type;
+	unsigned		otype;
+	int			id;
+} intel_th_subdevices[TH_SUBDEVICE_MAX] = {
+	{
+		.nres	= 1,
+		.res	= {
+			{
+				.start	= REG_GTH_OFFSET,
+				.end	= REG_GTH_OFFSET + REG_GTH_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+		},
+		.name	= "gth",
+		.type	= INTEL_TH_SWITCH,
+		.id	= -1,
+	},
+	{
+		.nres	= 2,
+		.res	= {
+			{
+				.start	= REG_MSU_OFFSET,
+				.end	= REG_MSU_OFFSET + REG_MSU_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+			{
+				.start	= BUF_MSU_OFFSET,
+				.end	= BUF_MSU_OFFSET + BUF_MSU_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+		},
+		.name	= "msc",
+		.id	= 0,
+		.type	= INTEL_TH_OUTPUT,
+		.otype	= GTH_MSU,
+	},
+	{
+		.nres	= 2,
+		.res	= {
+			{
+				.start	= REG_MSU_OFFSET,
+				.end	= REG_MSU_OFFSET + REG_MSU_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+			{
+				.start	= BUF_MSU_OFFSET,
+				.end	= BUF_MSU_OFFSET + BUF_MSU_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+		},
+		.name	= "msc",
+		.id	= 1,
+		.type	= INTEL_TH_OUTPUT,
+		.otype	= GTH_MSU,
+	},
+	{
+		.nres	= 2,
+		.res	= {
+			{
+				.start	= REG_STH_OFFSET,
+				.end	= REG_STH_OFFSET + REG_STH_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+			{
+				.start	= TH_MMIO_SW,
+				.end	= 0,
+				.flags	= IORESOURCE_MEM,
+			},
+		},
+		.id	= -1,
+		.name	= "sth",
+		.type	= INTEL_TH_SOURCE,
+	},
+	{
+		.nres	= 1,
+		.res	= {
+			{
+				.start	= REG_PTI_OFFSET,
+				.end	= REG_PTI_OFFSET + REG_PTI_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+		},
+		.id	= -1,
+		.name	= "pti",
+		.type	= INTEL_TH_OUTPUT,
+		.otype	= GTH_PTI,
+	},
+	{
+		.nres	= 1,
+		.res	= {
+			{
+				.start	= REG_DCIH_OFFSET,
+				.end	= REG_DCIH_OFFSET + REG_DCIH_LENGTH - 1,
+				.flags	= IORESOURCE_MEM,
+			},
+		},
+		.id	= -1,
+		.name	= "dcih",
+		.type	= INTEL_TH_OUTPUT,
+	},
+};
+
+static int intel_th_populate(struct intel_th *th, struct resource *devres,
+			     unsigned int ndevres, int irq)
+{
+	struct resource res[3];
+	unsigned int req = 0;
+	int i, err;
+
+	/* create devices for each intel_th_subdevice */
+	for (i = 0; i < ARRAY_SIZE(intel_th_subdevices); i++) {
+		struct intel_th_subdevice *subdev = &intel_th_subdevices[i];
+		struct intel_th_device *thdev;
+		int r;
+
+		thdev = intel_th_device_alloc(th, subdev->type, subdev->name,
+					      subdev->id);
+		if (!thdev) {
+			err = -ENOMEM;
+			goto kill_subdevs;
+		}
+
+		memcpy(res, subdev->res,
+		       sizeof(struct resource) * subdev->nres);
+
+		for (r = 0; r < subdev->nres; r++) {
+			int bar = TH_MMIO_CONFIG;
+
+			/*
+			 * Take .end == 0 to mean 'take the whole bar',
+			 * .start then tells us which bar it is. Default to
+			 * TH_MMIO_CONFIG.
+			 */
+			if (!res[r].end && res[r].flags == IORESOURCE_MEM) {
+				bar = res[r].start;
+				res[r].start = 0;
+				res[r].end = resource_size(&devres[bar]) - 1;
+			}
+
+			if (res[r].flags & IORESOURCE_MEM) {
+				res[r].start	+= devres[bar].start;
+				res[r].end	+= devres[bar].start;
+
+				dev_dbg(th->dev, "%s:%d @ %pR\n",
+					subdev->name, r, &res[r]);
+			} else if (res[r].flags & IORESOURCE_IRQ) {
+				res[r].start	= irq;
+			}
+		}
+
+		err = intel_th_device_add_resources(thdev, res, subdev->nres);
+		if (err) {
+			put_device(&thdev->dev);
+			goto kill_subdevs;
+		}
+
+		if (subdev->type == INTEL_TH_OUTPUT) {
+			thdev->dev.devt = MKDEV(th->major, i);
+			thdev->output.type = subdev->otype;
+			thdev->output.port = -1;
+		}
+
+		err = device_add(&thdev->dev);
+		if (err) {
+			put_device(&thdev->dev);
+			goto kill_subdevs;
+		}
+
+		/* need switch driver to be loaded to enumerate the rest */
+		if (subdev->type == INTEL_TH_SWITCH && !req) {
+			th->hub = thdev;
+			err = request_module("intel_th_%s", subdev->name);
+			if (!err)
+				req++;
+		}
+
+		th->thdev[i] = thdev;
+	}
+
+	return 0;
+
+kill_subdevs:
+	for (i-- ; i >= 0; i--)
+		intel_th_device_remove(th->thdev[i]);
+
+	return err;
+}
+
+static int match_devt(struct device *dev, void *data)
+{
+	dev_t devt = (dev_t)(unsigned long)data;
+
+	return dev->devt == devt;
+}
+
+static int intel_th_output_open(struct inode *inode, struct file *file)
+{
+	const struct file_operations *fops;
+	struct intel_th_driver *thdrv;
+	struct device *dev;
+	int err;
+
+	dev = bus_find_device(&intel_th_bus, NULL,
+			      (void *)(unsigned long)inode->i_rdev,
+			      match_devt);
+	if (!dev || !dev->driver)
+		return -ENODEV;
+
+	thdrv = to_intel_th_driver(dev->driver);
+	fops = fops_get(thdrv->fops);
+	if (!fops)
+		return -ENODEV;
+
+	replace_fops(file, fops);
+
+	file->private_data = to_intel_th_device(dev);
+
+	if (file->f_op->open) {
+		err = file->f_op->open(inode, file);
+		return err;
+	}
+
+	return 0;
+}
+
+static const struct file_operations intel_th_output_fops = {
+	.open	= intel_th_output_open,
+	.llseek	= noop_llseek,
+};
+
+/**
+ * intel_th_alloc() - allocate a new Intel TH device and its subdevices
+ * @dev:	parent device
+ * @devres:	parent's resources
+ * @ndevres:	number of resources
+ * @irq:	irq number
+ */
+struct intel_th *
+intel_th_alloc(struct device *dev, struct resource *devres,
+	       unsigned int ndevres, int irq)
+{
+	struct intel_th *th;
+	int err;
+
+	th = kzalloc(sizeof(*th), GFP_KERNEL);
+	if (!th)
+		return ERR_PTR(-ENOMEM);
+
+	th->id = ida_simple_get(&intel_th_ida, 0, 0, GFP_KERNEL);
+	if (th->id < 0) {
+		err = th->id;
+		goto err_alloc;
+	}
+
+	th->major = __register_chrdev(0, 0, TH_POSSIBLE_OUTPUTS,
+				      "intel_th/output", &intel_th_output_fops);
+	if (th->major < 0) {
+		err = th->major;
+		goto err_ida;
+	}
+	th->dev = dev;
+
+	err = intel_th_populate(th, devres, ndevres, irq);
+	if (err)
+		goto err_chrdev;
+
+	return th;
+
+err_chrdev:
+	__unregister_chrdev(th->major, 0, TH_POSSIBLE_OUTPUTS,
+			    "intel_th/output");
+
+err_ida:
+	ida_simple_remove(&intel_th_ida, th->id);
+
+err_alloc:
+	kfree(th);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(intel_th_alloc);
+
+void intel_th_free(struct intel_th *th)
+{
+	int i;
+
+	for (i = 0; i < TH_SUBDEVICE_MAX; i++)
+		if (th->thdev[i] != th->hub)
+			intel_th_device_remove(th->thdev[i]);
+
+	intel_th_device_remove(th->hub);
+
+	__unregister_chrdev(th->major, 0, TH_POSSIBLE_OUTPUTS,
+			    "intel_th/output");
+
+	ida_simple_remove(&intel_th_ida, th->id);
+
+	kfree(th);
+}
+EXPORT_SYMBOL_GPL(intel_th_free);
+
+/**
+ * intel_th_trace_enable() - enable tracing for an output device
+ * @thdev:	output device that requests tracing be enabled
+ */
+int intel_th_trace_enable(struct intel_th_device *thdev)
+{
+	struct intel_th_device *hub = to_intel_th_device(thdev->dev.parent);
+	struct intel_th_driver *hubdrv = to_intel_th_driver(hub->dev.driver);
+
+	if (WARN_ON_ONCE(hub->type != INTEL_TH_SWITCH))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(thdev->type != INTEL_TH_OUTPUT))
+		return -EINVAL;
+
+	hubdrv->enable(hub, &thdev->output);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(intel_th_trace_enable);
+
+/**
+ * intel_th_trace_disable() - disable tracing for an output device
+ * @thdev:	output device that requests tracing be disabled
+ */
+int intel_th_trace_disable(struct intel_th_device *thdev)
+{
+	struct intel_th_device *hub = to_intel_th_device(thdev->dev.parent);
+	struct intel_th_driver *hubdrv = to_intel_th_driver(hub->dev.driver);
+
+	WARN_ON_ONCE(hub->type != INTEL_TH_SWITCH);
+	if (WARN_ON_ONCE(thdev->type != INTEL_TH_OUTPUT))
+		return -EINVAL;
+
+	hubdrv->disable(hub, &thdev->output);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(intel_th_trace_disable);
+
+int intel_th_set_output(struct intel_th_device *thdev,
+			unsigned int master)
+{
+	struct intel_th_device *hub = to_intel_th_device(thdev->dev.parent);
+	struct intel_th_driver *hubdrv = to_intel_th_driver(hub->dev.driver);
+
+	if (!hubdrv->set_output)
+		return -ENOTSUPP;
+
+	return hubdrv->set_output(hub, master);
+}
+EXPORT_SYMBOL_GPL(intel_th_set_output);
+
+static int __init intel_th_init(void)
+{
+	intel_th_debug_init();
+
+	return bus_register(&intel_th_bus);
+}
+subsys_initcall(intel_th_init);
+
+static void __exit intel_th_exit(void)
+{
+	intel_th_debug_done();
+
+	bus_unregister(&intel_th_bus);
+}
+module_exit(intel_th_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel(R) Trace Hub controller driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/intel_th/debug.c b/drivers/hwtracing/intel_th/debug.c
new file mode 100644
index 000000000000..788a1f0a97ad
--- /dev/null
+++ b/drivers/hwtracing/intel_th/debug.c
@@ -0,0 +1,36 @@
+/*
+ * Intel(R) Trace Hub driver debugging
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+
+#include "intel_th.h"
+#include "debug.h"
+
+struct dentry *intel_th_dbg;
+
+void intel_th_debug_init(void)
+{
+	intel_th_dbg = debugfs_create_dir("intel_th", NULL);
+	if (IS_ERR(intel_th_dbg))
+		intel_th_dbg = NULL;
+}
+
+void intel_th_debug_done(void)
+{
+	debugfs_remove(intel_th_dbg);
+	intel_th_dbg = NULL;
+}
diff --git a/drivers/hwtracing/intel_th/debug.h b/drivers/hwtracing/intel_th/debug.h
new file mode 100644
index 000000000000..88311bad3ba4
--- /dev/null
+++ b/drivers/hwtracing/intel_th/debug.h
@@ -0,0 +1,34 @@
+/*
+ * Intel(R) Trace Hub driver debugging
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __INTEL_TH_DEBUG_H__
+#define __INTEL_TH_DEBUG_H__
+
+#ifdef CONFIG_INTEL_TH_DEBUG
+extern struct dentry *intel_th_dbg;
+
+void intel_th_debug_init(void);
+void intel_th_debug_done(void);
+#else
+static inline void intel_th_debug_init(void)
+{
+}
+
+static inline void intel_th_debug_done(void)
+{
+}
+#endif
+
+#endif /* __INTEL_TH_DEBUG_H__ */
diff --git a/drivers/hwtracing/intel_th/gth.c b/drivers/hwtracing/intel_th/gth.c
new file mode 100644
index 000000000000..673d272ee400
--- /dev/null
+++ b/drivers/hwtracing/intel_th/gth.c
@@ -0,0 +1,706 @@
+/*
+ * Intel(R) Trace Hub Global Trace Hub
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "intel_th.h"
+#include "gth.h"
+
+struct gth_device;
+
+/**
+ * struct gth_output - GTH view on an output port
+ * @gth:	backlink to the GTH device
+ * @output:	link to output device's output descriptor
+ * @index:	output port number
+ * @port_type:	one of GTH_* port type values
+ * @master:	bitmap of masters configured for this output
+ */
+struct gth_output {
+	struct gth_device	*gth;
+	struct intel_th_output	*output;
+	unsigned int		index;
+	unsigned int		port_type;
+	DECLARE_BITMAP(master, TH_CONFIGURABLE_MASTERS + 1);
+};
+
+/**
+ * struct gth_device - GTH device
+ * @dev:	driver core's device
+ * @base:	register window base address
+ * @output_group:	attributes describing output ports
+ * @master_group:	attributes describing master assignments
+ * @output:		output ports
+ * @master:		master/output port assignments
+ * @gth_lock:		serializes accesses to GTH bits
+ */
+struct gth_device {
+	struct device		*dev;
+	void __iomem		*base;
+
+	struct attribute_group	output_group;
+	struct attribute_group	master_group;
+	struct gth_output	output[TH_POSSIBLE_OUTPUTS];
+	signed char		master[TH_CONFIGURABLE_MASTERS + 1];
+	spinlock_t		gth_lock;
+};
+
+static void gth_output_set(struct gth_device *gth, int port,
+			   unsigned int config)
+{
+	unsigned long reg = port & 4 ? REG_GTH_GTHOPT1 : REG_GTH_GTHOPT0;
+	u32 val;
+	int shift = (port & 3) * 8;
+
+	val = ioread32(gth->base + reg);
+	val &= ~(0xff << shift);
+	val |= config << shift;
+	iowrite32(val, gth->base + reg);
+}
+
+static unsigned int gth_output_get(struct gth_device *gth, int port)
+{
+	unsigned long reg = port & 4 ? REG_GTH_GTHOPT1 : REG_GTH_GTHOPT0;
+	u32 val;
+	int shift = (port & 3) * 8;
+
+	val = ioread32(gth->base + reg);
+	val &= 0xff << shift;
+	val >>= shift;
+
+	return val;
+}
+
+static void gth_smcfreq_set(struct gth_device *gth, int port,
+			    unsigned int freq)
+{
+	unsigned long reg = REG_GTH_SMCR0 + ((port / 2) * 4);
+	int shift = (port & 1) * 16;
+	u32 val;
+
+	val = ioread32(gth->base + reg);
+	val &= ~(0xffff << shift);
+	val |= freq << shift;
+	iowrite32(val, gth->base + reg);
+}
+
+static unsigned int gth_smcfreq_get(struct gth_device *gth, int port)
+{
+	unsigned long reg = REG_GTH_SMCR0 + ((port / 2) * 4);
+	int shift = (port & 1) * 16;
+	u32 val;
+
+	val = ioread32(gth->base + reg);
+	val &= 0xffff << shift;
+	val >>= shift;
+
+	return val;
+}
+
+/*
+ * "masters" attribute group
+ */
+
+struct master_attribute {
+	struct device_attribute	attr;
+	struct gth_device	*gth;
+	unsigned int		master;
+};
+
+static void
+gth_master_set(struct gth_device *gth, unsigned int master, int port)
+{
+	unsigned int reg = REG_GTH_SWDEST0 + ((master >> 1) & ~3u);
+	unsigned int shift = (master & 0x7) * 4;
+	u32 val;
+
+	if (master >= 256) {
+		reg = REG_GTH_GSWTDEST;
+		shift = 0;
+	}
+
+	val = ioread32(gth->base + reg);
+	val &= ~(0xf << shift);
+	if (port >= 0)
+		val |= (0x8 | port) << shift;
+	iowrite32(val, gth->base + reg);
+}
+
+/*static int gth_master_get(struct gth_device *gth, unsigned int master)
+{
+	unsigned int reg = REG_GTH_SWDEST0 + ((master >> 1) & ~3u);
+	unsigned int shift = (master & 0x7) * 4;
+	u32 val;
+
+	if (master >= 256) {
+		reg = REG_GTH_GSWTDEST;
+		shift = 0;
+	}
+
+	val = ioread32(gth->base + reg);
+	val &= (0xf << shift);
+	val >>= shift;
+
+	return val ? val & 0x7 : -1;
+	}*/
+
+static ssize_t master_attr_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct master_attribute *ma =
+		container_of(attr, struct master_attribute, attr);
+	struct gth_device *gth = ma->gth;
+	size_t count;
+	int port;
+
+	spin_lock(&gth->gth_lock);
+	port = gth->master[ma->master];
+	spin_unlock(&gth->gth_lock);
+
+	if (port >= 0)
+		count = snprintf(buf, PAGE_SIZE, "%x\n", port);
+	else
+		count = snprintf(buf, PAGE_SIZE, "disabled\n");
+
+	return count;
+}
+
+static ssize_t master_attr_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct master_attribute *ma =
+		container_of(attr, struct master_attribute, attr);
+	struct gth_device *gth = ma->gth;
+	int old_port, port;
+
+	if (kstrtoint(buf, 10, &port) < 0)
+		return -EINVAL;
+
+	if (port >= TH_POSSIBLE_OUTPUTS || port < -1)
+		return -EINVAL;
+
+	spin_lock(&gth->gth_lock);
+
+	/* disconnect from the previous output port, if any */
+	old_port = gth->master[ma->master];
+	if (old_port >= 0) {
+		gth->master[ma->master] = -1;
+		clear_bit(ma->master, gth->output[old_port].master);
+		if (gth->output[old_port].output->active)
+			gth_master_set(gth, ma->master, -1);
+	}
+
+	/* connect to the new output port, if any */
+	if (port >= 0) {
+		/* check if there's a driver for this port */
+		if (!gth->output[port].output) {
+			count = -ENODEV;
+			goto unlock;
+		}
+
+		set_bit(ma->master, gth->output[port].master);
+
+		/* if the port is active, program this setting */
+		if (gth->output[port].output->active)
+			gth_master_set(gth, ma->master, port);
+	}
+
+	gth->master[ma->master] = port;
+
+unlock:
+	spin_unlock(&gth->gth_lock);
+
+	return count;
+}
+
+struct output_attribute {
+	struct device_attribute attr;
+	struct gth_device	*gth;
+	unsigned int		port;
+	unsigned int		parm;
+};
+
+#define OUTPUT_PARM(_name, _mask, _r, _w, _what)			\
+	[TH_OUTPUT_PARM(_name)] = { .name = __stringify(_name),		\
+				    .get = gth_ ## _what ## _get,	\
+				    .set = gth_ ## _what ## _set,	\
+				    .mask = (_mask),			\
+				    .readable = (_r),			\
+				    .writable = (_w) }
+
+static const struct output_parm {
+	const char	*name;
+	unsigned int	(*get)(struct gth_device *gth, int port);
+	void		(*set)(struct gth_device *gth, int port,
+			       unsigned int val);
+	unsigned int	mask;
+	unsigned int	readable : 1,
+			writable : 1;
+} output_parms[] = {
+	OUTPUT_PARM(port,	0x7,	1, 0, output),
+	OUTPUT_PARM(null,	BIT(3),	1, 1, output),
+	OUTPUT_PARM(drop,	BIT(4),	1, 1, output),
+	OUTPUT_PARM(reset,	BIT(5),	1, 0, output),
+	OUTPUT_PARM(flush,	BIT(7),	0, 1, output),
+	OUTPUT_PARM(smcfreq,	0xffff,	1, 1, smcfreq),
+};
+
+static void
+gth_output_parm_set(struct gth_device *gth, int port, unsigned int parm,
+		    unsigned int val)
+{
+	unsigned int config = output_parms[parm].get(gth, port);
+	unsigned int mask = output_parms[parm].mask;
+	unsigned int shift = __ffs(mask);
+
+	config &= ~mask;
+	config |= (val << shift) & mask;
+	output_parms[parm].set(gth, port, config);
+}
+
+static unsigned int
+gth_output_parm_get(struct gth_device *gth, int port, unsigned int parm)
+{
+	unsigned int config = output_parms[parm].get(gth, port);
+	unsigned int mask = output_parms[parm].mask;
+	unsigned int shift = __ffs(mask);
+
+	config &= mask;
+	config >>= shift;
+	return config;
+}
+
+/*
+ * Reset outputs and sources
+ */
+static int intel_th_gth_reset(struct gth_device *gth)
+{
+	u32 scratchpad;
+	int port, i;
+
+	scratchpad = ioread32(gth->base + REG_GTH_SCRPD0);
+	if (scratchpad & SCRPD_DEBUGGER_IN_USE)
+		return -EBUSY;
+
+	/* output ports */
+	for (port = 0; port < 8; port++) {
+		if (gth_output_parm_get(gth, port, TH_OUTPUT_PARM(port)) ==
+		    GTH_NONE)
+			continue;
+
+		gth_output_set(gth, port, 0);
+		gth_smcfreq_set(gth, port, 16);
+	}
+	/* disable overrides */
+	iowrite32(0, gth->base + REG_GTH_DESTOVR);
+
+	/* masters swdest_0~31 and gswdest */
+	for (i = 0; i < 33; i++)
+		iowrite32(0, gth->base + REG_GTH_SWDEST0 + i * 4);
+
+	/* sources */
+	iowrite32(0, gth->base + REG_GTH_SCR);
+	iowrite32(0xfc, gth->base + REG_GTH_SCR2);
+
+	return 0;
+}
+
+/*
+ * "outputs" attribute group
+ */
+
+static ssize_t output_attr_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct output_attribute *oa =
+		container_of(attr, struct output_attribute, attr);
+	struct gth_device *gth = oa->gth;
+	size_t count;
+
+	spin_lock(&gth->gth_lock);
+	count = snprintf(buf, PAGE_SIZE, "%x\n",
+			 gth_output_parm_get(gth, oa->port, oa->parm));
+	spin_unlock(&gth->gth_lock);
+
+	return count;
+}
+
+static ssize_t output_attr_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct output_attribute *oa =
+		container_of(attr, struct output_attribute, attr);
+	struct gth_device *gth = oa->gth;
+	unsigned int config;
+
+	if (kstrtouint(buf, 16, &config) < 0)
+		return -EINVAL;
+
+	spin_lock(&gth->gth_lock);
+	gth_output_parm_set(gth, oa->port, oa->parm, config);
+	spin_unlock(&gth->gth_lock);
+
+	return count;
+}
+
+static int intel_th_master_attributes(struct gth_device *gth)
+{
+	struct master_attribute *master_attrs;
+	struct attribute **attrs;
+	int i, nattrs = TH_CONFIGURABLE_MASTERS + 2;
+
+	attrs = devm_kcalloc(gth->dev, nattrs, sizeof(void *), GFP_KERNEL);
+	if (!attrs)
+		return -ENOMEM;
+
+	master_attrs = devm_kcalloc(gth->dev, nattrs,
+				    sizeof(struct master_attribute),
+				    GFP_KERNEL);
+	if (!master_attrs)
+		return -ENOMEM;
+
+	for (i = 0; i < TH_CONFIGURABLE_MASTERS + 1; i++) {
+		char *name;
+
+		name = devm_kasprintf(gth->dev, GFP_KERNEL, "%d%s", i,
+				      i == TH_CONFIGURABLE_MASTERS ? "+" : "");
+		if (!name)
+			return -ENOMEM;
+
+		master_attrs[i].attr.attr.name = name;
+		master_attrs[i].attr.attr.mode = S_IRUGO | S_IWUSR;
+		master_attrs[i].attr.show = master_attr_show;
+		master_attrs[i].attr.store = master_attr_store;
+
+		sysfs_attr_init(&master_attrs[i].attr.attr);
+		attrs[i] = &master_attrs[i].attr.attr;
+
+		master_attrs[i].gth = gth;
+		master_attrs[i].master = i;
+	}
+
+	gth->master_group.name	= "masters";
+	gth->master_group.attrs = attrs;
+
+	return sysfs_create_group(&gth->dev->kobj, &gth->master_group);
+}
+
+static int intel_th_output_attributes(struct gth_device *gth)
+{
+	struct output_attribute *out_attrs;
+	struct attribute **attrs;
+	int i, j, nouts = TH_POSSIBLE_OUTPUTS;
+	int nparms = ARRAY_SIZE(output_parms);
+	int nattrs = nouts * nparms + 1;
+
+	attrs = devm_kcalloc(gth->dev, nattrs, sizeof(void *), GFP_KERNEL);
+	if (!attrs)
+		return -ENOMEM;
+
+	out_attrs = devm_kcalloc(gth->dev, nattrs,
+				 sizeof(struct output_attribute),
+				 GFP_KERNEL);
+	if (!out_attrs)
+		return -ENOMEM;
+
+	for (i = 0; i < nouts; i++) {
+		for (j = 0; j < nparms; j++) {
+			unsigned int idx = i * nparms + j;
+			char *name;
+
+			name = devm_kasprintf(gth->dev, GFP_KERNEL, "%d_%s", i,
+					      output_parms[j].name);
+			if (!name)
+				return -ENOMEM;
+
+			out_attrs[idx].attr.attr.name = name;
+
+			if (output_parms[j].readable) {
+				out_attrs[idx].attr.attr.mode |= S_IRUGO;
+				out_attrs[idx].attr.show = output_attr_show;
+			}
+
+			if (output_parms[j].writable) {
+				out_attrs[idx].attr.attr.mode |= S_IWUSR;
+				out_attrs[idx].attr.store = output_attr_store;
+			}
+
+			sysfs_attr_init(&out_attrs[idx].attr.attr);
+			attrs[idx] = &out_attrs[idx].attr.attr;
+
+			out_attrs[idx].gth = gth;
+			out_attrs[idx].port = i;
+			out_attrs[idx].parm = j;
+		}
+	}
+
+	gth->output_group.name	= "outputs";
+	gth->output_group.attrs = attrs;
+
+	return sysfs_create_group(&gth->dev->kobj, &gth->output_group);
+}
+
+/**
+ * intel_th_gth_disable() - enable tracing to an output device
+ * @thdev:	GTH device
+ * @output:	output device's descriptor
+ *
+ * This will deconfigure all masters set to output to this device,
+ * disable tracing using force storeEn off signal and wait for the
+ * "pipeline empty" bit for corresponding output port.
+ */
+static void intel_th_gth_disable(struct intel_th_device *thdev,
+				 struct intel_th_output *output)
+{
+	struct gth_device *gth = dev_get_drvdata(&thdev->dev);
+	unsigned long count;
+	int master;
+	u32 reg;
+
+	spin_lock(&gth->gth_lock);
+	output->active = false;
+
+	for_each_set_bit(master, gth->output[output->port].master,
+			 TH_CONFIGURABLE_MASTERS) {
+		gth_master_set(gth, master, -1);
+	}
+	spin_unlock(&gth->gth_lock);
+
+	iowrite32(0, gth->base + REG_GTH_SCR);
+	iowrite32(0xfd, gth->base + REG_GTH_SCR2);
+
+	/* wait on pipeline empty for the given port */
+	for (reg = 0, count = GTH_PLE_WAITLOOP_DEPTH;
+	     count && !(reg & BIT(output->port)); count--) {
+		reg = ioread32(gth->base + REG_GTH_STAT);
+		cpu_relax();
+	}
+
+	/* clear force capture done for next captures */
+	iowrite32(0xfc, gth->base + REG_GTH_SCR2);
+
+	if (!count)
+		dev_dbg(&thdev->dev, "timeout waiting for GTH[%d] PLE\n",
+			output->port);
+}
+
+/**
+ * intel_th_gth_enable() - enable tracing to an output device
+ * @thdev:	GTH device
+ * @output:	output device's descriptor
+ *
+ * This will configure all masters set to output to this device and
+ * enable tracing using force storeEn signal.
+ */
+static void intel_th_gth_enable(struct intel_th_device *thdev,
+				struct intel_th_output *output)
+{
+	struct gth_device *gth = dev_get_drvdata(&thdev->dev);
+	u32 scr = 0xfc0000;
+	int master;
+
+	spin_lock(&gth->gth_lock);
+	for_each_set_bit(master, gth->output[output->port].master,
+			 TH_CONFIGURABLE_MASTERS + 1) {
+		gth_master_set(gth, master, output->port);
+	}
+
+	if (output->multiblock)
+		scr |= 0xff;
+
+	output->active = true;
+	spin_unlock(&gth->gth_lock);
+
+	iowrite32(scr, gth->base + REG_GTH_SCR);
+	iowrite32(0, gth->base + REG_GTH_SCR2);
+}
+
+/**
+ * intel_th_gth_assign() - assign output device to a GTH output port
+ * @thdev:	GTH device
+ * @othdev:	output device
+ *
+ * This will match a given output device parameters against present
+ * output ports on the GTH and fill out relevant bits in output device's
+ * descriptor.
+ *
+ * Return:	0 on success, -errno on error.
+ */
+static int intel_th_gth_assign(struct intel_th_device *thdev,
+			       struct intel_th_device *othdev)
+{
+	struct gth_device *gth = dev_get_drvdata(&thdev->dev);
+	int i, id;
+
+	if (othdev->type != INTEL_TH_OUTPUT)
+		return -EINVAL;
+
+	for (i = 0, id = 0; i < TH_POSSIBLE_OUTPUTS; i++) {
+		if (gth->output[i].port_type != othdev->output.type)
+			continue;
+
+		if (othdev->id == -1 || othdev->id == id)
+			goto found;
+
+		id++;
+	}
+
+	return -ENOENT;
+
+found:
+	spin_lock(&gth->gth_lock);
+	othdev->output.port = i;
+	othdev->output.active = false;
+	gth->output[i].output = &othdev->output;
+	spin_unlock(&gth->gth_lock);
+
+	return 0;
+}
+
+/**
+ * intel_th_gth_unassign() - deassociate an output device from its output port
+ * @thdev:	GTH device
+ * @othdev:	output device
+ */
+static void intel_th_gth_unassign(struct intel_th_device *thdev,
+				  struct intel_th_device *othdev)
+{
+	struct gth_device *gth = dev_get_drvdata(&thdev->dev);
+	int port = othdev->output.port;
+
+	spin_lock(&gth->gth_lock);
+	othdev->output.port = -1;
+	othdev->output.active = false;
+	gth->output[port].output = NULL;
+	spin_unlock(&gth->gth_lock);
+}
+
+static int
+intel_th_gth_set_output(struct intel_th_device *thdev, unsigned int master)
+{
+	struct gth_device *gth = dev_get_drvdata(&thdev->dev);
+	int port = 0; /* FIXME: make default output configurable */
+
+	/*
+	 * everything above TH_CONFIGURABLE_MASTERS is controlled by the
+	 * same register
+	 */
+	if (master > TH_CONFIGURABLE_MASTERS)
+		master = TH_CONFIGURABLE_MASTERS;
+
+	spin_lock(&gth->gth_lock);
+	if (gth->master[master] == -1) {
+		set_bit(master, gth->output[port].master);
+		gth->master[master] = port;
+	}
+	spin_unlock(&gth->gth_lock);
+
+	return 0;
+}
+
+static int intel_th_gth_probe(struct intel_th_device *thdev)
+{
+	struct device *dev = &thdev->dev;
+	struct gth_device *gth;
+	struct resource *res;
+	void __iomem *base;
+	int i, ret;
+
+	res = intel_th_device_get_resource(thdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	base = devm_ioremap(dev, res->start, resource_size(res));
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	gth = devm_kzalloc(dev, sizeof(*gth), GFP_KERNEL);
+	if (!gth)
+		return -ENOMEM;
+
+	gth->dev = dev;
+	gth->base = base;
+	spin_lock_init(&gth->gth_lock);
+
+	ret = intel_th_gth_reset(gth);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < TH_CONFIGURABLE_MASTERS + 1; i++)
+		gth->master[i] = -1;
+
+	for (i = 0; i < TH_POSSIBLE_OUTPUTS; i++) {
+		gth->output[i].gth = gth;
+		gth->output[i].index = i;
+		gth->output[i].port_type =
+			gth_output_parm_get(gth, i, TH_OUTPUT_PARM(port));
+	}
+
+	if (intel_th_output_attributes(gth) ||
+	    intel_th_master_attributes(gth)) {
+		pr_warn("Can't initialize sysfs attributes\n");
+
+		if (gth->output_group.attrs)
+			sysfs_remove_group(&gth->dev->kobj, &gth->output_group);
+		return -ENOMEM;
+	}
+
+	dev_set_drvdata(dev, gth);
+
+	return 0;
+}
+
+static void intel_th_gth_remove(struct intel_th_device *thdev)
+{
+	struct gth_device *gth = dev_get_drvdata(&thdev->dev);
+
+	sysfs_remove_group(&gth->dev->kobj, &gth->output_group);
+	sysfs_remove_group(&gth->dev->kobj, &gth->master_group);
+}
+
+static struct intel_th_driver intel_th_gth_driver = {
+	.probe		= intel_th_gth_probe,
+	.remove		= intel_th_gth_remove,
+	.assign		= intel_th_gth_assign,
+	.unassign	= intel_th_gth_unassign,
+	.set_output	= intel_th_gth_set_output,
+	.enable		= intel_th_gth_enable,
+	.disable	= intel_th_gth_disable,
+	.driver	= {
+		.name	= "gth",
+		.owner	= THIS_MODULE,
+	},
+};
+
+module_driver(intel_th_gth_driver,
+	      intel_th_driver_register,
+	      intel_th_driver_unregister);
+
+MODULE_ALIAS("intel_th_switch");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel(R) Trace Hub Global Trace Hub driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/intel_th/gth.h b/drivers/hwtracing/intel_th/gth.h
new file mode 100644
index 000000000000..3b714b7a61db
--- /dev/null
+++ b/drivers/hwtracing/intel_th/gth.h
@@ -0,0 +1,66 @@
+/*
+ * Intel(R) Trace Hub Global Trace Hub (GTH) data structures
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __INTEL_TH_GTH_H__
+#define __INTEL_TH_GTH_H__
+
+/* Map output port parameter bits to symbolic names */
+#define TH_OUTPUT_PARM(name)			\
+	TH_OUTPUT_ ## name
+
+enum intel_th_output_parm {
+	/* output port type */
+	TH_OUTPUT_PARM(port),
+	/* generate NULL packet */
+	TH_OUTPUT_PARM(null),
+	/* packet drop */
+	TH_OUTPUT_PARM(drop),
+	/* port in reset state */
+	TH_OUTPUT_PARM(reset),
+	/* flush out data */
+	TH_OUTPUT_PARM(flush),
+	/* mainenance packet frequency */
+	TH_OUTPUT_PARM(smcfreq),
+};
+
+/*
+ * Register offsets
+ */
+enum {
+	REG_GTH_GTHOPT0		= 0x00, /* Output ports 0..3 config */
+	REG_GTH_GTHOPT1		= 0x04, /* Output ports 4..7 config */
+	REG_GTH_SWDEST0		= 0x08, /* Switching destination masters 0..7 */
+	REG_GTH_GSWTDEST	= 0x88, /* Global sw trace destination */
+	REG_GTH_SMCR0		= 0x9c, /* STP mainenance for ports 0/1 */
+	REG_GTH_SMCR1		= 0xa0, /* STP mainenance for ports 2/3 */
+	REG_GTH_SMCR2		= 0xa4, /* STP mainenance for ports 4/5 */
+	REG_GTH_SMCR3		= 0xa8, /* STP mainenance for ports 6/7 */
+	REG_GTH_SCR		= 0xc8, /* Source control (storeEn override) */
+	REG_GTH_STAT		= 0xd4, /* GTH status */
+	REG_GTH_SCR2		= 0xd8, /* Source control (force storeEn off) */
+	REG_GTH_DESTOVR		= 0xdc, /* Destination override */
+	REG_GTH_SCRPD0		= 0xe0, /* ScratchPad[0] */
+	REG_GTH_SCRPD1		= 0xe4, /* ScratchPad[1] */
+	REG_GTH_SCRPD2		= 0xe8, /* ScratchPad[2] */
+	REG_GTH_SCRPD3		= 0xec, /* ScratchPad[3] */
+};
+
+/* Externall debugger is using Intel TH */
+#define SCRPD_DEBUGGER_IN_USE	BIT(24)
+
+/* waiting for Pipeline Empty bit(s) to assert for GTH */
+#define GTH_PLE_WAITLOOP_DEPTH	10000
+
+#endif /* __INTEL_TH_GTH_H__ */
diff --git a/drivers/hwtracing/intel_th/intel_th.h b/drivers/hwtracing/intel_th/intel_th.h
new file mode 100644
index 000000000000..57fd72b20fae
--- /dev/null
+++ b/drivers/hwtracing/intel_th/intel_th.h
@@ -0,0 +1,244 @@
+/*
+ * Intel(R) Trace Hub data structures
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __INTEL_TH_H__
+#define __INTEL_TH_H__
+
+/* intel_th_device device types */
+enum {
+	/* Devices that generate trace data */
+	INTEL_TH_SOURCE = 0,
+	/* Output ports (MSC, PTI) */
+	INTEL_TH_OUTPUT,
+	/* Switch, the Global Trace Hub (GTH) */
+	INTEL_TH_SWITCH,
+};
+
+/**
+ * struct intel_th_output - descriptor INTEL_TH_OUTPUT type devices
+ * @port:	output port number, assigned by the switch
+ * @type:	GTH_{MSU,CTP,PTI}
+ * @multiblock:	true for multiblock output configuration
+ * @active:	true when this output is enabled
+ *
+ * Output port descriptor, used by switch driver to tell which output
+ * port this output device corresponds to. Filled in at output device's
+ * probe time by switch::assign(). Passed from output device driver to
+ * switch related code to enable/disable its port.
+ */
+struct intel_th_output {
+	int		port;
+	unsigned int	type;
+	bool		multiblock;
+	bool		active;
+};
+
+/**
+ * struct intel_th_device - device on the intel_th bus
+ * @dev:		device
+ * @resource:		array of resources available to this device
+ * @num_resources:	number of resources in @resource array
+ * @type:		INTEL_TH_{SOURCE,OUTPUT,SWITCH}
+ * @id:			device instance or -1
+ * @output:		output descriptor for INTEL_TH_OUTPUT devices
+ * @name:		device name to match the driver
+ */
+struct intel_th_device {
+	struct device	dev;
+	struct resource	*resource;
+	unsigned int	num_resources;
+	unsigned int	type;
+	int		id;
+
+	/* INTEL_TH_OUTPUT specific */
+	struct intel_th_output	output;
+
+	char		name[];
+};
+
+#define to_intel_th_device(_d)				\
+	container_of((_d), struct intel_th_device, dev)
+
+/**
+ * intel_th_device_get_resource() - obtain @num'th resource of type @type
+ * @thdev:	the device to search the resource for
+ * @type:	resource type
+ * @num:	number of the resource
+ */
+static inline struct resource *
+intel_th_device_get_resource(struct intel_th_device *thdev, unsigned int type,
+			     unsigned int num)
+{
+	int i;
+
+	for (i = 0; i < thdev->num_resources; i++)
+		if (resource_type(&thdev->resource[i]) == type && !num--)
+			return &thdev->resource[i];
+
+	return NULL;
+}
+
+/**
+ * intel_th_output_assigned() - if an output device is assigned to a switch port
+ * @thdev:	the output device
+ *
+ * Return:	true if the device is INTEL_TH_OUTPUT *and* is assigned a port
+ */
+static inline bool
+intel_th_output_assigned(struct intel_th_device *thdev)
+{
+	return thdev->type == INTEL_TH_OUTPUT &&
+		thdev->output.port >= 0;
+}
+
+/**
+ * struct intel_th_driver - driver for an intel_th_device device
+ * @driver:	generic driver
+ * @probe:	probe method
+ * @remove:	remove method
+ * @assign:	match a given output type device against available outputs
+ * @unassign:	deassociate an output type device from an output port
+ * @enable:	enable tracing for a given output device
+ * @disable:	disable tracing for a given output device
+ * @fops:	file operations for device nodes
+ *
+ * Callbacks @probe and @remove are required for all device types.
+ * Switch device driver needs to fill in @assign, @enable and @disable
+ * callbacks.
+ */
+struct intel_th_driver {
+	struct device_driver	driver;
+	int			(*probe)(struct intel_th_device *thdev);
+	void			(*remove)(struct intel_th_device *thdev);
+	/* switch (GTH) ops */
+	int			(*assign)(struct intel_th_device *thdev,
+					  struct intel_th_device *othdev);
+	void			(*unassign)(struct intel_th_device *thdev,
+					    struct intel_th_device *othdev);
+	void			(*enable)(struct intel_th_device *thdev,
+					  struct intel_th_output *output);
+	void			(*disable)(struct intel_th_device *thdev,
+					   struct intel_th_output *output);
+	/* output ops */
+	void			(*irq)(struct intel_th_device *thdev);
+	int			(*activate)(struct intel_th_device *thdev);
+	void			(*deactivate)(struct intel_th_device *thdev);
+	/* file_operations for those who want a device node */
+	const struct file_operations *fops;
+
+	/* source ops */
+	int			(*set_output)(struct intel_th_device *thdev,
+					      unsigned int master);
+};
+
+#define to_intel_th_driver(_d)					\
+	container_of((_d), struct intel_th_driver, driver)
+
+static inline struct intel_th_device *
+to_intel_th_hub(struct intel_th_device *thdev)
+{
+	struct device *parent = thdev->dev.parent;
+
+	if (!parent)
+		return NULL;
+
+	return to_intel_th_device(parent);
+}
+
+struct intel_th *
+intel_th_alloc(struct device *dev, struct resource *devres,
+	       unsigned int ndevres, int irq);
+void intel_th_free(struct intel_th *th);
+
+int intel_th_driver_register(struct intel_th_driver *thdrv);
+void intel_th_driver_unregister(struct intel_th_driver *thdrv);
+
+int intel_th_trace_enable(struct intel_th_device *thdev);
+int intel_th_trace_disable(struct intel_th_device *thdev);
+int intel_th_set_output(struct intel_th_device *thdev,
+			unsigned int master);
+
+enum {
+	TH_MMIO_CONFIG = 0,
+	TH_MMIO_SW = 2,
+	TH_MMIO_END,
+};
+
+#define TH_SUBDEVICE_MAX	6
+#define TH_POSSIBLE_OUTPUTS	8
+#define TH_CONFIGURABLE_MASTERS 256
+#define TH_MSC_MAX		2
+
+/**
+ * struct intel_th - Intel TH controller
+ * @dev:	driver core's device
+ * @thdev:	subdevices
+ * @hub:	"switch" subdevice (GTH)
+ * @id:		this Intel TH controller's device ID in the system
+ * @major:	device node major for output devices
+ */
+struct intel_th {
+	struct device		*dev;
+
+	struct intel_th_device	*thdev[TH_SUBDEVICE_MAX];
+	struct intel_th_device	*hub;
+
+	int			id;
+	int			major;
+#ifdef CONFIG_INTEL_TH_DEBUG
+	struct dentry		*dbg;
+#endif
+};
+
+/*
+ * Register windows
+ */
+enum {
+	/* Global Trace Hub (GTH) */
+	REG_GTH_OFFSET		= 0x0000,
+	REG_GTH_LENGTH		= 0x2000,
+
+	/* Software Trace Hub (STH) [0x4000..0x4fff] */
+	REG_STH_OFFSET		= 0x4000,
+	REG_STH_LENGTH		= 0x2000,
+
+	/* Memory Storage Unit (MSU) [0xa0000..0xa1fff] */
+	REG_MSU_OFFSET		= 0xa0000,
+	REG_MSU_LENGTH		= 0x02000,
+
+	/* Internal MSU trace buffer [0x80000..0x9ffff] */
+	BUF_MSU_OFFSET		= 0x80000,
+	BUF_MSU_LENGTH		= 0x20000,
+
+	/* PTI output == same window as GTH */
+	REG_PTI_OFFSET		= REG_GTH_OFFSET,
+	REG_PTI_LENGTH		= REG_GTH_LENGTH,
+
+	/* DCI Handler (DCIH) == some window as MSU */
+	REG_DCIH_OFFSET		= REG_MSU_OFFSET,
+	REG_DCIH_LENGTH		= REG_MSU_LENGTH,
+};
+
+/*
+ * GTH, output ports configuration
+ */
+enum {
+	GTH_NONE = 0,
+	GTH_MSU,	/* memory/usb */
+	GTH_CTP,	/* Common Trace Port */
+	GTH_PTI = 4,	/* MIPI-PTI */
+};
+
+#endif
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
new file mode 100644
index 000000000000..80a12384ed20
--- /dev/null
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -0,0 +1,1509 @@
+/*
+ * Intel(R) Trace Hub Memory Storage Unit
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/uaccess.h>
+#include <linux/sizes.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/cacheflush.h>
+
+#include "intel_th.h"
+#include "msu.h"
+
+#define msc_dev(x) (&(x)->thdev->dev)
+
+/**
+ * struct msc_block - multiblock mode block descriptor
+ * @bdesc:	pointer to hardware descriptor (beginning of the block)
+ * @addr:	physical address of the block
+ */
+struct msc_block {
+	struct msc_block_desc	*bdesc;
+	dma_addr_t		addr;
+};
+
+/**
+ * struct msc_window - multiblock mode window descriptor
+ * @entry:	window list linkage (msc::win_list)
+ * @pgoff:	page offset into the buffer that this window starts at
+ * @nr_blocks:	number of blocks (pages) in this window
+ * @block:	array of block descriptors
+ */
+struct msc_window {
+	struct list_head	entry;
+	unsigned long		pgoff;
+	unsigned int		nr_blocks;
+	struct msc		*msc;
+	struct msc_block	block[0];
+};
+
+/**
+ * struct msc_iter - iterator for msc buffer
+ * @entry:		msc::iter_list linkage
+ * @msc:		pointer to the MSC device
+ * @start_win:		oldest window
+ * @win:		current window
+ * @offset:		current logical offset into the buffer
+ * @start_block:	oldest block in the window
+ * @block:		block number in the window
+ * @block_off:		offset into current block
+ * @wrap_count:		block wrapping handling
+ * @eof:		end of buffer reached
+ */
+struct msc_iter {
+	struct list_head	entry;
+	struct msc		*msc;
+	struct msc_window	*start_win;
+	struct msc_window	*win;
+	unsigned long		offset;
+	int			start_block;
+	int			block;
+	unsigned int		block_off;
+	unsigned int		wrap_count;
+	unsigned int		eof;
+};
+
+/**
+ * struct msc - MSC device representation
+ * @reg_base:		register window base address
+ * @thdev:		intel_th_device pointer
+ * @win_list:		list of windows in multiblock mode
+ * @nr_pages:		total number of pages allocated for this buffer
+ * @single_sz:		amount of data in single mode
+ * @single_wrap:	single mode wrap occurred
+ * @base:		buffer's base pointer
+ * @base_addr:		buffer's base address
+ * @user_count:		number of users of the buffer
+ * @mmap_count:		number of mappings
+ * @buf_mutex:		mutex to serialize access to buffer-related bits
+
+ * @enabled:		MSC is enabled
+ * @wrap:		wrapping is enabled
+ * @mode:		MSC operating mode
+ * @burst_len:		write burst length
+ * @index:		number of this MSC in the MSU
+ */
+struct msc {
+	void __iomem		*reg_base;
+	struct intel_th_device	*thdev;
+
+	struct list_head	win_list;
+	unsigned long		nr_pages;
+	unsigned long		single_sz;
+	unsigned int		single_wrap : 1;
+	void			*base;
+	dma_addr_t		base_addr;
+
+	/* <0: no buffer, 0: no users, >0: active users */
+	atomic_t		user_count;
+
+	atomic_t		mmap_count;
+	struct mutex		buf_mutex;
+
+	struct mutex		iter_mutex;
+	struct list_head	iter_list;
+
+	/* config */
+	unsigned int		enabled : 1,
+				wrap	: 1;
+	unsigned int		mode;
+	unsigned int		burst_len;
+	unsigned int		index;
+};
+
+static inline bool msc_block_is_empty(struct msc_block_desc *bdesc)
+{
+	/* header hasn't been written */
+	if (!bdesc->valid_dw)
+		return true;
+
+	/* valid_dw includes the header */
+	if (!msc_data_sz(bdesc))
+		return true;
+
+	return false;
+}
+
+/**
+ * msc_oldest_window() - locate the window with oldest data
+ * @msc:	MSC device
+ *
+ * This should only be used in multiblock mode. Caller should hold the
+ * msc::user_count reference.
+ *
+ * Return:	the oldest window with valid data
+ */
+static struct msc_window *msc_oldest_window(struct msc *msc)
+{
+	struct msc_window *win;
+	u32 reg = ioread32(msc->reg_base + REG_MSU_MSC0NWSA);
+	unsigned long win_addr = (unsigned long)reg << PAGE_SHIFT;
+	unsigned int found = 0;
+
+	if (list_empty(&msc->win_list))
+		return NULL;
+
+	/*
+	 * we might need a radix tree for this, depending on how
+	 * many windows a typical user would allocate; ideally it's
+	 * something like 2, in which case we're good
+	 */
+	list_for_each_entry(win, &msc->win_list, entry) {
+		if (win->block[0].addr == win_addr)
+			found++;
+
+		/* skip the empty ones */
+		if (msc_block_is_empty(win->block[0].bdesc))
+			continue;
+
+		if (found)
+			return win;
+	}
+
+	return list_entry(msc->win_list.next, struct msc_window, entry);
+}
+
+/**
+ * msc_win_oldest_block() - locate the oldest block in a given window
+ * @win:	window to look at
+ *
+ * Return:	index of the block with the oldest data
+ */
+static unsigned int msc_win_oldest_block(struct msc_window *win)
+{
+	unsigned int blk;
+	struct msc_block_desc *bdesc = win->block[0].bdesc;
+
+	/* without wrapping, first block is the oldest */
+	if (!msc_block_wrapped(bdesc))
+		return 0;
+
+	/*
+	 * with wrapping, last written block contains both the newest and the
+	 * oldest data for this window.
+	 */
+	for (blk = 0; blk < win->nr_blocks; blk++) {
+		bdesc = win->block[blk].bdesc;
+
+		if (msc_block_last_written(bdesc))
+			return blk;
+	}
+
+	return 0;
+}
+
+/**
+ * msc_is_last_win() - check if a window is the last one for a given MSC
+ * @win:	window
+ * Return:	true if @win is the last window in MSC's multiblock buffer
+ */
+static inline bool msc_is_last_win(struct msc_window *win)
+{
+	return win->entry.next == &win->msc->win_list;
+}
+
+/**
+ * msc_next_window() - return next window in the multiblock buffer
+ * @win:	current window
+ *
+ * Return:	window following the current one
+ */
+static struct msc_window *msc_next_window(struct msc_window *win)
+{
+	if (msc_is_last_win(win))
+		return list_entry(win->msc->win_list.next, struct msc_window,
+				  entry);
+
+	return list_entry(win->entry.next, struct msc_window, entry);
+}
+
+static struct msc_block_desc *msc_iter_bdesc(struct msc_iter *iter)
+{
+	return iter->win->block[iter->block].bdesc;
+}
+
+static void msc_iter_init(struct msc_iter *iter)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->start_block = -1;
+	iter->block = -1;
+}
+
+static struct msc_iter *msc_iter_install(struct msc *msc)
+{
+	struct msc_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	msc_iter_init(iter);
+	iter->msc = msc;
+
+	mutex_lock(&msc->iter_mutex);
+	list_add_tail(&iter->entry, &msc->iter_list);
+	mutex_unlock(&msc->iter_mutex);
+
+	return iter;
+}
+
+static void msc_iter_remove(struct msc_iter *iter, struct msc *msc)
+{
+	mutex_lock(&msc->iter_mutex);
+	list_del(&iter->entry);
+	mutex_unlock(&msc->iter_mutex);
+
+	kfree(iter);
+}
+
+static void msc_iter_block_start(struct msc_iter *iter)
+{
+	if (iter->start_block != -1)
+		return;
+
+	iter->start_block = msc_win_oldest_block(iter->win);
+	iter->block = iter->start_block;
+	iter->wrap_count = 0;
+
+	/*
+	 * start with the block with oldest data; if data has wrapped
+	 * in this window, it should be in this block
+	 */
+	if (msc_block_wrapped(msc_iter_bdesc(iter)))
+		iter->wrap_count = 2;
+
+}
+
+static int msc_iter_win_start(struct msc_iter *iter, struct msc *msc)
+{
+	/* already started, nothing to do */
+	if (iter->start_win)
+		return 0;
+
+	iter->start_win = msc_oldest_window(msc);
+	if (!iter->start_win)
+		return -EINVAL;
+
+	iter->win = iter->start_win;
+	iter->start_block = -1;
+
+	msc_iter_block_start(iter);
+
+	return 0;
+}
+
+static int msc_iter_win_advance(struct msc_iter *iter)
+{
+	iter->win = msc_next_window(iter->win);
+	iter->start_block = -1;
+
+	if (iter->win == iter->start_win) {
+		iter->eof++;
+		return 1;
+	}
+
+	msc_iter_block_start(iter);
+
+	return 0;
+}
+
+static int msc_iter_block_advance(struct msc_iter *iter)
+{
+	iter->block_off = 0;
+
+	/* wrapping */
+	if (iter->wrap_count && iter->block == iter->start_block) {
+		iter->wrap_count--;
+		if (!iter->wrap_count)
+			/* copied newest data from the wrapped block */
+			return msc_iter_win_advance(iter);
+	}
+
+	/* no wrapping, check for last written block */
+	if (!iter->wrap_count && msc_block_last_written(msc_iter_bdesc(iter)))
+		/* copied newest data for the window */
+		return msc_iter_win_advance(iter);
+
+	/* block advance */
+	if (++iter->block == iter->win->nr_blocks)
+		iter->block = 0;
+
+	/* no wrapping, sanity check in case there is no last written block */
+	if (!iter->wrap_count && iter->block == iter->start_block)
+		return msc_iter_win_advance(iter);
+
+	return 0;
+}
+
+/**
+ * msc_buffer_iterate() - go through multiblock buffer's data
+ * @iter:	iterator structure
+ * @size:	amount of data to scan
+ * @data:	callback's private data
+ * @fn:		iterator callback
+ *
+ * This will start at the window which will be written to next (containing
+ * the oldest data) and work its way to the current window, calling @fn
+ * for each chunk of data as it goes.
+ *
+ * Caller should have msc::user_count reference to make sure the buffer
+ * doesn't disappear from under us.
+ *
+ * Return:	amount of data actually scanned.
+ */
+static ssize_t
+msc_buffer_iterate(struct msc_iter *iter, size_t size, void *data,
+		   unsigned long (*fn)(void *, void *, size_t))
+{
+	struct msc *msc = iter->msc;
+	size_t len = size;
+	unsigned int advance;
+
+	if (iter->eof)
+		return 0;
+
+	/* start with the oldest window */
+	if (msc_iter_win_start(iter, msc))
+		return 0;
+
+	do {
+		unsigned long data_bytes = msc_data_sz(msc_iter_bdesc(iter));
+		void *src = (void *)msc_iter_bdesc(iter) + MSC_BDESC;
+		size_t tocopy = data_bytes, copied = 0;
+		size_t remaining = 0;
+
+		advance = 1;
+
+		/*
+		 * If block wrapping happened, we need to visit the last block
+		 * twice, because it contains both the oldest and the newest
+		 * data in this window.
+		 *
+		 * First time (wrap_count==2), in the very beginning, to collect
+		 * the oldest data, which is in the range
+		 * (data_bytes..DATA_IN_PAGE).
+		 *
+		 * Second time (wrap_count==1), it's just like any other block,
+		 * containing data in the range of [MSC_BDESC..data_bytes].
+		 */
+		if (iter->block == iter->start_block && iter->wrap_count) {
+			tocopy = DATA_IN_PAGE - data_bytes;
+			src += data_bytes;
+		}
+
+		if (!tocopy)
+			goto next_block;
+
+		tocopy -= iter->block_off;
+		src += iter->block_off;
+
+		if (len < tocopy) {
+			tocopy = len;
+			advance = 0;
+		}
+
+		remaining = fn(data, src, tocopy);
+
+		if (remaining)
+			advance = 0;
+
+		copied = tocopy - remaining;
+		len -= copied;
+		iter->block_off += copied;
+		iter->offset += copied;
+
+		if (!advance)
+			break;
+
+next_block:
+		if (msc_iter_block_advance(iter))
+			break;
+
+	} while (len);
+
+	return size - len;
+}
+
+/**
+ * msc_buffer_clear_hw_header() - clear hw header for multiblock
+ * @msc:	MSC device
+ */
+static void msc_buffer_clear_hw_header(struct msc *msc)
+{
+	struct msc_window *win;
+
+	mutex_lock(&msc->buf_mutex);
+	list_for_each_entry(win, &msc->win_list, entry) {
+		unsigned int blk;
+		size_t hw_sz = sizeof(struct msc_block_desc) -
+			offsetof(struct msc_block_desc, hw_tag);
+
+		for (blk = 0; blk < win->nr_blocks; blk++) {
+			struct msc_block_desc *bdesc = win->block[blk].bdesc;
+
+			memset(&bdesc->hw_tag, 0, hw_sz);
+		}
+	}
+	mutex_unlock(&msc->buf_mutex);
+}
+
+/**
+ * msc_configure() - set up MSC hardware
+ * @msc:	the MSC device to configure
+ *
+ * Program storage mode, wrapping, burst length and trace buffer address
+ * into a given MSC. If msc::enabled is set, enable the trace, too.
+ */
+static int msc_configure(struct msc *msc)
+{
+	u32 reg;
+
+	if (msc->mode > MSC_MODE_MULTI)
+		return -ENOTSUPP;
+
+	if (msc->mode == MSC_MODE_MULTI)
+		msc_buffer_clear_hw_header(msc);
+
+	reg = msc->base_addr >> PAGE_SHIFT;
+	iowrite32(reg, msc->reg_base + REG_MSU_MSC0BAR);
+
+	if (msc->mode == MSC_MODE_SINGLE) {
+		reg = msc->nr_pages;
+		iowrite32(reg, msc->reg_base + REG_MSU_MSC0SIZE);
+	}
+
+	reg = ioread32(msc->reg_base + REG_MSU_MSC0CTL);
+	reg &= ~(MSC_MODE | MSC_WRAPEN | MSC_EN | MSC_RD_HDR_OVRD);
+
+	reg |= msc->mode << __ffs(MSC_MODE);
+	reg |= msc->burst_len << __ffs(MSC_LEN);
+	/*if (msc->mode == MSC_MODE_MULTI)
+	  reg |= MSC_RD_HDR_OVRD; */
+	if (msc->wrap)
+		reg |= MSC_WRAPEN;
+	if (msc->enabled)
+		reg |= MSC_EN;
+
+	iowrite32(reg, msc->reg_base + REG_MSU_MSC0CTL);
+
+	if (msc->enabled) {
+		msc->thdev->output.multiblock = msc->mode == MSC_MODE_MULTI;
+		intel_th_trace_enable(msc->thdev);
+	}
+
+	return 0;
+}
+
+/**
+ * msc_disable() - disable MSC hardware
+ * @msc:	MSC device to disable
+ *
+ * If @msc is enabled, disable tracing on the switch and then disable MSC
+ * storage.
+ */
+static void msc_disable(struct msc *msc)
+{
+	unsigned long count;
+	u32 reg;
+
+	if (!msc->enabled)
+		return;
+
+	intel_th_trace_disable(msc->thdev);
+
+	for (reg = 0, count = MSC_PLE_WAITLOOP_DEPTH;
+	     count && !(reg & MSCSTS_PLE); count--) {
+		reg = ioread32(msc->reg_base + REG_MSU_MSC0STS);
+		cpu_relax();
+	}
+
+	if (!count)
+		dev_dbg(msc_dev(msc), "timeout waiting for MSC0 PLE\n");
+
+	if (msc->mode == MSC_MODE_SINGLE) {
+		msc->single_wrap = !!(reg & MSCSTS_WRAPSTAT);
+
+		reg = ioread32(msc->reg_base + REG_MSU_MSC0MWP);
+		msc->single_sz = reg & ((msc->nr_pages << PAGE_SHIFT) - 1);
+		dev_dbg(msc_dev(msc), "MSCnMWP: %08x/%08lx, wrap: %d\n",
+			reg, msc->single_sz, msc->single_wrap);
+	}
+
+	reg = ioread32(msc->reg_base + REG_MSU_MSC0CTL);
+	reg &= ~MSC_EN;
+	iowrite32(reg, msc->reg_base + REG_MSU_MSC0CTL);
+	msc->enabled = 0;
+
+	iowrite32(0, msc->reg_base + REG_MSU_MSC0BAR);
+	iowrite32(0, msc->reg_base + REG_MSU_MSC0SIZE);
+
+	dev_dbg(msc_dev(msc), "MSCnNWSA: %08x\n",
+		ioread32(msc->reg_base + REG_MSU_MSC0NWSA));
+
+	reg = ioread32(msc->reg_base + REG_MSU_MSC0STS);
+	dev_dbg(msc_dev(msc), "MSCnSTS: %08x\n", reg);
+}
+
+static int intel_th_msc_activate(struct intel_th_device *thdev)
+{
+	struct msc *msc = dev_get_drvdata(&thdev->dev);
+	int ret = 0;
+
+	if (!atomic_inc_unless_negative(&msc->user_count))
+		return -ENODEV;
+
+	mutex_lock(&msc->iter_mutex);
+	if (!list_empty(&msc->iter_list))
+		ret = -EBUSY;
+	mutex_unlock(&msc->iter_mutex);
+
+	if (ret) {
+		atomic_dec(&msc->user_count);
+		return ret;
+	}
+
+	msc->enabled = 1;
+
+	return msc_configure(msc);
+}
+
+static void intel_th_msc_deactivate(struct intel_th_device *thdev)
+{
+	struct msc *msc = dev_get_drvdata(&thdev->dev);
+
+	msc_disable(msc);
+
+	atomic_dec(&msc->user_count);
+}
+
+/**
+ * msc_buffer_contig_alloc() - allocate a contiguous buffer for SINGLE mode
+ * @msc:	MSC device
+ * @size:	allocation size in bytes
+ *
+ * This modifies msc::base, which requires msc::buf_mutex to serialize, so the
+ * caller is expected to hold it.
+ *
+ * Return:	0 on success, -errno otherwise.
+ */
+static int msc_buffer_contig_alloc(struct msc *msc, unsigned long size)
+{
+	unsigned int order = get_order(size);
+	struct page *page;
+
+	if (!size)
+		return 0;
+
+	page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!page)
+		return -ENOMEM;
+
+	split_page(page, order);
+	msc->nr_pages = size >> PAGE_SHIFT;
+	msc->base = page_address(page);
+	msc->base_addr = page_to_phys(page);
+
+	return 0;
+}
+
+/**
+ * msc_buffer_contig_free() - free a contiguous buffer
+ * @msc:	MSC configured in SINGLE mode
+ */
+static void msc_buffer_contig_free(struct msc *msc)
+{
+	unsigned long off;
+
+	for (off = 0; off < msc->nr_pages << PAGE_SHIFT; off += PAGE_SIZE) {
+		struct page *page = virt_to_page(msc->base + off);
+
+		page->mapping = NULL;
+		__free_page(page);
+	}
+
+	msc->nr_pages = 0;
+}
+
+/**
+ * msc_buffer_contig_get_page() - find a page at a given offset
+ * @msc:	MSC configured in SINGLE mode
+ * @pgoff:	page offset
+ *
+ * Return:	page, if @pgoff is within the range, NULL otherwise.
+ */
+static struct page *msc_buffer_contig_get_page(struct msc *msc,
+					       unsigned long pgoff)
+{
+	if (pgoff >= msc->nr_pages)
+		return NULL;
+
+	return virt_to_page(msc->base + (pgoff << PAGE_SHIFT));
+}
+
+/**
+ * msc_buffer_win_alloc() - alloc a window for a multiblock mode
+ * @msc:	MSC device
+ * @nr_blocks:	number of pages in this window
+ *
+ * This modifies msc::win_list and msc::base, which requires msc::buf_mutex
+ * to serialize, so the caller is expected to hold it.
+ *
+ * Return:	0 on success, -errno otherwise.
+ */
+static int msc_buffer_win_alloc(struct msc *msc, unsigned int nr_blocks)
+{
+	struct msc_window *win;
+	unsigned long size = PAGE_SIZE;
+	int i, ret = -ENOMEM;
+
+	if (!nr_blocks)
+		return 0;
+
+	win = kzalloc(offsetof(struct msc_window, block[nr_blocks]),
+		      GFP_KERNEL);
+	if (!win)
+		return -ENOMEM;
+
+	if (!list_empty(&msc->win_list)) {
+		struct msc_window *prev = list_entry(msc->win_list.prev,
+						     struct msc_window, entry);
+
+		win->pgoff = prev->pgoff + prev->nr_blocks;
+	}
+
+	for (i = 0; i < nr_blocks; i++) {
+		win->block[i].bdesc = dma_alloc_coherent(msc_dev(msc), size,
+							 &win->block[i].addr,
+							 GFP_KERNEL);
+
+#ifdef CONFIG_X86
+		/* Set the page as uncached */
+		set_memory_uc((unsigned long)win->block[i].bdesc, 1);
+#endif
+
+		if (!win->block[i].bdesc)
+			goto err_nomem;
+	}
+
+	win->msc = msc;
+	win->nr_blocks = nr_blocks;
+
+	if (list_empty(&msc->win_list)) {
+		msc->base = win->block[0].bdesc;
+		msc->base_addr = win->block[0].addr;
+	}
+
+	list_add_tail(&win->entry, &msc->win_list);
+	msc->nr_pages += nr_blocks;
+
+	return 0;
+
+err_nomem:
+	for (i--; i >= 0; i--) {
+#ifdef CONFIG_X86
+		/* Reset the page to write-back before releasing */
+		set_memory_wb((unsigned long)win->block[i].bdesc, 1);
+#endif
+		dma_free_coherent(msc_dev(msc), size, win->block[i].bdesc,
+				  win->block[i].addr);
+	}
+	kfree(win);
+
+	return ret;
+}
+
+/**
+ * msc_buffer_win_free() - free a window from MSC's window list
+ * @msc:	MSC device
+ * @win:	window to free
+ *
+ * This modifies msc::win_list and msc::base, which requires msc::buf_mutex
+ * to serialize, so the caller is expected to hold it.
+ */
+static void msc_buffer_win_free(struct msc *msc, struct msc_window *win)
+{
+	int i;
+
+	msc->nr_pages -= win->nr_blocks;
+
+	list_del(&win->entry);
+	if (list_empty(&msc->win_list)) {
+		msc->base = NULL;
+		msc->base_addr = 0;
+	}
+
+	for (i = 0; i < win->nr_blocks; i++) {
+		struct page *page = virt_to_page(win->block[i].bdesc);
+
+		page->mapping = NULL;
+#ifdef CONFIG_X86
+		/* Reset the page to write-back before releasing */
+		set_memory_wb((unsigned long)win->block[i].bdesc, 1);
+#endif
+		dma_free_coherent(msc_dev(win->msc), PAGE_SIZE,
+				  win->block[i].bdesc, win->block[i].addr);
+	}
+
+	kfree(win);
+}
+
+/**
+ * msc_buffer_relink() - set up block descriptors for multiblock mode
+ * @msc:	MSC device
+ *
+ * This traverses msc::win_list, which requires msc::buf_mutex to serialize,
+ * so the caller is expected to hold it.
+ */
+static void msc_buffer_relink(struct msc *msc)
+{
+	struct msc_window *win, *next_win;
+
+	/* call with msc::mutex locked */
+	list_for_each_entry(win, &msc->win_list, entry) {
+		unsigned int blk;
+		u32 sw_tag = 0;
+
+		/*
+		 * Last window's next_win should point to the first window
+		 * and MSC_SW_TAG_LASTWIN should be set.
+		 */
+		if (msc_is_last_win(win)) {
+			sw_tag |= MSC_SW_TAG_LASTWIN;
+			next_win = list_entry(msc->win_list.next,
+					      struct msc_window, entry);
+		} else {
+			next_win = list_entry(win->entry.next,
+					      struct msc_window, entry);
+		}
+
+		for (blk = 0; blk < win->nr_blocks; blk++) {
+			struct msc_block_desc *bdesc = win->block[blk].bdesc;
+
+			memset(bdesc, 0, sizeof(*bdesc));
+
+			bdesc->next_win = next_win->block[0].addr >> PAGE_SHIFT;
+
+			/*
+			 * Similarly to last window, last block should point
+			 * to the first one.
+			 */
+			if (blk == win->nr_blocks - 1) {
+				sw_tag |= MSC_SW_TAG_LASTBLK;
+				bdesc->next_blk =
+					win->block[0].addr >> PAGE_SHIFT;
+			} else {
+				bdesc->next_blk =
+					win->block[blk + 1].addr >> PAGE_SHIFT;
+			}
+
+			bdesc->sw_tag = sw_tag;
+			bdesc->block_sz = PAGE_SIZE / 64;
+		}
+	}
+
+	/*
+	 * Make the above writes globally visible before tracing is
+	 * enabled to make sure hardware sees them coherently.
+	 */
+	wmb();
+}
+
+static void msc_buffer_multi_free(struct msc *msc)
+{
+	struct msc_window *win, *iter;
+
+	list_for_each_entry_safe(win, iter, &msc->win_list, entry)
+		msc_buffer_win_free(msc, win);
+}
+
+static int msc_buffer_multi_alloc(struct msc *msc, unsigned long *nr_pages,
+				  unsigned int nr_wins)
+{
+	int ret, i;
+
+	for (i = 0; i < nr_wins; i++) {
+		ret = msc_buffer_win_alloc(msc, nr_pages[i]);
+		if (ret) {
+			msc_buffer_multi_free(msc);
+			return ret;
+		}
+	}
+
+	msc_buffer_relink(msc);
+
+	return 0;
+}
+
+/**
+ * msc_buffer_free() - free buffers for MSC
+ * @msc:	MSC device
+ *
+ * Free MSC's storage buffers.
+ *
+ * This modifies msc::win_list and msc::base, which requires msc::buf_mutex to
+ * serialize, so the caller is expected to hold it.
+ */
+static void msc_buffer_free(struct msc *msc)
+{
+	if (msc->mode == MSC_MODE_SINGLE)
+		msc_buffer_contig_free(msc);
+	else if (msc->mode == MSC_MODE_MULTI)
+		msc_buffer_multi_free(msc);
+}
+
+/**
+ * msc_buffer_alloc() - allocate a buffer for MSC
+ * @msc:	MSC device
+ * @size:	allocation size in bytes
+ *
+ * Allocate a storage buffer for MSC, depending on the msc::mode, it will be
+ * either done via msc_buffer_contig_alloc() for SINGLE operation mode or
+ * msc_buffer_win_alloc() for multiblock operation. The latter allocates one
+ * window per invocation, so in multiblock mode this can be called multiple
+ * times for the same MSC to allocate multiple windows.
+ *
+ * This modifies msc::win_list and msc::base, which requires msc::buf_mutex
+ * to serialize, so the caller is expected to hold it.
+ *
+ * Return:	0 on success, -errno otherwise.
+ */
+static int msc_buffer_alloc(struct msc *msc, unsigned long *nr_pages,
+			    unsigned int nr_wins)
+{
+	int ret;
+
+	/* -1: buffer not allocated */
+	if (atomic_read(&msc->user_count) != -1)
+		return -EBUSY;
+
+	if (msc->mode == MSC_MODE_SINGLE) {
+		if (nr_wins != 1)
+			return -EINVAL;
+
+		ret = msc_buffer_contig_alloc(msc, nr_pages[0] << PAGE_SHIFT);
+	} else if (msc->mode == MSC_MODE_MULTI) {
+		ret = msc_buffer_multi_alloc(msc, nr_pages, nr_wins);
+	} else {
+		ret = -ENOTSUPP;
+	}
+
+	if (!ret) {
+		/* allocation should be visible before the counter goes to 0 */
+		smp_mb__before_atomic();
+
+		if (WARN_ON_ONCE(atomic_cmpxchg(&msc->user_count, -1, 0) != -1))
+			return -EINVAL;
+	}
+
+	return ret;
+}
+
+/**
+ * msc_buffer_unlocked_free_unless_used() - free a buffer unless it's in use
+ * @msc:	MSC device
+ *
+ * This will free MSC buffer unless it is in use or there is no allocated
+ * buffer.
+ * Caller needs to hold msc::buf_mutex.
+ *
+ * Return:	0 on successful deallocation or if there was no buffer to
+ *		deallocate, -EBUSY if there are active users.
+ */
+static int msc_buffer_unlocked_free_unless_used(struct msc *msc)
+{
+	int count, ret = 0;
+
+	count = atomic_cmpxchg(&msc->user_count, 0, -1);
+
+	/* > 0: buffer is allocated and has users */
+	if (count > 0)
+		ret = -EBUSY;
+	/* 0: buffer is allocated, no users */
+	else if (!count)
+		msc_buffer_free(msc);
+	/* < 0: no buffer, nothing to do */
+
+	return ret;
+}
+
+/**
+ * msc_buffer_free_unless_used() - free a buffer unless it's in use
+ * @msc:	MSC device
+ *
+ * This is a locked version of msc_buffer_unlocked_free_unless_used().
+ */
+static int msc_buffer_free_unless_used(struct msc *msc)
+{
+	int ret;
+
+	mutex_lock(&msc->buf_mutex);
+	ret = msc_buffer_unlocked_free_unless_used(msc);
+	mutex_unlock(&msc->buf_mutex);
+
+	return ret;
+}
+
+/**
+ * msc_buffer_get_page() - get MSC buffer page at a given offset
+ * @msc:	MSC device
+ * @pgoff:	page offset into the storage buffer
+ *
+ * This traverses msc::win_list, so holding msc::buf_mutex is expected from
+ * the caller.
+ *
+ * Return:	page if @pgoff corresponds to a valid buffer page or NULL.
+ */
+static struct page *msc_buffer_get_page(struct msc *msc, unsigned long pgoff)
+{
+	struct msc_window *win;
+
+	if (msc->mode == MSC_MODE_SINGLE)
+		return msc_buffer_contig_get_page(msc, pgoff);
+
+	list_for_each_entry(win, &msc->win_list, entry)
+		if (pgoff >= win->pgoff && pgoff < win->pgoff + win->nr_blocks)
+			goto found;
+
+	return NULL;
+
+found:
+	pgoff -= win->pgoff;
+	return virt_to_page(win->block[pgoff].bdesc);
+}
+
+/**
+ * struct msc_win_to_user_struct - data for copy_to_user() callback
+ * @buf:	userspace buffer to copy data to
+ * @offset:	running offset
+ */
+struct msc_win_to_user_struct {
+	char __user	*buf;
+	unsigned long	offset;
+};
+
+/**
+ * msc_win_to_user() - iterator for msc_buffer_iterate() to copy data to user
+ * @data:	callback's private data
+ * @src:	source buffer
+ * @len:	amount of data to copy from the source buffer
+ */
+static unsigned long msc_win_to_user(void *data, void *src, size_t len)
+{
+	struct msc_win_to_user_struct *u = data;
+	unsigned long ret;
+
+	ret = copy_to_user(u->buf + u->offset, src, len);
+	u->offset += len - ret;
+
+	return ret;
+}
+
+
+/*
+ * file operations' callbacks
+ */
+
+static int intel_th_msc_open(struct inode *inode, struct file *file)
+{
+	struct intel_th_device *thdev = file->private_data;
+	struct msc *msc = dev_get_drvdata(&thdev->dev);
+	struct msc_iter *iter;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	iter = msc_iter_install(msc);
+	if (!iter)
+		return -ENOMEM;
+
+	file->private_data = iter;
+
+	return nonseekable_open(inode, file);
+}
+
+static int intel_th_msc_release(struct inode *inode, struct file *file)
+{
+	struct msc_iter *iter = file->private_data;
+	struct msc *msc = iter->msc;
+
+	msc_iter_remove(iter, msc);
+
+	return 0;
+}
+
+static ssize_t
+msc_single_to_user(struct msc *msc, char __user *buf, loff_t off, size_t len)
+{
+	unsigned long size = msc->nr_pages << PAGE_SHIFT, rem = len;
+	unsigned long start = off, tocopy = 0;
+
+	if (msc->single_wrap) {
+		start += msc->single_sz;
+		if (start < size) {
+			tocopy = min(rem, size - start);
+			if (copy_to_user(buf, msc->base + start, tocopy))
+				return -EFAULT;
+
+			buf += tocopy;
+			rem -= tocopy;
+			start += tocopy;
+		}
+
+		start &= size - 1;
+		if (rem) {
+			tocopy = min(rem, msc->single_sz - start);
+			if (copy_to_user(buf, msc->base + start, tocopy))
+				return -EFAULT;
+
+			rem -= tocopy;
+		}
+
+		return len - rem;
+	}
+
+	if (copy_to_user(buf, msc->base + start, rem))
+		return -EFAULT;
+
+	return len;
+}
+
+static ssize_t intel_th_msc_read(struct file *file, char __user *buf,
+				 size_t len, loff_t *ppos)
+{
+	struct msc_iter *iter = file->private_data;
+	struct msc *msc = iter->msc;
+	size_t size;
+	loff_t off = *ppos;
+	ssize_t ret = 0;
+
+	if (!atomic_inc_unless_negative(&msc->user_count))
+		return 0;
+
+	if (msc->enabled) {
+		ret = -EBUSY;
+		goto put_count;
+	}
+
+	if (msc->mode == MSC_MODE_SINGLE && !msc->single_wrap)
+		size = msc->single_sz;
+	else
+		size = msc->nr_pages << PAGE_SHIFT;
+
+	if (!size)
+		return 0;
+
+	if (off >= size) {
+		len = 0;
+		goto put_count;
+	}
+	if (off + len >= size)
+		len = size - off;
+
+	if (msc->mode == MSC_MODE_SINGLE) {
+		ret = msc_single_to_user(msc, buf, off, len);
+		if (ret >= 0)
+			*ppos += ret;
+	} else if (msc->mode == MSC_MODE_MULTI) {
+		struct msc_win_to_user_struct u = {
+			.buf	= buf,
+			.offset	= 0,
+		};
+
+		ret = msc_buffer_iterate(iter, len, &u, msc_win_to_user);
+		if (ret >= 0)
+			*ppos = iter->offset;
+	} else {
+		ret = -ENOTSUPP;
+	}
+
+put_count:
+	atomic_dec(&msc->user_count);
+
+	return ret;
+}
+
+/*
+ * vm operations callbacks (vm_ops)
+ */
+
+static void msc_mmap_open(struct vm_area_struct *vma)
+{
+	struct msc_iter *iter = vma->vm_file->private_data;
+	struct msc *msc = iter->msc;
+
+	atomic_inc(&msc->mmap_count);
+}
+
+static void msc_mmap_close(struct vm_area_struct *vma)
+{
+	struct msc_iter *iter = vma->vm_file->private_data;
+	struct msc *msc = iter->msc;
+	unsigned long pg;
+
+	if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
+		return;
+
+	/* drop page _counts */
+	for (pg = 0; pg < msc->nr_pages; pg++) {
+		struct page *page = msc_buffer_get_page(msc, pg);
+
+		if (WARN_ON_ONCE(!page))
+			continue;
+
+		if (page->mapping)
+			page->mapping = NULL;
+	}
+
+	/* last mapping -- drop user_count */
+	atomic_dec(&msc->user_count);
+	mutex_unlock(&msc->buf_mutex);
+}
+
+static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct msc_iter *iter = vma->vm_file->private_data;
+	struct msc *msc = iter->msc;
+
+	vmf->page = msc_buffer_get_page(msc, vmf->pgoff);
+	if (!vmf->page)
+		return VM_FAULT_SIGBUS;
+
+	get_page(vmf->page);
+	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->index = vmf->pgoff;
+
+	return 0;
+}
+
+static const struct vm_operations_struct msc_mmap_ops = {
+	.open	= msc_mmap_open,
+	.close	= msc_mmap_close,
+	.fault	= msc_mmap_fault,
+};
+
+static int intel_th_msc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct msc_iter *iter = vma->vm_file->private_data;
+	struct msc *msc = iter->msc;
+	int ret = -EINVAL;
+
+	if (!size || offset_in_page(size))
+		return -EINVAL;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	/* grab user_count once per mmap; drop in msc_mmap_close() */
+	if (!atomic_inc_unless_negative(&msc->user_count))
+		return -EINVAL;
+
+	if (msc->mode != MSC_MODE_SINGLE &&
+	    msc->mode != MSC_MODE_MULTI)
+		goto out;
+
+	if (size >> PAGE_SHIFT != msc->nr_pages)
+		goto out;
+
+	atomic_set(&msc->mmap_count, 1);
+	ret = 0;
+
+out:
+	if (ret)
+		atomic_dec(&msc->user_count);
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTCOPY;
+	vma->vm_ops = &msc_mmap_ops;
+	return ret;
+}
+
+static const struct file_operations intel_th_msc_fops = {
+	.open		= intel_th_msc_open,
+	.release	= intel_th_msc_release,
+	.read		= intel_th_msc_read,
+	.mmap		= intel_th_msc_mmap,
+	.llseek		= no_llseek,
+};
+
+static int intel_th_msc_init(struct msc *msc)
+{
+	atomic_set(&msc->user_count, -1);
+
+	msc->mode = MSC_MODE_MULTI;
+	mutex_init(&msc->buf_mutex);
+	INIT_LIST_HEAD(&msc->win_list);
+
+	mutex_init(&msc->iter_mutex);
+	INIT_LIST_HEAD(&msc->iter_list);
+
+	msc->burst_len =
+		(ioread32(msc->reg_base + REG_MSU_MSC0CTL) & MSC_LEN) >>
+		__ffs(MSC_LEN);
+
+	return 0;
+}
+
+static const char * const msc_mode[] = {
+	[MSC_MODE_SINGLE]	= "single",
+	[MSC_MODE_MULTI]	= "multi",
+	[MSC_MODE_EXI]		= "ExI",
+	[MSC_MODE_DEBUG]	= "debug",
+};
+
+static ssize_t
+wrap_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct msc *msc = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", msc->wrap);
+}
+
+static ssize_t
+wrap_store(struct device *dev, struct device_attribute *attr, const char *buf,
+	   size_t size)
+{
+	struct msc *msc = dev_get_drvdata(dev);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	msc->wrap = !!val;
+
+	return size;
+}
+
+static DEVICE_ATTR_RW(wrap);
+
+static ssize_t
+mode_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct msc *msc = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", msc_mode[msc->mode]);
+}
+
+static ssize_t
+mode_store(struct device *dev, struct device_attribute *attr, const char *buf,
+	   size_t size)
+{
+	struct msc *msc = dev_get_drvdata(dev);
+	size_t len = size;
+	char *cp;
+	int i, ret;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	cp = memchr(buf, '\n', len);
+	if (cp)
+		len = cp - buf;
+
+	for (i = 0; i < ARRAY_SIZE(msc_mode); i++)
+		if (!strncmp(msc_mode[i], buf, len))
+			goto found;
+
+	return -EINVAL;
+
+found:
+	mutex_lock(&msc->buf_mutex);
+	ret = msc_buffer_unlocked_free_unless_used(msc);
+	if (!ret)
+		msc->mode = i;
+	mutex_unlock(&msc->buf_mutex);
+
+	return ret ? ret : size;
+}
+
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t
+nr_pages_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct msc *msc = dev_get_drvdata(dev);
+	struct msc_window *win;
+	size_t count = 0;
+
+	mutex_lock(&msc->buf_mutex);
+
+	if (msc->mode == MSC_MODE_SINGLE)
+		count = scnprintf(buf, PAGE_SIZE, "%ld\n", msc->nr_pages);
+	else if (msc->mode == MSC_MODE_MULTI) {
+		list_for_each_entry(win, &msc->win_list, entry) {
+			count += scnprintf(buf + count, PAGE_SIZE - count,
+					   "%d%c", win->nr_blocks,
+					   msc_is_last_win(win) ? '\n' : ',');
+		}
+	} else {
+		count = scnprintf(buf, PAGE_SIZE, "unsupported\n");
+	}
+
+	mutex_unlock(&msc->buf_mutex);
+
+	return count;
+}
+
+static ssize_t
+nr_pages_store(struct device *dev, struct device_attribute *attr,
+	       const char *buf, size_t size)
+{
+	struct msc *msc = dev_get_drvdata(dev);
+	unsigned long val, *win = NULL, *rewin;
+	size_t len = size;
+	const char *p = buf;
+	char *end, *s;
+	int ret, nr_wins = 0;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	ret = msc_buffer_free_unless_used(msc);
+	if (ret)
+		return ret;
+
+	/* scan the comma-separated list of allocation sizes */
+	end = memchr(buf, '\n', len);
+	if (end)
+		len = end - buf;
+
+	do {
+		end = memchr(p, ',', len);
+		s = kstrndup(p, end ? end - p : len, GFP_KERNEL);
+		ret = kstrtoul(s, 10, &val);
+		kfree(s);
+
+		if (ret || !val)
+			goto free_win;
+
+		if (nr_wins && msc->mode == MSC_MODE_SINGLE) {
+			ret = -EINVAL;
+			goto free_win;
+		}
+
+		nr_wins++;
+		rewin = krealloc(win, sizeof(*win) * nr_wins, GFP_KERNEL);
+		if (!rewin) {
+			kfree(win);
+			return -ENOMEM;
+		}
+
+		win = rewin;
+		win[nr_wins - 1] = val;
+
+		if (!end)
+			break;
+
+		len -= end - p;
+		p = end + 1;
+	} while (len);
+
+	mutex_lock(&msc->buf_mutex);
+	ret = msc_buffer_alloc(msc, win, nr_wins);
+	mutex_unlock(&msc->buf_mutex);
+
+free_win:
+	kfree(win);
+
+	return ret ? ret : size;
+}
+
+static DEVICE_ATTR_RW(nr_pages);
+
+static struct attribute *msc_output_attrs[] = {
+	&dev_attr_wrap.attr,
+	&dev_attr_mode.attr,
+	&dev_attr_nr_pages.attr,
+	NULL,
+};
+
+static struct attribute_group msc_output_group = {
+	.attrs	= msc_output_attrs,
+};
+
+static int intel_th_msc_probe(struct intel_th_device *thdev)
+{
+	struct device *dev = &thdev->dev;
+	struct resource *res;
+	struct msc *msc;
+	void __iomem *base;
+	int err;
+
+	res = intel_th_device_get_resource(thdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	base = devm_ioremap(dev, res->start, resource_size(res));
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	msc = devm_kzalloc(dev, sizeof(*msc), GFP_KERNEL);
+	if (!msc)
+		return -ENOMEM;
+
+	msc->index = thdev->id;
+
+	msc->thdev = thdev;
+	msc->reg_base = base + msc->index * 0x100;
+
+	err = intel_th_msc_init(msc);
+	if (err)
+		return err;
+
+	err = sysfs_create_group(&dev->kobj, &msc_output_group);
+	if (err)
+		return err;
+
+	dev_set_drvdata(dev, msc);
+
+	return 0;
+}
+
+static void intel_th_msc_remove(struct intel_th_device *thdev)
+{
+	sysfs_remove_group(&thdev->dev.kobj, &msc_output_group);
+}
+
+static struct intel_th_driver intel_th_msc_driver = {
+	.probe	= intel_th_msc_probe,
+	.remove	= intel_th_msc_remove,
+	.activate	= intel_th_msc_activate,
+	.deactivate	= intel_th_msc_deactivate,
+	.fops	= &intel_th_msc_fops,
+	.driver	= {
+		.name	= "msc",
+		.owner	= THIS_MODULE,
+	},
+};
+
+module_driver(intel_th_msc_driver,
+	      intel_th_driver_register,
+	      intel_th_driver_unregister);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel(R) Trace Hub Memory Storage Unit driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/intel_th/msu.h b/drivers/hwtracing/intel_th/msu.h
new file mode 100644
index 000000000000..9b710e4aa98a
--- /dev/null
+++ b/drivers/hwtracing/intel_th/msu.h
@@ -0,0 +1,116 @@
+/*
+ * Intel(R) Trace Hub Memory Storage Unit (MSU) data structures
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __INTEL_TH_MSU_H__
+#define __INTEL_TH_MSU_H__
+
+enum {
+	REG_MSU_MSUPARAMS	= 0x0000,
+	REG_MSU_MSUSTS		= 0x0008,
+	REG_MSU_MSC0CTL		= 0x0100, /* MSC0 control */
+	REG_MSU_MSC0STS		= 0x0104, /* MSC0 status */
+	REG_MSU_MSC0BAR		= 0x0108, /* MSC0 output base address */
+	REG_MSU_MSC0SIZE	= 0x010c, /* MSC0 output size */
+	REG_MSU_MSC0MWP		= 0x0110, /* MSC0 write pointer */
+	REG_MSU_MSC0NWSA	= 0x011c, /* MSC0 next window start address */
+
+	REG_MSU_MSC1CTL		= 0x0200, /* MSC1 control */
+	REG_MSU_MSC1STS		= 0x0204, /* MSC1 status */
+	REG_MSU_MSC1BAR		= 0x0208, /* MSC1 output base address */
+	REG_MSU_MSC1SIZE	= 0x020c, /* MSC1 output size */
+	REG_MSU_MSC1MWP		= 0x0210, /* MSC1 write pointer */
+	REG_MSU_MSC1NWSA	= 0x021c, /* MSC1 next window start address */
+};
+
+/* MSUSTS bits */
+#define MSUSTS_MSU_INT	BIT(0)
+
+/* MSCnCTL bits */
+#define MSC_EN		BIT(0)
+#define MSC_WRAPEN	BIT(1)
+#define MSC_RD_HDR_OVRD	BIT(2)
+#define MSC_MODE	(BIT(4) | BIT(5))
+#define MSC_LEN		(BIT(8) | BIT(9) | BIT(10))
+
+/* MSC operating modes (MSC_MODE) */
+enum {
+	MSC_MODE_SINGLE	= 0,
+	MSC_MODE_MULTI,
+	MSC_MODE_EXI,
+	MSC_MODE_DEBUG,
+};
+
+/* MSCnSTS bits */
+#define MSCSTS_WRAPSTAT	BIT(1)	/* Wrap occurred */
+#define MSCSTS_PLE	BIT(2)	/* Pipeline Empty */
+
+/*
+ * Multiblock/multiwindow block descriptor
+ */
+struct msc_block_desc {
+	u32	sw_tag;
+	u32	block_sz;
+	u32	next_blk;
+	u32	next_win;
+	u32	res0[4];
+	u32	hw_tag;
+	u32	valid_dw;
+	u32	ts_low;
+	u32	ts_high;
+	u32	res1[4];
+} __packed;
+
+#define MSC_BDESC	sizeof(struct msc_block_desc)
+#define DATA_IN_PAGE	(PAGE_SIZE - MSC_BDESC)
+
+/* MSC multiblock sw tag bits */
+#define MSC_SW_TAG_LASTBLK	BIT(0)
+#define MSC_SW_TAG_LASTWIN	BIT(1)
+
+/* MSC multiblock hw tag bits */
+#define MSC_HW_TAG_TRIGGER	BIT(0)
+#define MSC_HW_TAG_BLOCKWRAP	BIT(1)
+#define MSC_HW_TAG_WINWRAP	BIT(2)
+#define MSC_HW_TAG_ENDBIT	BIT(3)
+
+static inline unsigned long msc_data_sz(struct msc_block_desc *bdesc)
+{
+	if (!bdesc->valid_dw)
+		return 0;
+
+	return bdesc->valid_dw * 4 - MSC_BDESC;
+}
+
+static inline bool msc_block_wrapped(struct msc_block_desc *bdesc)
+{
+	if (bdesc->hw_tag & MSC_HW_TAG_BLOCKWRAP)
+		return true;
+
+	return false;
+}
+
+static inline bool msc_block_last_written(struct msc_block_desc *bdesc)
+{
+	if ((bdesc->hw_tag & MSC_HW_TAG_ENDBIT) ||
+	    (msc_data_sz(bdesc) != DATA_IN_PAGE))
+		return true;
+
+	return false;
+}
+
+/* waiting for Pipeline Empty bit(s) to assert for MSC */
+#define MSC_PLE_WAITLOOP_DEPTH	10000
+
+#endif /* __INTEL_TH_MSU_H__ */
diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c
new file mode 100644
index 000000000000..641e87936064
--- /dev/null
+++ b/drivers/hwtracing/intel_th/pci.c
@@ -0,0 +1,86 @@
+/*
+ * Intel(R) Trace Hub pci driver
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/pci.h>
+
+#include "intel_th.h"
+
+#define DRIVER_NAME "intel_th_pci"
+
+#define BAR_MASK (BIT(TH_MMIO_CONFIG) | BIT(TH_MMIO_SW))
+
+static int intel_th_pci_probe(struct pci_dev *pdev,
+			      const struct pci_device_id *id)
+{
+	struct intel_th *th;
+	int err;
+
+	err = pcim_enable_device(pdev);
+	if (err)
+		return err;
+
+	err = pcim_iomap_regions_request_all(pdev, BAR_MASK, DRIVER_NAME);
+	if (err)
+		return err;
+
+	th = intel_th_alloc(&pdev->dev, pdev->resource,
+			    DEVICE_COUNT_RESOURCE, pdev->irq);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	pci_set_drvdata(pdev, th);
+
+	return 0;
+}
+
+static void intel_th_pci_remove(struct pci_dev *pdev)
+{
+	struct intel_th *th = pci_get_drvdata(pdev);
+
+	intel_th_free(th);
+}
+
+static const struct pci_device_id intel_th_pci_id_table[] = {
+	{
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x9d26),
+		.driver_data = (kernel_ulong_t)0,
+	},
+	{
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa126),
+		.driver_data = (kernel_ulong_t)0,
+	},
+	{ 0 },
+};
+
+MODULE_DEVICE_TABLE(pci, intel_th_pci_id_table);
+
+static struct pci_driver intel_th_pci_driver = {
+	.name		= DRIVER_NAME,
+	.id_table	= intel_th_pci_id_table,
+	.probe		= intel_th_pci_probe,
+	.remove		= intel_th_pci_remove,
+};
+
+module_pci_driver(intel_th_pci_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel(R) Trace Hub PCI controller driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@intel.com>");
diff --git a/drivers/hwtracing/intel_th/pti.c b/drivers/hwtracing/intel_th/pti.c
new file mode 100644
index 000000000000..1e3bbc89825c
--- /dev/null
+++ b/drivers/hwtracing/intel_th/pti.c
@@ -0,0 +1,252 @@
+/*
+ * Intel(R) Trace Hub PTI output driver
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+
+#include "intel_th.h"
+#include "pti.h"
+
+struct pti_device {
+	void __iomem		*base;
+	struct intel_th_device	*thdev;
+	unsigned int		mode;
+	unsigned int		freeclk;
+	unsigned int		clkdiv;
+	unsigned int		patgen;
+};
+
+/* map PTI widths to MODE settings of PTI_CTL register */
+static const unsigned int pti_mode[] = {
+	0, 4, 8, 0, 12, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static int pti_width_mode(unsigned int width)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pti_mode); i++)
+		if (pti_mode[i] == width)
+			return i;
+
+	return -EINVAL;
+}
+
+static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct pti_device *pti = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", pti_mode[pti->mode]);
+}
+
+static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t size)
+{
+	struct pti_device *pti = dev_get_drvdata(dev);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	ret = pti_width_mode(val);
+	if (ret < 0)
+		return ret;
+
+	pti->mode = ret;
+
+	return size;
+}
+
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t
+freerunning_clock_show(struct device *dev, struct device_attribute *attr,
+		       char *buf)
+{
+	struct pti_device *pti = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", pti->freeclk);
+}
+
+static ssize_t
+freerunning_clock_store(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t size)
+{
+	struct pti_device *pti = dev_get_drvdata(dev);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	pti->freeclk = !!val;
+
+	return size;
+}
+
+static DEVICE_ATTR_RW(freerunning_clock);
+
+static ssize_t
+clock_divider_show(struct device *dev, struct device_attribute *attr,
+		   char *buf)
+{
+	struct pti_device *pti = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", 1u << pti->clkdiv);
+}
+
+static ssize_t
+clock_divider_store(struct device *dev, struct device_attribute *attr,
+		    const char *buf, size_t size)
+{
+	struct pti_device *pti = dev_get_drvdata(dev);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	if (!is_power_of_2(val) || val > 8 || !val)
+		return -EINVAL;
+
+	pti->clkdiv = val;
+
+	return size;
+}
+
+static DEVICE_ATTR_RW(clock_divider);
+
+static struct attribute *pti_output_attrs[] = {
+	&dev_attr_mode.attr,
+	&dev_attr_freerunning_clock.attr,
+	&dev_attr_clock_divider.attr,
+	NULL,
+};
+
+static struct attribute_group pti_output_group = {
+	.attrs	= pti_output_attrs,
+};
+
+static int intel_th_pti_activate(struct intel_th_device *thdev)
+{
+	struct pti_device *pti = dev_get_drvdata(&thdev->dev);
+	u32 ctl = PTI_EN;
+
+	if (pti->patgen)
+		ctl |= pti->patgen << __ffs(PTI_PATGENMODE);
+	if (pti->freeclk)
+		ctl |= PTI_FCEN;
+	ctl |= pti->mode << __ffs(PTI_MODE);
+	ctl |= pti->clkdiv << __ffs(PTI_CLKDIV);
+
+	iowrite32(ctl, pti->base + REG_PTI_CTL);
+
+	intel_th_trace_enable(thdev);
+
+	return 0;
+}
+
+static void intel_th_pti_deactivate(struct intel_th_device *thdev)
+{
+	struct pti_device *pti = dev_get_drvdata(&thdev->dev);
+
+	intel_th_trace_disable(thdev);
+
+	iowrite32(0, pti->base + REG_PTI_CTL);
+}
+
+static void read_hw_config(struct pti_device *pti)
+{
+	u32 ctl = ioread32(pti->base + REG_PTI_CTL);
+
+	pti->mode	= (ctl & PTI_MODE) >> __ffs(PTI_MODE);
+	pti->clkdiv	= (ctl & PTI_CLKDIV) >> __ffs(PTI_CLKDIV);
+	pti->freeclk	= !!(ctl & PTI_FCEN);
+
+	if (!pti_mode[pti->mode])
+		pti->mode = pti_width_mode(4);
+	if (!pti->clkdiv)
+		pti->clkdiv = 1;
+}
+
+static int intel_th_pti_probe(struct intel_th_device *thdev)
+{
+	struct device *dev = &thdev->dev;
+	struct resource *res;
+	struct pti_device *pti;
+	void __iomem *base;
+	int ret;
+
+	res = intel_th_device_get_resource(thdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	base = devm_ioremap(dev, res->start, resource_size(res));
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	pti = devm_kzalloc(dev, sizeof(*pti), GFP_KERNEL);
+	if (!pti)
+		return -ENOMEM;
+
+	pti->thdev = thdev;
+	pti->base = base;
+
+	read_hw_config(pti);
+
+	ret = sysfs_create_group(&dev->kobj, &pti_output_group);
+	if (ret)
+		return ret;
+
+	dev_set_drvdata(dev, pti);
+
+	return 0;
+}
+
+static void intel_th_pti_remove(struct intel_th_device *thdev)
+{
+}
+
+static struct intel_th_driver intel_th_pti_driver = {
+	.probe	= intel_th_pti_probe,
+	.remove	= intel_th_pti_remove,
+	.activate	= intel_th_pti_activate,
+	.deactivate	= intel_th_pti_deactivate,
+	.driver	= {
+		.name	= "pti",
+		.owner	= THIS_MODULE,
+	},
+};
+
+module_driver(intel_th_pti_driver,
+	      intel_th_driver_register,
+	      intel_th_driver_unregister);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel(R) Trace Hub PTI output driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/intel_th/pti.h b/drivers/hwtracing/intel_th/pti.h
new file mode 100644
index 000000000000..20883f5628cf
--- /dev/null
+++ b/drivers/hwtracing/intel_th/pti.h
@@ -0,0 +1,29 @@
+/*
+ * Intel(R) Trace Hub PTI output data structures
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __INTEL_TH_STH_H__
+#define __INTEL_TH_STH_H__
+
+enum {
+	REG_PTI_CTL	= 0x1c00,
+};
+
+#define PTI_EN		BIT(0)
+#define PTI_FCEN	BIT(1)
+#define PTI_MODE	0xf0
+#define PTI_CLKDIV	0x000f0000
+#define PTI_PATGENMODE	0x00f00000
+
+#endif /* __INTEL_TH_STH_H__ */
diff --git a/drivers/hwtracing/intel_th/sth.c b/drivers/hwtracing/intel_th/sth.c
new file mode 100644
index 000000000000..e488fccbfdec
--- /dev/null
+++ b/drivers/hwtracing/intel_th/sth.c
@@ -0,0 +1,259 @@
+/*
+ * Intel(R) Trace Hub Software Trace Hub support
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+
+#include "intel_th.h"
+#include "sth.h"
+
+struct sth_device {
+	void __iomem	*base;
+	void __iomem	*channels;
+	phys_addr_t	channels_phys;
+	struct device	*dev;
+	struct stm_data	stm;
+	unsigned int	sw_nmasters;
+};
+
+static struct intel_th_channel __iomem *
+sth_channel(struct sth_device *sth, unsigned int master, unsigned int channel)
+{
+	struct intel_th_channel __iomem *sw_map = sth->channels;
+
+	return &sw_map[(master - sth->stm.sw_start) * sth->stm.sw_nchannels +
+		       channel];
+}
+
+static void sth_iowrite(void __iomem *dest, const unsigned char *payload,
+			unsigned int size)
+{
+	switch (size) {
+#ifdef CONFIG_64BIT
+	case 8:
+		writeq_relaxed(*(u64 *)payload, dest);
+		break;
+#endif
+	case 4:
+		writel_relaxed(*(u32 *)payload, dest);
+		break;
+	case 2:
+		writew_relaxed(*(u16 *)payload, dest);
+		break;
+	case 1:
+		writeb_relaxed(*(u8 *)payload, dest);
+		break;
+	default:
+		break;
+	}
+}
+
+static ssize_t sth_stm_packet(struct stm_data *stm_data, unsigned int master,
+			      unsigned int channel, unsigned int packet,
+			      unsigned int flags, unsigned int size,
+			      const unsigned char *payload)
+{
+	struct sth_device *sth = container_of(stm_data, struct sth_device, stm);
+	struct intel_th_channel __iomem *out =
+		sth_channel(sth, master, channel);
+	u64 __iomem *outp = &out->Dn;
+	unsigned long reg = REG_STH_TRIG;
+
+#ifndef CONFIG_64BIT
+	if (size > 4)
+		size = 4;
+#endif
+
+	size = rounddown_pow_of_two(size);
+
+	switch (packet) {
+	/* Global packets (GERR, XSYNC, TRIG) are sent with register writes */
+	case STP_PACKET_GERR:
+		reg += 4;
+	case STP_PACKET_XSYNC:
+		reg += 8;
+	case STP_PACKET_TRIG:
+		if (flags & STP_PACKET_TIMESTAMPED)
+			reg += 4;
+		iowrite8(*payload, sth->base + reg);
+		break;
+
+	case STP_PACKET_MERR:
+		sth_iowrite(&out->MERR, payload, size);
+		break;
+
+	case STP_PACKET_FLAG:
+		if (flags & STP_PACKET_TIMESTAMPED)
+			outp = (u64 __iomem *)&out->FLAG_TS;
+		else
+			outp = (u64 __iomem *)&out->FLAG;
+
+		size = 1;
+		sth_iowrite(outp, payload, size);
+		break;
+
+	case STP_PACKET_USER:
+		if (flags & STP_PACKET_TIMESTAMPED)
+			outp = &out->USER_TS;
+		else
+			outp = &out->USER;
+		sth_iowrite(outp, payload, size);
+		break;
+
+	case STP_PACKET_DATA:
+		outp = &out->Dn;
+
+		if (flags & STP_PACKET_TIMESTAMPED)
+			outp += 2;
+		if (flags & STP_PACKET_MARKED)
+			outp++;
+
+		sth_iowrite(outp, payload, size);
+		break;
+	}
+
+	return size;
+}
+
+static phys_addr_t
+sth_stm_mmio_addr(struct stm_data *stm_data, unsigned int master,
+		  unsigned int channel, unsigned int nr_chans)
+{
+	struct sth_device *sth = container_of(stm_data, struct sth_device, stm);
+	phys_addr_t addr;
+
+	master -= sth->stm.sw_start;
+	addr = sth->channels_phys + (master * sth->stm.sw_nchannels + channel) *
+		sizeof(struct intel_th_channel);
+
+	if (offset_in_page(addr) ||
+	    offset_in_page(nr_chans * sizeof(struct intel_th_channel)))
+		return 0;
+
+	return addr;
+}
+
+static int sth_stm_link(struct stm_data *stm_data, unsigned int master,
+			 unsigned int channel)
+{
+	struct sth_device *sth = container_of(stm_data, struct sth_device, stm);
+
+	intel_th_set_output(to_intel_th_device(sth->dev), master);
+
+	return 0;
+}
+
+static int intel_th_sw_init(struct sth_device *sth)
+{
+	u32 reg;
+
+	reg = ioread32(sth->base + REG_STH_STHCAP1);
+	sth->stm.sw_nchannels = reg & 0xff;
+
+	reg = ioread32(sth->base + REG_STH_STHCAP0);
+	sth->stm.sw_start = reg & 0xffff;
+	sth->stm.sw_end = reg >> 16;
+
+	sth->sw_nmasters = sth->stm.sw_end - sth->stm.sw_start;
+	dev_dbg(sth->dev, "sw_start: %x sw_end: %x masters: %x nchannels: %x\n",
+		sth->stm.sw_start, sth->stm.sw_end, sth->sw_nmasters,
+		sth->stm.sw_nchannels);
+
+	return 0;
+}
+
+static int intel_th_sth_probe(struct intel_th_device *thdev)
+{
+	struct device *dev = &thdev->dev;
+	struct sth_device *sth;
+	struct resource *res;
+	void __iomem *base, *channels;
+	int err;
+
+	res = intel_th_device_get_resource(thdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	base = devm_ioremap(dev, res->start, resource_size(res));
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	res = intel_th_device_get_resource(thdev, IORESOURCE_MEM, 1);
+	if (!res)
+		return -ENODEV;
+
+	channels = devm_ioremap(dev, res->start, resource_size(res));
+	if (IS_ERR(channels))
+		return PTR_ERR(channels);
+
+	sth = devm_kzalloc(dev, sizeof(*sth), GFP_KERNEL);
+	if (!sth)
+		return -ENOMEM;
+
+	sth->dev = dev;
+	sth->base = base;
+	sth->channels = channels;
+	sth->channels_phys = res->start;
+	sth->stm.name = dev_name(dev);
+	sth->stm.packet = sth_stm_packet;
+	sth->stm.mmio_addr = sth_stm_mmio_addr;
+	sth->stm.sw_mmiosz = sizeof(struct intel_th_channel);
+	sth->stm.link = sth_stm_link;
+
+	err = intel_th_sw_init(sth);
+	if (err)
+		return err;
+
+	err = stm_register_device(dev, &sth->stm, THIS_MODULE);
+	if (err) {
+		dev_err(dev, "stm_register_device failed\n");
+		return err;
+	}
+
+	dev_set_drvdata(dev, sth);
+
+	return 0;
+}
+
+static void intel_th_sth_remove(struct intel_th_device *thdev)
+{
+	struct sth_device *sth = dev_get_drvdata(&thdev->dev);
+
+	stm_unregister_device(&sth->stm);
+}
+
+static struct intel_th_driver intel_th_sth_driver = {
+	.probe	= intel_th_sth_probe,
+	.remove	= intel_th_sth_remove,
+	.driver	= {
+		.name	= "sth",
+		.owner	= THIS_MODULE,
+	},
+};
+
+module_driver(intel_th_sth_driver,
+	      intel_th_driver_register,
+	      intel_th_driver_unregister);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel(R) Trace Hub Software Trace Hub driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@intel.com>");
diff --git a/drivers/hwtracing/intel_th/sth.h b/drivers/hwtracing/intel_th/sth.h
new file mode 100644
index 000000000000..f1390cd4f2ed
--- /dev/null
+++ b/drivers/hwtracing/intel_th/sth.h
@@ -0,0 +1,42 @@
+/*
+ * Intel(R) Trace Hub Software Trace Hub (STH) data structures
+ *
+ * Copyright (C) 2014-2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __INTEL_TH_STH_H__
+#define __INTEL_TH_STH_H__
+
+enum {
+	REG_STH_STHCAP0		= 0x0000, /* capabilities pt1 */
+	REG_STH_STHCAP1		= 0x0004, /* capabilities pt2 */
+	REG_STH_TRIG		= 0x0008, /* TRIG packet payload */
+	REG_STH_TRIG_TS		= 0x000c, /* TRIG_TS packet payload */
+	REG_STH_XSYNC		= 0x0010, /* XSYNC packet payload */
+	REG_STH_XSYNC_TS	= 0x0014, /* XSYNC_TS packet payload */
+	REG_STH_GERR		= 0x0018, /* GERR packet payload */
+};
+
+struct intel_th_channel {
+	u64	Dn;
+	u64	DnM;
+	u64	DnTS;
+	u64	DnMTS;
+	u64	USER;
+	u64	USER_TS;
+	u32	FLAG;
+	u32	FLAG_TS;
+	u32	MERR;
+	u32	__unused;
+} __packed;
+
+#endif /* __INTEL_TH_STH_H__ */
diff --git a/drivers/hwtracing/stm/Kconfig b/drivers/hwtracing/stm/Kconfig
new file mode 100644
index 000000000000..5a59a28cc3da
--- /dev/null
+++ b/drivers/hwtracing/stm/Kconfig
@@ -0,0 +1,25 @@
+config STM
+	tristate "System Trace Module devices"
+	help
+	  A System Trace Module (STM) is a device exporting data in System
+	  Trace Protocol (STP) format as defined by MIPI STP standards.
+	  Examples of such devices are Intel(R) Trace Hub and Coresight STM.
+
+	  Say Y here to enable System Trace Module device support.
+
+config STM_DUMMY
+	tristate "Dummy STM driver"
+	help
+	  This is a simple dummy device that pretends to be an stm device
+	  and discards your data. Use for stm class testing.
+
+	  If you don't know what this is, say N.
+
+config STM_SOURCE_CONSOLE
+	tristate "Kernel console over STM devices"
+	help
+	  This is a kernel space trace source that sends kernel log
+	  messages to trace hosts over STM devices.
+
+	  If you want to send kernel console messages over STM devices,
+	  say Y.
diff --git a/drivers/hwtracing/stm/Makefile b/drivers/hwtracing/stm/Makefile
new file mode 100644
index 000000000000..f9312c38dd7a
--- /dev/null
+++ b/drivers/hwtracing/stm/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_STM)	+= stm_core.o
+
+stm_core-y		:= core.o policy.o
+
+obj-$(CONFIG_STM_DUMMY)	+= dummy_stm.o
+
+obj-$(CONFIG_STM_SOURCE_CONSOLE)	+= stm_console.o
+
+stm_console-y		:= console.o
diff --git a/drivers/hwtracing/stm/console.c b/drivers/hwtracing/stm/console.c
new file mode 100644
index 000000000000..c9d9a8d2ff52
--- /dev/null
+++ b/drivers/hwtracing/stm/console.c
@@ -0,0 +1,80 @@
+/*
+ * Simple kernel console driver for STM devices
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * STM console will send kernel messages over STM devices to a trace host.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/console.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+
+static int stm_console_link(struct stm_source_data *data);
+static void stm_console_unlink(struct stm_source_data *data);
+
+static struct stm_console {
+	struct stm_source_data	data;
+	struct console		console;
+} stm_console = {
+	.data	= {
+		.name		= "console",
+		.nr_chans	= 1,
+		.link		= stm_console_link,
+		.unlink		= stm_console_unlink,
+	},
+};
+
+static void
+stm_console_write(struct console *con, const char *buf, unsigned len)
+{
+	struct stm_console *sc = container_of(con, struct stm_console, console);
+
+	stm_source_write(&sc->data, 0, buf, len);
+}
+
+static int stm_console_link(struct stm_source_data *data)
+{
+	struct stm_console *sc = container_of(data, struct stm_console, data);
+
+	strcpy(sc->console.name, "stm_console");
+	sc->console.write = stm_console_write;
+	sc->console.flags = CON_ENABLED | CON_PRINTBUFFER;
+	register_console(&sc->console);
+
+	return 0;
+}
+
+static void stm_console_unlink(struct stm_source_data *data)
+{
+	struct stm_console *sc = container_of(data, struct stm_console, data);
+
+	unregister_console(&sc->console);
+}
+
+static int stm_console_init(void)
+{
+	return stm_source_register_device(NULL, &stm_console.data);
+}
+
+static void stm_console_exit(void)
+{
+	stm_source_unregister_device(&stm_console.data);
+}
+
+module_init(stm_console_init);
+module_exit(stm_console_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("stm_console driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
new file mode 100644
index 000000000000..b6445d9e5453
--- /dev/null
+++ b/drivers/hwtracing/stm/core.c
@@ -0,0 +1,1032 @@
+/*
+ * System Trace Module (STM) infrastructure
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * STM class implements generic infrastructure for  System Trace Module devices
+ * as defined in MIPI STPv2 specification.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/compat.h>
+#include <linux/kdev_t.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "stm.h"
+
+#include <uapi/linux/stm.h>
+
+static unsigned int stm_core_up;
+
+/*
+ * The SRCU here makes sure that STM device doesn't disappear from under a
+ * stm_source_write() caller, which may want to have as little overhead as
+ * possible.
+ */
+static struct srcu_struct stm_source_srcu;
+
+static ssize_t masters_show(struct device *dev,
+			    struct device_attribute *attr,
+			    char *buf)
+{
+	struct stm_device *stm = to_stm_device(dev);
+	int ret;
+
+	ret = sprintf(buf, "%u %u\n", stm->data->sw_start, stm->data->sw_end);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(masters);
+
+static ssize_t channels_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct stm_device *stm = to_stm_device(dev);
+	int ret;
+
+	ret = sprintf(buf, "%u\n", stm->data->sw_nchannels);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(channels);
+
+static struct attribute *stm_attrs[] = {
+	&dev_attr_masters.attr,
+	&dev_attr_channels.attr,
+	NULL,
+};
+
+ATTRIBUTE_GROUPS(stm);
+
+static struct class stm_class = {
+	.name		= "stm",
+	.dev_groups	= stm_groups,
+};
+
+static int stm_dev_match(struct device *dev, const void *data)
+{
+	const char *name = data;
+
+	return sysfs_streq(name, dev_name(dev));
+}
+
+/**
+ * stm_find_device() - find stm device by name
+ * @buf:	character buffer containing the name
+ *
+ * This is called when either policy gets assigned to an stm device or an
+ * stm_source device gets linked to an stm device.
+ *
+ * This grabs device's reference (get_device()) and module reference, both
+ * of which the calling path needs to make sure to drop with stm_put_device().
+ *
+ * Return:	stm device pointer or null if lookup failed.
+ */
+struct stm_device *stm_find_device(const char *buf)
+{
+	struct stm_device *stm;
+	struct device *dev;
+
+	if (!stm_core_up)
+		return NULL;
+
+	dev = class_find_device(&stm_class, NULL, buf, stm_dev_match);
+	if (!dev)
+		return NULL;
+
+	stm = to_stm_device(dev);
+	if (!try_module_get(stm->owner)) {
+		put_device(dev);
+		return NULL;
+	}
+
+	return stm;
+}
+
+/**
+ * stm_put_device() - drop references on the stm device
+ * @stm:	stm device, previously acquired by stm_find_device()
+ *
+ * This drops the module reference and device reference taken by
+ * stm_find_device().
+ */
+void stm_put_device(struct stm_device *stm)
+{
+	module_put(stm->owner);
+	put_device(&stm->dev);
+}
+
+/*
+ * Internally we only care about software-writable masters here, that is the
+ * ones in the range [stm_data->sw_start..stm_data..sw_end], however we need
+ * original master numbers to be visible externally, since they are the ones
+ * that will appear in the STP stream. Thus, the internal bookkeeping uses
+ * $master - stm_data->sw_start to reference master descriptors and such.
+ */
+
+#define __stm_master(_s, _m)				\
+	((_s)->masters[(_m) - (_s)->data->sw_start])
+
+static inline struct stp_master *
+stm_master(struct stm_device *stm, unsigned int idx)
+{
+	if (idx < stm->data->sw_start || idx > stm->data->sw_end)
+		return NULL;
+
+	return __stm_master(stm, idx);
+}
+
+static int stp_master_alloc(struct stm_device *stm, unsigned int idx)
+{
+	struct stp_master *master;
+	size_t size;
+
+	size = ALIGN(stm->data->sw_nchannels, 8) / 8;
+	size += sizeof(struct stp_master);
+	master = kzalloc(size, GFP_ATOMIC);
+	if (!master)
+		return -ENOMEM;
+
+	master->nr_free = stm->data->sw_nchannels;
+	__stm_master(stm, idx) = master;
+
+	return 0;
+}
+
+static void stp_master_free(struct stm_device *stm, unsigned int idx)
+{
+	struct stp_master *master = stm_master(stm, idx);
+
+	if (!master)
+		return;
+
+	__stm_master(stm, idx) = NULL;
+	kfree(master);
+}
+
+static void stm_output_claim(struct stm_device *stm, struct stm_output *output)
+{
+	struct stp_master *master = stm_master(stm, output->master);
+
+	if (WARN_ON_ONCE(master->nr_free < output->nr_chans))
+		return;
+
+	bitmap_allocate_region(&master->chan_map[0], output->channel,
+			       ilog2(output->nr_chans));
+
+	master->nr_free -= output->nr_chans;
+}
+
+static void
+stm_output_disclaim(struct stm_device *stm, struct stm_output *output)
+{
+	struct stp_master *master = stm_master(stm, output->master);
+
+	bitmap_release_region(&master->chan_map[0], output->channel,
+			      ilog2(output->nr_chans));
+
+	output->nr_chans = 0;
+	master->nr_free += output->nr_chans;
+}
+
+/*
+ * This is like bitmap_find_free_region(), except it can ignore @start bits
+ * at the beginning.
+ */
+static int find_free_channels(unsigned long *bitmap, unsigned int start,
+			      unsigned int end, unsigned int width)
+{
+	unsigned int pos;
+	int i;
+
+	for (pos = start; pos < end + 1; pos = ALIGN(pos, width)) {
+		pos = find_next_zero_bit(bitmap, end + 1, pos);
+		if (pos + width > end + 1)
+			break;
+
+		if (pos & (width - 1))
+			continue;
+
+		for (i = 1; i < width && !test_bit(pos + i, bitmap); i++)
+			;
+		if (i == width)
+			return pos;
+	}
+
+	return -1;
+}
+
+static unsigned int
+stm_find_master_chan(struct stm_device *stm, unsigned int width,
+		     unsigned int *mstart, unsigned int mend,
+		     unsigned int *cstart, unsigned int cend)
+{
+	struct stp_master *master;
+	unsigned int midx;
+	int pos, err;
+
+	for (midx = *mstart; midx <= mend; midx++) {
+		if (!stm_master(stm, midx)) {
+			err = stp_master_alloc(stm, midx);
+			if (err)
+				return err;
+		}
+
+		master = stm_master(stm, midx);
+
+		if (!master->nr_free)
+			continue;
+
+		pos = find_free_channels(master->chan_map, *cstart, cend,
+					 width);
+		if (pos < 0)
+			continue;
+
+		*mstart = midx;
+		*cstart = pos;
+		return 0;
+	}
+
+	return -ENOSPC;
+}
+
+static int stm_output_assign(struct stm_device *stm, unsigned int width,
+			     struct stp_policy_node *policy_node,
+			     struct stm_output *output)
+{
+	unsigned int midx, cidx, mend, cend;
+	int ret = -EINVAL;
+
+	if (width > stm->data->sw_nchannels)
+		return -EINVAL;
+
+	if (policy_node) {
+		stp_policy_node_get_ranges(policy_node,
+					   &midx, &mend, &cidx, &cend);
+	} else {
+		midx = stm->data->sw_start;
+		cidx = 0;
+		mend = stm->data->sw_end;
+		cend = stm->data->sw_nchannels - 1;
+	}
+
+	spin_lock(&stm->mc_lock);
+	/* output is already assigned -- shouldn't happen */
+	if (WARN_ON_ONCE(output->nr_chans))
+		goto unlock;
+
+	ret = stm_find_master_chan(stm, width, &midx, mend, &cidx, cend);
+	if (ret)
+		goto unlock;
+
+	output->master = midx;
+	output->channel = cidx;
+	output->nr_chans = width;
+	stm_output_claim(stm, output);
+	dev_dbg(&stm->dev, "assigned %u:%u (+%u)\n", midx, cidx, width);
+
+	ret = 0;
+unlock:
+	spin_unlock(&stm->mc_lock);
+
+	return ret;
+}
+
+static void stm_output_free(struct stm_device *stm, struct stm_output *output)
+{
+	spin_lock(&stm->mc_lock);
+	if (output->nr_chans)
+		stm_output_disclaim(stm, output);
+	spin_unlock(&stm->mc_lock);
+}
+
+static int major_match(struct device *dev, const void *data)
+{
+	unsigned int major = *(unsigned int *)data;
+
+	return MAJOR(dev->devt) == major;
+}
+
+static int stm_char_open(struct inode *inode, struct file *file)
+{
+	struct stm_file *stmf;
+	struct device *dev;
+	unsigned int major = imajor(inode);
+	int err = -ENODEV;
+
+	dev = class_find_device(&stm_class, NULL, &major, major_match);
+	if (!dev)
+		return -ENODEV;
+
+	stmf = kzalloc(sizeof(*stmf), GFP_KERNEL);
+	if (!stmf)
+		return -ENOMEM;
+
+	stmf->stm = to_stm_device(dev);
+
+	if (!try_module_get(stmf->stm->owner))
+		goto err_free;
+
+	file->private_data = stmf;
+
+	return nonseekable_open(inode, file);
+
+err_free:
+	kfree(stmf);
+
+	return err;
+}
+
+static int stm_char_release(struct inode *inode, struct file *file)
+{
+	struct stm_file *stmf = file->private_data;
+
+	stm_output_free(stmf->stm, &stmf->output);
+	stm_put_device(stmf->stm);
+	kfree(stmf);
+
+	return 0;
+}
+
+static int stm_file_assign(struct stm_file *stmf, char *id, unsigned int width)
+{
+	struct stm_device *stm = stmf->stm;
+	int ret;
+
+	stmf->policy_node = stp_policy_node_lookup(stm, id);
+
+	ret = stm_output_assign(stm, width, stmf->policy_node, &stmf->output);
+
+	if (stmf->policy_node)
+		stp_policy_node_put(stmf->policy_node);
+
+	return ret;
+}
+
+static void stm_write(struct stm_data *data, unsigned int master,
+		      unsigned int channel, const char *buf, size_t count)
+{
+	unsigned int flags = STP_PACKET_TIMESTAMPED;
+	const unsigned char *p = buf, nil = 0;
+	size_t pos;
+	ssize_t sz;
+
+	for (pos = 0, p = buf; count > pos; pos += sz, p += sz) {
+		sz = min_t(unsigned int, count - pos, 8);
+		sz = data->packet(data, master, channel, STP_PACKET_DATA, flags,
+				  sz, p);
+		flags = 0;
+	}
+
+	data->packet(data, master, channel, STP_PACKET_FLAG, 0, 0, &nil);
+}
+
+static ssize_t stm_char_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct stm_file *stmf = file->private_data;
+	struct stm_device *stm = stmf->stm;
+	char *kbuf;
+	int err;
+
+	/*
+	 * if no m/c have been assigned to this writer up to this
+	 * point, use "default" policy entry
+	 */
+	if (!stmf->output.nr_chans) {
+		err = stm_file_assign(stmf, "default", 1);
+		/*
+		 * EBUSY means that somebody else just assigned this
+		 * output, which is just fine for write()
+		 */
+		if (err && err != -EBUSY)
+			return err;
+	}
+
+	kbuf = kmalloc(count + 1, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	err = copy_from_user(kbuf, buf, count);
+	if (err) {
+		kfree(kbuf);
+		return -EFAULT;
+	}
+
+	stm_write(stm->data, stmf->output.master, stmf->output.channel, kbuf,
+		  count);
+
+	kfree(kbuf);
+
+	return count;
+}
+
+static int stm_char_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct stm_file *stmf = file->private_data;
+	struct stm_device *stm = stmf->stm;
+	unsigned long size, phys;
+
+	if (!stm->data->mmio_addr)
+		return -EOPNOTSUPP;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	size = vma->vm_end - vma->vm_start;
+
+	if (stmf->output.nr_chans * stm->data->sw_mmiosz != size)
+		return -EINVAL;
+
+	phys = stm->data->mmio_addr(stm->data, stmf->output.master,
+				    stmf->output.channel,
+				    stmf->output.nr_chans);
+
+	if (!phys)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
+	vm_iomap_memory(vma, phys, size);
+
+	return 0;
+}
+
+static int stm_char_policy_set_ioctl(struct stm_file *stmf, void __user *arg)
+{
+	struct stm_device *stm = stmf->stm;
+	struct stp_policy_id *id;
+	int ret = -EINVAL;
+	u32 size;
+
+	if (stmf->output.nr_chans)
+		return -EBUSY;
+
+	if (copy_from_user(&size, arg, sizeof(size)))
+		return -EFAULT;
+
+	if (size >= PATH_MAX + sizeof(*id))
+		return -EINVAL;
+
+	/*
+	 * size + 1 to make sure the .id string at the bottom is terminated,
+	 * which is also why memdup_user() is not useful here
+	 */
+	id = kzalloc(size + 1, GFP_KERNEL);
+	if (!id)
+		return -ENOMEM;
+
+	if (copy_from_user(id, arg, size)) {
+		ret = -EFAULT;
+		goto err_free;
+	}
+
+	if (id->__reserved_0 || id->__reserved_1)
+		goto err_free;
+
+	if (id->width < 1 ||
+	    id->width > PAGE_SIZE / stm->data->sw_mmiosz)
+		goto err_free;
+
+	ret = stm_file_assign(stmf, id->id, id->width);
+	if (ret)
+		goto err_free;
+
+	ret = 0;
+
+	if (stm->data->link)
+		ret = stm->data->link(stm->data, stmf->output.master,
+				      stmf->output.channel);
+
+	if (ret) {
+		stm_output_free(stmf->stm, &stmf->output);
+		stm_put_device(stmf->stm);
+	}
+
+err_free:
+	kfree(id);
+
+	return ret;
+}
+
+static int stm_char_policy_get_ioctl(struct stm_file *stmf, void __user *arg)
+{
+	struct stp_policy_id id = {
+		.size		= sizeof(id),
+		.master		= stmf->output.master,
+		.channel	= stmf->output.channel,
+		.width		= stmf->output.nr_chans,
+		.__reserved_0	= 0,
+		.__reserved_1	= 0,
+	};
+
+	return copy_to_user(arg, &id, id.size) ? -EFAULT : 0;
+}
+
+static long
+stm_char_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct stm_file *stmf = file->private_data;
+	struct stm_data *stm_data = stmf->stm->data;
+	int err = -ENOTTY;
+	u64 options;
+
+	switch (cmd) {
+	case STP_POLICY_ID_SET:
+		err = stm_char_policy_set_ioctl(stmf, (void __user *)arg);
+		if (err)
+			return err;
+
+		return stm_char_policy_get_ioctl(stmf, (void __user *)arg);
+
+	case STP_POLICY_ID_GET:
+		return stm_char_policy_get_ioctl(stmf, (void __user *)arg);
+
+	case STP_SET_OPTIONS:
+		if (copy_from_user(&options, (u64 __user *)arg, sizeof(u64)))
+			return -EFAULT;
+
+		if (stm_data->set_options)
+			err = stm_data->set_options(stm_data,
+						    stmf->output.master,
+						    stmf->output.channel,
+						    stmf->output.nr_chans,
+						    options);
+
+		break;
+	default:
+		break;
+	}
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long
+stm_char_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return stm_char_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#else
+#define stm_char_compat_ioctl	NULL
+#endif
+
+static const struct file_operations stm_fops = {
+	.open		= stm_char_open,
+	.release	= stm_char_release,
+	.write		= stm_char_write,
+	.mmap		= stm_char_mmap,
+	.unlocked_ioctl	= stm_char_ioctl,
+	.compat_ioctl	= stm_char_compat_ioctl,
+	.llseek		= no_llseek,
+};
+
+static void stm_device_release(struct device *dev)
+{
+	struct stm_device *stm = to_stm_device(dev);
+
+	kfree(stm);
+}
+
+int stm_register_device(struct device *parent, struct stm_data *stm_data,
+			struct module *owner)
+{
+	struct stm_device *stm;
+	unsigned int nmasters;
+	int err = -ENOMEM;
+
+	if (!stm_core_up)
+		return -EPROBE_DEFER;
+
+	if (!stm_data->packet || !stm_data->sw_nchannels)
+		return -EINVAL;
+
+	nmasters = stm_data->sw_end - stm_data->sw_start;
+	stm = kzalloc(sizeof(*stm) + nmasters * sizeof(void *), GFP_KERNEL);
+	if (!stm)
+		return -ENOMEM;
+
+	stm->major = register_chrdev(0, stm_data->name, &stm_fops);
+	if (stm->major < 0)
+		goto err_free;
+
+	device_initialize(&stm->dev);
+	stm->dev.devt = MKDEV(stm->major, 0);
+	stm->dev.class = &stm_class;
+	stm->dev.parent = parent;
+	stm->dev.release = stm_device_release;
+
+	err = kobject_set_name(&stm->dev.kobj, "%s", stm_data->name);
+	if (err)
+		goto err_device;
+
+	err = device_add(&stm->dev);
+	if (err)
+		goto err_device;
+
+	spin_lock_init(&stm->link_lock);
+	INIT_LIST_HEAD(&stm->link_list);
+
+	spin_lock_init(&stm->mc_lock);
+	mutex_init(&stm->policy_mutex);
+	stm->sw_nmasters = nmasters;
+	stm->owner = owner;
+	stm->data = stm_data;
+	stm_data->stm = stm;
+
+	return 0;
+
+err_device:
+	put_device(&stm->dev);
+err_free:
+	kfree(stm);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(stm_register_device);
+
+static void __stm_source_link_drop(struct stm_source_device *src,
+				   struct stm_device *stm);
+
+void stm_unregister_device(struct stm_data *stm_data)
+{
+	struct stm_device *stm = stm_data->stm;
+	struct stm_source_device *src, *iter;
+	int i;
+
+	spin_lock(&stm->link_lock);
+	list_for_each_entry_safe(src, iter, &stm->link_list, link_entry) {
+		__stm_source_link_drop(src, stm);
+	}
+	spin_unlock(&stm->link_lock);
+
+	synchronize_srcu(&stm_source_srcu);
+
+	unregister_chrdev(stm->major, stm_data->name);
+
+	mutex_lock(&stm->policy_mutex);
+	if (stm->policy)
+		stp_policy_unbind(stm->policy);
+	mutex_unlock(&stm->policy_mutex);
+
+	for (i = 0; i < stm->sw_nmasters; i++)
+		stp_master_free(stm, i);
+
+	device_unregister(&stm->dev);
+	stm_data->stm = NULL;
+}
+EXPORT_SYMBOL_GPL(stm_unregister_device);
+
+/**
+ * stm_source_link_add() - connect an stm_source device to an stm device
+ * @src:	stm_source device
+ * @stm:	stm device
+ *
+ * This function establishes a link from stm_source to an stm device so that
+ * the former can send out trace data to the latter.
+ *
+ * Return:	0 on success, -errno otherwise.
+ */
+static int stm_source_link_add(struct stm_source_device *src,
+			       struct stm_device *stm)
+{
+	char *id;
+	int err;
+
+	spin_lock(&stm->link_lock);
+	spin_lock(&src->link_lock);
+
+	/* src->link is dereferenced under stm_source_srcu but not the list */
+	rcu_assign_pointer(src->link, stm);
+	list_add_tail(&src->link_entry, &stm->link_list);
+
+	spin_unlock(&src->link_lock);
+	spin_unlock(&stm->link_lock);
+
+	id = kstrdup(src->data->name, GFP_KERNEL);
+	if (id) {
+		src->policy_node =
+			stp_policy_node_lookup(stm, id);
+
+		kfree(id);
+	}
+
+	err = stm_output_assign(stm, src->data->nr_chans,
+				src->policy_node, &src->output);
+
+	if (src->policy_node)
+		stp_policy_node_put(src->policy_node);
+
+	if (err)
+		goto fail_detach;
+
+	/* this is to notify the STM device that a new link has been made */
+	if (stm->data->link)
+		err = stm->data->link(stm->data, src->output.master,
+				      src->output.channel);
+
+	if (err)
+		goto fail_free_output;
+
+	/* this is to let the source carry out all necessary preparations */
+	if (src->data->link)
+		src->data->link(src->data);
+
+	return 0;
+
+fail_free_output:
+	stm_output_free(stm, &src->output);
+	stm_put_device(stm);
+
+fail_detach:
+	spin_lock(&stm->link_lock);
+	spin_lock(&src->link_lock);
+
+	rcu_assign_pointer(src->link, NULL);
+	list_del_init(&src->link_entry);
+
+	spin_unlock(&src->link_lock);
+	spin_unlock(&stm->link_lock);
+
+	return err;
+}
+
+/**
+ * __stm_source_link_drop() - detach stm_source from an stm device
+ * @src:	stm_source device
+ * @stm:	stm device
+ *
+ * If @stm is @src::link, disconnect them from one another and put the
+ * reference on the @stm device.
+ *
+ * Caller must hold stm::link_lock.
+ */
+static void __stm_source_link_drop(struct stm_source_device *src,
+				   struct stm_device *stm)
+{
+	struct stm_device *link;
+
+	spin_lock(&src->link_lock);
+	link = srcu_dereference_check(src->link, &stm_source_srcu, 1);
+	if (WARN_ON_ONCE(link != stm)) {
+		spin_unlock(&src->link_lock);
+		return;
+	}
+
+	stm_output_free(link, &src->output);
+	/* caller must hold stm::link_lock */
+	list_del_init(&src->link_entry);
+	/* matches stm_find_device() from stm_source_link_store() */
+	stm_put_device(link);
+	rcu_assign_pointer(src->link, NULL);
+
+	spin_unlock(&src->link_lock);
+}
+
+/**
+ * stm_source_link_drop() - detach stm_source from its stm device
+ * @src:	stm_source device
+ *
+ * Unlinking means disconnecting from source's STM device; after this
+ * writes will be unsuccessful until it is linked to a new STM device.
+ *
+ * This will happen on "stm_source_link" sysfs attribute write to undo
+ * the existing link (if any), or on linked STM device's de-registration.
+ */
+static void stm_source_link_drop(struct stm_source_device *src)
+{
+	struct stm_device *stm;
+	int idx;
+
+	idx = srcu_read_lock(&stm_source_srcu);
+	stm = srcu_dereference(src->link, &stm_source_srcu);
+
+	if (stm) {
+		if (src->data->unlink)
+			src->data->unlink(src->data);
+
+		spin_lock(&stm->link_lock);
+		__stm_source_link_drop(src, stm);
+		spin_unlock(&stm->link_lock);
+	}
+
+	srcu_read_unlock(&stm_source_srcu, idx);
+}
+
+static ssize_t stm_source_link_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct stm_source_device *src = to_stm_source_device(dev);
+	struct stm_device *stm;
+	int idx, ret;
+
+	idx = srcu_read_lock(&stm_source_srcu);
+	stm = srcu_dereference(src->link, &stm_source_srcu);
+	ret = sprintf(buf, "%s\n",
+		      stm ? dev_name(&stm->dev) : "<none>");
+	srcu_read_unlock(&stm_source_srcu, idx);
+
+	return ret;
+}
+
+static ssize_t stm_source_link_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct stm_source_device *src = to_stm_source_device(dev);
+	struct stm_device *link;
+	int err;
+
+	stm_source_link_drop(src);
+
+	link = stm_find_device(buf);
+	if (!link)
+		return -EINVAL;
+
+	err = stm_source_link_add(src, link);
+	if (err)
+		stm_put_device(link);
+
+	return err ? : count;
+}
+
+static DEVICE_ATTR_RW(stm_source_link);
+
+static struct attribute *stm_source_attrs[] = {
+	&dev_attr_stm_source_link.attr,
+	NULL,
+};
+
+ATTRIBUTE_GROUPS(stm_source);
+
+static struct class stm_source_class = {
+	.name		= "stm_source",
+	.dev_groups	= stm_source_groups,
+};
+
+static void stm_source_device_release(struct device *dev)
+{
+	struct stm_source_device *src = to_stm_source_device(dev);
+
+	kfree(src);
+}
+
+/**
+ * stm_source_register_device() - register an stm_source device
+ * @parent:	parent device
+ * @data:	device description structure
+ *
+ * This will create a device of stm_source class that can write
+ * data to an stm device once linked.
+ *
+ * Return:	0 on success, -errno otherwise.
+ */
+int stm_source_register_device(struct device *parent,
+			       struct stm_source_data *data)
+{
+	struct stm_source_device *src;
+	int err;
+
+	if (!stm_core_up)
+		return -EPROBE_DEFER;
+
+	src = kzalloc(sizeof(*src), GFP_KERNEL);
+	if (!src)
+		return -ENOMEM;
+
+	device_initialize(&src->dev);
+	src->dev.class = &stm_source_class;
+	src->dev.parent = parent;
+	src->dev.release = stm_source_device_release;
+
+	err = kobject_set_name(&src->dev.kobj, "%s", data->name);
+	if (err)
+		goto err;
+
+	err = device_add(&src->dev);
+	if (err)
+		goto err;
+
+	spin_lock_init(&src->link_lock);
+	INIT_LIST_HEAD(&src->link_entry);
+	src->data = data;
+	data->src = src;
+
+	return 0;
+
+err:
+	put_device(&src->dev);
+	kfree(src);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(stm_source_register_device);
+
+/**
+ * stm_source_unregister_device() - unregister an stm_source device
+ * @data:	device description that was used to register the device
+ *
+ * This will remove a previously created stm_source device from the system.
+ */
+void stm_source_unregister_device(struct stm_source_data *data)
+{
+	struct stm_source_device *src = data->src;
+
+	stm_source_link_drop(src);
+
+	device_destroy(&stm_source_class, src->dev.devt);
+}
+EXPORT_SYMBOL_GPL(stm_source_unregister_device);
+
+int stm_source_write(struct stm_source_data *data, unsigned int chan,
+		     const char *buf, size_t count)
+{
+	struct stm_source_device *src = data->src;
+	struct stm_device *stm;
+	int idx;
+
+	if (!src->output.nr_chans)
+		return -ENODEV;
+
+	if (chan >= src->output.nr_chans)
+		return -EINVAL;
+
+	idx = srcu_read_lock(&stm_source_srcu);
+
+	stm = srcu_dereference(src->link, &stm_source_srcu);
+	if (stm)
+		stm_write(stm->data, src->output.master,
+			  src->output.channel + chan,
+			  buf, count);
+	else
+		count = -ENODEV;
+
+	srcu_read_unlock(&stm_source_srcu, idx);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(stm_source_write);
+
+static int __init stm_core_init(void)
+{
+	int err;
+
+	err = class_register(&stm_class);
+	if (err)
+		return err;
+
+	err = class_register(&stm_source_class);
+	if (err)
+		goto err_stm;
+
+	err = stp_configfs_init();
+	if (err)
+		goto err_src;
+
+	init_srcu_struct(&stm_source_srcu);
+
+	stm_core_up++;
+
+	return 0;
+
+err_src:
+	class_unregister(&stm_source_class);
+err_stm:
+	class_unregister(&stm_class);
+
+	return err;
+}
+
+module_init(stm_core_init);
+
+static void __exit stm_core_exit(void)
+{
+	cleanup_srcu_struct(&stm_source_srcu);
+	class_unregister(&stm_source_class);
+	class_unregister(&stm_class);
+	stp_configfs_exit();
+}
+
+module_exit(stm_core_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("System Trace Module device class");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/stm/dummy_stm.c b/drivers/hwtracing/stm/dummy_stm.c
new file mode 100644
index 000000000000..3709bef0b21f
--- /dev/null
+++ b/drivers/hwtracing/stm/dummy_stm.c
@@ -0,0 +1,66 @@
+/*
+ * A dummy STM device for stm/stm_source class testing.
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * STM class implements generic infrastructure for  System Trace Module devices
+ * as defined in MIPI STPv2 specification.
+ */
+
+#undef DEBUG
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+
+static ssize_t
+dummy_stm_packet(struct stm_data *stm_data, unsigned int master,
+		 unsigned int channel, unsigned int packet, unsigned int flags,
+		 unsigned int size, const unsigned char *payload)
+{
+#ifdef DEBUG
+	u64 pl = 0;
+
+	if (payload)
+		pl = *(u64 *)payload;
+
+	if (size < 8)
+		pl &= (1ull << (size * 8)) - 1;
+	trace_printk("[%u:%u] [pkt: %x/%x] (%llx)\n", master, channel,
+		     packet, size, pl);
+#endif
+	return size;
+}
+
+static struct stm_data dummy_stm = {
+	.name		= "dummy_stm",
+	.sw_start	= 0x0000,
+	.sw_end		= 0xffff,
+	.sw_nchannels	= 0xffff,
+	.packet		= dummy_stm_packet,
+};
+
+static int dummy_stm_init(void)
+{
+	return stm_register_device(NULL, &dummy_stm, THIS_MODULE);
+}
+
+static void dummy_stm_exit(void)
+{
+	stm_unregister_device(&dummy_stm);
+}
+
+module_init(dummy_stm_init);
+module_exit(dummy_stm_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("dummy_stm device");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/stm/policy.c b/drivers/hwtracing/stm/policy.c
new file mode 100644
index 000000000000..6498a9dbb7bd
--- /dev/null
+++ b/drivers/hwtracing/stm/policy.c
@@ -0,0 +1,529 @@
+/*
+ * System Trace Module (STM) master/channel allocation policy management
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * A master/channel allocation policy allows mapping string identifiers to
+ * master and channel ranges, where allocation can be done.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/configfs.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+#include "stm.h"
+
+/*
+ * STP Master/Channel allocation policy configfs layout.
+ */
+
+struct stp_policy {
+	struct config_group	group;
+	struct stm_device	*stm;
+};
+
+struct stp_policy_node {
+	struct config_group	group;
+	struct stp_policy	*policy;
+	unsigned int		first_master;
+	unsigned int		last_master;
+	unsigned int		first_channel;
+	unsigned int		last_channel;
+};
+
+static struct configfs_subsystem stp_policy_subsys;
+
+void stp_policy_node_get_ranges(struct stp_policy_node *policy_node,
+				unsigned int *mstart, unsigned int *mend,
+				unsigned int *cstart, unsigned int *cend)
+{
+	*mstart	= policy_node->first_master;
+	*mend	= policy_node->last_master;
+	*cstart	= policy_node->first_channel;
+	*cend	= policy_node->last_channel;
+}
+
+static inline char *stp_policy_node_name(struct stp_policy_node *policy_node)
+{
+	return policy_node->group.cg_item.ci_name ? : "<none>";
+}
+
+static inline struct stp_policy *to_stp_policy(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct stp_policy, group) :
+		NULL;
+}
+
+static inline struct stp_policy_node *
+to_stp_policy_node(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct stp_policy_node,
+			     group) :
+		NULL;
+}
+
+static ssize_t stp_policy_node_masters_show(struct stp_policy_node *policy_node,
+					    char *page)
+{
+	ssize_t count;
+
+	count = sprintf(page, "%u %u\n", policy_node->first_master,
+			policy_node->last_master);
+
+	return count;
+}
+
+static ssize_t
+stp_policy_node_masters_store(struct stp_policy_node *policy_node,
+			      const char *page, size_t count)
+{
+	unsigned int first, last;
+	struct stm_device *stm;
+	char *p = (char *)page;
+	ssize_t ret = -ENODEV;
+
+	if (sscanf(p, "%u %u", &first, &last) != 2)
+		return -EINVAL;
+
+	mutex_lock(&stp_policy_subsys.su_mutex);
+	stm = policy_node->policy->stm;
+	if (!stm)
+		goto unlock;
+
+	/* must be within [sw_start..sw_end], which is an inclusive range */
+	if (first > INT_MAX || last > INT_MAX || first > last ||
+	    first < stm->data->sw_start ||
+	    last > stm->data->sw_end) {
+		ret = -ERANGE;
+		goto unlock;
+	}
+
+	ret = count;
+	policy_node->first_master = first;
+	policy_node->last_master = last;
+
+unlock:
+	mutex_unlock(&stp_policy_subsys.su_mutex);
+
+	return ret;
+}
+
+static ssize_t
+stp_policy_node_channels_show(struct stp_policy_node *policy_node, char *page)
+{
+	ssize_t count;
+
+	count = sprintf(page, "%u %u\n", policy_node->first_channel,
+			policy_node->last_channel);
+
+	return count;
+}
+
+static ssize_t
+stp_policy_node_channels_store(struct stp_policy_node *policy_node,
+			       const char *page, size_t count)
+{
+	unsigned int first, last;
+	struct stm_device *stm;
+	char *p = (char *)page;
+	ssize_t ret = -ENODEV;
+
+	if (sscanf(p, "%u %u", &first, &last) != 2)
+		return -EINVAL;
+
+	mutex_lock(&stp_policy_subsys.su_mutex);
+	stm = policy_node->policy->stm;
+	if (!stm)
+		goto unlock;
+
+	if (first > INT_MAX || last > INT_MAX || first > last ||
+	    last >= stm->data->sw_nchannels) {
+		ret = -ERANGE;
+		goto unlock;
+	}
+
+	ret = count;
+	policy_node->first_channel = first;
+	policy_node->last_channel = last;
+
+unlock:
+	mutex_unlock(&stp_policy_subsys.su_mutex);
+
+	return ret;
+}
+
+static void stp_policy_node_release(struct config_item *item)
+{
+	kfree(to_stp_policy_node(item));
+}
+
+struct stp_policy_node_attribute {
+	struct configfs_attribute	attr;
+	ssize_t (*show)(struct stp_policy_node *, char *);
+	ssize_t (*store)(struct stp_policy_node *, const char *, size_t);
+};
+
+static ssize_t stp_policy_node_attr_show(struct config_item *item,
+					 struct configfs_attribute *attr,
+					 char *page)
+{
+	struct stp_policy_node *policy_node = to_stp_policy_node(item);
+	struct stp_policy_node_attribute *pn_attr =
+		container_of(attr, struct stp_policy_node_attribute, attr);
+	ssize_t count = 0;
+
+	if (pn_attr->show)
+		count = pn_attr->show(policy_node, page);
+
+	return count;
+}
+
+static ssize_t stp_policy_node_attr_store(struct config_item *item,
+					  struct configfs_attribute *attr,
+					  const char *page, size_t len)
+{
+	struct stp_policy_node *policy_node = to_stp_policy_node(item);
+	struct stp_policy_node_attribute *pn_attr =
+		container_of(attr, struct stp_policy_node_attribute, attr);
+	ssize_t count = -EINVAL;
+
+	if (pn_attr->store)
+		count = pn_attr->store(policy_node, page, len);
+
+	return count;
+}
+
+static struct configfs_item_operations stp_policy_node_item_ops = {
+	.release		= stp_policy_node_release,
+	.show_attribute		= stp_policy_node_attr_show,
+	.store_attribute	= stp_policy_node_attr_store,
+};
+
+static struct stp_policy_node_attribute stp_policy_node_attr_range = {
+	.attr	= {
+		.ca_owner = THIS_MODULE,
+		.ca_name = "masters",
+		.ca_mode = S_IRUGO | S_IWUSR,
+	},
+	.show	= stp_policy_node_masters_show,
+	.store	= stp_policy_node_masters_store,
+};
+
+static struct stp_policy_node_attribute stp_policy_node_attr_channels = {
+	.attr	= {
+		.ca_owner = THIS_MODULE,
+		.ca_name = "channels",
+		.ca_mode = S_IRUGO | S_IWUSR,
+	},
+	.show	= stp_policy_node_channels_show,
+	.store	= stp_policy_node_channels_store,
+};
+
+static struct configfs_attribute *stp_policy_node_attrs[] = {
+	&stp_policy_node_attr_range.attr,
+	&stp_policy_node_attr_channels.attr,
+	NULL,
+};
+
+static struct config_item_type stp_policy_type;
+static struct config_item_type stp_policy_node_type;
+
+static struct config_group *
+stp_policy_node_make(struct config_group *group, const char *name)
+{
+	struct stp_policy_node *policy_node, *parent_node;
+	struct stp_policy *policy;
+
+	if (group->cg_item.ci_type == &stp_policy_type) {
+		policy = container_of(group, struct stp_policy, group);
+	} else {
+		parent_node = container_of(group, struct stp_policy_node,
+					   group);
+		policy = parent_node->policy;
+	}
+
+	if (!policy->stm)
+		return ERR_PTR(-ENODEV);
+
+	policy_node = kzalloc(sizeof(struct stp_policy_node), GFP_KERNEL);
+	if (!policy_node)
+		return ERR_PTR(-ENOMEM);
+
+	config_group_init_type_name(&policy_node->group, name,
+				    &stp_policy_node_type);
+
+	policy_node->policy = policy;
+
+	/* default values for the attributes */
+	policy_node->first_master = policy->stm->data->sw_start;
+	policy_node->last_master = policy->stm->data->sw_end;
+	policy_node->first_channel = 0;
+	policy_node->last_channel = policy->stm->data->sw_nchannels - 1;
+
+	return &policy_node->group;
+}
+
+static void
+stp_policy_node_drop(struct config_group *group, struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations stp_policy_node_group_ops = {
+	.make_group	= stp_policy_node_make,
+	.drop_item	= stp_policy_node_drop,
+};
+
+static struct config_item_type stp_policy_node_type = {
+	.ct_item_ops	= &stp_policy_node_item_ops,
+	.ct_group_ops	= &stp_policy_node_group_ops,
+	.ct_attrs	= stp_policy_node_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/*
+ * Root group: policies.
+ */
+static struct configfs_attribute stp_policy_attr_device = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "device",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *stp_policy_attrs[] = {
+	&stp_policy_attr_device,
+	NULL,
+};
+
+static ssize_t stp_policy_attr_show(struct config_item *item,
+				    struct configfs_attribute *attr,
+				    char *page)
+{
+	struct stp_policy *policy = to_stp_policy(item);
+	ssize_t count;
+
+	count = sprintf(page, "%s\n",
+			(policy && policy->stm) ?
+			policy->stm->data->name :
+			"<none>");
+
+	return count;
+}
+
+void stp_policy_unbind(struct stp_policy *policy)
+{
+	struct stm_device *stm = policy->stm;
+
+	if (WARN_ON_ONCE(!policy->stm))
+		return;
+
+	mutex_lock(&stm->policy_mutex);
+	stm->policy = NULL;
+	mutex_unlock(&stm->policy_mutex);
+
+	policy->stm = NULL;
+
+	stm_put_device(stm);
+}
+
+static void stp_policy_release(struct config_item *item)
+{
+	struct stp_policy *policy = to_stp_policy(item);
+
+	stp_policy_unbind(policy);
+	kfree(policy);
+}
+
+static struct configfs_item_operations stp_policy_item_ops = {
+	.release		= stp_policy_release,
+	.show_attribute		= stp_policy_attr_show,
+};
+
+static struct configfs_group_operations stp_policy_group_ops = {
+	.make_group	= stp_policy_node_make,
+};
+
+static struct config_item_type stp_policy_type = {
+	.ct_item_ops	= &stp_policy_item_ops,
+	.ct_group_ops	= &stp_policy_group_ops,
+	.ct_attrs	= stp_policy_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_group *
+stp_policies_make(struct config_group *group, const char *name)
+{
+	struct config_group *ret;
+	struct stm_device *stm;
+	char *devname, *p;
+
+	devname = kasprintf(GFP_KERNEL, "%s", name);
+	if (!devname)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * node must look like <device_name>.<policy_name>, where
+	 * <device_name> is the name of an existing stm device and
+	 * <policy_name> is an arbitrary string
+	 */
+	p = strchr(devname, '.');
+	if (!p) {
+		kfree(devname);
+		return ERR_PTR(-EINVAL);
+	}
+
+	*p++ = '\0';
+
+	stm = stm_find_device(devname);
+	kfree(devname);
+
+	if (!stm)
+		return ERR_PTR(-ENODEV);
+
+	mutex_lock(&stm->policy_mutex);
+	if (stm->policy) {
+		ret = ERR_PTR(-EBUSY);
+		goto unlock_policy;
+	}
+
+	stm->policy = kzalloc(sizeof(*stm->policy), GFP_KERNEL);
+	if (!stm->policy) {
+		ret = ERR_PTR(-ENOMEM);
+		goto unlock_policy;
+	}
+
+	config_group_init_type_name(&stm->policy->group, name,
+				    &stp_policy_type);
+	stm->policy->stm = stm;
+
+	ret = &stm->policy->group;
+
+unlock_policy:
+	mutex_unlock(&stm->policy_mutex);
+
+	if (IS_ERR(ret))
+		stm_put_device(stm);
+
+	return ret;
+}
+
+static struct configfs_group_operations stp_policies_group_ops = {
+	.make_group	= stp_policies_make,
+};
+
+static struct config_item_type stp_policies_type = {
+	.ct_group_ops	= &stp_policies_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct configfs_subsystem stp_policy_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf	= "stp-policy",
+			.ci_type	= &stp_policies_type,
+		},
+	},
+};
+
+/*
+ * Lock the policy mutex from the outside
+ */
+static struct stp_policy_node *
+__stp_policy_node_lookup(struct stp_policy *policy, char *s)
+{
+	struct stp_policy_node *policy_node, *ret;
+	struct list_head *head = &policy->group.cg_children;
+	struct config_item *item;
+	char *start, *end = s;
+
+	if (list_empty(head))
+		return NULL;
+
+	/* return the first entry if everything else fails */
+	item = list_entry(head->next, struct config_item, ci_entry);
+	ret = to_stp_policy_node(item);
+
+next:
+	for (;;) {
+		start = strsep(&end, "/");
+		if (!start)
+			break;
+
+		if (!*start)
+			continue;
+
+		list_for_each_entry(item, head, ci_entry) {
+			policy_node = to_stp_policy_node(item);
+
+			if (!strcmp(start,
+				    policy_node->group.cg_item.ci_name)) {
+				ret = policy_node;
+
+				if (!end)
+					goto out;
+
+				head = &policy_node->group.cg_children;
+				goto next;
+			}
+		}
+		break;
+	}
+
+out:
+	return ret;
+}
+
+
+struct stp_policy_node *
+stp_policy_node_lookup(struct stm_device *stm, char *s)
+{
+	struct stp_policy_node *policy_node = NULL;
+
+	mutex_lock(&stp_policy_subsys.su_mutex);
+
+	mutex_lock(&stm->policy_mutex);
+	if (stm->policy)
+		policy_node = __stp_policy_node_lookup(stm->policy, s);
+	mutex_unlock(&stm->policy_mutex);
+
+	if (policy_node)
+		config_item_get(&policy_node->group.cg_item);
+	mutex_unlock(&stp_policy_subsys.su_mutex);
+
+	return policy_node;
+}
+
+void stp_policy_node_put(struct stp_policy_node *policy_node)
+{
+	config_item_put(&policy_node->group.cg_item);
+}
+
+int __init stp_configfs_init(void)
+{
+	int err;
+
+	config_group_init(&stp_policy_subsys.su_group);
+	mutex_init(&stp_policy_subsys.su_mutex);
+	err = configfs_register_subsystem(&stp_policy_subsys);
+
+	return err;
+}
+
+void __exit stp_configfs_exit(void)
+{
+	configfs_unregister_subsystem(&stp_policy_subsys);
+}
diff --git a/drivers/hwtracing/stm/stm.h b/drivers/hwtracing/stm/stm.h
new file mode 100644
index 000000000000..95ece0292c99
--- /dev/null
+++ b/drivers/hwtracing/stm/stm.h
@@ -0,0 +1,87 @@
+/*
+ * System Trace Module (STM) infrastructure
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * STM class implements generic infrastructure for  System Trace Module devices
+ * as defined in MIPI STPv2 specification.
+ */
+
+#ifndef _STM_STM_H_
+#define _STM_STM_H_
+
+struct stp_policy;
+struct stp_policy_node;
+
+struct stp_policy_node *
+stp_policy_node_lookup(struct stm_device *stm, char *s);
+void stp_policy_node_put(struct stp_policy_node *policy_node);
+void stp_policy_unbind(struct stp_policy *policy);
+
+void stp_policy_node_get_ranges(struct stp_policy_node *policy_node,
+				unsigned int *mstart, unsigned int *mend,
+				unsigned int *cstart, unsigned int *cend);
+int stp_configfs_init(void);
+void stp_configfs_exit(void);
+
+struct stp_master {
+	unsigned int	nr_free;
+	unsigned long	chan_map[0];
+};
+
+struct stm_device {
+	struct device		dev;
+	struct module		*owner;
+	struct stp_policy	*policy;
+	struct mutex		policy_mutex;
+	int			major;
+	unsigned int		sw_nmasters;
+	struct stm_data		*data;
+	spinlock_t		link_lock;
+	struct list_head	link_list;
+	/* master allocation */
+	spinlock_t		mc_lock;
+	struct stp_master	*masters[0];
+};
+
+#define to_stm_device(_d)				\
+	container_of((_d), struct stm_device, dev)
+
+struct stm_output {
+	unsigned int		master;
+	unsigned int		channel;
+	unsigned int		nr_chans;
+};
+
+struct stm_file {
+	struct stm_device	*stm;
+	struct stp_policy_node	*policy_node;
+	struct stm_output	output;
+};
+
+struct stm_device *stm_find_device(const char *name);
+void stm_put_device(struct stm_device *stm);
+
+struct stm_source_device {
+	struct device		dev;
+	struct stm_source_data	*data;
+	spinlock_t		link_lock;
+	struct stm_device __rcu	*link;
+	struct list_head	link_entry;
+	/* one output per stm_source device */
+	struct stp_policy_node	*policy_node;
+	struct stm_output	output;
+};
+
+#define to_stm_source_device(_d)				\
+	container_of((_d), struct stm_source_device, dev)
+
+#endif /* _STM_STM_H_ */
diff --git a/drivers/memory/fsl-corenet-cf.c b/drivers/memory/fsl-corenet-cf.c
index d708ded5457b..662d050243be 100644
--- a/drivers/memory/fsl-corenet-cf.c
+++ b/drivers/memory/fsl-corenet-cf.c
@@ -61,6 +61,7 @@ static const struct of_device_id ccf_matches[] = {
 	},
 	{}
 };
+MODULE_DEVICE_TABLE(of, ccf_matches);
 
 struct ccf_err_regs {
 	u32 errdet;		/* 0x00 Error Detect Register */
diff --git a/drivers/memory/ti-aemif.c b/drivers/memory/ti-aemif.c
index ca7d97a9a9ba..a579a0f25840 100644
--- a/drivers/memory/ti-aemif.c
+++ b/drivers/memory/ti-aemif.c
@@ -324,6 +324,7 @@ static const struct of_device_id aemif_of_match[] = {
 	{ .compatible = "ti,da850-aemif", },
 	{},
 };
+MODULE_DEVICE_TABLE(of, aemif_of_match);
 
 static int aemif_probe(struct platform_device *pdev)
 {
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index ccccc2943f2f..22892c701c63 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -414,7 +414,7 @@ config TI_DAC7512
 
 config VMWARE_BALLOON
 	tristate "VMware Balloon Driver"
-	depends on X86 && HYPERVISOR_GUEST
+	depends on VMWARE_VMCI && X86 && HYPERVISOR_GUEST
 	help
 	  This is VMware physical memory management driver which acts
 	  like a "balloon" that can be inflated to reclaim physical pages
diff --git a/drivers/misc/ad525x_dpot-i2c.c b/drivers/misc/ad525x_dpot-i2c.c
index d11187d36ddd..4f832002d116 100644
--- a/drivers/misc/ad525x_dpot-i2c.c
+++ b/drivers/misc/ad525x_dpot-i2c.c
@@ -117,4 +117,3 @@ module_i2c_driver(ad_dpot_i2c_driver);
 MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
 MODULE_DESCRIPTION("digital potentiometer I2C bus driver");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("i2c:ad_dpot");
diff --git a/drivers/misc/genwqe/card_base.h b/drivers/misc/genwqe/card_base.h
index e7353449874b..cb851c14ca4b 100644
--- a/drivers/misc/genwqe/card_base.h
+++ b/drivers/misc/genwqe/card_base.h
@@ -514,7 +514,7 @@ int  __genwqe_execute_ddcb(struct genwqe_dev *cd,
 /**
  * __genwqe_execute_raw_ddcb() - Execute DDCB request without addr translation
  *
- * This version will not do address translation or any modifcation of
+ * This version will not do address translation or any modification of
  * the DDCB data. It is used e.g. for the MoveFlash DDCB which is
  * entirely prepared by the driver itself. That means the appropriate
  * DMA addresses are already in the DDCB and do not need any
diff --git a/drivers/misc/genwqe/card_ddcb.c b/drivers/misc/genwqe/card_ddcb.c
index 6d51e5f08664..353ee0cc733d 100644
--- a/drivers/misc/genwqe/card_ddcb.c
+++ b/drivers/misc/genwqe/card_ddcb.c
@@ -203,7 +203,7 @@ struct genwqe_ddcb_cmd *ddcb_requ_alloc(void)
 {
 	struct ddcb_requ *req;
 
-	req = kzalloc(sizeof(*req), GFP_ATOMIC);
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
 	if (!req)
 		return NULL;
 
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index 70e62d6a3231..7f1b282d7d96 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -449,7 +449,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma)
 	if (get_order(vsize) > MAX_ORDER)
 		return -ENOMEM;
 
-	dma_map = kzalloc(sizeof(struct dma_mapping), GFP_ATOMIC);
+	dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL);
 	if (dma_map == NULL)
 		return -ENOMEM;
 
@@ -785,7 +785,7 @@ static int genwqe_pin_mem(struct genwqe_file *cfile, struct genwqe_mem *m)
 	map_addr = (m->addr & PAGE_MASK);
 	map_size = round_up(m->size + (m->addr & ~PAGE_MASK), PAGE_SIZE);
 
-	dma_map = kzalloc(sizeof(struct dma_mapping), GFP_ATOMIC);
+	dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL);
 	if (dma_map == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 1ca94e6fa8fb..222367cc8c81 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -220,7 +220,8 @@ void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size,
 	if (get_order(size) > MAX_ORDER)
 		return NULL;
 
-	return pci_alloc_consistent(cd->pci_dev, size, dma_handle);
+	return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle,
+				  GFP_KERNEL);
 }
 
 void __genwqe_free_consistent(struct genwqe_dev *cd, size_t size,
@@ -229,7 +230,7 @@ void __genwqe_free_consistent(struct genwqe_dev *cd, size_t size,
 	if (vaddr == NULL)
 		return;
 
-	pci_free_consistent(cd->pci_dev, size, vaddr, dma_handle);
+	dma_free_coherent(&cd->pci_dev->dev, size, vaddr, dma_handle);
 }
 
 static void genwqe_unmap_pages(struct genwqe_dev *cd, dma_addr_t *dma_list,
diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c
index b83e3ca12a41..d6a901cd4222 100644
--- a/drivers/misc/hpilo.c
+++ b/drivers/misc/hpilo.c
@@ -2,7 +2,7 @@
  * Driver for the HP iLO management processor.
  *
  * Copyright (C) 2008 Hewlett-Packard Development Company, L.P.
- *	David Altobelli <david.altobelli@hp.com>
+ *	David Altobelli <david.altobelli@hpe.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -902,11 +902,11 @@ static void __exit ilo_exit(void)
 MODULE_VERSION("1.4.1");
 MODULE_ALIAS(ILO_NAME);
 MODULE_DESCRIPTION(ILO_NAME);
-MODULE_AUTHOR("David Altobelli <david.altobelli@hp.com>");
+MODULE_AUTHOR("David Altobelli <david.altobelli@hpe.com>");
 MODULE_LICENSE("GPL v2");
 
 module_param(max_ccb, uint, 0444);
-MODULE_PARM_DESC(max_ccb, "Maximum number of HP iLO channels to attach (16)");
+MODULE_PARM_DESC(max_ccb, "Maximum number of HP iLO channels to attach (8-24)(default=16)");
 
 module_init(ilo_init);
 module_exit(ilo_exit);
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 9a60bd4d3c49..99635dd9dbac 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -1112,6 +1112,7 @@ static int __init init_kgdbts(void)
 
 	return configure_kgdbts();
 }
+device_initcall(init_kgdbts);
 
 static int kgdbts_get_char(void)
 {
@@ -1180,10 +1181,9 @@ static struct kgdb_io kgdbts_io_ops = {
 	.post_exception		= kgdbts_post_exp_handler,
 };
 
-module_init(init_kgdbts);
+/*
+ * not really modular, but the easiest way to keep compat with existing
+ * bootargs behaviour is to continue using module_param here.
+ */
 module_param_call(kgdbts, param_set_kgdbts_var, param_get_string, &kps, 0644);
 MODULE_PARM_DESC(kgdbts, "<A|V1|V2>[F#|S#][N#]");
-MODULE_DESCRIPTION("KGDB Test Suite");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Wind River Systems, Inc.");
-
diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c
index 3e536ca85f7d..020de5919c21 100644
--- a/drivers/misc/mei/bus-fixup.c
+++ b/drivers/misc/mei/bus-fixup.c
@@ -285,11 +285,11 @@ static struct mei_fixup {
 };
 
 /**
- * mei_cl_dev_fixup - run fixup handlers
+ * mei_cldev_fixup - run fixup handlers
  *
  * @cldev: me client device
  */
-void mei_cl_dev_fixup(struct mei_cl_device *cldev)
+void mei_cl_bus_dev_fixup(struct mei_cl_device *cldev)
 {
 	struct mei_fixup *f;
 	const uuid_le *uuid = mei_me_cl_uuid(cldev->me_cl);
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index eef1c6b46ad8..832085207a7f 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -165,7 +165,7 @@ out:
 }
 
 /**
- * mei_cl_send - me device send  (write)
+ * mei_cldev_send - me device send  (write)
  *
  * @cldev: me client device
  * @buf: buffer to send
@@ -173,7 +173,7 @@ out:
  *
  * Return: written size in bytes or < 0 on error
  */
-ssize_t mei_cl_send(struct mei_cl_device *cldev, u8 *buf, size_t length)
+ssize_t mei_cldev_send(struct mei_cl_device *cldev, u8 *buf, size_t length)
 {
 	struct mei_cl *cl = cldev->cl;
 
@@ -182,10 +182,10 @@ ssize_t mei_cl_send(struct mei_cl_device *cldev, u8 *buf, size_t length)
 
 	return __mei_cl_send(cl, buf, length, 1);
 }
-EXPORT_SYMBOL_GPL(mei_cl_send);
+EXPORT_SYMBOL_GPL(mei_cldev_send);
 
 /**
- * mei_cl_recv - client receive (read)
+ * mei_cldev_recv - client receive (read)
  *
  * @cldev: me client device
  * @buf: buffer to send
@@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(mei_cl_send);
  *
  * Return: read size in bytes of < 0 on error
  */
-ssize_t mei_cl_recv(struct mei_cl_device *cldev, u8 *buf, size_t length)
+ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length)
 {
 	struct mei_cl *cl = cldev->cl;
 
@@ -202,15 +202,15 @@ ssize_t mei_cl_recv(struct mei_cl_device *cldev, u8 *buf, size_t length)
 
 	return __mei_cl_recv(cl, buf, length);
 }
-EXPORT_SYMBOL_GPL(mei_cl_recv);
+EXPORT_SYMBOL_GPL(mei_cldev_recv);
 
 /**
- * mei_bus_event_work  - dispatch rx event for a bus device
+ * mei_cl_bus_event_work  - dispatch rx event for a bus device
  *    and schedule new work
  *
  * @work: work
  */
-static void mei_bus_event_work(struct work_struct *work)
+static void mei_cl_bus_event_work(struct work_struct *work)
 {
 	struct mei_cl_device *cldev;
 
@@ -272,7 +272,7 @@ void mei_cl_bus_rx_event(struct mei_cl *cl)
 }
 
 /**
- * mei_cl_register_event_cb - register event callback
+ * mei_cldev_register_event_cb - register event callback
  *
  * @cldev: me client devices
  * @event_cb: callback function
@@ -283,9 +283,9 @@ void mei_cl_bus_rx_event(struct mei_cl *cl)
  *         -EALREADY if an callback is already registered
  *         <0 on other errors
  */
-int mei_cl_register_event_cb(struct mei_cl_device *cldev,
-			  unsigned long events_mask,
-			  mei_cl_event_cb_t event_cb, void *context)
+int mei_cldev_register_event_cb(struct mei_cl_device *cldev,
+				unsigned long events_mask,
+				mei_cldev_event_cb_t event_cb, void *context)
 {
 	int ret;
 
@@ -296,7 +296,7 @@ int mei_cl_register_event_cb(struct mei_cl_device *cldev,
 	cldev->events_mask = events_mask;
 	cldev->event_cb = event_cb;
 	cldev->event_context = context;
-	INIT_WORK(&cldev->event_work, mei_bus_event_work);
+	INIT_WORK(&cldev->event_work, mei_cl_bus_event_work);
 
 	if (cldev->events_mask & BIT(MEI_CL_EVENT_RX)) {
 		ret = mei_cl_read_start(cldev->cl, 0, NULL);
@@ -314,42 +314,81 @@ int mei_cl_register_event_cb(struct mei_cl_device *cldev,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(mei_cl_register_event_cb);
+EXPORT_SYMBOL_GPL(mei_cldev_register_event_cb);
 
 /**
- * mei_cl_get_drvdata - driver data getter
+ * mei_cldev_get_drvdata - driver data getter
  *
  * @cldev: mei client device
  *
  * Return: driver private data
  */
-void *mei_cl_get_drvdata(const struct mei_cl_device *cldev)
+void *mei_cldev_get_drvdata(const struct mei_cl_device *cldev)
 {
 	return dev_get_drvdata(&cldev->dev);
 }
-EXPORT_SYMBOL_GPL(mei_cl_get_drvdata);
+EXPORT_SYMBOL_GPL(mei_cldev_get_drvdata);
 
 /**
- * mei_cl_set_drvdata - driver data setter
+ * mei_cldev_set_drvdata - driver data setter
  *
  * @cldev: mei client device
  * @data: data to store
  */
-void mei_cl_set_drvdata(struct mei_cl_device *cldev, void *data)
+void mei_cldev_set_drvdata(struct mei_cl_device *cldev, void *data)
 {
 	dev_set_drvdata(&cldev->dev, data);
 }
-EXPORT_SYMBOL_GPL(mei_cl_set_drvdata);
+EXPORT_SYMBOL_GPL(mei_cldev_set_drvdata);
 
 /**
- * mei_cl_enable_device - enable me client device
+ * mei_cldev_uuid - return uuid of the underlying me client
+ *
+ * @cldev: mei client device
+ *
+ * Return: me client uuid
+ */
+const uuid_le *mei_cldev_uuid(const struct mei_cl_device *cldev)
+{
+	return mei_me_cl_uuid(cldev->me_cl);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_uuid);
+
+/**
+ * mei_cldev_ver - return protocol version of the underlying me client
+ *
+ * @cldev: mei client device
+ *
+ * Return: me client protocol version
+ */
+u8 mei_cldev_ver(const struct mei_cl_device *cldev)
+{
+	return mei_me_cl_ver(cldev->me_cl);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_ver);
+
+/**
+ * mei_cldev_enabled - check whether the device is enabled
+ *
+ * @cldev: mei client device
+ *
+ * Return: true if me client is initialized and connected
+ */
+bool mei_cldev_enabled(struct mei_cl_device *cldev)
+{
+	return cldev->cl && mei_cl_is_connected(cldev->cl);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_enabled);
+
+/**
+ * mei_cldev_enable_device - enable me client device
  *     create connection with me client
  *
  * @cldev: me client device
  *
  * Return: 0 on success and < 0 on error
  */
-int mei_cl_enable_device(struct mei_cl_device *cldev)
+int mei_cldev_enable(struct mei_cl_device *cldev)
 {
 	struct mei_device *bus = cldev->bus;
 	struct mei_cl *cl;
@@ -389,17 +428,17 @@ out:
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(mei_cl_enable_device);
+EXPORT_SYMBOL_GPL(mei_cldev_enable);
 
 /**
- * mei_cl_disable_device - disable me client device
+ * mei_cldev_disable - disable me client device
  *     disconnect form the me client
  *
  * @cldev: me client device
  *
  * Return: 0 on success and < 0 on error
  */
-int mei_cl_disable_device(struct mei_cl_device *cldev)
+int mei_cldev_disable(struct mei_cl_device *cldev)
 {
 	struct mei_device *bus;
 	struct mei_cl *cl;
@@ -437,7 +476,7 @@ out:
 	mutex_unlock(&bus->device_lock);
 	return err;
 }
-EXPORT_SYMBOL_GPL(mei_cl_disable_device);
+EXPORT_SYMBOL_GPL(mei_cldev_disable);
 
 /**
  * mei_cl_device_find - find matching entry in the driver id table
@@ -453,17 +492,26 @@ struct mei_cl_device_id *mei_cl_device_find(struct mei_cl_device *cldev,
 {
 	const struct mei_cl_device_id *id;
 	const uuid_le *uuid;
+	u8 version;
+	bool match;
 
 	uuid = mei_me_cl_uuid(cldev->me_cl);
+	version = mei_me_cl_ver(cldev->me_cl);
 
 	id = cldrv->id_table;
 	while (uuid_le_cmp(NULL_UUID_LE, id->uuid)) {
 		if (!uuid_le_cmp(*uuid, id->uuid)) {
+			match = true;
 
-			if (!cldev->name[0])
-				return id;
+			if (cldev->name[0])
+				if (strncmp(cldev->name, id->name,
+					    sizeof(id->name)))
+					match = false;
 
-			if (!strncmp(cldev->name, id->name, sizeof(id->name)))
+			if (id->version != MEI_CL_VERSION_ANY)
+				if (id->version != version)
+					match = false;
+			if (match)
 				return id;
 		}
 
@@ -590,6 +638,19 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *a,
 }
 static DEVICE_ATTR_RO(uuid);
 
+static ssize_t version_show(struct device *dev, struct device_attribute *a,
+			     char *buf)
+{
+	struct mei_cl_device *cldev = to_mei_cl_device(dev);
+	u8 version = mei_me_cl_ver(cldev->me_cl);
+	size_t len;
+
+	len = snprintf(buf, PAGE_SIZE, "%02X", version);
+
+	return (len >= PAGE_SIZE) ? (PAGE_SIZE - 1) : len;
+}
+static DEVICE_ATTR_RO(version);
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
 			     char *buf)
 {
@@ -597,20 +658,19 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
 	const uuid_le *uuid = mei_me_cl_uuid(cldev->me_cl);
 	size_t len;
 
-	len = snprintf(buf, PAGE_SIZE, "mei:%s:" MEI_CL_UUID_FMT ":",
-		cldev->name, MEI_CL_UUID_ARGS(uuid->b));
-
+	len = snprintf(buf, PAGE_SIZE, "mei:%s:%pUl:", cldev->name, uuid);
 	return (len >= PAGE_SIZE) ? (PAGE_SIZE - 1) : len;
 }
 static DEVICE_ATTR_RO(modalias);
 
-static struct attribute *mei_cl_dev_attrs[] = {
+static struct attribute *mei_cldev_attrs[] = {
 	&dev_attr_name.attr,
 	&dev_attr_uuid.attr,
+	&dev_attr_version.attr,
 	&dev_attr_modalias.attr,
 	NULL,
 };
-ATTRIBUTE_GROUPS(mei_cl_dev);
+ATTRIBUTE_GROUPS(mei_cldev);
 
 /**
  * mei_cl_device_uevent - me client bus uevent handler
@@ -624,6 +684,10 @@ static int mei_cl_device_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct mei_cl_device *cldev = to_mei_cl_device(dev);
 	const uuid_le *uuid = mei_me_cl_uuid(cldev->me_cl);
+	u8 version = mei_me_cl_ver(cldev->me_cl);
+
+	if (add_uevent_var(env, "MEI_CL_VERSION=%d", version))
+		return -ENOMEM;
 
 	if (add_uevent_var(env, "MEI_CL_UUID=%pUl", uuid))
 		return -ENOMEM;
@@ -631,8 +695,8 @@ static int mei_cl_device_uevent(struct device *dev, struct kobj_uevent_env *env)
 	if (add_uevent_var(env, "MEI_CL_NAME=%s", cldev->name))
 		return -ENOMEM;
 
-	if (add_uevent_var(env, "MODALIAS=mei:%s:" MEI_CL_UUID_FMT ":",
-		cldev->name, MEI_CL_UUID_ARGS(uuid->b)))
+	if (add_uevent_var(env, "MODALIAS=mei:%s:%pUl:%02X:",
+			   cldev->name, uuid, version))
 		return -ENOMEM;
 
 	return 0;
@@ -640,7 +704,7 @@ static int mei_cl_device_uevent(struct device *dev, struct kobj_uevent_env *env)
 
 static struct bus_type mei_cl_bus_type = {
 	.name		= "mei",
-	.dev_groups	= mei_cl_dev_groups,
+	.dev_groups	= mei_cldev_groups,
 	.match		= mei_cl_device_match,
 	.probe		= mei_cl_device_probe,
 	.remove		= mei_cl_device_remove,
@@ -661,7 +725,7 @@ static void mei_dev_bus_put(struct mei_device *bus)
 		put_device(bus->dev);
 }
 
-static void mei_cl_dev_release(struct device *dev)
+static void mei_cl_bus_dev_release(struct device *dev)
 {
 	struct mei_cl_device *cldev = to_mei_cl_device(dev);
 
@@ -674,19 +738,19 @@ static void mei_cl_dev_release(struct device *dev)
 }
 
 static struct device_type mei_cl_device_type = {
-	.release	= mei_cl_dev_release,
+	.release	= mei_cl_bus_dev_release,
 };
 
 /**
- * mei_cl_dev_alloc - initialize and allocate mei client device
+ * mei_cl_bus_dev_alloc - initialize and allocate mei client device
  *
  * @bus: mei device
  * @me_cl: me client
  *
  * Return: allocated device structur or NULL on allocation failure
  */
-static struct mei_cl_device *mei_cl_dev_alloc(struct mei_device *bus,
-					      struct mei_me_client *me_cl)
+static struct mei_cl_device *mei_cl_bus_dev_alloc(struct mei_device *bus,
+						  struct mei_me_client *me_cl)
 {
 	struct mei_cl_device *cldev;
 
@@ -715,15 +779,17 @@ static struct mei_cl_device *mei_cl_dev_alloc(struct mei_device *bus,
  *
  * Return: true if the device is eligible for enumeration
  */
-static bool mei_cl_dev_setup(struct mei_device *bus,
-			     struct mei_cl_device *cldev)
+static bool mei_cl_bus_dev_setup(struct mei_device *bus,
+				 struct mei_cl_device *cldev)
 {
 	cldev->do_match = 1;
-	mei_cl_dev_fixup(cldev);
+	mei_cl_bus_dev_fixup(cldev);
 
 	if (cldev->do_match)
-		dev_set_name(&cldev->dev, "mei:%s:%pUl",
-			     cldev->name, mei_me_cl_uuid(cldev->me_cl));
+		dev_set_name(&cldev->dev, "mei:%s:%pUl:%02X",
+			     cldev->name,
+			     mei_me_cl_uuid(cldev->me_cl),
+			     mei_me_cl_ver(cldev->me_cl));
 
 	return cldev->do_match == 1;
 }
@@ -739,7 +805,9 @@ static int mei_cl_bus_dev_add(struct mei_cl_device *cldev)
 {
 	int ret;
 
-	dev_dbg(cldev->bus->dev, "adding %pUL\n", mei_me_cl_uuid(cldev->me_cl));
+	dev_dbg(cldev->bus->dev, "adding %pUL:%02X\n",
+		mei_me_cl_uuid(cldev->me_cl),
+		mei_me_cl_ver(cldev->me_cl));
 	ret = device_add(&cldev->dev);
 	if (!ret)
 		cldev->is_added = 1;
@@ -804,13 +872,14 @@ void mei_cl_bus_remove_devices(struct mei_device *bus)
 
 
 /**
- * mei_cl_dev_init - allocate and initializes an mei client devices
+ * mei_cl_bus_dev_init - allocate and initializes an mei client devices
  *     based on me client
  *
  * @bus: mei device
  * @me_cl: me client
  */
-static void mei_cl_dev_init(struct mei_device *bus, struct mei_me_client *me_cl)
+static void mei_cl_bus_dev_init(struct mei_device *bus,
+				struct mei_me_client *me_cl)
 {
 	struct mei_cl_device *cldev;
 
@@ -819,7 +888,7 @@ static void mei_cl_dev_init(struct mei_device *bus, struct mei_me_client *me_cl)
 	if (me_cl->bus_added)
 		return;
 
-	cldev = mei_cl_dev_alloc(bus, me_cl);
+	cldev = mei_cl_bus_dev_alloc(bus, me_cl);
 	if (!cldev)
 		return;
 
@@ -843,7 +912,7 @@ void mei_cl_bus_rescan(struct mei_device *bus)
 
 	down_read(&bus->me_clients_rwsem);
 	list_for_each_entry(me_cl, &bus->me_clients, list)
-		mei_cl_dev_init(bus, me_cl);
+		mei_cl_bus_dev_init(bus, me_cl);
 	up_read(&bus->me_clients_rwsem);
 
 	mutex_lock(&bus->cl_bus_lock);
@@ -857,7 +926,7 @@ void mei_cl_bus_rescan(struct mei_device *bus)
 		if (cldev->is_added)
 			continue;
 
-		if (mei_cl_dev_setup(bus, cldev))
+		if (mei_cl_bus_dev_setup(bus, cldev))
 			mei_cl_bus_dev_add(cldev);
 		else {
 			list_del_init(&cldev->bus_list);
@@ -869,7 +938,8 @@ void mei_cl_bus_rescan(struct mei_device *bus)
 	dev_dbg(bus->dev, "rescan end");
 }
 
-int __mei_cl_driver_register(struct mei_cl_driver *cldrv, struct module *owner)
+int __mei_cldev_driver_register(struct mei_cl_driver *cldrv,
+				struct module *owner)
 {
 	int err;
 
@@ -885,15 +955,15 @@ int __mei_cl_driver_register(struct mei_cl_driver *cldrv, struct module *owner)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__mei_cl_driver_register);
+EXPORT_SYMBOL_GPL(__mei_cldev_driver_register);
 
-void mei_cl_driver_unregister(struct mei_cl_driver *cldrv)
+void mei_cldev_driver_unregister(struct mei_cl_driver *cldrv)
 {
 	driver_unregister(&cldrv->driver);
 
 	pr_debug("mei: driver [%s] unregistered\n", cldrv->driver.name);
 }
-EXPORT_SYMBOL_GPL(mei_cl_driver_unregister);
+EXPORT_SYMBOL_GPL(mei_cldev_driver_unregister);
 
 
 int __init mei_cl_bus_init(void)
diff --git a/drivers/misc/mei/client.h b/drivers/misc/mei/client.h
index 1c7cad07d731..04e1aa39243f 100644
--- a/drivers/misc/mei/client.h
+++ b/drivers/misc/mei/client.h
@@ -68,6 +68,18 @@ static inline const uuid_le *mei_me_cl_uuid(const struct mei_me_client *me_cl)
 	return &me_cl->props.protocol_name;
 }
 
+/**
+ * mei_me_cl_ver - return me client protocol version
+ *
+ * @me_cl: me client
+ *
+ * Return: me client protocol version
+ */
+static inline u8 mei_me_cl_ver(const struct mei_me_client *me_cl)
+{
+	return me_cl->props.protocol_version;
+}
+
 /*
  * MEI IO Functions
  */
diff --git a/drivers/misc/mei/hw-me.c b/drivers/misc/mei/hw-me.c
index 65511d39d89b..25b1997a62cb 100644
--- a/drivers/misc/mei/hw-me.c
+++ b/drivers/misc/mei/hw-me.c
@@ -150,7 +150,7 @@ static inline u32 mei_me_d0i3c_read(const struct mei_device *dev)
 	u32 reg;
 
 	reg = mei_me_reg_read(to_me_hw(dev), H_D0I3C);
-	trace_mei_reg_read(dev->dev, "H_D0I3C", H_CSR, reg);
+	trace_mei_reg_read(dev->dev, "H_D0I3C", H_D0I3C, reg);
 
 	return reg;
 }
@@ -163,7 +163,7 @@ static inline u32 mei_me_d0i3c_read(const struct mei_device *dev)
  */
 static inline void mei_me_d0i3c_write(struct mei_device *dev, u32 reg)
 {
-	trace_mei_reg_write(dev->dev, "H_D0I3C", H_CSR, reg);
+	trace_mei_reg_write(dev->dev, "H_D0I3C", H_D0I3C, reg);
 	mei_me_reg_write(to_me_hw(dev), H_D0I3C, reg);
 }
 
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index e25ee16c658e..0f87dffa6be1 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -340,7 +340,7 @@ struct mei_hw_ops {
 
 /* MEI bus API*/
 void mei_cl_bus_rescan(struct mei_device *bus);
-void mei_cl_dev_fixup(struct mei_cl_device *dev);
+void mei_cl_bus_dev_fixup(struct mei_cl_device *dev);
 ssize_t __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length,
 			bool blocking);
 ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
diff --git a/drivers/misc/mic/Kconfig b/drivers/misc/mic/Kconfig
index e9f2f56c370d..40677df7f996 100644
--- a/drivers/misc/mic/Kconfig
+++ b/drivers/misc/mic/Kconfig
@@ -36,7 +36,7 @@ comment "Intel MIC Host Driver"
 
 config INTEL_MIC_HOST
 	tristate "Intel MIC Host Driver"
-	depends on 64BIT && PCI && X86 && INTEL_MIC_BUS && SCIF_BUS
+	depends on 64BIT && PCI && X86 && INTEL_MIC_BUS && SCIF_BUS && MIC_COSM
 	select VHOST_RING
 	help
 	  This enables Host Driver support for the Intel Many Integrated
@@ -56,7 +56,7 @@ comment "Intel MIC Card Driver"
 
 config INTEL_MIC_CARD
 	tristate "Intel MIC Card Driver"
-	depends on 64BIT && X86 && INTEL_MIC_BUS && SCIF_BUS
+	depends on 64BIT && X86 && INTEL_MIC_BUS && SCIF_BUS && MIC_COSM
 	select VIRTIO
 	help
 	  This enables card driver support for the Intel Many Integrated
@@ -74,7 +74,8 @@ comment "SCIF Driver"
 
 config SCIF
 	tristate "SCIF Driver"
-	depends on 64BIT && PCI && X86 && SCIF_BUS
+	depends on 64BIT && PCI && X86 && SCIF_BUS && IOMMU_SUPPORT
+	select IOMMU_IOVA
 	help
 	  This enables SCIF Driver support for the Intel Many Integrated
 	  Core (MIC) family of PCIe form factor coprocessor devices that
@@ -88,3 +89,21 @@ config SCIF
 	  More information about the Intel MIC family as well as the Linux
 	  OS and tools for MIC to use with this driver are available from
 	  <http://software.intel.com/en-us/mic-developer>.
+
+comment "Intel MIC Coprocessor State Management (COSM) Drivers"
+
+config MIC_COSM
+	tristate "Intel MIC Coprocessor State Management (COSM) Drivers"
+	depends on 64BIT && PCI && X86 && SCIF
+	help
+	  This enables COSM driver support for the Intel Many
+	  Integrated Core (MIC) family of PCIe form factor coprocessor
+	  devices. COSM drivers implement functions such as boot,
+	  shutdown, reset and reboot of MIC devices.
+
+	  If you are building a host kernel with an Intel MIC device then
+	  say M (recommended) or Y, else say N. If unsure say N.
+
+	  More information about the Intel MIC family as well as the Linux
+	  OS and tools for MIC to use with this driver are available from
+	  <http://software.intel.com/en-us/mic-developer>.
diff --git a/drivers/misc/mic/Makefile b/drivers/misc/mic/Makefile
index a74042c58649..e288a1106738 100644
--- a/drivers/misc/mic/Makefile
+++ b/drivers/misc/mic/Makefile
@@ -6,3 +6,5 @@ obj-$(CONFIG_INTEL_MIC_HOST) += host/
 obj-$(CONFIG_INTEL_MIC_CARD) += card/
 obj-y += bus/
 obj-$(CONFIG_SCIF) += scif/
+obj-$(CONFIG_MIC_COSM) += cosm/
+obj-$(CONFIG_MIC_COSM) += cosm_client/
diff --git a/drivers/misc/mic/bus/Makefile b/drivers/misc/mic/bus/Makefile
index 1ed37e234c96..761842b0d0bb 100644
--- a/drivers/misc/mic/bus/Makefile
+++ b/drivers/misc/mic/bus/Makefile
@@ -4,3 +4,4 @@
 #
 obj-$(CONFIG_INTEL_MIC_BUS) += mic_bus.o
 obj-$(CONFIG_SCIF_BUS) += scif_bus.o
+obj-$(CONFIG_MIC_COSM) += cosm_bus.o
diff --git a/drivers/misc/mic/bus/cosm_bus.c b/drivers/misc/mic/bus/cosm_bus.c
new file mode 100644
index 000000000000..1e36830f1a4e
--- /dev/null
+++ b/drivers/misc/mic/bus/cosm_bus.c
@@ -0,0 +1,141 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC COSM Bus Driver
+ */
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/idr.h>
+#include "cosm_bus.h"
+
+/* Unique numbering for cosm devices. */
+static DEFINE_IDA(cosm_index_ida);
+
+static int cosm_dev_probe(struct device *d)
+{
+	struct cosm_device *dev = dev_to_cosm(d);
+	struct cosm_driver *drv = drv_to_cosm(dev->dev.driver);
+
+	return drv->probe(dev);
+}
+
+static int cosm_dev_remove(struct device *d)
+{
+	struct cosm_device *dev = dev_to_cosm(d);
+	struct cosm_driver *drv = drv_to_cosm(dev->dev.driver);
+
+	drv->remove(dev);
+	return 0;
+}
+
+static struct bus_type cosm_bus = {
+	.name  = "cosm_bus",
+	.probe = cosm_dev_probe,
+	.remove = cosm_dev_remove,
+};
+
+int cosm_register_driver(struct cosm_driver *driver)
+{
+	driver->driver.bus = &cosm_bus;
+	return driver_register(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(cosm_register_driver);
+
+void cosm_unregister_driver(struct cosm_driver *driver)
+{
+	driver_unregister(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(cosm_unregister_driver);
+
+static inline void cosm_release_dev(struct device *d)
+{
+	struct cosm_device *cdev = dev_to_cosm(d);
+
+	kfree(cdev);
+}
+
+struct cosm_device *
+cosm_register_device(struct device *pdev, struct cosm_hw_ops *hw_ops)
+{
+	struct cosm_device *cdev;
+	int ret;
+
+	cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+	if (!cdev)
+		return ERR_PTR(-ENOMEM);
+
+	cdev->dev.parent = pdev;
+	cdev->dev.release = cosm_release_dev;
+	cdev->hw_ops = hw_ops;
+	dev_set_drvdata(&cdev->dev, cdev);
+	cdev->dev.bus = &cosm_bus;
+
+	/* Assign a unique device index and hence name */
+	ret = ida_simple_get(&cosm_index_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto free_cdev;
+
+	cdev->index = ret;
+	cdev->dev.id = ret;
+	dev_set_name(&cdev->dev, "cosm-dev%u", cdev->index);
+
+	ret = device_register(&cdev->dev);
+	if (ret)
+		goto ida_remove;
+	return cdev;
+ida_remove:
+	ida_simple_remove(&cosm_index_ida, cdev->index);
+free_cdev:
+	kfree(cdev);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(cosm_register_device);
+
+void cosm_unregister_device(struct cosm_device *dev)
+{
+	int index = dev->index; /* save for after device release */
+
+	device_unregister(&dev->dev);
+	ida_simple_remove(&cosm_index_ida, index);
+}
+EXPORT_SYMBOL_GPL(cosm_unregister_device);
+
+struct cosm_device *cosm_find_cdev_by_id(int id)
+{
+	struct device *dev = subsys_find_device_by_id(&cosm_bus, id, NULL);
+
+	return dev ? container_of(dev, struct cosm_device, dev) : NULL;
+}
+EXPORT_SYMBOL_GPL(cosm_find_cdev_by_id);
+
+static int __init cosm_init(void)
+{
+	return bus_register(&cosm_bus);
+}
+
+static void __exit cosm_exit(void)
+{
+	bus_unregister(&cosm_bus);
+	ida_destroy(&cosm_index_ida);
+}
+
+core_initcall(cosm_init);
+module_exit(cosm_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel(R) MIC card OS state management bus driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/misc/mic/bus/cosm_bus.h b/drivers/misc/mic/bus/cosm_bus.h
new file mode 100644
index 000000000000..f7c57f266916
--- /dev/null
+++ b/drivers/misc/mic/bus/cosm_bus.h
@@ -0,0 +1,134 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC COSM Bus Driver
+ */
+#ifndef _COSM_BUS_H_
+#define _COSM_BUS_H_
+
+#include <linux/scif.h>
+#include <linux/mic_common.h>
+#include "../common/mic_dev.h"
+
+/**
+ * cosm_device - representation of a cosm device
+ *
+ * @attr_group: Pointer to list of sysfs attribute groups.
+ * @sdev: Device for sysfs entries.
+ * @state: MIC state.
+ * @shutdown_status: MIC status reported by card for shutdown/crashes.
+ * @shutdown_status_int: Internal shutdown status maintained by the driver
+ * @cosm_mutex: Mutex for synchronizing access to data structures.
+ * @reset_trigger_work: Work for triggering reset requests.
+ * @scif_work: Work for handling per device SCIF connections
+ * @cmdline: Kernel command line.
+ * @firmware: Firmware file name.
+ * @ramdisk: Ramdisk file name.
+ * @bootmode: Boot mode i.e. "linux" or "elf" for flash updates.
+ * @log_buf_addr: Log buffer address for MIC.
+ * @log_buf_len: Log buffer length address for MIC.
+ * @state_sysfs: Sysfs dirent for notifying ring 3 about MIC state changes.
+ * @hw_ops: the hardware bus ops for this device.
+ * @dev: underlying device.
+ * @index: unique position on the cosm bus
+ * @dbg_dir: debug fs directory
+ * @newepd: new endpoint from scif accept to be assigned to this cdev
+ * @epd: SCIF endpoint for this cdev
+ * @heartbeat_watchdog_enable: if heartbeat watchdog is enabled for this cdev
+ * @sysfs_heartbeat_enable: sysfs setting for disabling heartbeat notification
+ */
+struct cosm_device {
+	const struct attribute_group **attr_group;
+	struct device *sdev;
+	u8 state;
+	u8 shutdown_status;
+	u8 shutdown_status_int;
+	struct mutex cosm_mutex;
+	struct work_struct reset_trigger_work;
+	struct work_struct scif_work;
+	char *cmdline;
+	char *firmware;
+	char *ramdisk;
+	char *bootmode;
+	void *log_buf_addr;
+	int *log_buf_len;
+	struct kernfs_node *state_sysfs;
+	struct cosm_hw_ops *hw_ops;
+	struct device dev;
+	int index;
+	struct dentry *dbg_dir;
+	scif_epd_t newepd;
+	scif_epd_t epd;
+	bool heartbeat_watchdog_enable;
+	bool sysfs_heartbeat_enable;
+};
+
+/**
+ * cosm_driver - operations for a cosm driver
+ *
+ * @driver: underlying device driver (populate name and owner).
+ * @probe: the function to call when a device is found.  Returns 0 or -errno.
+ * @remove: the function to call when a device is removed.
+ */
+struct cosm_driver {
+	struct device_driver driver;
+	int (*probe)(struct cosm_device *dev);
+	void (*remove)(struct cosm_device *dev);
+};
+
+/**
+ * cosm_hw_ops - cosm bus ops
+ *
+ * @reset: trigger MIC reset
+ * @force_reset: force MIC reset
+ * @post_reset: inform MIC reset is complete
+ * @ready: is MIC ready for OS download
+ * @start: boot MIC
+ * @stop: prepare MIC for reset
+ * @family: return MIC HW family string
+ * @stepping: return MIC HW stepping string
+ * @aper: return MIC PCIe aperture
+ */
+struct cosm_hw_ops {
+	void (*reset)(struct cosm_device *cdev);
+	void (*force_reset)(struct cosm_device *cdev);
+	void (*post_reset)(struct cosm_device *cdev, enum mic_states state);
+	bool (*ready)(struct cosm_device *cdev);
+	int (*start)(struct cosm_device *cdev, int id);
+	void (*stop)(struct cosm_device *cdev, bool force);
+	ssize_t (*family)(struct cosm_device *cdev, char *buf);
+	ssize_t (*stepping)(struct cosm_device *cdev, char *buf);
+	struct mic_mw *(*aper)(struct cosm_device *cdev);
+};
+
+struct cosm_device *
+cosm_register_device(struct device *pdev, struct cosm_hw_ops *hw_ops);
+void cosm_unregister_device(struct cosm_device *dev);
+int cosm_register_driver(struct cosm_driver *drv);
+void cosm_unregister_driver(struct cosm_driver *drv);
+struct cosm_device *cosm_find_cdev_by_id(int id);
+
+static inline struct cosm_device *dev_to_cosm(struct device *dev)
+{
+	return container_of(dev, struct cosm_device, dev);
+}
+
+static inline struct cosm_driver *drv_to_cosm(struct device_driver *drv)
+{
+	return container_of(drv, struct cosm_driver, driver);
+}
+#endif /* _COSM_BUS_H */
diff --git a/drivers/misc/mic/bus/mic_bus.c b/drivers/misc/mic/bus/mic_bus.c
index 961ae90aae47..c64955d8cbc1 100644
--- a/drivers/misc/mic/bus/mic_bus.c
+++ b/drivers/misc/mic/bus/mic_bus.c
@@ -25,9 +25,6 @@
 #include <linux/idr.h>
 #include <linux/mic_bus.h>
 
-/* Unique numbering for mbus devices. */
-static DEFINE_IDA(mbus_index_ida);
-
 static ssize_t device_show(struct device *d,
 			   struct device_attribute *attr, char *buf)
 {
@@ -147,7 +144,8 @@ static void mbus_release_dev(struct device *d)
 
 struct mbus_device *
 mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
-		     struct mbus_hw_ops *hw_ops, void __iomem *mmio_va)
+		     struct mbus_hw_ops *hw_ops, int index,
+		     void __iomem *mmio_va)
 {
 	int ret;
 	struct mbus_device *mbdev;
@@ -166,13 +164,7 @@ mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
 	mbdev->dev.release = mbus_release_dev;
 	mbdev->hw_ops = hw_ops;
 	mbdev->dev.bus = &mic_bus;
-
-	/* Assign a unique device index and hence name. */
-	ret = ida_simple_get(&mbus_index_ida, 0, 0, GFP_KERNEL);
-	if (ret < 0)
-		goto free_mbdev;
-
-	mbdev->index = ret;
+	mbdev->index = index;
 	dev_set_name(&mbdev->dev, "mbus-dev%u", mbdev->index);
 	/*
 	 * device_register() causes the bus infrastructure to look for a
@@ -180,10 +172,8 @@ mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
 	 */
 	ret = device_register(&mbdev->dev);
 	if (ret)
-		goto ida_remove;
+		goto free_mbdev;
 	return mbdev;
-ida_remove:
-	ida_simple_remove(&mbus_index_ida, mbdev->index);
 free_mbdev:
 	kfree(mbdev);
 	return ERR_PTR(ret);
@@ -192,10 +182,7 @@ EXPORT_SYMBOL_GPL(mbus_register_device);
 
 void mbus_unregister_device(struct mbus_device *mbdev)
 {
-	int index = mbdev->index; /* save for after device release */
-
 	device_unregister(&mbdev->dev);
-	ida_simple_remove(&mbus_index_ida, index);
 }
 EXPORT_SYMBOL_GPL(mbus_unregister_device);
 
@@ -207,7 +194,6 @@ static int __init mbus_init(void)
 static void __exit mbus_exit(void)
 {
 	bus_unregister(&mic_bus);
-	ida_destroy(&mbus_index_ida);
 }
 
 core_initcall(mbus_init);
diff --git a/drivers/misc/mic/bus/scif_bus.c b/drivers/misc/mic/bus/scif_bus.c
index 2da7ceed015d..fd2702143022 100644
--- a/drivers/misc/mic/bus/scif_bus.c
+++ b/drivers/misc/mic/bus/scif_bus.c
@@ -28,7 +28,6 @@ static ssize_t device_show(struct device *d,
 
 	return sprintf(buf, "0x%04x\n", dev->id.device);
 }
-
 static DEVICE_ATTR_RO(device);
 
 static ssize_t vendor_show(struct device *d,
@@ -38,7 +37,6 @@ static ssize_t vendor_show(struct device *d,
 
 	return sprintf(buf, "0x%04x\n", dev->id.vendor);
 }
-
 static DEVICE_ATTR_RO(vendor);
 
 static ssize_t modalias_show(struct device *d,
@@ -49,7 +47,6 @@ static ssize_t modalias_show(struct device *d,
 	return sprintf(buf, "scif:d%08Xv%08X\n",
 		       dev->id.device, dev->id.vendor);
 }
-
 static DEVICE_ATTR_RO(modalias);
 
 static struct attribute *scif_dev_attrs[] = {
@@ -144,7 +141,8 @@ struct scif_hw_dev *
 scif_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
 		     struct scif_hw_ops *hw_ops, u8 dnode, u8 snode,
 		     struct mic_mw *mmio, struct mic_mw *aper, void *dp,
-		     void __iomem *rdp, struct dma_chan **chan, int num_chan)
+		     void __iomem *rdp, struct dma_chan **chan, int num_chan,
+		     bool card_rel_da)
 {
 	int ret;
 	struct scif_hw_dev *sdev;
@@ -171,6 +169,7 @@ scif_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops,
 	dma_set_mask(&sdev->dev, DMA_BIT_MASK(64));
 	sdev->dma_ch = chan;
 	sdev->num_dma_ch = num_chan;
+	sdev->card_rel_da = card_rel_da;
 	dev_set_name(&sdev->dev, "scif-dev%u", sdev->dnode);
 	/*
 	 * device_register() causes the bus infrastructure to look for a
diff --git a/drivers/misc/mic/bus/scif_bus.h b/drivers/misc/mic/bus/scif_bus.h
index 335a228a8236..94f29ac608b6 100644
--- a/drivers/misc/mic/bus/scif_bus.h
+++ b/drivers/misc/mic/bus/scif_bus.h
@@ -46,6 +46,8 @@ struct scif_hw_dev_id {
  * @rdp - Remote device page
  * @dma_ch - Array of DMA channels
  * @num_dma_ch - Number of DMA channels available
+ * @card_rel_da - Set to true if DMA addresses programmed in the DMA engine
+ *		are relative to the card point of view
  */
 struct scif_hw_dev {
 	struct scif_hw_ops *hw_ops;
@@ -59,6 +61,7 @@ struct scif_hw_dev {
 	void __iomem *rdp;
 	struct dma_chan **dma_ch;
 	int num_dma_ch;
+	bool card_rel_da;
 };
 
 /**
@@ -114,7 +117,8 @@ scif_register_device(struct device *pdev, int id,
 		     struct scif_hw_ops *hw_ops, u8 dnode, u8 snode,
 		     struct mic_mw *mmio, struct mic_mw *aper,
 		     void *dp, void __iomem *rdp,
-		     struct dma_chan **chan, int num_chan);
+		     struct dma_chan **chan, int num_chan,
+		     bool card_rel_da);
 void scif_unregister_device(struct scif_hw_dev *sdev);
 
 static inline struct scif_hw_dev *dev_to_scif(struct device *dev)
diff --git a/drivers/misc/mic/card/mic_device.c b/drivers/misc/mic/card/mic_device.c
index 6338908b2252..d0edaf7e0cd5 100644
--- a/drivers/misc/mic/card/mic_device.c
+++ b/drivers/misc/mic/card/mic_device.c
@@ -37,71 +37,6 @@
 #include "mic_virtio.h"
 
 static struct mic_driver *g_drv;
-static struct mic_irq *shutdown_cookie;
-
-static void mic_notify_host(u8 state)
-{
-	struct mic_driver *mdrv = g_drv;
-	struct mic_bootparam __iomem *bootparam = mdrv->dp;
-
-	iowrite8(state, &bootparam->shutdown_status);
-	dev_dbg(mdrv->dev, "%s %d system_state %d\n",
-		__func__, __LINE__, state);
-	mic_send_intr(&mdrv->mdev, ioread8(&bootparam->c2h_shutdown_db));
-}
-
-static int mic_panic_event(struct notifier_block *this, unsigned long event,
-		void *ptr)
-{
-	struct mic_driver *mdrv = g_drv;
-	struct mic_bootparam __iomem *bootparam = mdrv->dp;
-
-	iowrite8(-1, &bootparam->h2c_config_db);
-	iowrite8(-1, &bootparam->h2c_shutdown_db);
-	mic_notify_host(MIC_CRASHED);
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block mic_panic = {
-	.notifier_call  = mic_panic_event,
-};
-
-static irqreturn_t mic_shutdown_isr(int irq, void *data)
-{
-	struct mic_driver *mdrv = g_drv;
-	struct mic_bootparam __iomem *bootparam = mdrv->dp;
-
-	mic_ack_interrupt(&g_drv->mdev);
-	if (ioread8(&bootparam->shutdown_card))
-		orderly_poweroff(true);
-	return IRQ_HANDLED;
-}
-
-static int mic_shutdown_init(void)
-{
-	int rc = 0;
-	struct mic_driver *mdrv = g_drv;
-	struct mic_bootparam __iomem *bootparam = mdrv->dp;
-	int shutdown_db;
-
-	shutdown_db = mic_next_card_db();
-	shutdown_cookie = mic_request_card_irq(mic_shutdown_isr, NULL,
-					       "Shutdown", mdrv, shutdown_db);
-	if (IS_ERR(shutdown_cookie))
-		rc = PTR_ERR(shutdown_cookie);
-	else
-		iowrite8(shutdown_db, &bootparam->h2c_shutdown_db);
-	return rc;
-}
-
-static void mic_shutdown_uninit(void)
-{
-	struct mic_driver *mdrv = g_drv;
-	struct mic_bootparam __iomem *bootparam = mdrv->dp;
-
-	iowrite8(-1, &bootparam->h2c_shutdown_db);
-	mic_free_card_irq(shutdown_cookie, mdrv);
-}
 
 static int __init mic_dp_init(void)
 {
@@ -359,11 +294,7 @@ int __init mic_driver_init(struct mic_driver *mdrv)
 	u8 node_id;
 
 	g_drv = mdrv;
-	/*
-	 * Unloading the card module is not supported. The MIC card module
-	 * handles fundamental operations like host/card initiated shutdowns
-	 * and informing the host about card crashes and cannot be unloaded.
-	 */
+	/* Unloading the card module is not supported. */
 	if (!try_module_get(mdrv->dev->driver->owner)) {
 		rc = -ENODEV;
 		goto done;
@@ -374,12 +305,9 @@ int __init mic_driver_init(struct mic_driver *mdrv)
 	rc = mic_init_irq();
 	if (rc)
 		goto dp_uninit;
-	rc = mic_shutdown_init();
-	if (rc)
-		goto irq_uninit;
 	if (!mic_request_dma_chans(mdrv)) {
 		rc = -ENODEV;
-		goto shutdown_uninit;
+		goto irq_uninit;
 	}
 	rc = mic_devices_init(mdrv);
 	if (rc)
@@ -390,21 +318,18 @@ int __init mic_driver_init(struct mic_driver *mdrv)
 					   NULL, &scif_hw_ops,
 					   0, node_id, &mdrv->mdev.mmio, NULL,
 					   NULL, mdrv->dp, mdrv->dma_ch,
-					   mdrv->num_dma_ch);
+					   mdrv->num_dma_ch, true);
 	if (IS_ERR(mdrv->scdev)) {
 		rc = PTR_ERR(mdrv->scdev);
 		goto device_uninit;
 	}
 	mic_create_card_debug_dir(mdrv);
-	atomic_notifier_chain_register(&panic_notifier_list, &mic_panic);
 done:
 	return rc;
 device_uninit:
 	mic_devices_uninit(mdrv);
 dma_free:
 	mic_free_dma_chans(mdrv);
-shutdown_uninit:
-	mic_shutdown_uninit();
 irq_uninit:
 	mic_uninit_irq();
 dp_uninit:
@@ -425,13 +350,6 @@ void mic_driver_uninit(struct mic_driver *mdrv)
 	scif_unregister_device(mdrv->scdev);
 	mic_devices_uninit(mdrv);
 	mic_free_dma_chans(mdrv);
-	/*
-	 * Inform the host about the shutdown status i.e. poweroff/restart etc.
-	 * The module cannot be unloaded so the only code path to call
-	 * mic_devices_uninit(..) is the shutdown callback.
-	 */
-	mic_notify_host(system_state);
-	mic_shutdown_uninit();
 	mic_uninit_irq();
 	mic_dp_uninit();
 	module_put(mdrv->dev->driver->owner);
diff --git a/drivers/misc/mic/card/mic_x100.c b/drivers/misc/mic/card/mic_x100.c
index 77fd41781c2e..b2958ce2368c 100644
--- a/drivers/misc/mic/card/mic_x100.c
+++ b/drivers/misc/mic/card/mic_x100.c
@@ -261,7 +261,7 @@ static int __init mic_probe(struct platform_device *pdev)
 	mic_hw_intr_init(mdrv);
 	platform_set_drvdata(pdev, mdrv);
 	mdrv->dma_mbdev = mbus_register_device(mdrv->dev, MBUS_DEV_DMA_MIC,
-					       NULL, &mbus_hw_ops,
+					       NULL, &mbus_hw_ops, 0,
 					       mdrv->mdev.mmio.va);
 	if (IS_ERR(mdrv->dma_mbdev)) {
 		rc = PTR_ERR(mdrv->dma_mbdev);
diff --git a/drivers/misc/mic/common/mic_dev.h b/drivers/misc/mic/common/mic_dev.h
index 0b58c46045dc..50776772ebdf 100644
--- a/drivers/misc/mic/common/mic_dev.h
+++ b/drivers/misc/mic/common/mic_dev.h
@@ -21,6 +21,19 @@
 #ifndef __MIC_DEV_H__
 #define __MIC_DEV_H__
 
+/* The maximum number of MIC devices supported in a single host system. */
+#define MIC_MAX_NUM_DEVS 128
+
+/**
+ * enum mic_hw_family - The hardware family to which a device belongs.
+ */
+enum mic_hw_family {
+	MIC_FAMILY_X100 = 0,
+	MIC_FAMILY_X200,
+	MIC_FAMILY_UNKNOWN,
+	MIC_FAMILY_LAST
+};
+
 /**
  * struct mic_mw - MIC memory window
  *
diff --git a/drivers/misc/mic/cosm/Makefile b/drivers/misc/mic/cosm/Makefile
new file mode 100644
index 000000000000..b85d4d49df46
--- /dev/null
+++ b/drivers/misc/mic/cosm/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile - Intel MIC Coprocessor State Management (COSM) Driver
+# Copyright(c) 2015, Intel Corporation.
+#
+obj-$(CONFIG_MIC_COSM) += mic_cosm.o
+
+mic_cosm-objs := cosm_main.o
+mic_cosm-objs += cosm_debugfs.o
+mic_cosm-objs += cosm_sysfs.o
+mic_cosm-objs += cosm_scif_server.o
diff --git a/drivers/misc/mic/cosm/cosm_debugfs.c b/drivers/misc/mic/cosm/cosm_debugfs.c
new file mode 100644
index 000000000000..30d953dff4fa
--- /dev/null
+++ b/drivers/misc/mic/cosm/cosm_debugfs.c
@@ -0,0 +1,155 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Coprocessor State Management (COSM) Driver
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include "cosm_main.h"
+
+/* Debugfs parent dir */
+static struct dentry *cosm_dbg;
+
+/**
+ * cosm_log_buf_show - Display MIC kernel log buffer
+ *
+ * log_buf addr/len is read from System.map by user space
+ * and populated in sysfs entries.
+ */
+static int cosm_log_buf_show(struct seq_file *s, void *unused)
+{
+	void __iomem *log_buf_va;
+	int __iomem *log_buf_len_va;
+	struct cosm_device *cdev = s->private;
+	void *kva;
+	int size;
+	u64 aper_offset;
+
+	if (!cdev || !cdev->log_buf_addr || !cdev->log_buf_len)
+		goto done;
+
+	mutex_lock(&cdev->cosm_mutex);
+	switch (cdev->state) {
+	case MIC_BOOTING:
+	case MIC_ONLINE:
+	case MIC_SHUTTING_DOWN:
+		break;
+	default:
+		goto unlock;
+	}
+
+	/*
+	 * Card kernel will never be relocated and any kernel text/data mapping
+	 * can be translated to phys address by subtracting __START_KERNEL_map.
+	 */
+	aper_offset = (u64)cdev->log_buf_len - __START_KERNEL_map;
+	log_buf_len_va = cdev->hw_ops->aper(cdev)->va + aper_offset;
+	aper_offset = (u64)cdev->log_buf_addr - __START_KERNEL_map;
+	log_buf_va = cdev->hw_ops->aper(cdev)->va + aper_offset;
+
+	size = ioread32(log_buf_len_va);
+	kva = kmalloc(size, GFP_KERNEL);
+	if (!kva)
+		goto unlock;
+
+	memcpy_fromio(kva, log_buf_va, size);
+	seq_write(s, kva, size);
+	kfree(kva);
+unlock:
+	mutex_unlock(&cdev->cosm_mutex);
+done:
+	return 0;
+}
+
+static int cosm_log_buf_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cosm_log_buf_show, inode->i_private);
+}
+
+static const struct file_operations log_buf_ops = {
+	.owner   = THIS_MODULE,
+	.open    = cosm_log_buf_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release
+};
+
+/**
+ * cosm_force_reset_show - Force MIC reset
+ *
+ * Invokes the force_reset COSM bus op instead of the standard reset
+ * op in case a force reset of the MIC device is required
+ */
+static int cosm_force_reset_show(struct seq_file *s, void *pos)
+{
+	struct cosm_device *cdev = s->private;
+
+	cosm_stop(cdev, true);
+	return 0;
+}
+
+static int cosm_force_reset_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cosm_force_reset_show, inode->i_private);
+}
+
+static const struct file_operations force_reset_ops = {
+	.owner   = THIS_MODULE,
+	.open    = cosm_force_reset_debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release
+};
+
+void cosm_create_debug_dir(struct cosm_device *cdev)
+{
+	char name[16];
+
+	if (!cosm_dbg)
+		return;
+
+	scnprintf(name, sizeof(name), "mic%d", cdev->index);
+	cdev->dbg_dir = debugfs_create_dir(name, cosm_dbg);
+	if (!cdev->dbg_dir)
+		return;
+
+	debugfs_create_file("log_buf", 0444, cdev->dbg_dir, cdev, &log_buf_ops);
+	debugfs_create_file("force_reset", 0444, cdev->dbg_dir, cdev,
+			    &force_reset_ops);
+}
+
+void cosm_delete_debug_dir(struct cosm_device *cdev)
+{
+	if (!cdev->dbg_dir)
+		return;
+
+	debugfs_remove_recursive(cdev->dbg_dir);
+}
+
+void cosm_init_debugfs(void)
+{
+	cosm_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (!cosm_dbg)
+		pr_err("can't create debugfs dir\n");
+}
+
+void cosm_exit_debugfs(void)
+{
+	debugfs_remove(cosm_dbg);
+}
diff --git a/drivers/misc/mic/cosm/cosm_main.c b/drivers/misc/mic/cosm/cosm_main.c
new file mode 100644
index 000000000000..4b4b356c797d
--- /dev/null
+++ b/drivers/misc/mic/cosm/cosm_main.c
@@ -0,0 +1,388 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Coprocessor State Management (COSM) Driver
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include "cosm_main.h"
+
+static const char cosm_driver_name[] = "mic";
+
+/* COSM ID allocator */
+static struct ida g_cosm_ida;
+/* Class of MIC devices for sysfs accessibility. */
+static struct class *g_cosm_class;
+/* Number of MIC devices */
+static atomic_t g_num_dev;
+
+/**
+ * cosm_hw_reset - Issue a HW reset for the MIC device
+ * @cdev: pointer to cosm_device instance
+ */
+static void cosm_hw_reset(struct cosm_device *cdev, bool force)
+{
+	int i;
+
+#define MIC_RESET_TO (45)
+	if (force && cdev->hw_ops->force_reset)
+		cdev->hw_ops->force_reset(cdev);
+	else
+		cdev->hw_ops->reset(cdev);
+
+	for (i = 0; i < MIC_RESET_TO; i++) {
+		if (cdev->hw_ops->ready(cdev)) {
+			cosm_set_state(cdev, MIC_READY);
+			return;
+		}
+		/*
+		 * Resets typically take 10s of seconds to complete.
+		 * Since an MMIO read is required to check if the
+		 * firmware is ready or not, a 1 second delay works nicely.
+		 */
+		msleep(1000);
+	}
+	cosm_set_state(cdev, MIC_RESET_FAILED);
+}
+
+/**
+ * cosm_start - Start the MIC
+ * @cdev: pointer to cosm_device instance
+ *
+ * This function prepares an MIC for boot and initiates boot.
+ * RETURNS: An appropriate -ERRNO error value on error, or 0 for success.
+ */
+int cosm_start(struct cosm_device *cdev)
+{
+	const struct cred *orig_cred;
+	struct cred *override_cred;
+	int rc;
+
+	mutex_lock(&cdev->cosm_mutex);
+	if (!cdev->bootmode) {
+		dev_err(&cdev->dev, "%s %d bootmode not set\n",
+			__func__, __LINE__);
+		rc = -EINVAL;
+		goto unlock_ret;
+	}
+retry:
+	if (cdev->state != MIC_READY) {
+		dev_err(&cdev->dev, "%s %d MIC state not READY\n",
+			__func__, __LINE__);
+		rc = -EINVAL;
+		goto unlock_ret;
+	}
+	if (!cdev->hw_ops->ready(cdev)) {
+		cosm_hw_reset(cdev, false);
+		/*
+		 * The state will either be MIC_READY if the reset succeeded
+		 * or MIC_RESET_FAILED if the firmware reset failed.
+		 */
+		goto retry;
+	}
+
+	/*
+	 * Set credentials to root to allow non-root user to download initramsfs
+	 * with 600 permissions
+	 */
+	override_cred = prepare_creds();
+	if (!override_cred) {
+		dev_err(&cdev->dev, "%s %d prepare_creds failed\n",
+			__func__, __LINE__);
+		rc = -ENOMEM;
+		goto unlock_ret;
+	}
+	override_cred->fsuid = GLOBAL_ROOT_UID;
+	orig_cred = override_creds(override_cred);
+
+	rc = cdev->hw_ops->start(cdev, cdev->index);
+
+	revert_creds(orig_cred);
+	put_cred(override_cred);
+	if (rc)
+		goto unlock_ret;
+
+	/*
+	 * If linux is being booted, card is treated 'online' only
+	 * when the scif interface in the card is up. If anything else
+	 * is booted, we set card to 'online' immediately.
+	 */
+	if (!strcmp(cdev->bootmode, "linux"))
+		cosm_set_state(cdev, MIC_BOOTING);
+	else
+		cosm_set_state(cdev, MIC_ONLINE);
+unlock_ret:
+	mutex_unlock(&cdev->cosm_mutex);
+	if (rc)
+		dev_err(&cdev->dev, "cosm_start failed rc %d\n", rc);
+	return rc;
+}
+
+/**
+ * cosm_stop - Prepare the MIC for reset and trigger reset
+ * @cdev: pointer to cosm_device instance
+ * @force: force a MIC to reset even if it is already reset and ready.
+ *
+ * RETURNS: None
+ */
+void cosm_stop(struct cosm_device *cdev, bool force)
+{
+	mutex_lock(&cdev->cosm_mutex);
+	if (cdev->state != MIC_READY || force) {
+		/*
+		 * Don't call hw_ops if they have been called previously.
+		 * stop(..) calls device_unregister and will crash the system if
+		 * called multiple times.
+		 */
+		bool call_hw_ops = cdev->state != MIC_RESET_FAILED &&
+					cdev->state != MIC_READY;
+
+		if (cdev->state != MIC_RESETTING)
+			cosm_set_state(cdev, MIC_RESETTING);
+		cdev->heartbeat_watchdog_enable = false;
+		if (call_hw_ops)
+			cdev->hw_ops->stop(cdev, force);
+		cosm_hw_reset(cdev, force);
+		cosm_set_shutdown_status(cdev, MIC_NOP);
+		if (call_hw_ops && cdev->hw_ops->post_reset)
+			cdev->hw_ops->post_reset(cdev, cdev->state);
+	}
+	mutex_unlock(&cdev->cosm_mutex);
+	flush_work(&cdev->scif_work);
+}
+
+/**
+ * cosm_reset_trigger_work - Trigger MIC reset
+ * @work: The work structure
+ *
+ * This work is scheduled whenever the host wants to reset the MIC.
+ */
+static void cosm_reset_trigger_work(struct work_struct *work)
+{
+	struct cosm_device *cdev = container_of(work, struct cosm_device,
+						reset_trigger_work);
+	cosm_stop(cdev, false);
+}
+
+/**
+ * cosm_reset - Schedule MIC reset
+ * @cdev: pointer to cosm_device instance
+ *
+ * RETURNS: An -EINVAL if the card is already READY or 0 for success.
+ */
+int cosm_reset(struct cosm_device *cdev)
+{
+	int rc = 0;
+
+	mutex_lock(&cdev->cosm_mutex);
+	if (cdev->state != MIC_READY) {
+		cosm_set_state(cdev, MIC_RESETTING);
+		schedule_work(&cdev->reset_trigger_work);
+	} else {
+		dev_err(&cdev->dev, "%s %d MIC is READY\n", __func__, __LINE__);
+		rc = -EINVAL;
+	}
+	mutex_unlock(&cdev->cosm_mutex);
+	return rc;
+}
+
+/**
+ * cosm_shutdown - Initiate MIC shutdown.
+ * @cdev: pointer to cosm_device instance
+ *
+ * RETURNS: None
+ */
+int cosm_shutdown(struct cosm_device *cdev)
+{
+	struct cosm_msg msg = { .id = COSM_MSG_SHUTDOWN };
+	int rc = 0;
+
+	mutex_lock(&cdev->cosm_mutex);
+	if (cdev->state != MIC_ONLINE) {
+		rc = -EINVAL;
+		dev_err(&cdev->dev, "%s %d skipping shutdown in state: %s\n",
+			__func__, __LINE__, cosm_state_string[cdev->state]);
+		goto err;
+	}
+
+	if (!cdev->epd) {
+		rc = -ENOTCONN;
+		dev_err(&cdev->dev, "%s %d scif endpoint not connected rc %d\n",
+			__func__, __LINE__, rc);
+		goto err;
+	}
+
+	rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
+	if (rc < 0) {
+		dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
+			__func__, __LINE__, rc);
+		goto err;
+	}
+	cdev->heartbeat_watchdog_enable = false;
+	cosm_set_state(cdev, MIC_SHUTTING_DOWN);
+	rc = 0;
+err:
+	mutex_unlock(&cdev->cosm_mutex);
+	return rc;
+}
+
+static int cosm_driver_probe(struct cosm_device *cdev)
+{
+	int rc;
+
+	/* Initialize SCIF server at first probe */
+	if (atomic_add_return(1, &g_num_dev) == 1) {
+		rc = cosm_scif_init();
+		if (rc)
+			goto scif_exit;
+	}
+	mutex_init(&cdev->cosm_mutex);
+	INIT_WORK(&cdev->reset_trigger_work, cosm_reset_trigger_work);
+	INIT_WORK(&cdev->scif_work, cosm_scif_work);
+	cdev->sysfs_heartbeat_enable = true;
+	cosm_sysfs_init(cdev);
+	cdev->sdev = device_create_with_groups(g_cosm_class, cdev->dev.parent,
+			       MKDEV(0, cdev->index), cdev, cdev->attr_group,
+			       "mic%d", cdev->index);
+	if (IS_ERR(cdev->sdev)) {
+		rc = PTR_ERR(cdev->sdev);
+		dev_err(&cdev->dev, "device_create_with_groups failed rc %d\n",
+			rc);
+		goto scif_exit;
+	}
+
+	cdev->state_sysfs = sysfs_get_dirent(cdev->sdev->kobj.sd,
+		"state");
+	if (!cdev->state_sysfs) {
+		rc = -ENODEV;
+		dev_err(&cdev->dev, "sysfs_get_dirent failed rc %d\n", rc);
+		goto destroy_device;
+	}
+	cosm_create_debug_dir(cdev);
+	return 0;
+destroy_device:
+	device_destroy(g_cosm_class, MKDEV(0, cdev->index));
+scif_exit:
+	if (atomic_dec_and_test(&g_num_dev))
+		cosm_scif_exit();
+	return rc;
+}
+
+static void cosm_driver_remove(struct cosm_device *cdev)
+{
+	cosm_delete_debug_dir(cdev);
+	sysfs_put(cdev->state_sysfs);
+	device_destroy(g_cosm_class, MKDEV(0, cdev->index));
+	flush_work(&cdev->reset_trigger_work);
+	cosm_stop(cdev, false);
+	if (atomic_dec_and_test(&g_num_dev))
+		cosm_scif_exit();
+
+	/* These sysfs entries might have allocated */
+	kfree(cdev->cmdline);
+	kfree(cdev->firmware);
+	kfree(cdev->ramdisk);
+	kfree(cdev->bootmode);
+}
+
+static int cosm_suspend(struct device *dev)
+{
+	struct cosm_device *cdev = dev_to_cosm(dev);
+
+	mutex_lock(&cdev->cosm_mutex);
+	switch (cdev->state) {
+	/**
+	 * Suspend/freeze hooks in userspace have already shutdown the card.
+	 * Card should be 'ready' in most cases. It is however possible that
+	 * some userspace application initiated a boot. In those cases, we
+	 * simply reset the card.
+	 */
+	case MIC_ONLINE:
+	case MIC_BOOTING:
+	case MIC_SHUTTING_DOWN:
+		mutex_unlock(&cdev->cosm_mutex);
+		cosm_stop(cdev, false);
+		break;
+	default:
+		mutex_unlock(&cdev->cosm_mutex);
+		break;
+	}
+	return 0;
+}
+
+static const struct dev_pm_ops cosm_pm_ops = {
+	.suspend = cosm_suspend,
+	.freeze = cosm_suspend
+};
+
+static struct cosm_driver cosm_driver = {
+	.driver = {
+		.name =  KBUILD_MODNAME,
+		.owner = THIS_MODULE,
+		.pm = &cosm_pm_ops,
+	},
+	.probe = cosm_driver_probe,
+	.remove = cosm_driver_remove
+};
+
+static int __init cosm_init(void)
+{
+	int ret;
+
+	cosm_init_debugfs();
+
+	g_cosm_class = class_create(THIS_MODULE, cosm_driver_name);
+	if (IS_ERR(g_cosm_class)) {
+		ret = PTR_ERR(g_cosm_class);
+		pr_err("class_create failed ret %d\n", ret);
+		goto cleanup_debugfs;
+	}
+
+	ida_init(&g_cosm_ida);
+	ret = cosm_register_driver(&cosm_driver);
+	if (ret) {
+		pr_err("cosm_register_driver failed ret %d\n", ret);
+		goto ida_destroy;
+	}
+	return 0;
+ida_destroy:
+	ida_destroy(&g_cosm_ida);
+	class_destroy(g_cosm_class);
+cleanup_debugfs:
+	cosm_exit_debugfs();
+	return ret;
+}
+
+static void __exit cosm_exit(void)
+{
+	cosm_unregister_driver(&cosm_driver);
+	ida_destroy(&g_cosm_ida);
+	class_destroy(g_cosm_class);
+	cosm_exit_debugfs();
+}
+
+module_init(cosm_init);
+module_exit(cosm_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel(R) MIC Coprocessor State Management (COSM) Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/misc/mic/cosm/cosm_main.h b/drivers/misc/mic/cosm/cosm_main.h
new file mode 100644
index 000000000000..f01156fca881
--- /dev/null
+++ b/drivers/misc/mic/cosm/cosm_main.h
@@ -0,0 +1,70 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Coprocessor State Management (COSM) Driver
+ *
+ */
+#ifndef _COSM_COSM_H_
+#define _COSM_COSM_H_
+
+#include <linux/scif.h>
+#include "../bus/cosm_bus.h"
+
+#define COSM_HEARTBEAT_SEND_SEC 30
+#define SCIF_COSM_LISTEN_PORT  201
+
+/**
+ * enum COSM msg id's
+ * @COSM_MSG_SHUTDOWN: host->card trigger shutdown
+ * @COSM_MSG_SYNC_TIME: host->card send host time to card to sync time
+ * @COSM_MSG_HEARTBEAT: card->host heartbeat
+ * @COSM_MSG_SHUTDOWN_STATUS: card->host with shutdown status as payload
+ */
+enum cosm_msg_id {
+	COSM_MSG_SHUTDOWN,
+	COSM_MSG_SYNC_TIME,
+	COSM_MSG_HEARTBEAT,
+	COSM_MSG_SHUTDOWN_STATUS,
+};
+
+struct cosm_msg {
+	u64 id;
+	union {
+		u64 shutdown_status;
+		struct timespec64 timespec;
+	};
+};
+
+extern const char * const cosm_state_string[];
+extern const char * const cosm_shutdown_status_string[];
+
+void cosm_sysfs_init(struct cosm_device *cdev);
+int cosm_start(struct cosm_device *cdev);
+void cosm_stop(struct cosm_device *cdev, bool force);
+int cosm_reset(struct cosm_device *cdev);
+int cosm_shutdown(struct cosm_device *cdev);
+void cosm_set_state(struct cosm_device *cdev, u8 state);
+void cosm_set_shutdown_status(struct cosm_device *cdev, u8 status);
+void cosm_init_debugfs(void);
+void cosm_exit_debugfs(void);
+void cosm_create_debug_dir(struct cosm_device *cdev);
+void cosm_delete_debug_dir(struct cosm_device *cdev);
+int cosm_scif_init(void);
+void cosm_scif_exit(void);
+void cosm_scif_work(struct work_struct *work);
+
+#endif
diff --git a/drivers/misc/mic/cosm/cosm_scif_server.c b/drivers/misc/mic/cosm/cosm_scif_server.c
new file mode 100644
index 000000000000..5696df4326b5
--- /dev/null
+++ b/drivers/misc/mic/cosm/cosm_scif_server.c
@@ -0,0 +1,405 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Coprocessor State Management (COSM) Driver
+ *
+ */
+#include <linux/kthread.h>
+#include "cosm_main.h"
+
+/*
+ * The COSM driver uses SCIF to communicate between the management node and the
+ * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b)
+ * receive a shutdown status back from the card upon completion of shutdown and
+ * (c) receive periodic heartbeat messages from the card used to deduce if the
+ * card has crashed.
+ *
+ * A COSM server consisting of a SCIF listening endpoint waits for incoming
+ * connections from the card. Upon acceptance of the connection, a separate
+ * work-item is scheduled to handle SCIF message processing for that card. The
+ * life-time of this work-item is therefore the time from which the connection
+ * from a card is accepted to the time at which the connection is closed. A new
+ * work-item starts each time the card boots and is alive till the card (a)
+ * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is
+ * unloaded.
+ *
+ * From the point of view of COSM interactions with SCIF during card
+ * shutdown, reset and crash are as follows:
+ *
+ * Card shutdown
+ * -------------
+ * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN
+ *    message from the host.
+ * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting
+ *    in scif_remove(..) getting called on the card
+ * 3. scif_remove -> scif_stop -> scif_handle_remove_node ->
+ *    scif_peer_unregister_device -> device_unregister for the host peer device
+ * 4. During device_unregister remove(..) method of cosm_client is invoked which
+ *    closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT
+ *    message being sent to host SCIF. SCIF_DISCNCT message processing on the
+ *    host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
+ *    up the host COSM thread blocked in scif_poll(..) resulting in
+ *    scif_poll(..)  returning POLLHUP.
+ * 5. On the card, scif_peer_release_dev is next called which results in an
+ *    SCIF_EXIT message being sent to the host and after receiving the
+ *    SCIF_EXIT_ACK from the host the peer device teardown on the card is
+ *    complete.
+ * 6. As part of the SCIF_EXIT message processing on the host, host sends a
+ *    SCIF_REMOVE_NODE to itself corresponding to the card being removed. This
+ *    starts a similar SCIF peer device teardown sequence on the host
+ *    corresponding to the card being shut down.
+ *
+ * Card reset
+ * ----------
+ * The case of interest here is when the card has not been previously shut down
+ * since most of the steps below are skipped in that case:
+
+ * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver
+ *    which unregisters the SCIF HW device resulting in scif_remove(..) being
+ *    called on the host.
+ * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a
+ *    SCIF_EXIT message being sent to the card.
+ * 3. The card executes scif_stop() as part of SCIF_EXIT message
+ *    processing. This results in the COSM endpoint on the card being closed and
+ *    the SCIF host peer device on the card getting unregistered similar to
+ *    steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
+ *    host returns POLLHUP as a result.
+ * 4. On the host, card peer device unregister and SCIF HW remove(..) also
+ *    subsequently complete.
+ *
+ * Card crash
+ * ----------
+ * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
+ * message from the card which would result in scif_poll(..) returning
+ * POLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
+ * message to itself resulting in the card SCIF peer device being unregistered,
+ * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
+ * scif_invalidate_ep call sequence which sets the endpoint state to
+ * DISCONNECTED and results in scif_poll(..) returning POLLHUP.
+ */
+
+#define COSM_SCIF_BACKLOG 16
+#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10
+#define COSM_HEARTBEAT_TIMEOUT_SEC \
+		(COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC)
+#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC)
+
+static struct task_struct *server_thread;
+static scif_epd_t listen_epd;
+
+/* Publish MIC card's shutdown status to user space MIC daemon */
+static void cosm_update_mic_status(struct cosm_device *cdev)
+{
+	if (cdev->shutdown_status_int != MIC_NOP) {
+		cosm_set_shutdown_status(cdev, cdev->shutdown_status_int);
+		cdev->shutdown_status_int = MIC_NOP;
+	}
+}
+
+/* Store MIC card's shutdown status internally when it is received */
+static void cosm_shutdown_status_int(struct cosm_device *cdev,
+				     enum mic_status shutdown_status)
+{
+	switch (shutdown_status) {
+	case MIC_HALTED:
+	case MIC_POWER_OFF:
+	case MIC_RESTART:
+	case MIC_CRASHED:
+		break;
+	default:
+		dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n",
+			__func__, __LINE__, shutdown_status);
+		return;
+	};
+	cdev->shutdown_status_int = shutdown_status;
+	cdev->heartbeat_watchdog_enable = false;
+
+	if (cdev->state != MIC_SHUTTING_DOWN)
+		cosm_set_state(cdev, MIC_SHUTTING_DOWN);
+}
+
+/* Non-blocking recv. Read and process all available messages */
+static void cosm_scif_recv(struct cosm_device *cdev)
+{
+	struct cosm_msg msg;
+	int rc;
+
+	while (1) {
+		rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0);
+		if (!rc) {
+			break;
+		} else if (rc < 0) {
+			dev_dbg(&cdev->dev, "%s: %d rc %d\n",
+				__func__, __LINE__, rc);
+			break;
+		}
+		dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n",
+			__func__, __LINE__, rc, msg.id);
+
+		switch (msg.id) {
+		case COSM_MSG_SHUTDOWN_STATUS:
+			cosm_shutdown_status_int(cdev, msg.shutdown_status);
+			break;
+		case COSM_MSG_HEARTBEAT:
+			/* Nothing to do, heartbeat only unblocks scif_poll */
+			break;
+		default:
+			dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n",
+				__func__, __LINE__, msg.id);
+			break;
+		}
+	}
+}
+
+/* Publish crashed status for this MIC card */
+static void cosm_set_crashed(struct cosm_device *cdev)
+{
+	dev_err(&cdev->dev, "node alive timeout\n");
+	cosm_shutdown_status_int(cdev, MIC_CRASHED);
+	cosm_update_mic_status(cdev);
+}
+
+/* Send host time to the MIC card to sync system time between host and MIC */
+static void cosm_send_time(struct cosm_device *cdev)
+{
+	struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME };
+	int rc;
+
+	getnstimeofday64(&msg.timespec);
+	rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
+	if (rc < 0)
+		dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
+			__func__, __LINE__, rc);
+}
+
+/*
+ * Close this cosm_device's endpoint after its peer endpoint on the card has
+ * been closed. In all cases except MIC card crash POLLHUP on the host is
+ * triggered by the client's endpoint being closed.
+ */
+static void cosm_scif_close(struct cosm_device *cdev)
+{
+	/*
+	 * Because SHUTDOWN_STATUS message is sent by the MIC cards in the
+	 * reboot notifier when shutdown is still not complete, we notify mpssd
+	 * to reset the card when SCIF endpoint is closed.
+	 */
+	cosm_update_mic_status(cdev);
+	scif_close(cdev->epd);
+	cdev->epd = NULL;
+	dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
+}
+
+/*
+ * Set card state to ONLINE when a new SCIF connection from a MIC card is
+ * received. Normally the state is BOOTING when the connection comes in, but can
+ * be ONLINE if cosm_client driver on the card was unloaded and then reloaded.
+ */
+static int cosm_set_online(struct cosm_device *cdev)
+{
+	int rc = 0;
+
+	if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) {
+		cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable;
+		cdev->epd = cdev->newepd;
+		if (cdev->state == MIC_BOOTING)
+			cosm_set_state(cdev, MIC_ONLINE);
+		cosm_send_time(cdev);
+		dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
+	} else {
+		dev_warn(&cdev->dev, "%s %d not going online in state: %s\n",
+			 __func__, __LINE__, cosm_state_string[cdev->state]);
+		rc = -EINVAL;
+	}
+	/* Drop reference acquired by bus_find_device in the server thread */
+	put_device(&cdev->dev);
+	return rc;
+}
+
+/*
+ * Work function for handling work for a SCIF connection from a particular MIC
+ * card. It first sets the card state to ONLINE and then calls scif_poll to
+ * block on activity such as incoming messages on the SCIF endpoint. When the
+ * endpoint is closed, the work function exits, completing its life cycle, from
+ * MIC card boot to card shutdown/reset/crash.
+ */
+void cosm_scif_work(struct work_struct *work)
+{
+	struct cosm_device *cdev = container_of(work, struct cosm_device,
+						scif_work);
+	struct scif_pollepd pollepd;
+	int rc;
+
+	mutex_lock(&cdev->cosm_mutex);
+	if (cosm_set_online(cdev))
+		goto exit;
+
+	while (1) {
+		pollepd.epd = cdev->epd;
+		pollepd.events = POLLIN;
+
+		/* Drop the mutex before blocking in scif_poll(..) */
+		mutex_unlock(&cdev->cosm_mutex);
+		/* poll(..) with timeout on our endpoint */
+		rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC);
+		mutex_lock(&cdev->cosm_mutex);
+		if (rc < 0) {
+			dev_err(&cdev->dev, "%s %d scif_poll rc %d\n",
+				__func__, __LINE__, rc);
+			continue;
+		}
+
+		/* There is a message from the card */
+		if (pollepd.revents & POLLIN)
+			cosm_scif_recv(cdev);
+
+		/* The peer endpoint is closed or this endpoint disconnected */
+		if (pollepd.revents & POLLHUP) {
+			cosm_scif_close(cdev);
+			break;
+		}
+
+		/* Did we timeout from poll? */
+		if (!rc && cdev->heartbeat_watchdog_enable)
+			cosm_set_crashed(cdev);
+	}
+exit:
+	dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__);
+	mutex_unlock(&cdev->cosm_mutex);
+}
+
+/*
+ * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC
+ * cards, finds the correct cosm_device to associate that connection with and
+ * schedules individual work items for each MIC card.
+ */
+static int cosm_scif_server(void *unused)
+{
+	struct cosm_device *cdev;
+	scif_epd_t newepd;
+	struct scif_port_id port_id;
+	int rc;
+
+	allow_signal(SIGKILL);
+
+	while (!kthread_should_stop()) {
+		rc = scif_accept(listen_epd, &port_id, &newepd,
+				 SCIF_ACCEPT_SYNC);
+		if (rc < 0) {
+			if (-ERESTARTSYS != rc)
+				pr_err("%s %d rc %d\n", __func__, __LINE__, rc);
+			continue;
+		}
+
+		/*
+		 * Associate the incoming connection with a particular
+		 * cosm_device, COSM device ID == SCIF node ID - 1
+		 */
+		cdev = cosm_find_cdev_by_id(port_id.node - 1);
+		if (!cdev)
+			continue;
+		cdev->newepd = newepd;
+		schedule_work(&cdev->scif_work);
+	}
+
+	pr_debug("%s %d Server thread stopped\n", __func__, __LINE__);
+	return 0;
+}
+
+static int cosm_scif_listen(void)
+{
+	int rc;
+
+	listen_epd = scif_open();
+	if (!listen_epd) {
+		pr_err("%s %d scif_open failed\n", __func__, __LINE__);
+		return -ENOMEM;
+	}
+
+	rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT);
+	if (rc < 0) {
+		pr_err("%s %d scif_bind failed rc %d\n",
+		       __func__, __LINE__, rc);
+		goto err;
+	}
+
+	rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG);
+	if (rc < 0) {
+		pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc);
+		goto err;
+	}
+	pr_debug("%s %d listen_epd set up\n", __func__, __LINE__);
+	return 0;
+err:
+	scif_close(listen_epd);
+	listen_epd = NULL;
+	return rc;
+}
+
+static void cosm_scif_listen_exit(void)
+{
+	pr_debug("%s %d closing listen_epd\n", __func__, __LINE__);
+	if (listen_epd) {
+		scif_close(listen_epd);
+		listen_epd = NULL;
+	}
+}
+
+/*
+ * Create a listening SCIF endpoint and a server kthread which accepts incoming
+ * SCIF connections from MIC cards
+ */
+int cosm_scif_init(void)
+{
+	int rc = cosm_scif_listen();
+
+	if (rc) {
+		pr_err("%s %d cosm_scif_listen rc %d\n",
+		       __func__, __LINE__, rc);
+		goto err;
+	}
+
+	server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server");
+	if (IS_ERR(server_thread)) {
+		rc = PTR_ERR(server_thread);
+		pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc);
+		goto listen_exit;
+	}
+	return 0;
+listen_exit:
+	cosm_scif_listen_exit();
+err:
+	return rc;
+}
+
+/* Stop the running server thread and close the listening SCIF endpoint */
+void cosm_scif_exit(void)
+{
+	int rc;
+
+	if (!IS_ERR_OR_NULL(server_thread)) {
+		rc = send_sig(SIGKILL, server_thread, 0);
+		if (rc) {
+			pr_err("%s %d send_sig rc %d\n",
+			       __func__, __LINE__, rc);
+			return;
+		}
+		kthread_stop(server_thread);
+	}
+
+	cosm_scif_listen_exit();
+}
diff --git a/drivers/misc/mic/cosm/cosm_sysfs.c b/drivers/misc/mic/cosm/cosm_sysfs.c
new file mode 100644
index 000000000000..29d6863b6e59
--- /dev/null
+++ b/drivers/misc/mic/cosm/cosm_sysfs.c
@@ -0,0 +1,461 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC Coprocessor State Management (COSM) Driver
+ *
+ */
+#include <linux/slab.h>
+#include "cosm_main.h"
+
+/*
+ * A state-to-string lookup table, for exposing a human readable state
+ * via sysfs. Always keep in sync with enum cosm_states
+ */
+const char * const cosm_state_string[] = {
+	[MIC_READY] = "ready",
+	[MIC_BOOTING] = "booting",
+	[MIC_ONLINE] = "online",
+	[MIC_SHUTTING_DOWN] = "shutting_down",
+	[MIC_RESETTING] = "resetting",
+	[MIC_RESET_FAILED] = "reset_failed",
+};
+
+/*
+ * A shutdown-status-to-string lookup table, for exposing a human
+ * readable state via sysfs. Always keep in sync with enum cosm_shutdown_status
+ */
+const char * const cosm_shutdown_status_string[] = {
+	[MIC_NOP] = "nop",
+	[MIC_CRASHED] = "crashed",
+	[MIC_HALTED] = "halted",
+	[MIC_POWER_OFF] = "poweroff",
+	[MIC_RESTART] = "restart",
+};
+
+void cosm_set_shutdown_status(struct cosm_device *cdev, u8 shutdown_status)
+{
+	dev_dbg(&cdev->dev, "Shutdown Status %s -> %s\n",
+		cosm_shutdown_status_string[cdev->shutdown_status],
+		cosm_shutdown_status_string[shutdown_status]);
+	cdev->shutdown_status = shutdown_status;
+}
+
+void cosm_set_state(struct cosm_device *cdev, u8 state)
+{
+	dev_dbg(&cdev->dev, "State %s -> %s\n",
+		cosm_state_string[cdev->state],
+		cosm_state_string[state]);
+	cdev->state = state;
+	sysfs_notify_dirent(cdev->state_sysfs);
+}
+
+static ssize_t
+family_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	return cdev->hw_ops->family(cdev, buf);
+}
+static DEVICE_ATTR_RO(family);
+
+static ssize_t
+stepping_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	return cdev->hw_ops->stepping(cdev, buf);
+}
+static DEVICE_ATTR_RO(stepping);
+
+static ssize_t
+state_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev || cdev->state >= MIC_LAST)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+		cosm_state_string[cdev->state]);
+}
+
+static ssize_t
+state_store(struct device *dev, struct device_attribute *attr,
+	    const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	int rc;
+
+	if (!cdev)
+		return -EINVAL;
+
+	if (sysfs_streq(buf, "boot")) {
+		rc = cosm_start(cdev);
+		goto done;
+	}
+	if (sysfs_streq(buf, "reset")) {
+		rc = cosm_reset(cdev);
+		goto done;
+	}
+
+	if (sysfs_streq(buf, "shutdown")) {
+		rc = cosm_shutdown(cdev);
+		goto done;
+	}
+	rc = -EINVAL;
+done:
+	if (rc)
+		count = rc;
+	return count;
+}
+static DEVICE_ATTR_RW(state);
+
+static ssize_t shutdown_status_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev || cdev->shutdown_status >= MIC_STATUS_LAST)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+		cosm_shutdown_status_string[cdev->shutdown_status]);
+}
+static DEVICE_ATTR_RO(shutdown_status);
+
+static ssize_t
+heartbeat_enable_show(struct device *dev,
+		      struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", cdev->sysfs_heartbeat_enable);
+}
+
+static ssize_t
+heartbeat_enable_store(struct device *dev,
+		       struct device_attribute *attr,
+		       const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	int enable;
+	int ret;
+
+	if (!cdev)
+		return -EINVAL;
+
+	mutex_lock(&cdev->cosm_mutex);
+	ret = kstrtoint(buf, 10, &enable);
+	if (ret)
+		goto unlock;
+
+	cdev->sysfs_heartbeat_enable = enable;
+	/* if state is not online, cdev->heartbeat_watchdog_enable is 0 */
+	if (cdev->state == MIC_ONLINE)
+		cdev->heartbeat_watchdog_enable = enable;
+	ret = count;
+unlock:
+	mutex_unlock(&cdev->cosm_mutex);
+	return ret;
+}
+static DEVICE_ATTR_RW(heartbeat_enable);
+
+static ssize_t
+cmdline_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	char *cmdline;
+
+	if (!cdev)
+		return -EINVAL;
+
+	cmdline = cdev->cmdline;
+
+	if (cmdline)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", cmdline);
+	return 0;
+}
+
+static ssize_t
+cmdline_store(struct device *dev, struct device_attribute *attr,
+	      const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	mutex_lock(&cdev->cosm_mutex);
+	kfree(cdev->cmdline);
+
+	cdev->cmdline = kmalloc(count + 1, GFP_KERNEL);
+	if (!cdev->cmdline) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+
+	strncpy(cdev->cmdline, buf, count);
+
+	if (cdev->cmdline[count - 1] == '\n')
+		cdev->cmdline[count - 1] = '\0';
+	else
+		cdev->cmdline[count] = '\0';
+unlock:
+	mutex_unlock(&cdev->cosm_mutex);
+	return count;
+}
+static DEVICE_ATTR_RW(cmdline);
+
+static ssize_t
+firmware_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	char *firmware;
+
+	if (!cdev)
+		return -EINVAL;
+
+	firmware = cdev->firmware;
+
+	if (firmware)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", firmware);
+	return 0;
+}
+
+static ssize_t
+firmware_store(struct device *dev, struct device_attribute *attr,
+	       const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	mutex_lock(&cdev->cosm_mutex);
+	kfree(cdev->firmware);
+
+	cdev->firmware = kmalloc(count + 1, GFP_KERNEL);
+	if (!cdev->firmware) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+	strncpy(cdev->firmware, buf, count);
+
+	if (cdev->firmware[count - 1] == '\n')
+		cdev->firmware[count - 1] = '\0';
+	else
+		cdev->firmware[count] = '\0';
+unlock:
+	mutex_unlock(&cdev->cosm_mutex);
+	return count;
+}
+static DEVICE_ATTR_RW(firmware);
+
+static ssize_t
+ramdisk_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	char *ramdisk;
+
+	if (!cdev)
+		return -EINVAL;
+
+	ramdisk = cdev->ramdisk;
+
+	if (ramdisk)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", ramdisk);
+	return 0;
+}
+
+static ssize_t
+ramdisk_store(struct device *dev, struct device_attribute *attr,
+	      const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	mutex_lock(&cdev->cosm_mutex);
+	kfree(cdev->ramdisk);
+
+	cdev->ramdisk = kmalloc(count + 1, GFP_KERNEL);
+	if (!cdev->ramdisk) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+
+	strncpy(cdev->ramdisk, buf, count);
+
+	if (cdev->ramdisk[count - 1] == '\n')
+		cdev->ramdisk[count - 1] = '\0';
+	else
+		cdev->ramdisk[count] = '\0';
+unlock:
+	mutex_unlock(&cdev->cosm_mutex);
+	return count;
+}
+static DEVICE_ATTR_RW(ramdisk);
+
+static ssize_t
+bootmode_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	char *bootmode;
+
+	if (!cdev)
+		return -EINVAL;
+
+	bootmode = cdev->bootmode;
+
+	if (bootmode)
+		return scnprintf(buf, PAGE_SIZE, "%s\n", bootmode);
+	return 0;
+}
+
+static ssize_t
+bootmode_store(struct device *dev, struct device_attribute *attr,
+	       const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	if (!sysfs_streq(buf, "linux") && !sysfs_streq(buf, "flash"))
+		return -EINVAL;
+
+	mutex_lock(&cdev->cosm_mutex);
+	kfree(cdev->bootmode);
+
+	cdev->bootmode = kmalloc(count + 1, GFP_KERNEL);
+	if (!cdev->bootmode) {
+		count = -ENOMEM;
+		goto unlock;
+	}
+
+	strncpy(cdev->bootmode, buf, count);
+
+	if (cdev->bootmode[count - 1] == '\n')
+		cdev->bootmode[count - 1] = '\0';
+	else
+		cdev->bootmode[count] = '\0';
+unlock:
+	mutex_unlock(&cdev->cosm_mutex);
+	return count;
+}
+static DEVICE_ATTR_RW(bootmode);
+
+static ssize_t
+log_buf_addr_show(struct device *dev, struct device_attribute *attr,
+		  char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%p\n", cdev->log_buf_addr);
+}
+
+static ssize_t
+log_buf_addr_store(struct device *dev, struct device_attribute *attr,
+		   const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	int ret;
+	unsigned long addr;
+
+	if (!cdev)
+		return -EINVAL;
+
+	ret = kstrtoul(buf, 16, &addr);
+	if (ret)
+		goto exit;
+
+	cdev->log_buf_addr = (void *)addr;
+	ret = count;
+exit:
+	return ret;
+}
+static DEVICE_ATTR_RW(log_buf_addr);
+
+static ssize_t
+log_buf_len_show(struct device *dev, struct device_attribute *attr,
+		 char *buf)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+
+	if (!cdev)
+		return -EINVAL;
+
+	return scnprintf(buf, PAGE_SIZE, "%p\n", cdev->log_buf_len);
+}
+
+static ssize_t
+log_buf_len_store(struct device *dev, struct device_attribute *attr,
+		  const char *buf, size_t count)
+{
+	struct cosm_device *cdev = dev_get_drvdata(dev);
+	int ret;
+	unsigned long addr;
+
+	if (!cdev)
+		return -EINVAL;
+
+	ret = kstrtoul(buf, 16, &addr);
+	if (ret)
+		goto exit;
+
+	cdev->log_buf_len = (int *)addr;
+	ret = count;
+exit:
+	return ret;
+}
+static DEVICE_ATTR_RW(log_buf_len);
+
+static struct attribute *cosm_default_attrs[] = {
+	&dev_attr_family.attr,
+	&dev_attr_stepping.attr,
+	&dev_attr_state.attr,
+	&dev_attr_shutdown_status.attr,
+	&dev_attr_heartbeat_enable.attr,
+	&dev_attr_cmdline.attr,
+	&dev_attr_firmware.attr,
+	&dev_attr_ramdisk.attr,
+	&dev_attr_bootmode.attr,
+	&dev_attr_log_buf_addr.attr,
+	&dev_attr_log_buf_len.attr,
+
+	NULL
+};
+
+ATTRIBUTE_GROUPS(cosm_default);
+
+void cosm_sysfs_init(struct cosm_device *cdev)
+{
+	cdev->attr_group = cosm_default_groups;
+}
diff --git a/drivers/misc/mic/cosm_client/Makefile b/drivers/misc/mic/cosm_client/Makefile
new file mode 100644
index 000000000000..6f751a519a09
--- /dev/null
+++ b/drivers/misc/mic/cosm_client/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile - Intel MIC COSM Client Driver
+# Copyright(c) 2015, Intel Corporation.
+#
+obj-$(CONFIG_MIC_COSM) += cosm_client.o
+
+cosm_client-objs += cosm_scif_client.o
diff --git a/drivers/misc/mic/cosm_client/cosm_scif_client.c b/drivers/misc/mic/cosm_client/cosm_scif_client.c
new file mode 100644
index 000000000000..03e98bf1ac15
--- /dev/null
+++ b/drivers/misc/mic/cosm_client/cosm_scif_client.c
@@ -0,0 +1,275 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC COSM Client Driver
+ *
+ */
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/kthread.h>
+#include "../cosm/cosm_main.h"
+
+#define COSM_SCIF_MAX_RETRIES 10
+#define COSM_HEARTBEAT_SEND_MSEC (COSM_HEARTBEAT_SEND_SEC * MSEC_PER_SEC)
+
+static struct task_struct *client_thread;
+static scif_epd_t client_epd;
+static struct scif_peer_dev *client_spdev;
+
+/*
+ * Reboot notifier: receives shutdown status from the OS and communicates it
+ * back to the COSM process on the host
+ */
+static int cosm_reboot_event(struct notifier_block *this, unsigned long event,
+			     void *ptr)
+{
+	struct cosm_msg msg = { .id = COSM_MSG_SHUTDOWN_STATUS };
+	int rc;
+
+	event = (event == SYS_RESTART) ? SYSTEM_RESTART : event;
+	dev_info(&client_spdev->dev, "%s %d received event %ld\n",
+		 __func__, __LINE__, event);
+
+	msg.shutdown_status = event;
+	rc = scif_send(client_epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
+	if (rc < 0)
+		dev_err(&client_spdev->dev, "%s %d scif_send rc %d\n",
+			__func__, __LINE__, rc);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cosm_reboot = {
+	.notifier_call  = cosm_reboot_event,
+};
+
+/* Set system time from timespec value received from the host */
+static void cosm_set_time(struct cosm_msg *msg)
+{
+	int rc = do_settimeofday64(&msg->timespec);
+
+	if (rc)
+		dev_err(&client_spdev->dev, "%s: %d settimeofday rc %d\n",
+			__func__, __LINE__, rc);
+}
+
+/* COSM client receive message processing */
+static void cosm_client_recv(void)
+{
+	struct cosm_msg msg;
+	int rc;
+
+	while (1) {
+		rc = scif_recv(client_epd, &msg, sizeof(msg), 0);
+		if (!rc) {
+			return;
+		} else if (rc < 0) {
+			dev_err(&client_spdev->dev, "%s: %d rc %d\n",
+				__func__, __LINE__, rc);
+			return;
+		}
+
+		dev_dbg(&client_spdev->dev, "%s: %d rc %d id 0x%llx\n",
+			__func__, __LINE__, rc, msg.id);
+
+		switch (msg.id) {
+		case COSM_MSG_SYNC_TIME:
+			cosm_set_time(&msg);
+			break;
+		case COSM_MSG_SHUTDOWN:
+			orderly_poweroff(true);
+			break;
+		default:
+			dev_err(&client_spdev->dev, "%s: %d unknown id %lld\n",
+				__func__, __LINE__, msg.id);
+			break;
+		}
+	}
+}
+
+/* Initiate connection to the COSM server on the host */
+static int cosm_scif_connect(void)
+{
+	struct scif_port_id port_id;
+	int i, rc;
+
+	client_epd = scif_open();
+	if (!client_epd) {
+		dev_err(&client_spdev->dev, "%s %d scif_open failed\n",
+			__func__, __LINE__);
+		return -ENOMEM;
+	}
+
+	port_id.node = 0;
+	port_id.port = SCIF_COSM_LISTEN_PORT;
+
+	for (i = 0; i < COSM_SCIF_MAX_RETRIES; i++) {
+		rc = scif_connect(client_epd, &port_id);
+		if (rc < 0)
+			msleep(1000);
+		else
+			break;
+	}
+
+	if (rc < 0) {
+		dev_err(&client_spdev->dev, "%s %d scif_connect rc %d\n",
+			__func__, __LINE__, rc);
+		scif_close(client_epd);
+		client_epd = NULL;
+	}
+	return rc < 0 ? rc : 0;
+}
+
+/* Close host SCIF connection */
+static void cosm_scif_connect_exit(void)
+{
+	if (client_epd) {
+		scif_close(client_epd);
+		client_epd = NULL;
+	}
+}
+
+/*
+ * COSM SCIF client thread function: waits for messages from the host and sends
+ * a heartbeat to the host
+ */
+static int cosm_scif_client(void *unused)
+{
+	struct cosm_msg msg = { .id = COSM_MSG_HEARTBEAT };
+	struct scif_pollepd pollepd;
+	int rc;
+
+	allow_signal(SIGKILL);
+
+	while (!kthread_should_stop()) {
+		pollepd.epd = client_epd;
+		pollepd.events = POLLIN;
+
+		rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_SEND_MSEC);
+		if (rc < 0) {
+			if (-EINTR != rc)
+				dev_err(&client_spdev->dev,
+					"%s %d scif_poll rc %d\n",
+					__func__, __LINE__, rc);
+			continue;
+		}
+
+		if (pollepd.revents & POLLIN)
+			cosm_client_recv();
+
+		msg.id = COSM_MSG_HEARTBEAT;
+		rc = scif_send(client_epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
+		if (rc < 0)
+			dev_err(&client_spdev->dev, "%s %d scif_send rc %d\n",
+				__func__, __LINE__, rc);
+	}
+
+	dev_dbg(&client_spdev->dev, "%s %d Client thread stopped\n",
+		__func__, __LINE__);
+	return 0;
+}
+
+static void cosm_scif_probe(struct scif_peer_dev *spdev)
+{
+	int rc;
+
+	dev_dbg(&spdev->dev, "%s %d: dnode %d\n",
+		__func__, __LINE__, spdev->dnode);
+
+	/* We are only interested in the host with spdev->dnode == 0 */
+	if (spdev->dnode)
+		return;
+
+	client_spdev = spdev;
+	rc = cosm_scif_connect();
+	if (rc)
+		goto exit;
+
+	rc = register_reboot_notifier(&cosm_reboot);
+	if (rc) {
+		dev_err(&spdev->dev,
+			"reboot notifier registration failed rc %d\n", rc);
+		goto connect_exit;
+	}
+
+	client_thread = kthread_run(cosm_scif_client, NULL, "cosm_client");
+	if (IS_ERR(client_thread)) {
+		rc = PTR_ERR(client_thread);
+		dev_err(&spdev->dev, "%s %d kthread_run rc %d\n",
+			__func__, __LINE__, rc);
+		goto unreg_reboot;
+	}
+	return;
+unreg_reboot:
+	unregister_reboot_notifier(&cosm_reboot);
+connect_exit:
+	cosm_scif_connect_exit();
+exit:
+	client_spdev = NULL;
+}
+
+static void cosm_scif_remove(struct scif_peer_dev *spdev)
+{
+	int rc;
+
+	dev_dbg(&spdev->dev, "%s %d: dnode %d\n",
+		__func__, __LINE__, spdev->dnode);
+
+	if (spdev->dnode)
+		return;
+
+	if (!IS_ERR_OR_NULL(client_thread)) {
+		rc = send_sig(SIGKILL, client_thread, 0);
+		if (rc) {
+			pr_err("%s %d send_sig rc %d\n",
+			       __func__, __LINE__, rc);
+			return;
+		}
+		kthread_stop(client_thread);
+	}
+	unregister_reboot_notifier(&cosm_reboot);
+	cosm_scif_connect_exit();
+	client_spdev = NULL;
+}
+
+static struct scif_client scif_client_cosm = {
+	.name = KBUILD_MODNAME,
+	.probe = cosm_scif_probe,
+	.remove = cosm_scif_remove,
+};
+
+static int __init cosm_client_init(void)
+{
+	int rc = scif_client_register(&scif_client_cosm);
+
+	if (rc)
+		pr_err("scif_client_register failed rc %d\n", rc);
+	return rc;
+}
+
+static void __exit cosm_client_exit(void)
+{
+	scif_client_unregister(&scif_client_cosm);
+}
+
+module_init(cosm_client_init);
+module_exit(cosm_client_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel(R) MIC card OS state management client driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/misc/mic/host/Makefile b/drivers/misc/mic/host/Makefile
index c2197f999394..004d3db0f990 100644
--- a/drivers/misc/mic/host/Makefile
+++ b/drivers/misc/mic/host/Makefile
@@ -5,7 +5,6 @@
 obj-$(CONFIG_INTEL_MIC_HOST) += mic_host.o
 mic_host-objs := mic_main.o
 mic_host-objs += mic_x100.o
-mic_host-objs += mic_sysfs.o
 mic_host-objs += mic_smpt.o
 mic_host-objs += mic_intr.o
 mic_host-objs += mic_boot.o
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
index e5f6a5e7bca1..7845564dff64 100644
--- a/drivers/misc/mic/host/mic_boot.c
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -22,9 +22,9 @@
 #include <linux/firmware.h>
 #include <linux/pci.h>
 #include <linux/kmod.h>
-
 #include <linux/mic_common.h>
 #include <linux/mic_bus.h>
+#include "../bus/scif_bus.h"
 #include "../common/mic_dev.h"
 #include "mic_device.h"
 #include "mic_smpt.h"
@@ -99,7 +99,7 @@ static int __mic_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	int i, j, ret;
 	dma_addr_t da;
 
-	ret = dma_map_sg(mdev->sdev->parent, sg, nents, dir);
+	ret = dma_map_sg(&mdev->pdev->dev, sg, nents, dir);
 	if (ret <= 0)
 		return 0;
 
@@ -115,7 +115,7 @@ err:
 		mic_unmap(mdev, sg_dma_address(s), s->length);
 		sg_dma_address(s) = mic_to_dma_addr(mdev, sg_dma_address(s));
 	}
-	dma_unmap_sg(mdev->sdev->parent, sg, nents, dir);
+	dma_unmap_sg(&mdev->pdev->dev, sg, nents, dir);
 	return 0;
 }
 
@@ -135,7 +135,7 @@ static void __mic_dma_unmap_sg(struct device *dev,
 		mic_unmap(mdev, sg_dma_address(s), s->length);
 		sg_dma_address(s) = da;
 	}
-	dma_unmap_sg(mdev->sdev->parent, sg, nents, dir);
+	dma_unmap_sg(&mdev->pdev->dev, sg, nents, dir);
 }
 
 static struct dma_map_ops __mic_dma_ops = {
@@ -270,48 +270,13 @@ static struct mbus_hw_ops mbus_hw_ops = {
 	.ack_interrupt = _mic_ack_interrupt,
 };
 
-/**
- * mic_reset - Reset the MIC device.
- * @mdev: pointer to mic_device instance
- */
-static void mic_reset(struct mic_device *mdev)
-{
-	int i;
-
-#define MIC_RESET_TO (45)
-
-	reinit_completion(&mdev->reset_wait);
-	mdev->ops->reset_fw_ready(mdev);
-	mdev->ops->reset(mdev);
-
-	for (i = 0; i < MIC_RESET_TO; i++) {
-		if (mdev->ops->is_fw_ready(mdev))
-			goto done;
-		/*
-		 * Resets typically take 10s of seconds to complete.
-		 * Since an MMIO read is required to check if the
-		 * firmware is ready or not, a 1 second delay works nicely.
-		 */
-		msleep(1000);
-	}
-	mic_set_state(mdev, MIC_RESET_FAILED);
-done:
-	complete_all(&mdev->reset_wait);
-}
-
 /* Initialize the MIC bootparams */
 void mic_bootparam_init(struct mic_device *mdev)
 {
 	struct mic_bootparam *bootparam = mdev->dp;
 
 	bootparam->magic = cpu_to_le32(MIC_MAGIC);
-	bootparam->c2h_shutdown_db = mdev->shutdown_db;
-	bootparam->h2c_shutdown_db = -1;
 	bootparam->h2c_config_db = -1;
-	bootparam->shutdown_status = 0;
-	bootparam->shutdown_card = 0;
-	/* Total nodes = number of MICs + 1 for self node */
-	bootparam->tot_nodes = atomic_read(&g_num_mics) + 1;
 	bootparam->node_id = mdev->id + 1;
 	bootparam->scif_host_dma_addr = 0x0;
 	bootparam->scif_card_dma_addr = 0x0;
@@ -319,6 +284,26 @@ void mic_bootparam_init(struct mic_device *mdev)
 	bootparam->h2c_scif_db = -1;
 }
 
+static inline struct mic_device *cosmdev_to_mdev(struct cosm_device *cdev)
+{
+	return dev_get_drvdata(cdev->dev.parent);
+}
+
+static void _mic_reset(struct cosm_device *cdev)
+{
+	struct mic_device *mdev = cosmdev_to_mdev(cdev);
+
+	mdev->ops->reset_fw_ready(mdev);
+	mdev->ops->reset(mdev);
+}
+
+static bool _mic_ready(struct cosm_device *cdev)
+{
+	struct mic_device *mdev = cosmdev_to_mdev(cdev);
+
+	return mdev->ops->is_fw_ready(mdev);
+}
+
 /**
  * mic_request_dma_chans - Request DMA channels
  * @mdev: pointer to mic_device instance
@@ -336,14 +321,14 @@ static int mic_request_dma_chans(struct mic_device *mdev)
 
 	do {
 		chan = dma_request_channel(mask, mdev->ops->dma_filter,
-					   mdev->sdev->parent);
+					   &mdev->pdev->dev);
 		if (chan) {
 			mdev->dma_ch[mdev->num_dma_ch++] = chan;
 			if (mdev->num_dma_ch >= MIC_MAX_DMA_CHAN)
 				break;
 		}
 	} while (chan);
-	dev_info(mdev->sdev->parent, "DMA channels # %d\n", mdev->num_dma_ch);
+	dev_info(&mdev->pdev->dev, "DMA channels # %d\n", mdev->num_dma_ch);
 	return mdev->num_dma_ch;
 }
 
@@ -365,34 +350,24 @@ static void mic_free_dma_chans(struct mic_device *mdev)
 }
 
 /**
- * mic_start - Start the MIC.
- * @mdev: pointer to mic_device instance
- * @buf: buffer containing boot string including firmware/ramdisk path.
+ * _mic_start - Start the MIC.
+ * @cdev: pointer to cosm_device instance
+ * @id: MIC device id/index provided by COSM used in other drivers like SCIF
  *
  * This function prepares an MIC for boot and initiates boot.
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ *
+ * For all cosm_hw_ops the caller holds a mutex to ensure serialization.
  */
-int mic_start(struct mic_device *mdev, const char *buf)
+static int _mic_start(struct cosm_device *cdev, int id)
 {
+	struct mic_device *mdev = cosmdev_to_mdev(cdev);
 	int rc;
-	mutex_lock(&mdev->mic_mutex);
+
 	mic_bootparam_init(mdev);
-retry:
-	if (MIC_OFFLINE != mdev->state) {
-		rc = -EINVAL;
-		goto unlock_ret;
-	}
-	if (!mdev->ops->is_fw_ready(mdev)) {
-		mic_reset(mdev);
-		/*
-		 * The state will either be MIC_OFFLINE if the reset succeeded
-		 * or MIC_RESET_FAILED if the firmware reset failed.
-		 */
-		goto retry;
-	}
-	mdev->dma_mbdev = mbus_register_device(mdev->sdev->parent,
+	mdev->dma_mbdev = mbus_register_device(&mdev->pdev->dev,
 					       MBUS_DEV_DMA_HOST, &mic_dma_ops,
-					       &mbus_hw_ops, mdev->mmio.va);
+					       &mbus_hw_ops, id, mdev->mmio.va);
 	if (IS_ERR(mdev->dma_mbdev)) {
 		rc = PTR_ERR(mdev->dma_mbdev);
 		goto unlock_ret;
@@ -401,16 +376,18 @@ retry:
 		rc = -ENODEV;
 		goto dma_remove;
 	}
-	mdev->scdev = scif_register_device(mdev->sdev->parent, MIC_SCIF_DEV,
+	mdev->scdev = scif_register_device(&mdev->pdev->dev, MIC_SCIF_DEV,
 					   &__mic_dma_ops, &scif_hw_ops,
-					   mdev->id + 1, 0, &mdev->mmio,
+					   id + 1, 0, &mdev->mmio,
 					   &mdev->aper, mdev->dp, NULL,
-					   mdev->dma_ch, mdev->num_dma_ch);
+					   mdev->dma_ch, mdev->num_dma_ch,
+					   true);
 	if (IS_ERR(mdev->scdev)) {
 		rc = PTR_ERR(mdev->scdev);
 		goto dma_free;
 	}
-	rc = mdev->ops->load_mic_fw(mdev, buf);
+
+	rc = mdev->ops->load_mic_fw(mdev, NULL);
 	if (rc)
 		goto scif_remove;
 	mic_smpt_restore(mdev);
@@ -419,7 +396,6 @@ retry:
 	mdev->ops->write_spad(mdev, MIC_DPLO_SPAD, mdev->dp_dma_addr);
 	mdev->ops->write_spad(mdev, MIC_DPHI_SPAD, mdev->dp_dma_addr >> 32);
 	mdev->ops->send_firmware_intr(mdev);
-	mic_set_state(mdev, MIC_ONLINE);
 	goto unlock_ret;
 scif_remove:
 	scif_unregister_device(mdev->scdev);
@@ -428,198 +404,79 @@ dma_free:
 dma_remove:
 	mbus_unregister_device(mdev->dma_mbdev);
 unlock_ret:
-	mutex_unlock(&mdev->mic_mutex);
 	return rc;
 }
 
 /**
- * mic_stop - Prepare the MIC for reset and trigger reset.
- * @mdev: pointer to mic_device instance
+ * _mic_stop - Prepare the MIC for reset and trigger reset.
+ * @cdev: pointer to cosm_device instance
  * @force: force a MIC to reset even if it is already offline.
  *
  * RETURNS: None.
  */
-void mic_stop(struct mic_device *mdev, bool force)
-{
-	mutex_lock(&mdev->mic_mutex);
-	if (MIC_OFFLINE != mdev->state || force) {
-		scif_unregister_device(mdev->scdev);
-		mic_virtio_reset_devices(mdev);
-		mic_free_dma_chans(mdev);
-		mbus_unregister_device(mdev->dma_mbdev);
-		mic_bootparam_init(mdev);
-		mic_reset(mdev);
-		if (MIC_RESET_FAILED == mdev->state)
-			goto unlock;
-		mic_set_shutdown_status(mdev, MIC_NOP);
-		if (MIC_SUSPENDED != mdev->state)
-			mic_set_state(mdev, MIC_OFFLINE);
-	}
-unlock:
-	mutex_unlock(&mdev->mic_mutex);
-}
-
-/**
- * mic_shutdown - Initiate MIC shutdown.
- * @mdev: pointer to mic_device instance
- *
- * RETURNS: None.
- */
-void mic_shutdown(struct mic_device *mdev)
+static void _mic_stop(struct cosm_device *cdev, bool force)
 {
-	struct mic_bootparam *bootparam = mdev->dp;
-	s8 db = bootparam->h2c_shutdown_db;
-
-	mutex_lock(&mdev->mic_mutex);
-	if (MIC_ONLINE == mdev->state && db != -1) {
-		bootparam->shutdown_card = 1;
-		mdev->ops->send_intr(mdev, db);
-		mic_set_state(mdev, MIC_SHUTTING_DOWN);
-	}
-	mutex_unlock(&mdev->mic_mutex);
-}
-
-/**
- * mic_shutdown_work - Handle shutdown interrupt from MIC.
- * @work: The work structure.
- *
- * This work is scheduled whenever the host has received a shutdown
- * interrupt from the MIC.
- */
-void mic_shutdown_work(struct work_struct *work)
-{
-	struct mic_device *mdev = container_of(work, struct mic_device,
-			shutdown_work);
-	struct mic_bootparam *bootparam = mdev->dp;
-
-	mutex_lock(&mdev->mic_mutex);
-	mic_set_shutdown_status(mdev, bootparam->shutdown_status);
-	bootparam->shutdown_status = 0;
+	struct mic_device *mdev = cosmdev_to_mdev(cdev);
 
 	/*
-	 * if state is MIC_SUSPENDED, OSPM suspend is in progress. We do not
-	 * change the state here so as to prevent users from booting the card
-	 * during and after the suspend operation.
+	 * Since SCIF handles card shutdown and reset (using COSM), it will
+	 * will be the first to be registered and the last to be
+	 * unregistered.
 	 */
-	if (MIC_SHUTTING_DOWN != mdev->state &&
-	    MIC_SUSPENDED != mdev->state)
-		mic_set_state(mdev, MIC_SHUTTING_DOWN);
-	mutex_unlock(&mdev->mic_mutex);
+	mic_virtio_reset_devices(mdev);
+	scif_unregister_device(mdev->scdev);
+	mic_free_dma_chans(mdev);
+	mbus_unregister_device(mdev->dma_mbdev);
+	mic_bootparam_init(mdev);
 }
 
-/**
- * mic_reset_trigger_work - Trigger MIC reset.
- * @work: The work structure.
- *
- * This work is scheduled whenever the host wants to reset the MIC.
- */
-void mic_reset_trigger_work(struct work_struct *work)
+static ssize_t _mic_family(struct cosm_device *cdev, char *buf)
 {
-	struct mic_device *mdev = container_of(work, struct mic_device,
-			reset_trigger_work);
+	struct mic_device *mdev = cosmdev_to_mdev(cdev);
+	static const char *family[MIC_FAMILY_LAST] = { "x100", "Unknown" };
 
-	mic_stop(mdev, false);
+	return scnprintf(buf, PAGE_SIZE, "%s\n", family[mdev->family]);
 }
 
-/**
- * mic_complete_resume - Complete MIC Resume after an OSPM suspend/hibernate
- * event.
- * @mdev: pointer to mic_device instance
- *
- * RETURNS: None.
- */
-void mic_complete_resume(struct mic_device *mdev)
+static ssize_t _mic_stepping(struct cosm_device *cdev, char *buf)
 {
-	if (mdev->state != MIC_SUSPENDED) {
-		dev_warn(mdev->sdev->parent, "state %d should be %d\n",
-			 mdev->state, MIC_SUSPENDED);
-		return;
-	}
-
-	/* Make sure firmware is ready */
-	if (!mdev->ops->is_fw_ready(mdev))
-		mic_stop(mdev, true);
+	struct mic_device *mdev = cosmdev_to_mdev(cdev);
+	const char *string = "??";
 
-	mutex_lock(&mdev->mic_mutex);
-	mic_set_state(mdev, MIC_OFFLINE);
-	mutex_unlock(&mdev->mic_mutex);
-}
-
-/**
- * mic_prepare_suspend - Handle suspend notification for the MIC device.
- * @mdev: pointer to mic_device instance
- *
- * RETURNS: None.
- */
-void mic_prepare_suspend(struct mic_device *mdev)
-{
-	unsigned long timeout;
-
-#define MIC_SUSPEND_TIMEOUT (60 * HZ)
-
-	mutex_lock(&mdev->mic_mutex);
-	switch (mdev->state) {
-	case MIC_OFFLINE:
-		/*
-		 * Card is already offline. Set state to MIC_SUSPENDED
-		 * to prevent users from booting the card.
-		 */
-		mic_set_state(mdev, MIC_SUSPENDED);
-		mutex_unlock(&mdev->mic_mutex);
+	switch (mdev->stepping) {
+	case MIC_A0_STEP:
+		string = "A0";
 		break;
-	case MIC_ONLINE:
-		/*
-		 * Card is online. Set state to MIC_SUSPENDING and notify
-		 * MIC user space daemon which will issue card
-		 * shutdown and reset.
-		 */
-		mic_set_state(mdev, MIC_SUSPENDING);
-		mutex_unlock(&mdev->mic_mutex);
-		timeout = wait_for_completion_timeout(&mdev->reset_wait,
-						      MIC_SUSPEND_TIMEOUT);
-		/* Force reset the card if the shutdown completion timed out */
-		if (!timeout) {
-			mutex_lock(&mdev->mic_mutex);
-			mic_set_state(mdev, MIC_SUSPENDED);
-			mutex_unlock(&mdev->mic_mutex);
-			mic_stop(mdev, true);
-		}
+	case MIC_B0_STEP:
+		string = "B0";
+		break;
+	case MIC_B1_STEP:
+		string = "B1";
 		break;
-	case MIC_SHUTTING_DOWN:
-		/*
-		 * Card is shutting down. Set state to MIC_SUSPENDED
-		 * to prevent further boot of the card.
-		 */
-		mic_set_state(mdev, MIC_SUSPENDED);
-		mutex_unlock(&mdev->mic_mutex);
-		timeout = wait_for_completion_timeout(&mdev->reset_wait,
-						      MIC_SUSPEND_TIMEOUT);
-		/* Force reset the card if the shutdown completion timed out */
-		if (!timeout)
-			mic_stop(mdev, true);
+	case MIC_C0_STEP:
+		string = "C0";
 		break;
 	default:
-		mutex_unlock(&mdev->mic_mutex);
 		break;
 	}
+	return scnprintf(buf, PAGE_SIZE, "%s\n", string);
 }
 
-/**
- * mic_suspend - Initiate MIC suspend. Suspend merely issues card shutdown.
- * @mdev: pointer to mic_device instance
- *
- * RETURNS: None.
- */
-void mic_suspend(struct mic_device *mdev)
+static struct mic_mw *_mic_aper(struct cosm_device *cdev)
 {
-	struct mic_bootparam *bootparam = mdev->dp;
-	s8 db = bootparam->h2c_shutdown_db;
+	struct mic_device *mdev = cosmdev_to_mdev(cdev);
 
-	mutex_lock(&mdev->mic_mutex);
-	if (MIC_SUSPENDING == mdev->state && db != -1) {
-		bootparam->shutdown_card = 1;
-		mdev->ops->send_intr(mdev, db);
-		mic_set_state(mdev, MIC_SUSPENDED);
-	}
-	mutex_unlock(&mdev->mic_mutex);
+	return &mdev->aper;
 }
+
+struct cosm_hw_ops cosm_hw_ops = {
+	.reset = _mic_reset,
+	.force_reset = _mic_reset,
+	.post_reset = NULL,
+	.ready = _mic_ready,
+	.start = _mic_start,
+	.stop = _mic_stop,
+	.family = _mic_family,
+	.stepping = _mic_stepping,
+	.aper = _mic_aper,
+};
diff --git a/drivers/misc/mic/host/mic_debugfs.c b/drivers/misc/mic/host/mic_debugfs.c
index 3c9ea4896f3c..10581600777a 100644
--- a/drivers/misc/mic/host/mic_debugfs.c
+++ b/drivers/misc/mic/host/mic_debugfs.c
@@ -31,71 +31,6 @@
 /* Debugfs parent dir */
 static struct dentry *mic_dbg;
 
-/**
- * mic_log_buf_show - Display MIC kernel log buffer.
- *
- * log_buf addr/len is read from System.map by user space
- * and populated in sysfs entries.
- */
-static int mic_log_buf_show(struct seq_file *s, void *unused)
-{
-	void __iomem *log_buf_va;
-	int __iomem *log_buf_len_va;
-	struct mic_device *mdev = s->private;
-	void *kva;
-	int size;
-	unsigned long aper_offset;
-
-	if (!mdev || !mdev->log_buf_addr || !mdev->log_buf_len)
-		goto done;
-	/*
-	 * Card kernel will never be relocated and any kernel text/data mapping
-	 * can be translated to phys address by subtracting __START_KERNEL_map.
-	 */
-	aper_offset = (unsigned long)mdev->log_buf_len - __START_KERNEL_map;
-	log_buf_len_va = mdev->aper.va + aper_offset;
-	aper_offset = (unsigned long)mdev->log_buf_addr - __START_KERNEL_map;
-	log_buf_va = mdev->aper.va + aper_offset;
-	size = ioread32(log_buf_len_va);
-
-	kva = kmalloc(size, GFP_KERNEL);
-	if (!kva)
-		goto done;
-	mutex_lock(&mdev->mic_mutex);
-	memcpy_fromio(kva, log_buf_va, size);
-	switch (mdev->state) {
-	case MIC_ONLINE:
-		/* Fall through */
-	case MIC_SHUTTING_DOWN:
-		seq_write(s, kva, size);
-		break;
-	default:
-		break;
-	}
-	mutex_unlock(&mdev->mic_mutex);
-	kfree(kva);
-done:
-	return 0;
-}
-
-static int mic_log_buf_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mic_log_buf_show, inode->i_private);
-}
-
-static int mic_log_buf_release(struct inode *inode, struct file *file)
-{
-	return single_release(inode, file);
-}
-
-static const struct file_operations log_buf_ops = {
-	.owner   = THIS_MODULE,
-	.open    = mic_log_buf_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = mic_log_buf_release
-};
-
 static int mic_smpt_show(struct seq_file *s, void *pos)
 {
 	int i;
@@ -138,32 +73,6 @@ static const struct file_operations smpt_file_ops = {
 	.release = mic_smpt_debug_release
 };
 
-static int mic_soft_reset_show(struct seq_file *s, void *pos)
-{
-	struct mic_device *mdev = s->private;
-
-	mic_stop(mdev, true);
-	return 0;
-}
-
-static int mic_soft_reset_debug_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, mic_soft_reset_show, inode->i_private);
-}
-
-static int mic_soft_reset_debug_release(struct inode *inode, struct file *file)
-{
-	return single_release(inode, file);
-}
-
-static const struct file_operations soft_reset_ops = {
-	.owner   = THIS_MODULE,
-	.open    = mic_soft_reset_debug_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = mic_soft_reset_debug_release
-};
-
 static int mic_post_code_show(struct seq_file *s, void *pos)
 {
 	struct mic_device *mdev = s->private;
@@ -204,18 +113,8 @@ static int mic_dp_show(struct seq_file *s, void *pos)
 
 	seq_printf(s, "Bootparam: magic 0x%x\n",
 		   bootparam->magic);
-	seq_printf(s, "Bootparam: h2c_shutdown_db %d\n",
-		   bootparam->h2c_shutdown_db);
 	seq_printf(s, "Bootparam: h2c_config_db %d\n",
 		   bootparam->h2c_config_db);
-	seq_printf(s, "Bootparam: c2h_shutdown_db %d\n",
-		   bootparam->c2h_shutdown_db);
-	seq_printf(s, "Bootparam: shutdown_status %d\n",
-		   bootparam->shutdown_status);
-	seq_printf(s, "Bootparam: shutdown_card %d\n",
-		   bootparam->shutdown_card);
-	seq_printf(s, "Bootparam: tot_nodes %d\n",
-		   bootparam->tot_nodes);
 	seq_printf(s, "Bootparam: node_id %d\n",
 		   bootparam->node_id);
 	seq_printf(s, "Bootparam: c2h_scif_db %d\n",
@@ -392,8 +291,7 @@ static int mic_msi_irq_info_show(struct seq_file *s, void *pos)
 	int i, j;
 	u16 entry;
 	u16 vector;
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-		struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 
 	if (pci_dev_msi_enabled(pdev)) {
 		for (i = 0; i < mdev->irq_info.num_vectors; i++) {
@@ -454,20 +352,18 @@ static const struct file_operations msi_irq_info_ops = {
  */
 void mic_create_debug_dir(struct mic_device *mdev)
 {
+	char name[16];
+
 	if (!mic_dbg)
 		return;
 
-	mdev->dbg_dir = debugfs_create_dir(dev_name(mdev->sdev), mic_dbg);
+	scnprintf(name, sizeof(name), "mic%d", mdev->id);
+	mdev->dbg_dir = debugfs_create_dir(name, mic_dbg);
 	if (!mdev->dbg_dir)
 		return;
 
-	debugfs_create_file("log_buf", 0444, mdev->dbg_dir, mdev, &log_buf_ops);
-
 	debugfs_create_file("smpt", 0444, mdev->dbg_dir, mdev, &smpt_file_ops);
 
-	debugfs_create_file("soft_reset", 0444, mdev->dbg_dir, mdev,
-			    &soft_reset_ops);
-
 	debugfs_create_file("post_code", 0444, mdev->dbg_dir, mdev,
 			    &post_code_ops);
 
diff --git a/drivers/misc/mic/host/mic_device.h b/drivers/misc/mic/host/mic_device.h
index 01a7555aa648..461184a12fbb 100644
--- a/drivers/misc/mic/host/mic_device.h
+++ b/drivers/misc/mic/host/mic_device.h
@@ -26,21 +26,12 @@
 #include <linux/notifier.h>
 #include <linux/irqreturn.h>
 #include <linux/dmaengine.h>
+#include <linux/miscdevice.h>
 #include <linux/mic_bus.h>
 #include "../bus/scif_bus.h"
+#include "../bus/cosm_bus.h"
 #include "mic_intr.h"
 
-/* The maximum number of MIC devices supported in a single host system. */
-#define MIC_MAX_NUM_DEVS 256
-
-/**
- * enum mic_hw_family - The hardware family to which a device belongs.
- */
-enum mic_hw_family {
-	MIC_FAMILY_X100 = 0,
-	MIC_FAMILY_UNKNOWN
-};
-
 /**
  * enum mic_stepping - MIC stepping ids.
  */
@@ -51,6 +42,8 @@ enum mic_stepping {
 	MIC_C0_STEP = 0x20,
 };
 
+extern struct cosm_hw_ops cosm_hw_ops;
+
 /**
  * struct mic_device -  MIC device information for each card.
  *
@@ -60,8 +53,7 @@ enum mic_stepping {
  * @ops: MIC HW specific operations.
  * @id: The unique device id for this MIC device.
  * @stepping: Stepping ID.
- * @attr_group: Pointer to list of sysfs attribute groups.
- * @sdev: Device for sysfs entries.
+ * @pdev: Underlying PCI device.
  * @mic_mutex: Mutex for synchronizing access to mic_device.
  * @intr_ops: HW specific interrupt operations.
  * @smpt_ops: Hardware specific SMPT operations.
@@ -69,30 +61,17 @@ enum mic_stepping {
  * @intr_info: H/W specific interrupt information.
  * @irq_info: The OS specific irq information
  * @dbg_dir: debugfs directory of this MIC device.
- * @cmdline: Kernel command line.
- * @firmware: Firmware file name.
- * @ramdisk: Ramdisk file name.
- * @bootmode: Boot mode i.e. "linux" or "elf" for flash updates.
  * @bootaddr: MIC boot address.
- * @reset_trigger_work: Work for triggering reset requests.
- * @shutdown_work: Work for handling shutdown interrupts.
- * @state: MIC state.
- * @shutdown_status: MIC status reported by card for shutdown/crashes.
- * @state_sysfs: Sysfs dirent for notifying ring 3 about MIC state changes.
- * @reset_wait: Waitqueue for sleeping while reset completes.
- * @log_buf_addr: Log buffer address for MIC.
- * @log_buf_len: Log buffer length address for MIC.
  * @dp: virtio device page
  * @dp_dma_addr: virtio device page DMA address.
- * @shutdown_db: shutdown doorbell.
- * @shutdown_cookie: shutdown cookie.
- * @cdev: Character device for MIC.
+ * @name: name for the misc char device
+ * @miscdev: registered misc char device
  * @vdev_list: list of virtio devices.
- * @pm_notifier: Handles PM notifications from the OS.
  * @dma_mbdev: MIC BUS DMA device.
  * @dma_ch - Array of DMA channels
  * @num_dma_ch - Number of DMA channels available
  * @scdev: SCIF device on the SCIF virtual bus.
+ * @cosm_dev: COSM device
  */
 struct mic_device {
 	struct mic_mw mmio;
@@ -101,8 +80,7 @@ struct mic_device {
 	struct mic_hw_ops *ops;
 	int id;
 	enum mic_stepping stepping;
-	const struct attribute_group **attr_group;
-	struct device *sdev;
+	struct pci_dev *pdev;
 	struct mutex mic_mutex;
 	struct mic_hw_intr_ops *intr_ops;
 	struct mic_smpt_ops *smpt_ops;
@@ -110,30 +88,17 @@ struct mic_device {
 	struct mic_intr_info *intr_info;
 	struct mic_irq_info irq_info;
 	struct dentry *dbg_dir;
-	char *cmdline;
-	char *firmware;
-	char *ramdisk;
-	char *bootmode;
 	u32 bootaddr;
-	struct work_struct reset_trigger_work;
-	struct work_struct shutdown_work;
-	u8 state;
-	u8 shutdown_status;
-	struct kernfs_node *state_sysfs;
-	struct completion reset_wait;
-	void *log_buf_addr;
-	int *log_buf_len;
 	void *dp;
 	dma_addr_t dp_dma_addr;
-	int shutdown_db;
-	struct mic_irq *shutdown_cookie;
-	struct cdev cdev;
+	char name[16];
+	struct miscdevice miscdev;
 	struct list_head vdev_list;
-	struct notifier_block pm_notifier;
 	struct mbus_device *dma_mbdev;
 	struct dma_chan *dma_ch[MIC_MAX_DMA_CHAN];
 	int num_dma_ch;
 	struct scif_hw_dev *scdev;
+	struct cosm_device *cosm_dev;
 };
 
 /**
@@ -199,38 +164,9 @@ mic_mmio_write(struct mic_mw *mw, u32 val, u32 offset)
 	iowrite32(val, mw->va + offset);
 }
 
-static inline struct dma_chan *mic_request_dma_chan(struct mic_device *mdev)
-{
-	dma_cap_mask_t mask;
-	struct dma_chan *chan;
-
-	dma_cap_zero(mask);
-	dma_cap_set(DMA_MEMCPY, mask);
-	chan = dma_request_channel(mask, mdev->ops->dma_filter,
-				   mdev->sdev->parent);
-	if (chan)
-		return chan;
-	dev_err(mdev->sdev->parent, "%s %d unable to acquire channel\n",
-		__func__, __LINE__);
-	return NULL;
-}
-
-void mic_sysfs_init(struct mic_device *mdev);
-int mic_start(struct mic_device *mdev, const char *buf);
-void mic_stop(struct mic_device *mdev, bool force);
-void mic_shutdown(struct mic_device *mdev);
-void mic_reset_delayed_work(struct work_struct *work);
-void mic_reset_trigger_work(struct work_struct *work);
-void mic_shutdown_work(struct work_struct *work);
 void mic_bootparam_init(struct mic_device *mdev);
-void mic_set_state(struct mic_device *mdev, u8 state);
-void mic_set_shutdown_status(struct mic_device *mdev, u8 status);
 void mic_create_debug_dir(struct mic_device *dev);
 void mic_delete_debug_dir(struct mic_device *dev);
 void __init mic_init_debugfs(void);
 void mic_exit_debugfs(void);
-void mic_prepare_suspend(struct mic_device *mdev);
-void mic_complete_resume(struct mic_device *mdev);
-void mic_suspend(struct mic_device *mdev);
-extern atomic_t g_num_mics;
 #endif
diff --git a/drivers/misc/mic/host/mic_fops.c b/drivers/misc/mic/host/mic_fops.c
index 85776d7327f3..8cc1d90cd949 100644
--- a/drivers/misc/mic/host/mic_fops.c
+++ b/drivers/misc/mic/host/mic_fops.c
@@ -30,8 +30,8 @@
 int mic_open(struct inode *inode, struct file *f)
 {
 	struct mic_vdev *mvdev;
-	struct mic_device *mdev = container_of(inode->i_cdev,
-		struct mic_device, cdev);
+	struct mic_device *mdev = container_of(f->private_data,
+		struct mic_device, miscdev);
 
 	mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL);
 	if (!mvdev)
diff --git a/drivers/misc/mic/host/mic_intr.c b/drivers/misc/mic/host/mic_intr.c
index b4ca6c884d19..08ca3e372fa4 100644
--- a/drivers/misc/mic/host/mic_intr.c
+++ b/drivers/misc/mic/host/mic_intr.c
@@ -30,8 +30,7 @@ static irqreturn_t mic_thread_fn(int irq, void *dev)
 	struct mic_intr_info *intr_info = mdev->intr_info;
 	struct mic_irq_info *irq_info = &mdev->irq_info;
 	struct mic_intr_cb *intr_cb;
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-					    struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 	int i;
 
 	spin_lock(&irq_info->mic_thread_lock);
@@ -57,8 +56,7 @@ static irqreturn_t mic_interrupt(int irq, void *dev)
 	struct mic_intr_info *intr_info = mdev->intr_info;
 	struct mic_irq_info *irq_info = &mdev->irq_info;
 	struct mic_intr_cb *intr_cb;
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-					    struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 	u32 mask;
 	int i;
 
@@ -83,7 +81,7 @@ static irqreturn_t mic_interrupt(int irq, void *dev)
 
 /* Return the interrupt offset from the index. Index is 0 based. */
 static u16 mic_map_src_to_offset(struct mic_device *mdev,
-		int intr_src, enum mic_intr_type type)
+				 int intr_src, enum mic_intr_type type)
 {
 	if (type >= MIC_NUM_INTR_TYPES)
 		return MIC_NUM_OFFSETS;
@@ -214,7 +212,7 @@ static int mic_setup_msix(struct mic_device *mdev, struct pci_dev *pdev)
 		mdev->irq_info.msix_entries[i].entry = i;
 
 	rc = pci_enable_msix_exact(pdev, mdev->irq_info.msix_entries,
-		MIC_MIN_MSIX);
+				   MIC_MIN_MSIX);
 	if (rc) {
 		dev_dbg(&pdev->dev, "Error enabling MSIx. rc = %d\n", rc);
 		goto err_enable_msix;
@@ -229,7 +227,7 @@ static int mic_setup_msix(struct mic_device *mdev, struct pci_dev *pdev)
 		goto err_nomem2;
 	}
 
-	dev_dbg(mdev->sdev->parent,
+	dev_dbg(&mdev->pdev->dev,
 		"%d MSIx irqs setup\n", mdev->irq_info.num_vectors);
 	return 0;
 err_nomem2:
@@ -281,7 +279,6 @@ static void mic_release_callbacks(struct mic_device *mdev)
 	spin_lock(&mdev->irq_info.mic_thread_lock);
 	spin_lock_irqsave(&mdev->irq_info.mic_intr_lock, flags);
 	for (i = 0; i < MIC_NUM_OFFSETS; i++) {
-
 		if (list_empty(&mdev->irq_info.cb_list[i]))
 			break;
 
@@ -443,12 +440,11 @@ mic_request_threaded_irq(struct mic_device *mdev,
 	unsigned long cookie = 0;
 	u16 entry;
 	struct mic_intr_cb *intr_cb;
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-		struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 
 	offset = mic_map_src_to_offset(mdev, intr_src, type);
 	if (offset >= MIC_NUM_OFFSETS) {
-		dev_err(mdev->sdev->parent,
+		dev_err(&mdev->pdev->dev,
 			"Error mapping index %d to a valid source id.\n",
 			intr_src);
 		rc = -EINVAL;
@@ -458,7 +454,7 @@ mic_request_threaded_irq(struct mic_device *mdev,
 	if (mdev->irq_info.num_vectors > 1) {
 		msix = mic_get_available_vector(mdev);
 		if (!msix) {
-			dev_err(mdev->sdev->parent,
+			dev_err(&mdev->pdev->dev,
 				"No MSIx vectors available for use.\n");
 			rc = -ENOSPC;
 			goto err;
@@ -467,7 +463,7 @@ mic_request_threaded_irq(struct mic_device *mdev,
 		rc = request_threaded_irq(msix->vector, handler, thread_fn,
 					  0, name, data);
 		if (rc) {
-			dev_dbg(mdev->sdev->parent,
+			dev_dbg(&mdev->pdev->dev,
 				"request irq failed rc = %d\n", rc);
 			goto err;
 		}
@@ -476,13 +472,13 @@ mic_request_threaded_irq(struct mic_device *mdev,
 		mdev->intr_ops->program_msi_to_src_map(mdev,
 				entry, offset, true);
 		cookie = MK_COOKIE(entry, offset);
-		dev_dbg(mdev->sdev->parent, "irq: %d assigned for src: %d\n",
+		dev_dbg(&mdev->pdev->dev, "irq: %d assigned for src: %d\n",
 			msix->vector, intr_src);
 	} else {
 		intr_cb = mic_register_intr_callback(mdev, offset, handler,
 						     thread_fn, data);
 		if (IS_ERR(intr_cb)) {
-			dev_err(mdev->sdev->parent,
+			dev_err(&mdev->pdev->dev,
 				"No available callback entries for use\n");
 			rc = PTR_ERR(intr_cb);
 			goto err;
@@ -495,7 +491,7 @@ mic_request_threaded_irq(struct mic_device *mdev,
 				entry, offset, true);
 		}
 		cookie = MK_COOKIE(entry, intr_cb->cb_id);
-		dev_dbg(mdev->sdev->parent, "callback %d registered for src: %d\n",
+		dev_dbg(&mdev->pdev->dev, "callback %d registered for src: %d\n",
 			intr_cb->cb_id, intr_src);
 	}
 	return (struct mic_irq *)cookie;
@@ -515,20 +511,19 @@ err:
  * returns: none.
  */
 void mic_free_irq(struct mic_device *mdev,
-	struct mic_irq *cookie, void *data)
+		  struct mic_irq *cookie, void *data)
 {
 	u32 offset;
 	u32 entry;
 	u8 src_id;
 	unsigned int irq;
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-		struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 
 	entry = GET_ENTRY((unsigned long)cookie);
 	offset = GET_OFFSET((unsigned long)cookie);
 	if (mdev->irq_info.num_vectors > 1) {
 		if (entry >= mdev->irq_info.num_vectors) {
-			dev_warn(mdev->sdev->parent,
+			dev_warn(&mdev->pdev->dev,
 				 "entry %d should be < num_irq %d\n",
 				entry, mdev->irq_info.num_vectors);
 			return;
@@ -539,12 +534,12 @@ void mic_free_irq(struct mic_device *mdev,
 		mdev->intr_ops->program_msi_to_src_map(mdev,
 			entry, offset, false);
 
-		dev_dbg(mdev->sdev->parent, "irq: %d freed\n", irq);
+		dev_dbg(&mdev->pdev->dev, "irq: %d freed\n", irq);
 	} else {
 		irq = pdev->irq;
 		src_id = mic_unregister_intr_callback(mdev, offset);
 		if (src_id >= MIC_NUM_OFFSETS) {
-			dev_warn(mdev->sdev->parent, "Error unregistering callback\n");
+			dev_warn(&mdev->pdev->dev, "Error unregistering callback\n");
 			return;
 		}
 		if (pci_dev_msi_enabled(pdev)) {
@@ -552,7 +547,7 @@ void mic_free_irq(struct mic_device *mdev,
 			mdev->intr_ops->program_msi_to_src_map(mdev,
 				entry, src_id, false);
 		}
-		dev_dbg(mdev->sdev->parent, "callback %d unregistered for src: %d\n",
+		dev_dbg(&mdev->pdev->dev, "callback %d unregistered for src: %d\n",
 			offset, src_id);
 	}
 }
@@ -579,7 +574,7 @@ int mic_setup_interrupts(struct mic_device *mdev, struct pci_dev *pdev)
 
 	rc = mic_setup_intx(mdev, pdev);
 	if (rc) {
-		dev_err(mdev->sdev->parent, "no usable interrupts\n");
+		dev_err(&mdev->pdev->dev, "no usable interrupts\n");
 		return rc;
 	}
 done:
@@ -635,8 +630,7 @@ void mic_free_interrupts(struct mic_device *mdev, struct pci_dev *pdev)
 void mic_intr_restore(struct mic_device *mdev)
 {
 	int entry, offset;
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-		struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 
 	if (!pci_dev_msi_enabled(pdev))
 		return;
diff --git a/drivers/misc/mic/host/mic_main.c b/drivers/misc/mic/host/mic_main.c
index 456462932151..153894e7ed5b 100644
--- a/drivers/misc/mic/host/mic_main.c
+++ b/drivers/misc/mic/host/mic_main.c
@@ -16,17 +16,11 @@
  * the file called "COPYING".
  *
  * Intel MIC Host driver.
- *
- * Global TODO's across the driver to be added after initial base
- * patches are accepted upstream:
- * 1) Enable DMA support.
- * 2) Enable per vring interrupt support.
  */
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/poll.h>
-#include <linux/suspend.h>
 
 #include <linux/mic_common.h>
 #include "../common/mic_dev.h"
@@ -63,12 +57,8 @@ MODULE_DEVICE_TABLE(pci, mic_pci_tbl);
 
 /* ID allocator for MIC devices */
 static struct ida g_mic_ida;
-/* Class of MIC devices for sysfs accessibility. */
-static struct class *g_mic_class;
 /* Base device node number for MIC devices */
 static dev_t g_mic_devno;
-/* Track the total number of MIC devices */
-atomic_t g_num_mics;
 
 static const struct file_operations mic_fops = {
 	.open = mic_open,
@@ -83,17 +73,14 @@ static const struct file_operations mic_fops = {
 static int mic_dp_init(struct mic_device *mdev)
 {
 	mdev->dp = kzalloc(MIC_DP_SIZE, GFP_KERNEL);
-	if (!mdev->dp) {
-		dev_err(mdev->sdev->parent, "%s %d err %d\n",
-			__func__, __LINE__, -ENOMEM);
+	if (!mdev->dp)
 		return -ENOMEM;
-	}
 
 	mdev->dp_dma_addr = mic_map_single(mdev,
 		mdev->dp, MIC_DP_SIZE);
 	if (mic_map_error(mdev->dp_dma_addr)) {
 		kfree(mdev->dp);
-		dev_err(mdev->sdev->parent, "%s %d err %d\n",
+		dev_err(&mdev->pdev->dev, "%s %d err %d\n",
 			__func__, __LINE__, -ENOMEM);
 		return -ENOMEM;
 	}
@@ -110,30 +97,6 @@ static void mic_dp_uninit(struct mic_device *mdev)
 }
 
 /**
- * mic_shutdown_db - Shutdown doorbell interrupt handler.
- */
-static irqreturn_t mic_shutdown_db(int irq, void *data)
-{
-	struct mic_device *mdev = data;
-	struct mic_bootparam *bootparam = mdev->dp;
-
-	mdev->ops->intr_workarounds(mdev);
-
-	switch (bootparam->shutdown_status) {
-	case MIC_HALTED:
-	case MIC_POWER_OFF:
-	case MIC_RESTART:
-		/* Fall through */
-	case MIC_CRASHED:
-		schedule_work(&mdev->shutdown_work);
-		break;
-	default:
-		break;
-	};
-	return IRQ_HANDLED;
-}
-
-/**
  * mic_ops_init: Initialize HW specific operation tables.
  *
  * @mdev: pointer to mic_device instance
@@ -190,43 +153,6 @@ static enum mic_hw_family mic_get_family(struct pci_dev *pdev)
 }
 
 /**
-* mic_pm_notifier: Notifier callback function that handles
-* PM notifications.
-*
-* @notifier_block: The notifier structure.
-* @pm_event: The event for which the driver was notified.
-* @unused: Meaningless. Always NULL.
-*
-* returns NOTIFY_DONE
-*/
-static int mic_pm_notifier(struct notifier_block *notifier,
-		unsigned long pm_event, void *unused)
-{
-	struct mic_device *mdev = container_of(notifier,
-		struct mic_device, pm_notifier);
-
-	switch (pm_event) {
-	case PM_HIBERNATION_PREPARE:
-		/* Fall through */
-	case PM_SUSPEND_PREPARE:
-		mic_prepare_suspend(mdev);
-		break;
-	case PM_POST_HIBERNATION:
-		/* Fall through */
-	case PM_POST_SUSPEND:
-		/* Fall through */
-	case PM_POST_RESTORE:
-		mic_complete_resume(mdev);
-		break;
-	case PM_RESTORE_PREPARE:
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-/**
  * mic_device_init - Allocates and initializes the MIC device structure
  *
  * @mdev: pointer to mic_device instance
@@ -234,52 +160,16 @@ static int mic_pm_notifier(struct notifier_block *notifier,
  *
  * returns none.
  */
-static int
+static void
 mic_device_init(struct mic_device *mdev, struct pci_dev *pdev)
 {
-	int rc;
-
+	mdev->pdev = pdev;
 	mdev->family = mic_get_family(pdev);
 	mdev->stepping = pdev->revision;
 	mic_ops_init(mdev);
-	mic_sysfs_init(mdev);
 	mutex_init(&mdev->mic_mutex);
 	mdev->irq_info.next_avail_src = 0;
-	INIT_WORK(&mdev->reset_trigger_work, mic_reset_trigger_work);
-	INIT_WORK(&mdev->shutdown_work, mic_shutdown_work);
-	init_completion(&mdev->reset_wait);
 	INIT_LIST_HEAD(&mdev->vdev_list);
-	mdev->pm_notifier.notifier_call = mic_pm_notifier;
-	rc = register_pm_notifier(&mdev->pm_notifier);
-	if (rc) {
-		dev_err(&pdev->dev, "register_pm_notifier failed rc %d\n",
-			rc);
-		goto register_pm_notifier_fail;
-	}
-	return 0;
-register_pm_notifier_fail:
-	flush_work(&mdev->shutdown_work);
-	flush_work(&mdev->reset_trigger_work);
-	return rc;
-}
-
-/**
- * mic_device_uninit - Frees resources allocated during mic_device_init(..)
- *
- * @mdev: pointer to mic_device instance
- *
- * returns none
- */
-static void mic_device_uninit(struct mic_device *mdev)
-{
-	/* The cmdline sysfs entry might have allocated cmdline */
-	kfree(mdev->cmdline);
-	kfree(mdev->firmware);
-	kfree(mdev->ramdisk);
-	kfree(mdev->bootmode);
-	flush_work(&mdev->reset_trigger_work);
-	flush_work(&mdev->shutdown_work);
-	unregister_pm_notifier(&mdev->pm_notifier);
 }
 
 /**
@@ -291,7 +181,7 @@ static void mic_device_uninit(struct mic_device *mdev)
  * returns 0 on success, < 0 on failure.
  */
 static int mic_probe(struct pci_dev *pdev,
-		const struct pci_device_id *ent)
+		     const struct pci_device_id *ent)
 {
 	int rc;
 	struct mic_device *mdev;
@@ -309,16 +199,12 @@ static int mic_probe(struct pci_dev *pdev,
 		goto ida_fail;
 	}
 
-	rc = mic_device_init(mdev, pdev);
-	if (rc) {
-		dev_err(&pdev->dev, "mic_device_init failed rc %d\n", rc);
-		goto device_init_fail;
-	}
+	mic_device_init(mdev, pdev);
 
 	rc = pci_enable_device(pdev);
 	if (rc) {
 		dev_err(&pdev->dev, "failed to enable pci device.\n");
-		goto uninit_device;
+		goto ida_remove;
 	}
 
 	pci_set_master(pdev);
@@ -367,62 +253,39 @@ static int mic_probe(struct pci_dev *pdev,
 
 	pci_set_drvdata(pdev, mdev);
 
-	mdev->sdev = device_create_with_groups(g_mic_class, &pdev->dev,
-		MKDEV(MAJOR(g_mic_devno), mdev->id), NULL,
-		mdev->attr_group, "mic%d", mdev->id);
-	if (IS_ERR(mdev->sdev)) {
-		rc = PTR_ERR(mdev->sdev);
-		dev_err(&pdev->dev,
-			"device_create_with_groups failed rc %d\n", rc);
-		goto smpt_uninit;
-	}
-	mdev->state_sysfs = sysfs_get_dirent(mdev->sdev->kobj.sd, "state");
-	if (!mdev->state_sysfs) {
-		rc = -ENODEV;
-		dev_err(&pdev->dev, "sysfs_get_dirent failed rc %d\n", rc);
-		goto destroy_device;
-	}
-
 	rc = mic_dp_init(mdev);
 	if (rc) {
 		dev_err(&pdev->dev, "mic_dp_init failed rc %d\n", rc);
-		goto sysfs_put;
-	}
-	mutex_lock(&mdev->mic_mutex);
-
-	mdev->shutdown_db = mic_next_db(mdev);
-	mdev->shutdown_cookie = mic_request_threaded_irq(mdev, mic_shutdown_db,
-					NULL, "shutdown-interrupt", mdev,
-					mdev->shutdown_db, MIC_INTR_DB);
-	if (IS_ERR(mdev->shutdown_cookie)) {
-		rc = PTR_ERR(mdev->shutdown_cookie);
-		mutex_unlock(&mdev->mic_mutex);
-		goto dp_uninit;
+		goto smpt_uninit;
 	}
-	mutex_unlock(&mdev->mic_mutex);
 	mic_bootparam_init(mdev);
 
 	mic_create_debug_dir(mdev);
-	cdev_init(&mdev->cdev, &mic_fops);
-	mdev->cdev.owner = THIS_MODULE;
-	rc = cdev_add(&mdev->cdev, MKDEV(MAJOR(g_mic_devno), mdev->id), 1);
+
+	mdev->miscdev.minor = MISC_DYNAMIC_MINOR;
+	snprintf(mdev->name, sizeof(mdev->name), "mic%d", mdev->id);
+	mdev->miscdev.name = mdev->name;
+	mdev->miscdev.fops = &mic_fops;
+	mdev->miscdev.parent = &mdev->pdev->dev;
+	rc = misc_register(&mdev->miscdev);
 	if (rc) {
-		dev_err(&pdev->dev, "cdev_add err id %d rc %d\n", mdev->id, rc);
+		dev_err(&pdev->dev, "misc_register err id %d rc %d\n",
+			mdev->id, rc);
 		goto cleanup_debug_dir;
 	}
-	atomic_inc(&g_num_mics);
+
+	mdev->cosm_dev = cosm_register_device(&mdev->pdev->dev, &cosm_hw_ops);
+	if (IS_ERR(mdev->cosm_dev)) {
+		rc = PTR_ERR(mdev->cosm_dev);
+		dev_err(&pdev->dev, "cosm_add_device failed rc %d\n", rc);
+		goto misc_dereg;
+	}
 	return 0;
+misc_dereg:
+	misc_deregister(&mdev->miscdev);
 cleanup_debug_dir:
 	mic_delete_debug_dir(mdev);
-	mutex_lock(&mdev->mic_mutex);
-	mic_free_irq(mdev, mdev->shutdown_cookie, mdev);
-	mutex_unlock(&mdev->mic_mutex);
-dp_uninit:
 	mic_dp_uninit(mdev);
-sysfs_put:
-	sysfs_put(mdev->state_sysfs);
-destroy_device:
-	device_destroy(g_mic_class, MKDEV(MAJOR(g_mic_devno), mdev->id));
 smpt_uninit:
 	mic_smpt_uninit(mdev);
 free_interrupts:
@@ -435,9 +298,7 @@ release_regions:
 	pci_release_regions(pdev);
 disable_device:
 	pci_disable_device(pdev);
-uninit_device:
-	mic_device_uninit(mdev);
-device_init_fail:
+ida_remove:
 	ida_simple_remove(&g_mic_ida, mdev->id);
 ida_fail:
 	kfree(mdev);
@@ -461,22 +322,14 @@ static void mic_remove(struct pci_dev *pdev)
 	if (!mdev)
 		return;
 
-	mic_stop(mdev, false);
-	atomic_dec(&g_num_mics);
-	cdev_del(&mdev->cdev);
+	cosm_unregister_device(mdev->cosm_dev);
+	misc_deregister(&mdev->miscdev);
 	mic_delete_debug_dir(mdev);
-	mutex_lock(&mdev->mic_mutex);
-	mic_free_irq(mdev, mdev->shutdown_cookie, mdev);
-	mutex_unlock(&mdev->mic_mutex);
-	flush_work(&mdev->shutdown_work);
 	mic_dp_uninit(mdev);
-	sysfs_put(mdev->state_sysfs);
-	device_destroy(g_mic_class, MKDEV(MAJOR(g_mic_devno), mdev->id));
 	mic_smpt_uninit(mdev);
 	mic_free_interrupts(mdev, pdev);
-	iounmap(mdev->mmio.va);
 	iounmap(mdev->aper.va);
-	mic_device_uninit(mdev);
+	iounmap(mdev->mmio.va);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	ida_simple_remove(&g_mic_ida, mdev->id);
@@ -495,32 +348,23 @@ static int __init mic_init(void)
 	int ret;
 
 	ret = alloc_chrdev_region(&g_mic_devno, 0,
-		MIC_MAX_NUM_DEVS, mic_driver_name);
+				  MIC_MAX_NUM_DEVS, mic_driver_name);
 	if (ret) {
 		pr_err("alloc_chrdev_region failed ret %d\n", ret);
 		goto error;
 	}
 
-	g_mic_class = class_create(THIS_MODULE, mic_driver_name);
-	if (IS_ERR(g_mic_class)) {
-		ret = PTR_ERR(g_mic_class);
-		pr_err("class_create failed ret %d\n", ret);
-		goto cleanup_chrdev;
-	}
-
 	mic_init_debugfs();
 	ida_init(&g_mic_ida);
 	ret = pci_register_driver(&mic_driver);
 	if (ret) {
 		pr_err("pci_register_driver failed ret %d\n", ret);
-		goto cleanup_debugfs;
+		goto cleanup_chrdev;
 	}
 	return ret;
-cleanup_debugfs:
+cleanup_chrdev:
 	ida_destroy(&g_mic_ida);
 	mic_exit_debugfs();
-	class_destroy(g_mic_class);
-cleanup_chrdev:
 	unregister_chrdev_region(g_mic_devno, MIC_MAX_NUM_DEVS);
 error:
 	return ret;
@@ -531,7 +375,6 @@ static void __exit mic_exit(void)
 	pci_unregister_driver(&mic_driver);
 	ida_destroy(&g_mic_ida);
 	mic_exit_debugfs();
-	class_destroy(g_mic_class);
 	unregister_chrdev_region(g_mic_devno, MIC_MAX_NUM_DEVS);
 }
 
diff --git a/drivers/misc/mic/host/mic_smpt.c b/drivers/misc/mic/host/mic_smpt.c
index cec82034875f..c3f958580fb0 100644
--- a/drivers/misc/mic/host/mic_smpt.c
+++ b/drivers/misc/mic/host/mic_smpt.c
@@ -76,7 +76,7 @@ mic_is_system_addr(struct mic_device *mdev, dma_addr_t pa)
 
 /* Populate an SMPT entry and update the reference counts. */
 static void mic_add_smpt_entry(int spt, s64 *ref, u64 addr,
-		int entries, struct mic_device *mdev)
+			       int entries, struct mic_device *mdev)
 {
 	struct mic_smpt_info *smpt_info = mdev->smpt;
 	int i;
@@ -97,7 +97,7 @@ static void mic_add_smpt_entry(int spt, s64 *ref, u64 addr,
  * for a given DMA address and size.
  */
 static dma_addr_t mic_smpt_op(struct mic_device *mdev, u64 dma_addr,
-				int entries, s64 *ref, size_t size)
+			      int entries, s64 *ref, size_t size)
 {
 	int spt;
 	int ae = 0;
@@ -148,7 +148,7 @@ found:
  * and the starting smpt address
  */
 static int mic_get_smpt_ref_count(struct mic_device *mdev, dma_addr_t dma_addr,
-				size_t size, s64 *ref,  u64 *smpt_start)
+				  size_t size, s64 *ref,  u64 *smpt_start)
 {
 	u64 start =  dma_addr;
 	u64 end = dma_addr + size;
@@ -181,7 +181,7 @@ dma_addr_t mic_to_dma_addr(struct mic_device *mdev, dma_addr_t mic_addr)
 	dma_addr_t dma_addr;
 
 	if (!mic_is_system_addr(mdev, mic_addr)) {
-		dev_err(mdev->sdev->parent,
+		dev_err(&mdev->pdev->dev,
 			"mic_addr is invalid. mic_addr = 0x%llx\n", mic_addr);
 		return -EINVAL;
 	}
@@ -218,7 +218,7 @@ dma_addr_t mic_map(struct mic_device *mdev, dma_addr_t dma_addr, size_t size)
 		return mic_addr;
 
 	num_entries = mic_get_smpt_ref_count(mdev, dma_addr, size,
-		ref, &smpt_start);
+					     ref, &smpt_start);
 
 	/* Set the smpt table appropriately and get 16G aligned mic address */
 	mic_addr = mic_smpt_op(mdev, smpt_start, num_entries, ref, size);
@@ -231,7 +231,7 @@ dma_addr_t mic_map(struct mic_device *mdev, dma_addr_t dma_addr, size_t size)
 	 * else generate mic_addr by adding the 16G offset in dma_addr
 	 */
 	if (!mic_addr && MIC_FAMILY_X100 == mdev->family) {
-		dev_err(mdev->sdev->parent,
+		dev_err(&mdev->pdev->dev,
 			"mic_map failed dma_addr 0x%llx size 0x%lx\n",
 			dma_addr, size);
 		return mic_addr;
@@ -264,7 +264,7 @@ void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size)
 		return;
 
 	if (!mic_is_system_addr(mdev, mic_addr)) {
-		dev_err(mdev->sdev->parent,
+		dev_err(&mdev->pdev->dev,
 			"invalid address: 0x%llx\n", mic_addr);
 		return;
 	}
@@ -284,7 +284,7 @@ void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size)
 	for (i = spt; i < spt + num_smpt; i++) {
 		smpt_info->entry[i].ref_count -= ref[i - spt];
 		if (smpt_info->entry[i].ref_count < 0)
-			dev_warn(mdev->sdev->parent,
+			dev_warn(&mdev->pdev->dev,
 				 "ref count for entry %d is negative\n", i);
 	}
 	spin_unlock_irqrestore(&smpt_info->smpt_lock, flags);
@@ -307,15 +307,14 @@ void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size)
 dma_addr_t mic_map_single(struct mic_device *mdev, void *va, size_t size)
 {
 	dma_addr_t mic_addr = 0;
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-		struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 	dma_addr_t dma_addr =
 		pci_map_single(pdev, va, size, PCI_DMA_BIDIRECTIONAL);
 
 	if (!pci_dma_mapping_error(pdev, dma_addr)) {
 		mic_addr = mic_map(mdev, dma_addr, size);
 		if (!mic_addr) {
-			dev_err(mdev->sdev->parent,
+			dev_err(&mdev->pdev->dev,
 				"mic_map failed dma_addr 0x%llx size 0x%lx\n",
 				dma_addr, size);
 			pci_unmap_single(pdev, dma_addr,
@@ -339,8 +338,7 @@ dma_addr_t mic_map_single(struct mic_device *mdev, void *va, size_t size)
 void
 mic_unmap_single(struct mic_device *mdev, dma_addr_t mic_addr, size_t size)
 {
-	struct pci_dev *pdev = container_of(mdev->sdev->parent,
-		struct pci_dev, dev);
+	struct pci_dev *pdev = mdev->pdev;
 	dma_addr_t dma_addr = mic_to_dma_addr(mdev, mic_addr);
 	mic_unmap(mdev, mic_addr, size);
 	pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
@@ -399,18 +397,18 @@ void mic_smpt_uninit(struct mic_device *mdev)
 	struct mic_smpt_info *smpt_info = mdev->smpt;
 	int i;
 
-	dev_dbg(mdev->sdev->parent,
+	dev_dbg(&mdev->pdev->dev,
 		"nodeid %d SMPT ref count %lld map %lld unmap %lld\n",
 		mdev->id, smpt_info->ref_count,
 		smpt_info->map_count, smpt_info->unmap_count);
 
 	for (i = 0; i < smpt_info->info.num_reg; i++) {
-		dev_dbg(mdev->sdev->parent,
+		dev_dbg(&mdev->pdev->dev,
 			"SMPT entry[%d] dma_addr = 0x%llx ref_count = %lld\n",
 			i, smpt_info->entry[i].dma_addr,
 			smpt_info->entry[i].ref_count);
 		if (smpt_info->entry[i].ref_count)
-			dev_warn(mdev->sdev->parent,
+			dev_warn(&mdev->pdev->dev,
 				 "ref count for entry %d is not zero\n", i);
 	}
 	kfree(smpt_info->entry);
diff --git a/drivers/misc/mic/host/mic_sysfs.c b/drivers/misc/mic/host/mic_sysfs.c
deleted file mode 100644
index 6dd864e4a617..000000000000
--- a/drivers/misc/mic/host/mic_sysfs.c
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Intel MIC Platform Software Stack (MPSS)
- *
- * Copyright(c) 2013 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Intel MIC Host driver.
- *
- */
-#include <linux/pci.h>
-
-#include <linux/mic_common.h>
-#include "../common/mic_dev.h"
-#include "mic_device.h"
-
-/*
- * A state-to-string lookup table, for exposing a human readable state
- * via sysfs. Always keep in sync with enum mic_states
- */
-static const char * const mic_state_string[] = {
-	[MIC_OFFLINE] = "offline",
-	[MIC_ONLINE] = "online",
-	[MIC_SHUTTING_DOWN] = "shutting_down",
-	[MIC_RESET_FAILED] = "reset_failed",
-	[MIC_SUSPENDING] = "suspending",
-	[MIC_SUSPENDED] = "suspended",
-};
-
-/*
- * A shutdown-status-to-string lookup table, for exposing a human
- * readable state via sysfs. Always keep in sync with enum mic_shutdown_status
- */
-static const char * const mic_shutdown_status_string[] = {
-	[MIC_NOP] = "nop",
-	[MIC_CRASHED] = "crashed",
-	[MIC_HALTED] = "halted",
-	[MIC_POWER_OFF] = "poweroff",
-	[MIC_RESTART] = "restart",
-};
-
-void mic_set_shutdown_status(struct mic_device *mdev, u8 shutdown_status)
-{
-	dev_dbg(mdev->sdev->parent, "Shutdown Status %s -> %s\n",
-		mic_shutdown_status_string[mdev->shutdown_status],
-		mic_shutdown_status_string[shutdown_status]);
-	mdev->shutdown_status = shutdown_status;
-}
-
-void mic_set_state(struct mic_device *mdev, u8 state)
-{
-	dev_dbg(mdev->sdev->parent, "State %s -> %s\n",
-		mic_state_string[mdev->state],
-		mic_state_string[state]);
-	mdev->state = state;
-	sysfs_notify_dirent(mdev->state_sysfs);
-}
-
-static ssize_t
-family_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	static const char x100[] = "x100";
-	static const char unknown[] = "Unknown";
-	const char *card = NULL;
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev)
-		return -EINVAL;
-
-	switch (mdev->family) {
-	case MIC_FAMILY_X100:
-		card = x100;
-		break;
-	default:
-		card = unknown;
-		break;
-	}
-	return scnprintf(buf, PAGE_SIZE, "%s\n", card);
-}
-static DEVICE_ATTR_RO(family);
-
-static ssize_t
-stepping_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	char *string = "??";
-
-	if (!mdev)
-		return -EINVAL;
-
-	switch (mdev->stepping) {
-	case MIC_A0_STEP:
-		string = "A0";
-		break;
-	case MIC_B0_STEP:
-		string = "B0";
-		break;
-	case MIC_B1_STEP:
-		string = "B1";
-		break;
-	case MIC_C0_STEP:
-		string = "C0";
-		break;
-	default:
-		break;
-	}
-	return scnprintf(buf, PAGE_SIZE, "%s\n", string);
-}
-static DEVICE_ATTR_RO(stepping);
-
-static ssize_t
-state_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev || mdev->state >= MIC_LAST)
-		return -EINVAL;
-
-	return scnprintf(buf, PAGE_SIZE, "%s\n",
-		mic_state_string[mdev->state]);
-}
-
-static ssize_t
-state_store(struct device *dev, struct device_attribute *attr,
-	    const char *buf, size_t count)
-{
-	int rc = 0;
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	if (!mdev)
-		return -EINVAL;
-	if (sysfs_streq(buf, "boot")) {
-		rc = mic_start(mdev, buf);
-		if (rc) {
-			dev_err(mdev->sdev->parent,
-				"mic_boot failed rc %d\n", rc);
-			count = rc;
-		}
-		goto done;
-	}
-
-	if (sysfs_streq(buf, "reset")) {
-		schedule_work(&mdev->reset_trigger_work);
-		goto done;
-	}
-
-	if (sysfs_streq(buf, "shutdown")) {
-		mic_shutdown(mdev);
-		goto done;
-	}
-
-	if (sysfs_streq(buf, "suspend")) {
-		mic_suspend(mdev);
-		goto done;
-	}
-
-	count = -EINVAL;
-done:
-	return count;
-}
-static DEVICE_ATTR_RW(state);
-
-static ssize_t shutdown_status_show(struct device *dev,
-				    struct device_attribute *attr, char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev || mdev->shutdown_status >= MIC_STATUS_LAST)
-		return -EINVAL;
-
-	return scnprintf(buf, PAGE_SIZE, "%s\n",
-		mic_shutdown_status_string[mdev->shutdown_status]);
-}
-static DEVICE_ATTR_RO(shutdown_status);
-
-static ssize_t
-cmdline_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	char *cmdline;
-
-	if (!mdev)
-		return -EINVAL;
-
-	cmdline = mdev->cmdline;
-
-	if (cmdline)
-		return scnprintf(buf, PAGE_SIZE, "%s\n", cmdline);
-	return 0;
-}
-
-static ssize_t
-cmdline_store(struct device *dev, struct device_attribute *attr,
-	      const char *buf, size_t count)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev)
-		return -EINVAL;
-
-	mutex_lock(&mdev->mic_mutex);
-	kfree(mdev->cmdline);
-
-	mdev->cmdline = kmalloc(count + 1, GFP_KERNEL);
-	if (!mdev->cmdline) {
-		count = -ENOMEM;
-		goto unlock;
-	}
-
-	strncpy(mdev->cmdline, buf, count);
-
-	if (mdev->cmdline[count - 1] == '\n')
-		mdev->cmdline[count - 1] = '\0';
-	else
-		mdev->cmdline[count] = '\0';
-unlock:
-	mutex_unlock(&mdev->mic_mutex);
-	return count;
-}
-static DEVICE_ATTR_RW(cmdline);
-
-static ssize_t
-firmware_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	char *firmware;
-
-	if (!mdev)
-		return -EINVAL;
-
-	firmware = mdev->firmware;
-
-	if (firmware)
-		return scnprintf(buf, PAGE_SIZE, "%s\n", firmware);
-	return 0;
-}
-
-static ssize_t
-firmware_store(struct device *dev, struct device_attribute *attr,
-	       const char *buf, size_t count)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev)
-		return -EINVAL;
-
-	mutex_lock(&mdev->mic_mutex);
-	kfree(mdev->firmware);
-
-	mdev->firmware = kmalloc(count + 1, GFP_KERNEL);
-	if (!mdev->firmware) {
-		count = -ENOMEM;
-		goto unlock;
-	}
-	strncpy(mdev->firmware, buf, count);
-
-	if (mdev->firmware[count - 1] == '\n')
-		mdev->firmware[count - 1] = '\0';
-	else
-		mdev->firmware[count] = '\0';
-unlock:
-	mutex_unlock(&mdev->mic_mutex);
-	return count;
-}
-static DEVICE_ATTR_RW(firmware);
-
-static ssize_t
-ramdisk_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	char *ramdisk;
-
-	if (!mdev)
-		return -EINVAL;
-
-	ramdisk = mdev->ramdisk;
-
-	if (ramdisk)
-		return scnprintf(buf, PAGE_SIZE, "%s\n", ramdisk);
-	return 0;
-}
-
-static ssize_t
-ramdisk_store(struct device *dev, struct device_attribute *attr,
-	      const char *buf, size_t count)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev)
-		return -EINVAL;
-
-	mutex_lock(&mdev->mic_mutex);
-	kfree(mdev->ramdisk);
-
-	mdev->ramdisk = kmalloc(count + 1, GFP_KERNEL);
-	if (!mdev->ramdisk) {
-		count = -ENOMEM;
-		goto unlock;
-	}
-
-	strncpy(mdev->ramdisk, buf, count);
-
-	if (mdev->ramdisk[count - 1] == '\n')
-		mdev->ramdisk[count - 1] = '\0';
-	else
-		mdev->ramdisk[count] = '\0';
-unlock:
-	mutex_unlock(&mdev->mic_mutex);
-	return count;
-}
-static DEVICE_ATTR_RW(ramdisk);
-
-static ssize_t
-bootmode_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	char *bootmode;
-
-	if (!mdev)
-		return -EINVAL;
-
-	bootmode = mdev->bootmode;
-
-	if (bootmode)
-		return scnprintf(buf, PAGE_SIZE, "%s\n", bootmode);
-	return 0;
-}
-
-static ssize_t
-bootmode_store(struct device *dev, struct device_attribute *attr,
-	       const char *buf, size_t count)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev)
-		return -EINVAL;
-
-	if (!sysfs_streq(buf, "linux") && !sysfs_streq(buf, "elf"))
-		return -EINVAL;
-
-	mutex_lock(&mdev->mic_mutex);
-	kfree(mdev->bootmode);
-
-	mdev->bootmode = kmalloc(count + 1, GFP_KERNEL);
-	if (!mdev->bootmode) {
-		count = -ENOMEM;
-		goto unlock;
-	}
-
-	strncpy(mdev->bootmode, buf, count);
-
-	if (mdev->bootmode[count - 1] == '\n')
-		mdev->bootmode[count - 1] = '\0';
-	else
-		mdev->bootmode[count] = '\0';
-unlock:
-	mutex_unlock(&mdev->mic_mutex);
-	return count;
-}
-static DEVICE_ATTR_RW(bootmode);
-
-static ssize_t
-log_buf_addr_show(struct device *dev, struct device_attribute *attr,
-		  char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev)
-		return -EINVAL;
-
-	return scnprintf(buf, PAGE_SIZE, "%p\n", mdev->log_buf_addr);
-}
-
-static ssize_t
-log_buf_addr_store(struct device *dev, struct device_attribute *attr,
-		   const char *buf, size_t count)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	int ret;
-	unsigned long addr;
-
-	if (!mdev)
-		return -EINVAL;
-
-	ret = kstrtoul(buf, 16, &addr);
-	if (ret)
-		goto exit;
-
-	mdev->log_buf_addr = (void *)addr;
-	ret = count;
-exit:
-	return ret;
-}
-static DEVICE_ATTR_RW(log_buf_addr);
-
-static ssize_t
-log_buf_len_show(struct device *dev, struct device_attribute *attr,
-		 char *buf)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-
-	if (!mdev)
-		return -EINVAL;
-
-	return scnprintf(buf, PAGE_SIZE, "%p\n", mdev->log_buf_len);
-}
-
-static ssize_t
-log_buf_len_store(struct device *dev, struct device_attribute *attr,
-		  const char *buf, size_t count)
-{
-	struct mic_device *mdev = dev_get_drvdata(dev->parent);
-	int ret;
-	unsigned long addr;
-
-	if (!mdev)
-		return -EINVAL;
-
-	ret = kstrtoul(buf, 16, &addr);
-	if (ret)
-		goto exit;
-
-	mdev->log_buf_len = (int *)addr;
-	ret = count;
-exit:
-	return ret;
-}
-static DEVICE_ATTR_RW(log_buf_len);
-
-static struct attribute *mic_default_attrs[] = {
-	&dev_attr_family.attr,
-	&dev_attr_stepping.attr,
-	&dev_attr_state.attr,
-	&dev_attr_shutdown_status.attr,
-	&dev_attr_cmdline.attr,
-	&dev_attr_firmware.attr,
-	&dev_attr_ramdisk.attr,
-	&dev_attr_bootmode.attr,
-	&dev_attr_log_buf_addr.attr,
-	&dev_attr_log_buf_len.attr,
-
-	NULL
-};
-
-ATTRIBUTE_GROUPS(mic_default);
-
-void mic_sysfs_init(struct mic_device *mdev)
-{
-	mdev->attr_group = mic_default_groups;
-}
diff --git a/drivers/misc/mic/host/mic_virtio.c b/drivers/misc/mic/host/mic_virtio.c
index cc08e9f733c9..58b107a24a8b 100644
--- a/drivers/misc/mic/host/mic_virtio.c
+++ b/drivers/misc/mic/host/mic_virtio.c
@@ -23,7 +23,6 @@
 #include <linux/uaccess.h>
 #include <linux/dmaengine.h>
 #include <linux/mic_common.h>
-
 #include "../common/mic_dev.h"
 #include "mic_device.h"
 #include "mic_smpt.h"
@@ -62,7 +61,7 @@ static int mic_sync_dma(struct mic_device *mdev, dma_addr_t dst,
 	}
 error:
 	if (err)
-		dev_err(mdev->sdev->parent, "%s %d err %d\n",
+		dev_err(&mdev->pdev->dev, "%s %d err %d\n",
 			__func__, __LINE__, err);
 	return err;
 }
@@ -440,7 +439,7 @@ void mic_virtio_reset_devices(struct mic_device *mdev)
 	struct list_head *pos, *tmp;
 	struct mic_vdev *mvdev;
 
-	dev_dbg(mdev->sdev->parent, "%s\n",  __func__);
+	dev_dbg(&mdev->pdev->dev, "%s\n",  __func__);
 
 	list_for_each_safe(pos, tmp, &mdev->vdev_list) {
 		mvdev = list_entry(pos, struct mic_vdev, list);
@@ -686,7 +685,7 @@ int mic_virtio_add_device(struct mic_vdev *mvdev,
 		mvr->head = USHRT_MAX;
 		mvr->mvdev = mvdev;
 		mvr->vrh.notify = mic_notify;
-		dev_dbg(mdev->sdev->parent,
+		dev_dbg(&mdev->pdev->dev,
 			"%s %d index %d va %p info %p vr_size 0x%x\n",
 			__func__, __LINE__, i, vr->va, vr->info, vr_size);
 		mvr->buf = (void *)__get_free_pages(GFP_KERNEL,
@@ -704,7 +703,7 @@ int mic_virtio_add_device(struct mic_vdev *mvdev,
 					       mvdev->virtio_db, MIC_INTR_DB);
 	if (IS_ERR(mvdev->virtio_cookie)) {
 		ret = PTR_ERR(mvdev->virtio_cookie);
-		dev_dbg(mdev->sdev->parent, "request irq failed\n");
+		dev_dbg(&mdev->pdev->dev, "request irq failed\n");
 		goto err;
 	}
 
@@ -720,7 +719,7 @@ int mic_virtio_add_device(struct mic_vdev *mvdev,
 	smp_wmb();
 	dd->type = type;
 
-	dev_dbg(mdev->sdev->parent, "Added virtio device id %d\n", dd->type);
+	dev_dbg(&mdev->pdev->dev, "Added virtio device id %d\n", dd->type);
 
 	db = bootparam->h2c_config_db;
 	if (db != -1)
@@ -755,7 +754,7 @@ void mic_virtio_del_device(struct mic_vdev *mvdev)
 	db = bootparam->h2c_config_db;
 	if (db == -1)
 		goto skip_hot_remove;
-	dev_dbg(mdev->sdev->parent,
+	dev_dbg(&mdev->pdev->dev,
 		"Requesting hot remove id %d\n", mvdev->virtio_id);
 	mvdev->dc->config_change = MIC_VIRTIO_PARAM_DEV_REMOVE;
 	mdev->ops->send_intr(mdev, db);
@@ -765,7 +764,7 @@ void mic_virtio_del_device(struct mic_vdev *mvdev)
 		if (ret)
 			break;
 	}
-	dev_dbg(mdev->sdev->parent,
+	dev_dbg(&mdev->pdev->dev,
 		"Device id %d config_change %d guest_ack %d retry %d\n",
 		mvdev->virtio_id, mvdev->dc->config_change,
 		mvdev->dc->guest_ack, retry);
@@ -794,7 +793,7 @@ skip_hot_remove:
 		tmp_mvdev = list_entry(pos, struct mic_vdev, list);
 		if (tmp_mvdev == mvdev) {
 			list_del(pos);
-			dev_dbg(mdev->sdev->parent,
+			dev_dbg(&mdev->pdev->dev,
 				"Removing virtio device id %d\n",
 				mvdev->virtio_id);
 			break;
diff --git a/drivers/misc/mic/host/mic_virtio.h b/drivers/misc/mic/host/mic_virtio.h
index d574efb853d9..a80631f2790d 100644
--- a/drivers/misc/mic/host/mic_virtio.h
+++ b/drivers/misc/mic/host/mic_virtio.h
@@ -124,7 +124,7 @@ void mic_bh_handler(struct work_struct *work);
 /* Helper API to obtain the MIC PCIe device */
 static inline struct device *mic_dev(struct mic_vdev *mvdev)
 {
-	return mvdev->mdev->sdev->parent;
+	return &mvdev->mdev->pdev->dev;
 }
 
 /* Helper API to check if a virtio device is initialized */
diff --git a/drivers/misc/mic/host/mic_x100.c b/drivers/misc/mic/host/mic_x100.c
index 3341e90dede4..8118ac48c764 100644
--- a/drivers/misc/mic/host/mic_x100.c
+++ b/drivers/misc/mic/host/mic_x100.c
@@ -43,7 +43,7 @@
 static void
 mic_x100_write_spad(struct mic_device *mdev, unsigned int idx, u32 val)
 {
-	dev_dbg(mdev->sdev->parent, "Writing 0x%x to scratch pad index %d\n",
+	dev_dbg(&mdev->pdev->dev, "Writing 0x%x to scratch pad index %d\n",
 		val, idx);
 	mic_mmio_write(&mdev->mmio, val,
 		       MIC_X100_SBOX_BASE_ADDRESS +
@@ -66,7 +66,7 @@ mic_x100_read_spad(struct mic_device *mdev, unsigned int idx)
 		MIC_X100_SBOX_BASE_ADDRESS +
 		MIC_X100_SBOX_SPAD0 + idx * 4);
 
-	dev_dbg(mdev->sdev->parent,
+	dev_dbg(&mdev->pdev->dev,
 		"Reading 0x%x from scratch pad index %d\n", val, idx);
 	return val;
 }
@@ -126,7 +126,7 @@ static void mic_x100_disable_interrupts(struct mic_device *mdev)
  * @mdev: pointer to mic_device instance
  */
 static void mic_x100_send_sbox_intr(struct mic_device *mdev,
-			int doorbell)
+				    int doorbell)
 {
 	struct mic_mw *mw = &mdev->mmio;
 	u64 apic_icr_offset = MIC_X100_SBOX_APICICR0 + doorbell * 8;
@@ -147,7 +147,7 @@ static void mic_x100_send_sbox_intr(struct mic_device *mdev,
  * @mdev: pointer to mic_device instance
  */
 static void mic_x100_send_rdmasr_intr(struct mic_device *mdev,
-			int doorbell)
+				      int doorbell)
 {
 	int rdmasr_offset = MIC_X100_SBOX_RDMASR0 + (doorbell << 2);
 	/* Ensure that the interrupt is ordered w.r.t. previous stores. */
@@ -359,15 +359,14 @@ mic_x100_load_command_line(struct mic_device *mdev, const struct firmware *fw)
 
 	boot_mem = mdev->aper.len >> 20;
 	buf = kzalloc(CMDLINE_SIZE, GFP_KERNEL);
-	if (!buf) {
-		dev_err(mdev->sdev->parent,
-			"%s %d allocation failed\n", __func__, __LINE__);
+	if (!buf)
 		return -ENOMEM;
-	}
+
 	len += snprintf(buf, CMDLINE_SIZE - len,
 		" mem=%dM", boot_mem);
-	if (mdev->cmdline)
-		snprintf(buf + len, CMDLINE_SIZE - len, " %s", mdev->cmdline);
+	if (mdev->cosm_dev->cmdline)
+		snprintf(buf + len, CMDLINE_SIZE - len, " %s",
+			 mdev->cosm_dev->cmdline);
 	memcpy_toio(cmd_line_va, buf, strlen(buf) + 1);
 	kfree(buf);
 	return 0;
@@ -386,12 +385,11 @@ mic_x100_load_ramdisk(struct mic_device *mdev)
 	int rc;
 	struct boot_params __iomem *bp = mdev->aper.va + mdev->bootaddr;
 
-	rc = request_firmware(&fw,
-			mdev->ramdisk, mdev->sdev->parent);
+	rc = request_firmware(&fw, mdev->cosm_dev->ramdisk, &mdev->pdev->dev);
 	if (rc < 0) {
-		dev_err(mdev->sdev->parent,
+		dev_err(&mdev->pdev->dev,
 			"ramdisk request_firmware failed: %d %s\n",
-			rc, mdev->ramdisk);
+			rc, mdev->cosm_dev->ramdisk);
 		goto error;
 	}
 	/*
@@ -423,10 +421,10 @@ mic_x100_get_boot_addr(struct mic_device *mdev)
 
 	scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO);
 	boot_addr = MIC_X100_SPAD2_DOWNLOAD_ADDR(scratch2);
-	dev_dbg(mdev->sdev->parent, "%s %d boot_addr 0x%x\n",
+	dev_dbg(&mdev->pdev->dev, "%s %d boot_addr 0x%x\n",
 		__func__, __LINE__, boot_addr);
 	if (boot_addr > (1 << 31)) {
-		dev_err(mdev->sdev->parent,
+		dev_err(&mdev->pdev->dev,
 			"incorrect bootaddr 0x%x\n",
 			boot_addr);
 		rc = -EINVAL;
@@ -454,37 +452,37 @@ mic_x100_load_firmware(struct mic_device *mdev, const char *buf)
 	if (rc)
 		goto error;
 	/* load OS */
-	rc = request_firmware(&fw, mdev->firmware, mdev->sdev->parent);
+	rc = request_firmware(&fw, mdev->cosm_dev->firmware, &mdev->pdev->dev);
 	if (rc < 0) {
-		dev_err(mdev->sdev->parent,
+		dev_err(&mdev->pdev->dev,
 			"ramdisk request_firmware failed: %d %s\n",
-			rc, mdev->firmware);
+			rc, mdev->cosm_dev->firmware);
 		goto error;
 	}
 	if (mdev->bootaddr > mdev->aper.len - fw->size) {
 		rc = -EINVAL;
-		dev_err(mdev->sdev->parent, "%s %d rc %d bootaddr 0x%x\n",
+		dev_err(&mdev->pdev->dev, "%s %d rc %d bootaddr 0x%x\n",
 			__func__, __LINE__, rc, mdev->bootaddr);
 		release_firmware(fw);
 		goto error;
 	}
 	memcpy_toio(mdev->aper.va + mdev->bootaddr, fw->data, fw->size);
 	mdev->ops->write_spad(mdev, MIC_X100_FW_SIZE, fw->size);
-	if (!strcmp(mdev->bootmode, "elf"))
+	if (!strcmp(mdev->cosm_dev->bootmode, "flash"))
 		goto done;
 	/* load command line */
 	rc = mic_x100_load_command_line(mdev, fw);
 	if (rc) {
-		dev_err(mdev->sdev->parent, "%s %d rc %d\n",
+		dev_err(&mdev->pdev->dev, "%s %d rc %d\n",
 			__func__, __LINE__, rc);
 		goto error;
 	}
 	release_firmware(fw);
 	/* load ramdisk */
-	if (mdev->ramdisk)
+	if (mdev->cosm_dev->ramdisk)
 		rc = mic_x100_load_ramdisk(mdev);
 error:
-	dev_dbg(mdev->sdev->parent, "%s %d rc %d\n", __func__, __LINE__, rc);
+	dev_dbg(&mdev->pdev->dev, "%s %d rc %d\n", __func__, __LINE__, rc);
 done:
 	return rc;
 }
diff --git a/drivers/misc/mic/scif/Makefile b/drivers/misc/mic/scif/Makefile
index bf10bb7e2b91..29cfc3e51ac9 100644
--- a/drivers/misc/mic/scif/Makefile
+++ b/drivers/misc/mic/scif/Makefile
@@ -13,3 +13,8 @@ scif-objs += scif_epd.o
 scif-objs += scif_rb.o
 scif-objs += scif_nodeqp.o
 scif-objs += scif_nm.o
+scif-objs += scif_dma.o
+scif-objs += scif_fence.o
+scif-objs += scif_mmap.o
+scif-objs += scif_rma.o
+scif-objs += scif_rma_list.o
diff --git a/drivers/misc/mic/scif/scif_api.c b/drivers/misc/mic/scif/scif_api.c
index f39d3135a9ef..ddc9e4b08b5c 100644
--- a/drivers/misc/mic/scif/scif_api.c
+++ b/drivers/misc/mic/scif/scif_api.c
@@ -37,9 +37,21 @@ enum conn_async_state {
 	ASYNC_CONN_FLUSH_WORK	/* async work flush in progress  */
 };
 
+/*
+ * File operations for anonymous inode file associated with a SCIF endpoint,
+ * used in kernel mode SCIF poll. Kernel mode SCIF poll calls portions of the
+ * poll API in the kernel and these take in a struct file *. Since a struct
+ * file is not available to kernel mode SCIF, it uses an anonymous file for
+ * this purpose.
+ */
+const struct file_operations scif_anon_fops = {
+	.owner = THIS_MODULE,
+};
+
 scif_epd_t scif_open(void)
 {
 	struct scif_endpt *ep;
+	int err;
 
 	might_sleep();
 	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
@@ -50,15 +62,22 @@ scif_epd_t scif_open(void)
 	if (!ep->qp_info.qp)
 		goto err_qp_alloc;
 
+	err = scif_anon_inode_getfile(ep);
+	if (err)
+		goto err_anon_inode;
+
 	spin_lock_init(&ep->lock);
 	mutex_init(&ep->sendlock);
 	mutex_init(&ep->recvlock);
 
+	scif_rma_ep_init(ep);
 	ep->state = SCIFEP_UNBOUND;
 	dev_dbg(scif_info.mdev.this_device,
 		"SCIFAPI open: ep %p success\n", ep);
 	return ep;
 
+err_anon_inode:
+	kfree(ep->qp_info.qp);
 err_qp_alloc:
 	kfree(ep);
 err_ep_alloc:
@@ -166,8 +185,11 @@ int scif_close(scif_epd_t epd)
 
 	switch (oldstate) {
 	case SCIFEP_ZOMBIE:
+		dev_err(scif_info.mdev.this_device,
+			"SCIFAPI close: zombie state unexpected\n");
 	case SCIFEP_DISCONNECTED:
 		spin_unlock(&ep->lock);
+		scif_unregister_all_windows(epd);
 		/* Remove from the disconnected list */
 		mutex_lock(&scif_info.connlock);
 		list_for_each_safe(pos, tmpq, &scif_info.disconnected) {
@@ -189,6 +211,7 @@ int scif_close(scif_epd_t epd)
 	case SCIFEP_CLOSING:
 	{
 		spin_unlock(&ep->lock);
+		scif_unregister_all_windows(epd);
 		scif_disconnect_ep(ep);
 		break;
 	}
@@ -200,7 +223,7 @@ int scif_close(scif_epd_t epd)
 		struct scif_endpt *aep;
 
 		spin_unlock(&ep->lock);
-		spin_lock(&scif_info.eplock);
+		mutex_lock(&scif_info.eplock);
 
 		/* remove from listen list */
 		list_for_each_safe(pos, tmpq, &scif_info.listen) {
@@ -222,7 +245,7 @@ int scif_close(scif_epd_t epd)
 					break;
 				}
 			}
-			spin_unlock(&scif_info.eplock);
+			mutex_unlock(&scif_info.eplock);
 			mutex_lock(&scif_info.connlock);
 			list_for_each_safe(pos, tmpq, &scif_info.connected) {
 				tmpep = list_entry(pos,
@@ -242,13 +265,13 @@ int scif_close(scif_epd_t epd)
 			}
 			mutex_unlock(&scif_info.connlock);
 			scif_teardown_ep(aep);
-			spin_lock(&scif_info.eplock);
+			mutex_lock(&scif_info.eplock);
 			scif_add_epd_to_zombie_list(aep, SCIF_EPLOCK_HELD);
 			ep->acceptcnt--;
 		}
 
 		spin_lock(&ep->lock);
-		spin_unlock(&scif_info.eplock);
+		mutex_unlock(&scif_info.eplock);
 
 		/* Remove and reject any pending connection requests. */
 		while (ep->conreqcnt) {
@@ -279,6 +302,7 @@ int scif_close(scif_epd_t epd)
 	}
 	}
 	scif_put_port(ep->port.port);
+	scif_anon_inode_fput(ep);
 	scif_teardown_ep(ep);
 	scif_add_epd_to_zombie_list(ep, !SCIF_EPLOCK_HELD);
 	return 0;
@@ -409,9 +433,9 @@ int scif_listen(scif_epd_t epd, int backlog)
 	scif_teardown_ep(ep);
 	ep->qp_info.qp = NULL;
 
-	spin_lock(&scif_info.eplock);
+	mutex_lock(&scif_info.eplock);
 	list_add_tail(&ep->list, &scif_info.listen);
-	spin_unlock(&scif_info.eplock);
+	mutex_unlock(&scif_info.eplock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(scif_listen);
@@ -450,6 +474,13 @@ static int scif_conn_func(struct scif_endpt *ep)
 	struct scifmsg msg;
 	struct device *spdev;
 
+	err = scif_reserve_dma_chan(ep);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		ep->state = SCIFEP_BOUND;
+		goto connect_error_simple;
+	}
 	/* Initiate the first part of the endpoint QP setup */
 	err = scif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset,
 				    SCIF_ENDPT_QP_SIZE, ep->remote_dev);
@@ -558,8 +589,10 @@ void scif_conn_handler(struct work_struct *work)
 			list_del(&ep->conn_list);
 		}
 		spin_unlock(&scif_info.nb_connect_lock);
-		if (ep)
+		if (ep) {
 			ep->conn_err = scif_conn_func(ep);
+			wake_up_interruptible(&ep->conn_pend_wq);
+		}
 	} while (ep);
 }
 
@@ -660,6 +693,7 @@ int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block)
 	ep->remote_dev = &scif_dev[dst->node];
 	ep->qp_info.qp->magic = SCIFEP_MAGIC;
 	if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
+		init_waitqueue_head(&ep->conn_pend_wq);
 		spin_lock(&scif_info.nb_connect_lock);
 		list_add_tail(&ep->conn_list, &scif_info.nb_connect_list);
 		spin_unlock(&scif_info.nb_connect_lock);
@@ -782,12 +816,25 @@ retry_connection:
 	cep->remote_dev = &scif_dev[peer->node];
 	cep->remote_ep = conreq->msg.payload[0];
 
+	scif_rma_ep_init(cep);
+
+	err = scif_reserve_dma_chan(cep);
+	if (err) {
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto scif_accept_error_qpalloc;
+	}
+
 	cep->qp_info.qp = kzalloc(sizeof(*cep->qp_info.qp), GFP_KERNEL);
 	if (!cep->qp_info.qp) {
 		err = -ENOMEM;
 		goto scif_accept_error_qpalloc;
 	}
 
+	err = scif_anon_inode_getfile(cep);
+	if (err)
+		goto scif_accept_error_anon_inode;
+
 	cep->qp_info.qp->magic = SCIFEP_MAGIC;
 	spdev = scif_get_peer_dev(cep->remote_dev);
 	if (IS_ERR(spdev)) {
@@ -858,6 +905,8 @@ retry:
 	spin_unlock(&cep->lock);
 	return 0;
 scif_accept_error_map:
+	scif_anon_inode_fput(cep);
+scif_accept_error_anon_inode:
 	scif_teardown_ep(cep);
 scif_accept_error_qpalloc:
 	kfree(cep);
@@ -1247,6 +1296,134 @@ int scif_recv(scif_epd_t epd, void *msg, int len, int flags)
 }
 EXPORT_SYMBOL_GPL(scif_recv);
 
+static inline void _scif_poll_wait(struct file *f, wait_queue_head_t *wq,
+				   poll_table *p, struct scif_endpt *ep)
+{
+	/*
+	 * Because poll_wait makes a GFP_KERNEL allocation, give up the lock
+	 * and regrab it afterwards. Because the endpoint state might have
+	 * changed while the lock was given up, the state must be checked
+	 * again after re-acquiring the lock. The code in __scif_pollfd(..)
+	 * does this.
+	 */
+	spin_unlock(&ep->lock);
+	poll_wait(f, wq, p);
+	spin_lock(&ep->lock);
+}
+
+unsigned int
+__scif_pollfd(struct file *f, poll_table *wait, struct scif_endpt *ep)
+{
+	unsigned int mask = 0;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]);
+
+	spin_lock(&ep->lock);
+
+	/* Endpoint is waiting for a non-blocking connect to complete */
+	if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
+		_scif_poll_wait(f, &ep->conn_pend_wq, wait, ep);
+		if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) {
+			if (ep->state == SCIFEP_CONNECTED ||
+			    ep->state == SCIFEP_DISCONNECTED ||
+			    ep->conn_err)
+				mask |= POLLOUT;
+			goto exit;
+		}
+	}
+
+	/* Endpoint is listening for incoming connection requests */
+	if (ep->state == SCIFEP_LISTENING) {
+		_scif_poll_wait(f, &ep->conwq, wait, ep);
+		if (ep->state == SCIFEP_LISTENING) {
+			if (ep->conreqcnt)
+				mask |= POLLIN;
+			goto exit;
+		}
+	}
+
+	/* Endpoint is connected or disconnected */
+	if (ep->state == SCIFEP_CONNECTED || ep->state == SCIFEP_DISCONNECTED) {
+		if (poll_requested_events(wait) & POLLIN)
+			_scif_poll_wait(f, &ep->recvwq, wait, ep);
+		if (poll_requested_events(wait) & POLLOUT)
+			_scif_poll_wait(f, &ep->sendwq, wait, ep);
+		if (ep->state == SCIFEP_CONNECTED ||
+		    ep->state == SCIFEP_DISCONNECTED) {
+			/* Data can be read without blocking */
+			if (scif_rb_count(&ep->qp_info.qp->inbound_q, 1))
+				mask |= POLLIN;
+			/* Data can be written without blocking */
+			if (scif_rb_space(&ep->qp_info.qp->outbound_q))
+				mask |= POLLOUT;
+			/* Return POLLHUP if endpoint is disconnected */
+			if (ep->state == SCIFEP_DISCONNECTED)
+				mask |= POLLHUP;
+			goto exit;
+		}
+	}
+
+	/* Return POLLERR if the endpoint is in none of the above states */
+	mask |= POLLERR;
+exit:
+	spin_unlock(&ep->lock);
+	return mask;
+}
+
+/**
+ * scif_poll() - Kernel mode SCIF poll
+ * @ufds: Array of scif_pollepd structures containing the end points
+ *	  and events to poll on
+ * @nfds: Size of the ufds array
+ * @timeout_msecs: Timeout in msecs, -ve implies infinite timeout
+ *
+ * The code flow in this function is based on do_poll(..) in select.c
+ *
+ * Returns the number of endpoints which have pending events or 0 in
+ * the event of a timeout. If a signal is used for wake up, -EINTR is
+ * returned.
+ */
+int
+scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs)
+{
+	struct poll_wqueues table;
+	poll_table *pt;
+	int i, mask, count = 0, timed_out = timeout_msecs == 0;
+	u64 timeout = timeout_msecs < 0 ? MAX_SCHEDULE_TIMEOUT
+		: msecs_to_jiffies(timeout_msecs);
+
+	poll_initwait(&table);
+	pt = &table.pt;
+	while (1) {
+		for (i = 0; i < nfds; i++) {
+			pt->_key = ufds[i].events | POLLERR | POLLHUP;
+			mask = __scif_pollfd(ufds[i].epd->anon,
+					     pt, ufds[i].epd);
+			mask &= ufds[i].events | POLLERR | POLLHUP;
+			if (mask) {
+				count++;
+				pt->_qproc = NULL;
+			}
+			ufds[i].revents = mask;
+		}
+		pt->_qproc = NULL;
+		if (!count) {
+			count = table.error;
+			if (signal_pending(current))
+				count = -EINTR;
+		}
+		if (count || timed_out)
+			break;
+
+		if (!schedule_timeout_interruptible(timeout))
+			timed_out = 1;
+	}
+	poll_freewait(&table);
+	return count;
+}
+EXPORT_SYMBOL_GPL(scif_poll);
+
 int scif_get_node_ids(u16 *nodes, int len, u16 *self)
 {
 	int online = 0;
@@ -1274,3 +1451,46 @@ int scif_get_node_ids(u16 *nodes, int len, u16 *self)
 	return online;
 }
 EXPORT_SYMBOL_GPL(scif_get_node_ids);
+
+static int scif_add_client_dev(struct device *dev, struct subsys_interface *si)
+{
+	struct scif_client *client =
+		container_of(si, struct scif_client, si);
+	struct scif_peer_dev *spdev =
+		container_of(dev, struct scif_peer_dev, dev);
+
+	if (client->probe)
+		client->probe(spdev);
+	return 0;
+}
+
+static void scif_remove_client_dev(struct device *dev,
+				   struct subsys_interface *si)
+{
+	struct scif_client *client =
+		container_of(si, struct scif_client, si);
+	struct scif_peer_dev *spdev =
+		container_of(dev, struct scif_peer_dev, dev);
+
+	if (client->remove)
+		client->remove(spdev);
+}
+
+void scif_client_unregister(struct scif_client *client)
+{
+	subsys_interface_unregister(&client->si);
+}
+EXPORT_SYMBOL_GPL(scif_client_unregister);
+
+int scif_client_register(struct scif_client *client)
+{
+	struct subsys_interface *si = &client->si;
+
+	si->name = client->name;
+	si->subsys = &scif_peer_bus;
+	si->add_dev = scif_add_client_dev;
+	si->remove_dev = scif_remove_client_dev;
+
+	return subsys_interface_register(&client->si);
+}
+EXPORT_SYMBOL_GPL(scif_client_register);
diff --git a/drivers/misc/mic/scif/scif_debugfs.c b/drivers/misc/mic/scif/scif_debugfs.c
index 51f14e2a1196..6884dad97e17 100644
--- a/drivers/misc/mic/scif/scif_debugfs.c
+++ b/drivers/misc/mic/scif/scif_debugfs.c
@@ -62,10 +62,87 @@ static const struct file_operations scif_dev_ops = {
 	.release = scif_dev_test_release
 };
 
-void __init scif_init_debugfs(void)
+static void scif_display_window(struct scif_window *window, struct seq_file *s)
+{
+	int j;
+	struct scatterlist *sg;
+	scif_pinned_pages_t pin = window->pinned_pages;
+
+	seq_printf(s, "window %p type %d temp %d offset 0x%llx ",
+		   window, window->type, window->temp, window->offset);
+	seq_printf(s, "nr_pages 0x%llx nr_contig_chunks 0x%x prot %d ",
+		   window->nr_pages, window->nr_contig_chunks, window->prot);
+	seq_printf(s, "ref_count %d magic 0x%llx peer_window 0x%llx ",
+		   window->ref_count, window->magic, window->peer_window);
+	seq_printf(s, "unreg_state 0x%x va_for_temp 0x%lx\n",
+		   window->unreg_state, window->va_for_temp);
+
+	for (j = 0; j < window->nr_contig_chunks; j++)
+		seq_printf(s, "page[%d] dma_addr 0x%llx num_pages 0x%llx\n", j,
+			   window->dma_addr[j], window->num_pages[j]);
+
+	if (window->type == SCIF_WINDOW_SELF && pin)
+		for (j = 0; j < window->nr_pages; j++)
+			seq_printf(s, "page[%d] = pinned_pages %p address %p\n",
+				   j, pin->pages[j],
+				   page_address(pin->pages[j]));
+
+	if (window->st)
+		for_each_sg(window->st->sgl, sg, window->st->nents, j)
+			seq_printf(s, "sg[%d] dma addr 0x%llx length 0x%x\n",
+				   j, sg_dma_address(sg), sg_dma_len(sg));
+}
+
+static void scif_display_all_windows(struct list_head *head, struct seq_file *s)
 {
-	struct dentry *d;
+	struct list_head *item;
+	struct scif_window *window;
 
+	list_for_each(item, head) {
+		window = list_entry(item, struct scif_window, list);
+		scif_display_window(window, s);
+	}
+}
+
+static int scif_rma_test(struct seq_file *s, void *unused)
+{
+	struct scif_endpt *ep;
+	struct list_head *pos;
+
+	mutex_lock(&scif_info.connlock);
+	list_for_each(pos, &scif_info.connected) {
+		ep = list_entry(pos, struct scif_endpt, list);
+		seq_printf(s, "ep %p self windows\n", ep);
+		mutex_lock(&ep->rma_info.rma_lock);
+		scif_display_all_windows(&ep->rma_info.reg_list, s);
+		seq_printf(s, "ep %p remote windows\n", ep);
+		scif_display_all_windows(&ep->rma_info.remote_reg_list, s);
+		mutex_unlock(&ep->rma_info.rma_lock);
+	}
+	mutex_unlock(&scif_info.connlock);
+	return 0;
+}
+
+static int scif_rma_test_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, scif_rma_test, inode->i_private);
+}
+
+static int scif_rma_test_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations scif_rma_ops = {
+	.owner   = THIS_MODULE,
+	.open    = scif_rma_test_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = scif_rma_test_release
+};
+
+void __init scif_init_debugfs(void)
+{
 	scif_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL);
 	if (!scif_dbg) {
 		dev_err(scif_info.mdev.this_device,
@@ -73,8 +150,8 @@ void __init scif_init_debugfs(void)
 		return;
 	}
 
-	d = debugfs_create_file("scif_dev", 0444, scif_dbg,
-				NULL, &scif_dev_ops);
+	debugfs_create_file("scif_dev", 0444, scif_dbg, NULL, &scif_dev_ops);
+	debugfs_create_file("scif_rma", 0444, scif_dbg, NULL, &scif_rma_ops);
 	debugfs_create_u8("en_msg_log", 0666, scif_dbg, &scif_info.en_msg_log);
 	debugfs_create_u8("p2p_enable", 0666, scif_dbg, &scif_info.p2p_enable);
 }
diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
new file mode 100644
index 000000000000..95a13c629a8e
--- /dev/null
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -0,0 +1,1979 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#include "scif_main.h"
+#include "scif_map.h"
+
+/*
+ * struct scif_dma_comp_cb - SCIF DMA completion callback
+ *
+ * @dma_completion_func: DMA completion callback
+ * @cb_cookie: DMA completion callback cookie
+ * @temp_buf: Temporary buffer
+ * @temp_buf_to_free: Temporary buffer to be freed
+ * @is_cache: Is a kmem_cache allocated buffer
+ * @dst_offset: Destination registration offset
+ * @dst_window: Destination registration window
+ * @len: Length of the temp buffer
+ * @temp_phys: DMA address of the temp buffer
+ * @sdev: The SCIF device
+ * @header_padding: padding for cache line alignment
+ */
+struct scif_dma_comp_cb {
+	void (*dma_completion_func)(void *cookie);
+	void *cb_cookie;
+	u8 *temp_buf;
+	u8 *temp_buf_to_free;
+	bool is_cache;
+	s64 dst_offset;
+	struct scif_window *dst_window;
+	size_t len;
+	dma_addr_t temp_phys;
+	struct scif_dev *sdev;
+	int header_padding;
+};
+
+/**
+ * struct scif_copy_work - Work for DMA copy
+ *
+ * @src_offset: Starting source offset
+ * @dst_offset: Starting destination offset
+ * @src_window: Starting src registered window
+ * @dst_window: Starting dst registered window
+ * @loopback: true if this is a loopback DMA transfer
+ * @len: Length of the transfer
+ * @comp_cb: DMA copy completion callback
+ * @remote_dev: The remote SCIF peer device
+ * @fence_type: polling or interrupt based
+ * @ordered: is this a tail byte ordered DMA transfer
+ */
+struct scif_copy_work {
+	s64 src_offset;
+	s64 dst_offset;
+	struct scif_window *src_window;
+	struct scif_window *dst_window;
+	int loopback;
+	size_t len;
+	struct scif_dma_comp_cb   *comp_cb;
+	struct scif_dev	*remote_dev;
+	int fence_type;
+	bool ordered;
+};
+
+#ifndef list_entry_next
+#define list_entry_next(pos, member) \
+	list_entry(pos->member.next, typeof(*pos), member)
+#endif
+
+/**
+ * scif_reserve_dma_chan:
+ * @ep: Endpoint Descriptor.
+ *
+ * This routine reserves a DMA channel for a particular
+ * endpoint. All DMA transfers for an endpoint are always
+ * programmed on the same DMA channel.
+ */
+int scif_reserve_dma_chan(struct scif_endpt *ep)
+{
+	int err = 0;
+	struct scif_dev *scifdev;
+	struct scif_hw_dev *sdev;
+	struct dma_chan *chan;
+
+	/* Loopback DMAs are not supported on the management node */
+	if (!scif_info.nodeid && scifdev_self(ep->remote_dev))
+		return 0;
+	if (scif_info.nodeid)
+		scifdev = &scif_dev[0];
+	else
+		scifdev = ep->remote_dev;
+	sdev = scifdev->sdev;
+	if (!sdev->num_dma_ch)
+		return -ENODEV;
+	chan = sdev->dma_ch[scifdev->dma_ch_idx];
+	scifdev->dma_ch_idx = (scifdev->dma_ch_idx + 1) % sdev->num_dma_ch;
+	mutex_lock(&ep->rma_info.rma_lock);
+	ep->rma_info.dma_chan = chan;
+	mutex_unlock(&ep->rma_info.rma_lock);
+	return err;
+}
+
+#ifdef CONFIG_MMU_NOTIFIER
+/**
+ * scif_rma_destroy_tcw:
+ *
+ * This routine destroys temporary cached windows
+ */
+static
+void __scif_rma_destroy_tcw(struct scif_mmu_notif *mmn,
+			    struct scif_endpt *ep,
+			    u64 start, u64 len)
+{
+	struct list_head *item, *tmp;
+	struct scif_window *window;
+	u64 start_va, end_va;
+	u64 end = start + len;
+
+	if (end <= start)
+		return;
+
+	list_for_each_safe(item, tmp, &mmn->tc_reg_list) {
+		window = list_entry(item, struct scif_window, list);
+		ep = (struct scif_endpt *)window->ep;
+		if (!len)
+			break;
+		start_va = window->va_for_temp;
+		end_va = start_va + (window->nr_pages << PAGE_SHIFT);
+		if (start < start_va && end <= start_va)
+			break;
+		if (start >= end_va)
+			continue;
+		__scif_rma_destroy_tcw_helper(window);
+	}
+}
+
+static void scif_rma_destroy_tcw(struct scif_mmu_notif *mmn, u64 start, u64 len)
+{
+	struct scif_endpt *ep = mmn->ep;
+
+	spin_lock(&ep->rma_info.tc_lock);
+	__scif_rma_destroy_tcw(mmn, ep, start, len);
+	spin_unlock(&ep->rma_info.tc_lock);
+}
+
+static void scif_rma_destroy_tcw_ep(struct scif_endpt *ep)
+{
+	struct list_head *item, *tmp;
+	struct scif_mmu_notif *mmn;
+
+	list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
+		mmn = list_entry(item, struct scif_mmu_notif, list);
+		scif_rma_destroy_tcw(mmn, 0, ULONG_MAX);
+	}
+}
+
+static void __scif_rma_destroy_tcw_ep(struct scif_endpt *ep)
+{
+	struct list_head *item, *tmp;
+	struct scif_mmu_notif *mmn;
+
+	spin_lock(&ep->rma_info.tc_lock);
+	list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) {
+		mmn = list_entry(item, struct scif_mmu_notif, list);
+		__scif_rma_destroy_tcw(mmn, ep, 0, ULONG_MAX);
+	}
+	spin_unlock(&ep->rma_info.tc_lock);
+}
+
+static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes)
+{
+	if ((cur_bytes >> PAGE_SHIFT) > scif_info.rma_tc_limit)
+		return false;
+	if ((atomic_read(&ep->rma_info.tcw_total_pages)
+			+ (cur_bytes >> PAGE_SHIFT)) >
+			scif_info.rma_tc_limit) {
+		dev_info(scif_info.mdev.this_device,
+			 "%s %d total=%d, current=%zu reached max\n",
+			 __func__, __LINE__,
+			 atomic_read(&ep->rma_info.tcw_total_pages),
+			 (1 + (cur_bytes >> PAGE_SHIFT)));
+		scif_rma_destroy_tcw_invalid();
+		__scif_rma_destroy_tcw_ep(ep);
+	}
+	return true;
+}
+
+static void scif_mmu_notifier_release(struct mmu_notifier *mn,
+				      struct mm_struct *mm)
+{
+	struct scif_mmu_notif	*mmn;
+
+	mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
+	scif_rma_destroy_tcw(mmn, 0, ULONG_MAX);
+	schedule_work(&scif_info.misc_work);
+}
+
+static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long address)
+{
+	struct scif_mmu_notif	*mmn;
+
+	mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
+	scif_rma_destroy_tcw(mmn, address, PAGE_SIZE);
+}
+
+static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						     struct mm_struct *mm,
+						     unsigned long start,
+						     unsigned long end)
+{
+	struct scif_mmu_notif	*mmn;
+
+	mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
+	scif_rma_destroy_tcw(mmn, start, end - start);
+}
+
+static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+						   struct mm_struct *mm,
+						   unsigned long start,
+						   unsigned long end)
+{
+	/*
+	 * Nothing to do here, everything needed was done in
+	 * invalidate_range_start.
+	 */
+}
+
+static const struct mmu_notifier_ops scif_mmu_notifier_ops = {
+	.release = scif_mmu_notifier_release,
+	.clear_flush_young = NULL,
+	.invalidate_page = scif_mmu_notifier_invalidate_page,
+	.invalidate_range_start = scif_mmu_notifier_invalidate_range_start,
+	.invalidate_range_end = scif_mmu_notifier_invalidate_range_end};
+
+static void scif_ep_unregister_mmu_notifier(struct scif_endpt *ep)
+{
+	struct scif_endpt_rma_info *rma = &ep->rma_info;
+	struct scif_mmu_notif *mmn = NULL;
+	struct list_head *item, *tmp;
+
+	mutex_lock(&ep->rma_info.mmn_lock);
+	list_for_each_safe(item, tmp, &rma->mmn_list) {
+		mmn = list_entry(item, struct scif_mmu_notif, list);
+		mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm);
+		list_del(item);
+		kfree(mmn);
+	}
+	mutex_unlock(&ep->rma_info.mmn_lock);
+}
+
+static void scif_init_mmu_notifier(struct scif_mmu_notif *mmn,
+				   struct mm_struct *mm, struct scif_endpt *ep)
+{
+	mmn->ep = ep;
+	mmn->mm = mm;
+	mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops;
+	INIT_LIST_HEAD(&mmn->list);
+	INIT_LIST_HEAD(&mmn->tc_reg_list);
+}
+
+static struct scif_mmu_notif *
+scif_find_mmu_notifier(struct mm_struct *mm, struct scif_endpt_rma_info *rma)
+{
+	struct scif_mmu_notif *mmn;
+	struct list_head *item;
+
+	list_for_each(item, &rma->mmn_list) {
+		mmn = list_entry(item, struct scif_mmu_notif, list);
+		if (mmn->mm == mm)
+			return mmn;
+	}
+	return NULL;
+}
+
+static struct scif_mmu_notif *
+scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep)
+{
+	struct scif_mmu_notif *mmn
+		 = kzalloc(sizeof(*mmn), GFP_KERNEL);
+
+	if (!mmn)
+		return ERR_PTR(ENOMEM);
+
+	scif_init_mmu_notifier(mmn, current->mm, ep);
+	if (mmu_notifier_register(&mmn->ep_mmu_notifier,
+				  current->mm)) {
+		kfree(mmn);
+		return ERR_PTR(EBUSY);
+	}
+	list_add(&mmn->list, &ep->rma_info.mmn_list);
+	return mmn;
+}
+
+/*
+ * Called from the misc thread to destroy temporary cached windows and
+ * unregister the MMU notifier for the SCIF endpoint.
+ */
+void scif_mmu_notif_handler(struct work_struct *work)
+{
+	struct list_head *pos, *tmpq;
+	struct scif_endpt *ep;
+restart:
+	scif_rma_destroy_tcw_invalid();
+	spin_lock(&scif_info.rmalock);
+	list_for_each_safe(pos, tmpq, &scif_info.mmu_notif_cleanup) {
+		ep = list_entry(pos, struct scif_endpt, mmu_list);
+		list_del(&ep->mmu_list);
+		spin_unlock(&scif_info.rmalock);
+		scif_rma_destroy_tcw_ep(ep);
+		scif_ep_unregister_mmu_notifier(ep);
+		goto restart;
+	}
+	spin_unlock(&scif_info.rmalock);
+}
+
+static bool scif_is_set_reg_cache(int flags)
+{
+	return !!(flags & SCIF_RMA_USECACHE);
+}
+#else
+static struct scif_mmu_notif *
+scif_find_mmu_notifier(struct mm_struct *mm,
+		       struct scif_endpt_rma_info *rma)
+{
+	return NULL;
+}
+
+static struct scif_mmu_notif *
+scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep)
+{
+	return NULL;
+}
+
+void scif_mmu_notif_handler(struct work_struct *work)
+{
+}
+
+static bool scif_is_set_reg_cache(int flags)
+{
+	return false;
+}
+
+static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes)
+{
+	return false;
+}
+#endif
+
+/**
+ * scif_register_temp:
+ * @epd: End Point Descriptor.
+ * @addr: virtual address to/from which to copy
+ * @len: length of range to copy
+ * @out_offset: computed offset returned by reference.
+ * @out_window: allocated registered window returned by reference.
+ *
+ * Create a temporary registered window. The peer will not know about this
+ * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's.
+ */
+static int
+scif_register_temp(scif_epd_t epd, unsigned long addr, size_t len, int prot,
+		   off_t *out_offset, struct scif_window **out_window)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	int err;
+	scif_pinned_pages_t pinned_pages;
+	size_t aligned_len;
+
+	aligned_len = ALIGN(len, PAGE_SIZE);
+
+	err = __scif_pin_pages((void *)(addr & PAGE_MASK),
+			       aligned_len, &prot, 0, &pinned_pages);
+	if (err)
+		return err;
+
+	pinned_pages->prot = prot;
+
+	/* Compute the offset for this registration */
+	err = scif_get_window_offset(ep, 0, 0,
+				     aligned_len >> PAGE_SHIFT,
+				     (s64 *)out_offset);
+	if (err)
+		goto error_unpin;
+
+	/* Allocate and prepare self registration window */
+	*out_window = scif_create_window(ep, aligned_len >> PAGE_SHIFT,
+					*out_offset, true);
+	if (!*out_window) {
+		scif_free_window_offset(ep, NULL, *out_offset);
+		err = -ENOMEM;
+		goto error_unpin;
+	}
+
+	(*out_window)->pinned_pages = pinned_pages;
+	(*out_window)->nr_pages = pinned_pages->nr_pages;
+	(*out_window)->prot = pinned_pages->prot;
+
+	(*out_window)->va_for_temp = addr & PAGE_MASK;
+	err = scif_map_window(ep->remote_dev, *out_window);
+	if (err) {
+		/* Something went wrong! Rollback */
+		scif_destroy_window(ep, *out_window);
+		*out_window = NULL;
+	} else {
+		*out_offset |= (addr - (*out_window)->va_for_temp);
+	}
+	return err;
+error_unpin:
+	if (err)
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+	scif_unpin_pages(pinned_pages);
+	return err;
+}
+
+#define SCIF_DMA_TO (3 * HZ)
+
+/*
+ * scif_sync_dma - Program a DMA without an interrupt descriptor
+ *
+ * @dev - The address of the pointer to the device instance used
+ * for DMA registration.
+ * @chan - DMA channel to be used.
+ * @sync_wait: Wait for DMA to complete?
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int scif_sync_dma(struct scif_hw_dev *sdev, struct dma_chan *chan,
+			 bool sync_wait)
+{
+	int err = 0;
+	struct dma_async_tx_descriptor *tx = NULL;
+	enum dma_ctrl_flags flags = DMA_PREP_FENCE;
+	dma_cookie_t cookie;
+	struct dma_device *ddev;
+
+	if (!chan) {
+		err = -EIO;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		return err;
+	}
+	ddev = chan->device;
+
+	tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags);
+	if (!tx) {
+		err = -ENOMEM;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto release;
+	}
+	cookie = tx->tx_submit(tx);
+
+	if (dma_submit_error(cookie)) {
+		err = -ENOMEM;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto release;
+	}
+	if (!sync_wait) {
+		dma_async_issue_pending(chan);
+	} else {
+		if (dma_sync_wait(chan, cookie) == DMA_COMPLETE) {
+			err = 0;
+		} else {
+			err = -EIO;
+			dev_err(&sdev->dev, "%s %d err %d\n",
+				__func__, __LINE__, err);
+		}
+	}
+release:
+	return err;
+}
+
+static void scif_dma_callback(void *arg)
+{
+	struct completion *done = (struct completion *)arg;
+
+	complete(done);
+}
+
+#define SCIF_DMA_SYNC_WAIT true
+#define SCIF_DMA_POLL BIT(0)
+#define SCIF_DMA_INTR BIT(1)
+
+/*
+ * scif_async_dma - Program a DMA with an interrupt descriptor
+ *
+ * @dev - The address of the pointer to the device instance used
+ * for DMA registration.
+ * @chan - DMA channel to be used.
+ * Return 0 on success and -errno on error.
+ */
+static int scif_async_dma(struct scif_hw_dev *sdev, struct dma_chan *chan)
+{
+	int err = 0;
+	struct dma_device *ddev;
+	struct dma_async_tx_descriptor *tx = NULL;
+	enum dma_ctrl_flags flags = DMA_PREP_INTERRUPT | DMA_PREP_FENCE;
+	DECLARE_COMPLETION_ONSTACK(done_wait);
+	dma_cookie_t cookie;
+	enum dma_status status;
+
+	if (!chan) {
+		err = -EIO;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		return err;
+	}
+	ddev = chan->device;
+
+	tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags);
+	if (!tx) {
+		err = -ENOMEM;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto release;
+	}
+	reinit_completion(&done_wait);
+	tx->callback = scif_dma_callback;
+	tx->callback_param = &done_wait;
+	cookie = tx->tx_submit(tx);
+
+	if (dma_submit_error(cookie)) {
+		err = -ENOMEM;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto release;
+	}
+	dma_async_issue_pending(chan);
+
+	err = wait_for_completion_timeout(&done_wait, SCIF_DMA_TO);
+	if (!err) {
+		err = -EIO;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto release;
+	}
+	err = 0;
+	status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
+	if (status != DMA_COMPLETE) {
+		err = -EIO;
+		dev_err(&sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto release;
+	}
+release:
+	return err;
+}
+
+/*
+ * scif_drain_dma_poll - Drain all outstanding DMA operations for a particular
+ * DMA channel via polling.
+ *
+ * @sdev - The SCIF device
+ * @chan - DMA channel
+ * Return 0 on success and -errno on error.
+ */
+static int scif_drain_dma_poll(struct scif_hw_dev *sdev, struct dma_chan *chan)
+{
+	if (!chan)
+		return -EINVAL;
+	return scif_sync_dma(sdev, chan, SCIF_DMA_SYNC_WAIT);
+}
+
+/*
+ * scif_drain_dma_intr - Drain all outstanding DMA operations for a particular
+ * DMA channel via interrupt based blocking wait.
+ *
+ * @sdev - The SCIF device
+ * @chan - DMA channel
+ * Return 0 on success and -errno on error.
+ */
+int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan)
+{
+	if (!chan)
+		return -EINVAL;
+	return scif_async_dma(sdev, chan);
+}
+
+/**
+ * scif_rma_destroy_windows:
+ *
+ * This routine destroys all windows queued for cleanup
+ */
+void scif_rma_destroy_windows(void)
+{
+	struct list_head *item, *tmp;
+	struct scif_window *window;
+	struct scif_endpt *ep;
+	struct dma_chan *chan;
+
+	might_sleep();
+restart:
+	spin_lock(&scif_info.rmalock);
+	list_for_each_safe(item, tmp, &scif_info.rma) {
+		window = list_entry(item, struct scif_window,
+				    list);
+		ep = (struct scif_endpt *)window->ep;
+		chan = ep->rma_info.dma_chan;
+
+		list_del_init(&window->list);
+		spin_unlock(&scif_info.rmalock);
+		if (!chan || !scifdev_alive(ep) ||
+		    !scif_drain_dma_intr(ep->remote_dev->sdev,
+					 ep->rma_info.dma_chan))
+			/* Remove window from global list */
+			window->unreg_state = OP_COMPLETED;
+		else
+			dev_warn(&ep->remote_dev->sdev->dev,
+				 "DMA engine hung?\n");
+		if (window->unreg_state == OP_COMPLETED) {
+			if (window->type == SCIF_WINDOW_SELF)
+				scif_destroy_window(ep, window);
+			else
+				scif_destroy_remote_window(window);
+			atomic_dec(&ep->rma_info.tw_refcount);
+		}
+		goto restart;
+	}
+	spin_unlock(&scif_info.rmalock);
+}
+
+/**
+ * scif_rma_destroy_tcw:
+ *
+ * This routine destroys temporary cached registered windows
+ * which have been queued for cleanup.
+ */
+void scif_rma_destroy_tcw_invalid(void)
+{
+	struct list_head *item, *tmp;
+	struct scif_window *window;
+	struct scif_endpt *ep;
+	struct dma_chan *chan;
+
+	might_sleep();
+restart:
+	spin_lock(&scif_info.rmalock);
+	list_for_each_safe(item, tmp, &scif_info.rma_tc) {
+		window = list_entry(item, struct scif_window, list);
+		ep = (struct scif_endpt *)window->ep;
+		chan = ep->rma_info.dma_chan;
+		list_del_init(&window->list);
+		spin_unlock(&scif_info.rmalock);
+		mutex_lock(&ep->rma_info.rma_lock);
+		if (!chan || !scifdev_alive(ep) ||
+		    !scif_drain_dma_intr(ep->remote_dev->sdev,
+					 ep->rma_info.dma_chan)) {
+			atomic_sub(window->nr_pages,
+				   &ep->rma_info.tcw_total_pages);
+			scif_destroy_window(ep, window);
+			atomic_dec(&ep->rma_info.tcw_refcount);
+		} else {
+			dev_warn(&ep->remote_dev->sdev->dev,
+				 "DMA engine hung?\n");
+		}
+		mutex_unlock(&ep->rma_info.rma_lock);
+		goto restart;
+	}
+	spin_unlock(&scif_info.rmalock);
+}
+
+static inline
+void *_get_local_va(off_t off, struct scif_window *window, size_t len)
+{
+	int page_nr = (off - window->offset) >> PAGE_SHIFT;
+	off_t page_off = off & ~PAGE_MASK;
+	void *va = NULL;
+
+	if (window->type == SCIF_WINDOW_SELF) {
+		struct page **pages = window->pinned_pages->pages;
+
+		va = page_address(pages[page_nr]) + page_off;
+	}
+	return va;
+}
+
+static inline
+void *ioremap_remote(off_t off, struct scif_window *window,
+		     size_t len, struct scif_dev *dev,
+		     struct scif_window_iter *iter)
+{
+	dma_addr_t phys = scif_off_to_dma_addr(window, off, NULL, iter);
+
+	/*
+	 * If the DMA address is not card relative then we need the DMA
+	 * addresses to be an offset into the bar. The aperture base was already
+	 * added so subtract it here since scif_ioremap is going to add it again
+	 */
+	if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER &&
+	    dev->sdev->aper && !dev->sdev->card_rel_da)
+		phys = phys - dev->sdev->aper->pa;
+	return scif_ioremap(phys, len, dev);
+}
+
+static inline void
+iounmap_remote(void *virt, size_t size, struct scif_copy_work *work)
+{
+	scif_iounmap(virt, size, work->remote_dev);
+}
+
+/*
+ * Takes care of ordering issue caused by
+ * 1. Hardware:  Only in the case of cpu copy from mgmt node to card
+ * because of WC memory.
+ * 2. Software: If memcpy reorders copy instructions for optimization.
+ * This could happen at both mgmt node and card.
+ */
+static inline void
+scif_ordered_memcpy_toio(char *dst, const char *src, size_t count)
+{
+	if (!count)
+		return;
+
+	memcpy_toio((void __iomem __force *)dst, src, --count);
+	/* Order the last byte with the previous stores */
+	wmb();
+	*(dst + count) = *(src + count);
+}
+
+static inline void scif_unaligned_cpy_toio(char *dst, const char *src,
+					   size_t count, bool ordered)
+{
+	if (ordered)
+		scif_ordered_memcpy_toio(dst, src, count);
+	else
+		memcpy_toio((void __iomem __force *)dst, src, count);
+}
+
+static inline
+void scif_ordered_memcpy_fromio(char *dst, const char *src, size_t count)
+{
+	if (!count)
+		return;
+
+	memcpy_fromio(dst, (void __iomem __force *)src, --count);
+	/* Order the last byte with the previous loads */
+	rmb();
+	*(dst + count) = *(src + count);
+}
+
+static inline void scif_unaligned_cpy_fromio(char *dst, const char *src,
+					     size_t count, bool ordered)
+{
+	if (ordered)
+		scif_ordered_memcpy_fromio(dst, src, count);
+	else
+		memcpy_fromio(dst, (void __iomem __force *)src, count);
+}
+
+#define SCIF_RMA_ERROR_CODE (~(dma_addr_t)0x0)
+
+/*
+ * scif_off_to_dma_addr:
+ * Obtain the dma_addr given the window and the offset.
+ * @window: Registered window.
+ * @off: Window offset.
+ * @nr_bytes: Return the number of contiguous bytes till next DMA addr index.
+ * @index: Return the index of the dma_addr array found.
+ * @start_off: start offset of index of the dma addr array found.
+ * The nr_bytes provides the callee an estimate of the maximum possible
+ * DMA xfer possible while the index/start_off provide faster lookups
+ * for the next iteration.
+ */
+dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off,
+				size_t *nr_bytes, struct scif_window_iter *iter)
+{
+	int i, page_nr;
+	s64 start, end;
+	off_t page_off;
+
+	if (window->nr_pages == window->nr_contig_chunks) {
+		page_nr = (off - window->offset) >> PAGE_SHIFT;
+		page_off = off & ~PAGE_MASK;
+
+		if (nr_bytes)
+			*nr_bytes = PAGE_SIZE - page_off;
+		return window->dma_addr[page_nr] | page_off;
+	}
+	if (iter) {
+		i = iter->index;
+		start = iter->offset;
+	} else {
+		i =  0;
+		start =  window->offset;
+	}
+	for (; i < window->nr_contig_chunks; i++) {
+		end = start + (window->num_pages[i] << PAGE_SHIFT);
+		if (off >= start && off < end) {
+			if (iter) {
+				iter->index = i;
+				iter->offset = start;
+			}
+			if (nr_bytes)
+				*nr_bytes = end - off;
+			return (window->dma_addr[i] + (off - start));
+		}
+		start += (window->num_pages[i] << PAGE_SHIFT);
+	}
+	dev_err(scif_info.mdev.this_device,
+		"%s %d BUG. Addr not found? window %p off 0x%llx\n",
+		__func__, __LINE__, window, off);
+	return SCIF_RMA_ERROR_CODE;
+}
+
+/*
+ * Copy between rma window and temporary buffer
+ */
+static void scif_rma_local_cpu_copy(s64 offset, struct scif_window *window,
+				    u8 *temp, size_t rem_len, bool to_temp)
+{
+	void *window_virt;
+	size_t loop_len;
+	int offset_in_page;
+	s64 end_offset;
+
+	offset_in_page = offset & ~PAGE_MASK;
+	loop_len = PAGE_SIZE - offset_in_page;
+
+	if (rem_len < loop_len)
+		loop_len = rem_len;
+
+	window_virt = _get_local_va(offset, window, loop_len);
+	if (!window_virt)
+		return;
+	if (to_temp)
+		memcpy(temp, window_virt, loop_len);
+	else
+		memcpy(window_virt, temp, loop_len);
+
+	offset += loop_len;
+	temp += loop_len;
+	rem_len -= loop_len;
+
+	end_offset = window->offset +
+		(window->nr_pages << PAGE_SHIFT);
+	while (rem_len) {
+		if (offset == end_offset) {
+			window = list_entry_next(window, list);
+			end_offset = window->offset +
+				(window->nr_pages << PAGE_SHIFT);
+		}
+		loop_len = min(PAGE_SIZE, rem_len);
+		window_virt = _get_local_va(offset, window, loop_len);
+		if (!window_virt)
+			return;
+		if (to_temp)
+			memcpy(temp, window_virt, loop_len);
+		else
+			memcpy(window_virt, temp, loop_len);
+		offset	+= loop_len;
+		temp	+= loop_len;
+		rem_len	-= loop_len;
+	}
+}
+
+/**
+ * scif_rma_completion_cb:
+ * @data: RMA cookie
+ *
+ * RMA interrupt completion callback.
+ */
+static void scif_rma_completion_cb(void *data)
+{
+	struct scif_dma_comp_cb *comp_cb = data;
+
+	/* Free DMA Completion CB. */
+	if (comp_cb->dst_window)
+		scif_rma_local_cpu_copy(comp_cb->dst_offset,
+					comp_cb->dst_window,
+					comp_cb->temp_buf +
+					comp_cb->header_padding,
+					comp_cb->len, false);
+	scif_unmap_single(comp_cb->temp_phys, comp_cb->sdev,
+			  SCIF_KMEM_UNALIGNED_BUF_SIZE);
+	if (comp_cb->is_cache)
+		kmem_cache_free(unaligned_cache,
+				comp_cb->temp_buf_to_free);
+	else
+		kfree(comp_cb->temp_buf_to_free);
+}
+
+/* Copies between temporary buffer and offsets provided in work */
+static int
+scif_rma_list_dma_copy_unaligned(struct scif_copy_work *work,
+				 u8 *temp, struct dma_chan *chan,
+				 bool src_local)
+{
+	struct scif_dma_comp_cb *comp_cb = work->comp_cb;
+	dma_addr_t window_dma_addr, temp_dma_addr;
+	dma_addr_t temp_phys = comp_cb->temp_phys;
+	size_t loop_len, nr_contig_bytes = 0, remaining_len = work->len;
+	int offset_in_ca, ret = 0;
+	s64 end_offset, offset;
+	struct scif_window *window;
+	void *window_virt_addr;
+	size_t tail_len;
+	struct dma_async_tx_descriptor *tx;
+	struct dma_device *dev = chan->device;
+	dma_cookie_t cookie;
+
+	if (src_local) {
+		offset = work->dst_offset;
+		window = work->dst_window;
+	} else {
+		offset = work->src_offset;
+		window = work->src_window;
+	}
+
+	offset_in_ca = offset & (L1_CACHE_BYTES - 1);
+	if (offset_in_ca) {
+		loop_len = L1_CACHE_BYTES - offset_in_ca;
+		loop_len = min(loop_len, remaining_len);
+		window_virt_addr = ioremap_remote(offset, window,
+						  loop_len,
+						  work->remote_dev,
+						  NULL);
+		if (!window_virt_addr)
+			return -ENOMEM;
+		if (src_local)
+			scif_unaligned_cpy_toio(window_virt_addr, temp,
+						loop_len,
+						work->ordered &&
+						!(remaining_len - loop_len));
+		else
+			scif_unaligned_cpy_fromio(temp, window_virt_addr,
+						  loop_len, work->ordered &&
+						  !(remaining_len - loop_len));
+		iounmap_remote(window_virt_addr, loop_len, work);
+
+		offset += loop_len;
+		temp += loop_len;
+		temp_phys += loop_len;
+		remaining_len -= loop_len;
+	}
+
+	offset_in_ca = offset & ~PAGE_MASK;
+	end_offset = window->offset +
+		(window->nr_pages << PAGE_SHIFT);
+
+	tail_len = remaining_len & (L1_CACHE_BYTES - 1);
+	remaining_len -= tail_len;
+	while (remaining_len) {
+		if (offset == end_offset) {
+			window = list_entry_next(window, list);
+			end_offset = window->offset +
+				(window->nr_pages << PAGE_SHIFT);
+		}
+		if (scif_is_mgmt_node())
+			temp_dma_addr = temp_phys;
+		else
+			/* Fix if we ever enable IOMMU on the card */
+			temp_dma_addr = (dma_addr_t)virt_to_phys(temp);
+		window_dma_addr = scif_off_to_dma_addr(window, offset,
+						       &nr_contig_bytes,
+						       NULL);
+		loop_len = min(nr_contig_bytes, remaining_len);
+		if (src_local) {
+			if (work->ordered && !tail_len &&
+			    !(remaining_len - loop_len) &&
+			    loop_len != L1_CACHE_BYTES) {
+				/*
+				 * Break up the last chunk of the transfer into
+				 * two steps. if there is no tail to guarantee
+				 * DMA ordering. SCIF_DMA_POLLING inserts
+				 * a status update descriptor in step 1 which
+				 * acts as a double sided synchronization fence
+				 * for the DMA engine to ensure that the last
+				 * cache line in step 2 is updated last.
+				 */
+				/* Step 1) DMA: Body Length - L1_CACHE_BYTES. */
+				tx =
+				dev->device_prep_dma_memcpy(chan,
+							    window_dma_addr,
+							    temp_dma_addr,
+							    loop_len -
+							    L1_CACHE_BYTES,
+							    DMA_PREP_FENCE);
+				if (!tx) {
+					ret = -ENOMEM;
+					goto err;
+				}
+				cookie = tx->tx_submit(tx);
+				if (dma_submit_error(cookie)) {
+					ret = -ENOMEM;
+					goto err;
+				}
+				dma_async_issue_pending(chan);
+				offset += (loop_len - L1_CACHE_BYTES);
+				temp_dma_addr += (loop_len - L1_CACHE_BYTES);
+				window_dma_addr += (loop_len - L1_CACHE_BYTES);
+				remaining_len -= (loop_len - L1_CACHE_BYTES);
+				loop_len = remaining_len;
+
+				/* Step 2) DMA: L1_CACHE_BYTES */
+				tx =
+				dev->device_prep_dma_memcpy(chan,
+							    window_dma_addr,
+							    temp_dma_addr,
+							    loop_len, 0);
+				if (!tx) {
+					ret = -ENOMEM;
+					goto err;
+				}
+				cookie = tx->tx_submit(tx);
+				if (dma_submit_error(cookie)) {
+					ret = -ENOMEM;
+					goto err;
+				}
+				dma_async_issue_pending(chan);
+			} else {
+				tx =
+				dev->device_prep_dma_memcpy(chan,
+							    window_dma_addr,
+							    temp_dma_addr,
+							    loop_len, 0);
+				if (!tx) {
+					ret = -ENOMEM;
+					goto err;
+				}
+				cookie = tx->tx_submit(tx);
+				if (dma_submit_error(cookie)) {
+					ret = -ENOMEM;
+					goto err;
+				}
+				dma_async_issue_pending(chan);
+			}
+		} else {
+			tx = dev->device_prep_dma_memcpy(chan, temp_dma_addr,
+					window_dma_addr, loop_len, 0);
+			if (!tx) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			cookie = tx->tx_submit(tx);
+			if (dma_submit_error(cookie)) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			dma_async_issue_pending(chan);
+		}
+		if (ret < 0)
+			goto err;
+		offset += loop_len;
+		temp += loop_len;
+		temp_phys += loop_len;
+		remaining_len -= loop_len;
+		offset_in_ca = 0;
+	}
+	if (tail_len) {
+		if (offset == end_offset) {
+			window = list_entry_next(window, list);
+			end_offset = window->offset +
+				(window->nr_pages << PAGE_SHIFT);
+		}
+		window_virt_addr = ioremap_remote(offset, window, tail_len,
+						  work->remote_dev,
+						  NULL);
+		if (!window_virt_addr)
+			return -ENOMEM;
+		/*
+		 * The CPU copy for the tail bytes must be initiated only once
+		 * previous DMA transfers for this endpoint have completed
+		 * to guarantee ordering.
+		 */
+		if (work->ordered) {
+			struct scif_dev *rdev = work->remote_dev;
+
+			ret = scif_drain_dma_intr(rdev->sdev, chan);
+			if (ret)
+				return ret;
+		}
+		if (src_local)
+			scif_unaligned_cpy_toio(window_virt_addr, temp,
+						tail_len, work->ordered);
+		else
+			scif_unaligned_cpy_fromio(temp, window_virt_addr,
+						  tail_len, work->ordered);
+		iounmap_remote(window_virt_addr, tail_len, work);
+	}
+	tx = dev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_INTERRUPT);
+	if (!tx) {
+		ret = -ENOMEM;
+		return ret;
+	}
+	tx->callback = &scif_rma_completion_cb;
+	tx->callback_param = comp_cb;
+	cookie = tx->tx_submit(tx);
+
+	if (dma_submit_error(cookie)) {
+		ret = -ENOMEM;
+		return ret;
+	}
+	dma_async_issue_pending(chan);
+	return 0;
+err:
+	dev_err(scif_info.mdev.this_device,
+		"%s %d Desc Prog Failed ret %d\n",
+		__func__, __LINE__, ret);
+	return ret;
+}
+
+/*
+ * _scif_rma_list_dma_copy_aligned:
+ *
+ * Traverse all the windows and perform DMA copy.
+ */
+static int _scif_rma_list_dma_copy_aligned(struct scif_copy_work *work,
+					   struct dma_chan *chan)
+{
+	dma_addr_t src_dma_addr, dst_dma_addr;
+	size_t loop_len, remaining_len, src_contig_bytes = 0;
+	size_t dst_contig_bytes = 0;
+	struct scif_window_iter src_win_iter;
+	struct scif_window_iter dst_win_iter;
+	s64 end_src_offset, end_dst_offset;
+	struct scif_window *src_window = work->src_window;
+	struct scif_window *dst_window = work->dst_window;
+	s64 src_offset = work->src_offset, dst_offset = work->dst_offset;
+	int ret = 0;
+	struct dma_async_tx_descriptor *tx;
+	struct dma_device *dev = chan->device;
+	dma_cookie_t cookie;
+
+	remaining_len = work->len;
+
+	scif_init_window_iter(src_window, &src_win_iter);
+	scif_init_window_iter(dst_window, &dst_win_iter);
+	end_src_offset = src_window->offset +
+		(src_window->nr_pages << PAGE_SHIFT);
+	end_dst_offset = dst_window->offset +
+		(dst_window->nr_pages << PAGE_SHIFT);
+	while (remaining_len) {
+		if (src_offset == end_src_offset) {
+			src_window = list_entry_next(src_window, list);
+			end_src_offset = src_window->offset +
+				(src_window->nr_pages << PAGE_SHIFT);
+			scif_init_window_iter(src_window, &src_win_iter);
+		}
+		if (dst_offset == end_dst_offset) {
+			dst_window = list_entry_next(dst_window, list);
+			end_dst_offset = dst_window->offset +
+				(dst_window->nr_pages << PAGE_SHIFT);
+			scif_init_window_iter(dst_window, &dst_win_iter);
+		}
+
+		/* compute dma addresses for transfer */
+		src_dma_addr = scif_off_to_dma_addr(src_window, src_offset,
+						    &src_contig_bytes,
+						    &src_win_iter);
+		dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset,
+						    &dst_contig_bytes,
+						    &dst_win_iter);
+		loop_len = min(src_contig_bytes, dst_contig_bytes);
+		loop_len = min(loop_len, remaining_len);
+		if (work->ordered && !(remaining_len - loop_len)) {
+			/*
+			 * Break up the last chunk of the transfer into two
+			 * steps to ensure that the last byte in step 2 is
+			 * updated last.
+			 */
+			/* Step 1) DMA: Body Length - 1 */
+			tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr,
+							 src_dma_addr,
+							 loop_len - 1,
+							 DMA_PREP_FENCE);
+			if (!tx) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			cookie = tx->tx_submit(tx);
+			if (dma_submit_error(cookie)) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			src_offset += (loop_len - 1);
+			dst_offset += (loop_len - 1);
+			src_dma_addr += (loop_len - 1);
+			dst_dma_addr += (loop_len - 1);
+			remaining_len -= (loop_len - 1);
+			loop_len = remaining_len;
+
+			/* Step 2) DMA: 1 BYTES */
+			tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr,
+					src_dma_addr, loop_len, 0);
+			if (!tx) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			cookie = tx->tx_submit(tx);
+			if (dma_submit_error(cookie)) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			dma_async_issue_pending(chan);
+		} else {
+			tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr,
+					src_dma_addr, loop_len, 0);
+			if (!tx) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			cookie = tx->tx_submit(tx);
+			if (dma_submit_error(cookie)) {
+				ret = -ENOMEM;
+				goto err;
+			}
+		}
+		src_offset += loop_len;
+		dst_offset += loop_len;
+		remaining_len -= loop_len;
+	}
+	return ret;
+err:
+	dev_err(scif_info.mdev.this_device,
+		"%s %d Desc Prog Failed ret %d\n",
+		__func__, __LINE__, ret);
+	return ret;
+}
+
+/*
+ * scif_rma_list_dma_copy_aligned:
+ *
+ * Traverse all the windows and perform DMA copy.
+ */
+static int scif_rma_list_dma_copy_aligned(struct scif_copy_work *work,
+					  struct dma_chan *chan)
+{
+	dma_addr_t src_dma_addr, dst_dma_addr;
+	size_t loop_len, remaining_len, tail_len, src_contig_bytes = 0;
+	size_t dst_contig_bytes = 0;
+	int src_cache_off;
+	s64 end_src_offset, end_dst_offset;
+	struct scif_window_iter src_win_iter;
+	struct scif_window_iter dst_win_iter;
+	void *src_virt, *dst_virt;
+	struct scif_window *src_window = work->src_window;
+	struct scif_window *dst_window = work->dst_window;
+	s64 src_offset = work->src_offset, dst_offset = work->dst_offset;
+	int ret = 0;
+	struct dma_async_tx_descriptor *tx;
+	struct dma_device *dev = chan->device;
+	dma_cookie_t cookie;
+
+	remaining_len = work->len;
+	scif_init_window_iter(src_window, &src_win_iter);
+	scif_init_window_iter(dst_window, &dst_win_iter);
+
+	src_cache_off = src_offset & (L1_CACHE_BYTES - 1);
+	if (src_cache_off != 0) {
+		/* Head */
+		loop_len = L1_CACHE_BYTES - src_cache_off;
+		loop_len = min(loop_len, remaining_len);
+		src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset);
+		dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset);
+		if (src_window->type == SCIF_WINDOW_SELF)
+			src_virt = _get_local_va(src_offset, src_window,
+						 loop_len);
+		else
+			src_virt = ioremap_remote(src_offset, src_window,
+						  loop_len,
+						  work->remote_dev, NULL);
+		if (!src_virt)
+			return -ENOMEM;
+		if (dst_window->type == SCIF_WINDOW_SELF)
+			dst_virt = _get_local_va(dst_offset, dst_window,
+						 loop_len);
+		else
+			dst_virt = ioremap_remote(dst_offset, dst_window,
+						  loop_len,
+						  work->remote_dev, NULL);
+		if (!dst_virt) {
+			if (src_window->type != SCIF_WINDOW_SELF)
+				iounmap_remote(src_virt, loop_len, work);
+			return -ENOMEM;
+		}
+		if (src_window->type == SCIF_WINDOW_SELF)
+			scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len,
+						remaining_len == loop_len ?
+						work->ordered : false);
+		else
+			scif_unaligned_cpy_fromio(dst_virt, src_virt, loop_len,
+						  remaining_len == loop_len ?
+						  work->ordered : false);
+		if (src_window->type != SCIF_WINDOW_SELF)
+			iounmap_remote(src_virt, loop_len, work);
+		if (dst_window->type != SCIF_WINDOW_SELF)
+			iounmap_remote(dst_virt, loop_len, work);
+		src_offset += loop_len;
+		dst_offset += loop_len;
+		remaining_len -= loop_len;
+	}
+
+	end_src_offset = src_window->offset +
+		(src_window->nr_pages << PAGE_SHIFT);
+	end_dst_offset = dst_window->offset +
+		(dst_window->nr_pages << PAGE_SHIFT);
+	tail_len = remaining_len & (L1_CACHE_BYTES - 1);
+	remaining_len -= tail_len;
+	while (remaining_len) {
+		if (src_offset == end_src_offset) {
+			src_window = list_entry_next(src_window, list);
+			end_src_offset = src_window->offset +
+				(src_window->nr_pages << PAGE_SHIFT);
+			scif_init_window_iter(src_window, &src_win_iter);
+		}
+		if (dst_offset == end_dst_offset) {
+			dst_window = list_entry_next(dst_window, list);
+			end_dst_offset = dst_window->offset +
+				(dst_window->nr_pages << PAGE_SHIFT);
+			scif_init_window_iter(dst_window, &dst_win_iter);
+		}
+
+		/* compute dma addresses for transfer */
+		src_dma_addr = scif_off_to_dma_addr(src_window, src_offset,
+						    &src_contig_bytes,
+						    &src_win_iter);
+		dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset,
+						    &dst_contig_bytes,
+						    &dst_win_iter);
+		loop_len = min(src_contig_bytes, dst_contig_bytes);
+		loop_len = min(loop_len, remaining_len);
+		if (work->ordered && !tail_len &&
+		    !(remaining_len - loop_len)) {
+			/*
+			 * Break up the last chunk of the transfer into two
+			 * steps. if there is no tail to gurantee DMA ordering.
+			 * Passing SCIF_DMA_POLLING inserts a status update
+			 * descriptor in step 1 which acts as a double sided
+			 * synchronization fence for the DMA engine to ensure
+			 * that the last cache line in step 2 is updated last.
+			 */
+			/* Step 1) DMA: Body Length - L1_CACHE_BYTES. */
+			tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr,
+							 src_dma_addr,
+							 loop_len -
+							 L1_CACHE_BYTES,
+							 DMA_PREP_FENCE);
+			if (!tx) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			cookie = tx->tx_submit(tx);
+			if (dma_submit_error(cookie)) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			dma_async_issue_pending(chan);
+			src_offset += (loop_len - L1_CACHE_BYTES);
+			dst_offset += (loop_len - L1_CACHE_BYTES);
+			src_dma_addr += (loop_len - L1_CACHE_BYTES);
+			dst_dma_addr += (loop_len - L1_CACHE_BYTES);
+			remaining_len -= (loop_len - L1_CACHE_BYTES);
+			loop_len = remaining_len;
+
+			/* Step 2) DMA: L1_CACHE_BYTES */
+			tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr,
+							 src_dma_addr,
+							 loop_len, 0);
+			if (!tx) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			cookie = tx->tx_submit(tx);
+			if (dma_submit_error(cookie)) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			dma_async_issue_pending(chan);
+		} else {
+			tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr,
+							 src_dma_addr,
+							 loop_len, 0);
+			if (!tx) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			cookie = tx->tx_submit(tx);
+			if (dma_submit_error(cookie)) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			dma_async_issue_pending(chan);
+		}
+		src_offset += loop_len;
+		dst_offset += loop_len;
+		remaining_len -= loop_len;
+	}
+	remaining_len = tail_len;
+	if (remaining_len) {
+		loop_len = remaining_len;
+		if (src_offset == end_src_offset)
+			src_window = list_entry_next(src_window, list);
+		if (dst_offset == end_dst_offset)
+			dst_window = list_entry_next(dst_window, list);
+
+		src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset);
+		dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset);
+		/*
+		 * The CPU copy for the tail bytes must be initiated only once
+		 * previous DMA transfers for this endpoint have completed to
+		 * guarantee ordering.
+		 */
+		if (work->ordered) {
+			struct scif_dev *rdev = work->remote_dev;
+
+			ret = scif_drain_dma_poll(rdev->sdev, chan);
+			if (ret)
+				return ret;
+		}
+		if (src_window->type == SCIF_WINDOW_SELF)
+			src_virt = _get_local_va(src_offset, src_window,
+						 loop_len);
+		else
+			src_virt = ioremap_remote(src_offset, src_window,
+						  loop_len,
+						  work->remote_dev, NULL);
+		if (!src_virt)
+			return -ENOMEM;
+
+		if (dst_window->type == SCIF_WINDOW_SELF)
+			dst_virt = _get_local_va(dst_offset, dst_window,
+						 loop_len);
+		else
+			dst_virt = ioremap_remote(dst_offset, dst_window,
+						  loop_len,
+						  work->remote_dev, NULL);
+		if (!dst_virt) {
+			if (src_window->type != SCIF_WINDOW_SELF)
+				iounmap_remote(src_virt, loop_len, work);
+			return -ENOMEM;
+		}
+
+		if (src_window->type == SCIF_WINDOW_SELF)
+			scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len,
+						work->ordered);
+		else
+			scif_unaligned_cpy_fromio(dst_virt, src_virt,
+						  loop_len, work->ordered);
+		if (src_window->type != SCIF_WINDOW_SELF)
+			iounmap_remote(src_virt, loop_len, work);
+
+		if (dst_window->type != SCIF_WINDOW_SELF)
+			iounmap_remote(dst_virt, loop_len, work);
+		remaining_len -= loop_len;
+	}
+	return ret;
+err:
+	dev_err(scif_info.mdev.this_device,
+		"%s %d Desc Prog Failed ret %d\n",
+		__func__, __LINE__, ret);
+	return ret;
+}
+
+/*
+ * scif_rma_list_cpu_copy:
+ *
+ * Traverse all the windows and perform CPU copy.
+ */
+static int scif_rma_list_cpu_copy(struct scif_copy_work *work)
+{
+	void *src_virt, *dst_virt;
+	size_t loop_len, remaining_len;
+	int src_page_off, dst_page_off;
+	s64 src_offset = work->src_offset, dst_offset = work->dst_offset;
+	struct scif_window *src_window = work->src_window;
+	struct scif_window *dst_window = work->dst_window;
+	s64 end_src_offset, end_dst_offset;
+	int ret = 0;
+	struct scif_window_iter src_win_iter;
+	struct scif_window_iter dst_win_iter;
+
+	remaining_len = work->len;
+
+	scif_init_window_iter(src_window, &src_win_iter);
+	scif_init_window_iter(dst_window, &dst_win_iter);
+	while (remaining_len) {
+		src_page_off = src_offset & ~PAGE_MASK;
+		dst_page_off = dst_offset & ~PAGE_MASK;
+		loop_len = min(PAGE_SIZE -
+			       max(src_page_off, dst_page_off),
+			       remaining_len);
+
+		if (src_window->type == SCIF_WINDOW_SELF)
+			src_virt = _get_local_va(src_offset, src_window,
+						 loop_len);
+		else
+			src_virt = ioremap_remote(src_offset, src_window,
+						  loop_len,
+						  work->remote_dev,
+						  &src_win_iter);
+		if (!src_virt) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (dst_window->type == SCIF_WINDOW_SELF)
+			dst_virt = _get_local_va(dst_offset, dst_window,
+						 loop_len);
+		else
+			dst_virt = ioremap_remote(dst_offset, dst_window,
+						  loop_len,
+						  work->remote_dev,
+						  &dst_win_iter);
+		if (!dst_virt) {
+			if (src_window->type == SCIF_WINDOW_PEER)
+				iounmap_remote(src_virt, loop_len, work);
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (work->loopback) {
+			memcpy(dst_virt, src_virt, loop_len);
+		} else {
+			if (src_window->type == SCIF_WINDOW_SELF)
+				memcpy_toio((void __iomem __force *)dst_virt,
+					    src_virt, loop_len);
+			else
+				memcpy_fromio(dst_virt,
+					      (void __iomem __force *)src_virt,
+					      loop_len);
+		}
+		if (src_window->type == SCIF_WINDOW_PEER)
+			iounmap_remote(src_virt, loop_len, work);
+
+		if (dst_window->type == SCIF_WINDOW_PEER)
+			iounmap_remote(dst_virt, loop_len, work);
+
+		src_offset += loop_len;
+		dst_offset += loop_len;
+		remaining_len -= loop_len;
+		if (remaining_len) {
+			end_src_offset = src_window->offset +
+				(src_window->nr_pages << PAGE_SHIFT);
+			end_dst_offset = dst_window->offset +
+				(dst_window->nr_pages << PAGE_SHIFT);
+			if (src_offset == end_src_offset) {
+				src_window = list_entry_next(src_window, list);
+				scif_init_window_iter(src_window,
+						      &src_win_iter);
+			}
+			if (dst_offset == end_dst_offset) {
+				dst_window = list_entry_next(dst_window, list);
+				scif_init_window_iter(dst_window,
+						      &dst_win_iter);
+			}
+		}
+	}
+error:
+	return ret;
+}
+
+static int scif_rma_list_dma_copy_wrapper(struct scif_endpt *epd,
+					  struct scif_copy_work *work,
+					  struct dma_chan *chan, off_t loffset)
+{
+	int src_cache_off, dst_cache_off;
+	s64 src_offset = work->src_offset, dst_offset = work->dst_offset;
+	u8 *temp = NULL;
+	bool src_local = true, dst_local = false;
+	struct scif_dma_comp_cb *comp_cb;
+	dma_addr_t src_dma_addr, dst_dma_addr;
+	int err;
+
+	if (is_dma_copy_aligned(chan->device, 1, 1, 1))
+		return _scif_rma_list_dma_copy_aligned(work, chan);
+
+	src_cache_off = src_offset & (L1_CACHE_BYTES - 1);
+	dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1);
+
+	if (dst_cache_off == src_cache_off)
+		return scif_rma_list_dma_copy_aligned(work, chan);
+
+	if (work->loopback)
+		return scif_rma_list_cpu_copy(work);
+	src_dma_addr = __scif_off_to_dma_addr(work->src_window, src_offset);
+	dst_dma_addr = __scif_off_to_dma_addr(work->dst_window, dst_offset);
+	src_local = work->src_window->type == SCIF_WINDOW_SELF;
+	dst_local = work->dst_window->type == SCIF_WINDOW_SELF;
+
+	dst_local = dst_local;
+	/* Allocate dma_completion cb */
+	comp_cb = kzalloc(sizeof(*comp_cb), GFP_KERNEL);
+	if (!comp_cb)
+		goto error;
+
+	work->comp_cb = comp_cb;
+	comp_cb->cb_cookie = comp_cb;
+	comp_cb->dma_completion_func = &scif_rma_completion_cb;
+
+	if (work->len + (L1_CACHE_BYTES << 1) < SCIF_KMEM_UNALIGNED_BUF_SIZE) {
+		comp_cb->is_cache = false;
+		/* Allocate padding bytes to align to a cache line */
+		temp = kmalloc(work->len + (L1_CACHE_BYTES << 1),
+			       GFP_KERNEL);
+		if (!temp)
+			goto free_comp_cb;
+		comp_cb->temp_buf_to_free = temp;
+		/* kmalloc(..) does not guarantee cache line alignment */
+		if (!IS_ALIGNED((u64)temp, L1_CACHE_BYTES))
+			temp = PTR_ALIGN(temp, L1_CACHE_BYTES);
+	} else {
+		comp_cb->is_cache = true;
+		temp = kmem_cache_alloc(unaligned_cache, GFP_KERNEL);
+		if (!temp)
+			goto free_comp_cb;
+		comp_cb->temp_buf_to_free = temp;
+	}
+
+	if (src_local) {
+		temp += dst_cache_off;
+		scif_rma_local_cpu_copy(work->src_offset, work->src_window,
+					temp, work->len, true);
+	} else {
+		comp_cb->dst_window = work->dst_window;
+		comp_cb->dst_offset = work->dst_offset;
+		work->src_offset = work->src_offset - src_cache_off;
+		comp_cb->len = work->len;
+		work->len = ALIGN(work->len + src_cache_off, L1_CACHE_BYTES);
+		comp_cb->header_padding = src_cache_off;
+	}
+	comp_cb->temp_buf = temp;
+
+	err = scif_map_single(&comp_cb->temp_phys, temp,
+			      work->remote_dev, SCIF_KMEM_UNALIGNED_BUF_SIZE);
+	if (err)
+		goto free_temp_buf;
+	comp_cb->sdev = work->remote_dev;
+	if (scif_rma_list_dma_copy_unaligned(work, temp, chan, src_local) < 0)
+		goto free_temp_buf;
+	if (!src_local)
+		work->fence_type = SCIF_DMA_INTR;
+	return 0;
+free_temp_buf:
+	if (comp_cb->is_cache)
+		kmem_cache_free(unaligned_cache, comp_cb->temp_buf_to_free);
+	else
+		kfree(comp_cb->temp_buf_to_free);
+free_comp_cb:
+	kfree(comp_cb);
+error:
+	return -ENOMEM;
+}
+
+/**
+ * scif_rma_copy:
+ * @epd: end point descriptor.
+ * @loffset: offset in local registered address space to/from which to copy
+ * @addr: user virtual address to/from which to copy
+ * @len: length of range to copy
+ * @roffset: offset in remote registered address space to/from which to copy
+ * @flags: flags
+ * @dir: LOCAL->REMOTE or vice versa.
+ * @last_chunk: true if this is the last chunk of a larger transfer
+ *
+ * Validate parameters, check if src/dst registered ranges requested for copy
+ * are valid and initiate either CPU or DMA copy.
+ */
+static int scif_rma_copy(scif_epd_t epd, off_t loffset, unsigned long addr,
+			 size_t len, off_t roffset, int flags,
+			 enum scif_rma_dir dir, bool last_chunk)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct scif_rma_req remote_req;
+	struct scif_rma_req req;
+	struct scif_window *local_window = NULL;
+	struct scif_window *remote_window = NULL;
+	struct scif_copy_work copy_work;
+	bool loopback;
+	int err = 0;
+	struct dma_chan *chan;
+	struct scif_mmu_notif *mmn = NULL;
+	bool cache = false;
+	struct device *spdev;
+
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+
+	if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE |
+				SCIF_RMA_SYNC | SCIF_RMA_ORDERED)))
+		return -EINVAL;
+
+	loopback = scifdev_self(ep->remote_dev) ? true : false;
+	copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ?
+				SCIF_DMA_POLL : 0;
+	copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk);
+
+	/* Use CPU for Mgmt node <-> Mgmt node copies */
+	if (loopback && scif_is_mgmt_node()) {
+		flags |= SCIF_RMA_USECPU;
+		copy_work.fence_type = 0x0;
+	}
+
+	cache = scif_is_set_reg_cache(flags);
+
+	remote_req.out_window = &remote_window;
+	remote_req.offset = roffset;
+	remote_req.nr_bytes = len;
+	/*
+	 * If transfer is from local to remote then the remote window
+	 * must be writeable and vice versa.
+	 */
+	remote_req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_WRITE : VM_READ;
+	remote_req.type = SCIF_WINDOW_PARTIAL;
+	remote_req.head = &ep->rma_info.remote_reg_list;
+
+	spdev = scif_get_peer_dev(ep->remote_dev);
+	if (IS_ERR(spdev)) {
+		err = PTR_ERR(spdev);
+		return err;
+	}
+
+	if (addr && cache) {
+		mutex_lock(&ep->rma_info.mmn_lock);
+		mmn = scif_find_mmu_notifier(current->mm, &ep->rma_info);
+		if (!mmn)
+			scif_add_mmu_notifier(current->mm, ep);
+		mutex_unlock(&ep->rma_info.mmn_lock);
+		if (IS_ERR(mmn)) {
+			scif_put_peer_dev(spdev);
+			return PTR_ERR(mmn);
+		}
+		cache = cache && !scif_rma_tc_can_cache(ep, len);
+	}
+	mutex_lock(&ep->rma_info.rma_lock);
+	if (addr) {
+		req.out_window = &local_window;
+		req.nr_bytes = ALIGN(len + (addr & ~PAGE_MASK),
+				     PAGE_SIZE);
+		req.va_for_temp = addr & PAGE_MASK;
+		req.prot = (dir == SCIF_LOCAL_TO_REMOTE ?
+			    VM_READ : VM_WRITE | VM_READ);
+		/* Does a valid local window exist? */
+		if (mmn) {
+			spin_lock(&ep->rma_info.tc_lock);
+			req.head = &mmn->tc_reg_list;
+			err = scif_query_tcw(ep, &req);
+			spin_unlock(&ep->rma_info.tc_lock);
+		}
+		if (!mmn || err) {
+			err = scif_register_temp(epd, req.va_for_temp,
+						 req.nr_bytes, req.prot,
+						 &loffset, &local_window);
+			if (err) {
+				mutex_unlock(&ep->rma_info.rma_lock);
+				goto error;
+			}
+			if (!cache)
+				goto skip_cache;
+			atomic_inc(&ep->rma_info.tcw_refcount);
+			atomic_add_return(local_window->nr_pages,
+					  &ep->rma_info.tcw_total_pages);
+			if (mmn) {
+				spin_lock(&ep->rma_info.tc_lock);
+				scif_insert_tcw(local_window,
+						&mmn->tc_reg_list);
+				spin_unlock(&ep->rma_info.tc_lock);
+			}
+		}
+skip_cache:
+		loffset = local_window->offset +
+				(addr - local_window->va_for_temp);
+	} else {
+		req.out_window = &local_window;
+		req.offset = loffset;
+		/*
+		 * If transfer is from local to remote then the self window
+		 * must be readable and vice versa.
+		 */
+		req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_READ : VM_WRITE;
+		req.nr_bytes = len;
+		req.type = SCIF_WINDOW_PARTIAL;
+		req.head = &ep->rma_info.reg_list;
+		/* Does a valid local window exist? */
+		err = scif_query_window(&req);
+		if (err) {
+			mutex_unlock(&ep->rma_info.rma_lock);
+			goto error;
+		}
+	}
+
+	/* Does a valid remote window exist? */
+	err = scif_query_window(&remote_req);
+	if (err) {
+		mutex_unlock(&ep->rma_info.rma_lock);
+		goto error;
+	}
+
+	/*
+	 * Prepare copy_work for submitting work to the DMA kernel thread
+	 * or CPU copy routine.
+	 */
+	copy_work.len = len;
+	copy_work.loopback = loopback;
+	copy_work.remote_dev = ep->remote_dev;
+	if (dir == SCIF_LOCAL_TO_REMOTE) {
+		copy_work.src_offset = loffset;
+		copy_work.src_window = local_window;
+		copy_work.dst_offset = roffset;
+		copy_work.dst_window = remote_window;
+	} else {
+		copy_work.src_offset = roffset;
+		copy_work.src_window = remote_window;
+		copy_work.dst_offset = loffset;
+		copy_work.dst_window = local_window;
+	}
+
+	if (flags & SCIF_RMA_USECPU) {
+		scif_rma_list_cpu_copy(&copy_work);
+	} else {
+		chan = ep->rma_info.dma_chan;
+		err = scif_rma_list_dma_copy_wrapper(epd, &copy_work,
+						     chan, loffset);
+	}
+	if (addr && !cache)
+		atomic_inc(&ep->rma_info.tw_refcount);
+
+	mutex_unlock(&ep->rma_info.rma_lock);
+
+	if (last_chunk) {
+		struct scif_dev *rdev = ep->remote_dev;
+
+		if (copy_work.fence_type == SCIF_DMA_POLL)
+			err = scif_drain_dma_poll(rdev->sdev,
+						  ep->rma_info.dma_chan);
+		else if (copy_work.fence_type == SCIF_DMA_INTR)
+			err = scif_drain_dma_intr(rdev->sdev,
+						  ep->rma_info.dma_chan);
+	}
+
+	if (addr && !cache)
+		scif_queue_for_cleanup(local_window, &scif_info.rma);
+	scif_put_peer_dev(spdev);
+	return err;
+error:
+	if (err) {
+		if (addr && local_window && !cache)
+			scif_destroy_window(ep, local_window);
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d len 0x%lx\n",
+			__func__, __LINE__, err, len);
+	}
+	scif_put_peer_dev(spdev);
+	return err;
+}
+
+int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len,
+		  off_t roffset, int flags)
+{
+	int err;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx offset 0x%lx flags 0x%x\n",
+		epd, loffset, len, roffset, flags);
+	if (scif_unaligned(loffset, roffset)) {
+		while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) {
+			err = scif_rma_copy(epd, loffset, 0x0,
+					    SCIF_MAX_UNALIGNED_BUF_SIZE,
+					    roffset, flags,
+					    SCIF_REMOTE_TO_LOCAL, false);
+			if (err)
+				goto readfrom_err;
+			loffset += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			roffset += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			len -= SCIF_MAX_UNALIGNED_BUF_SIZE;
+		}
+	}
+	err = scif_rma_copy(epd, loffset, 0x0, len,
+			    roffset, flags, SCIF_REMOTE_TO_LOCAL, true);
+readfrom_err:
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_readfrom);
+
+int scif_writeto(scif_epd_t epd, off_t loffset, size_t len,
+		 off_t roffset, int flags)
+{
+	int err;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx roffset 0x%lx flags 0x%x\n",
+		epd, loffset, len, roffset, flags);
+	if (scif_unaligned(loffset, roffset)) {
+		while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) {
+			err = scif_rma_copy(epd, loffset, 0x0,
+					    SCIF_MAX_UNALIGNED_BUF_SIZE,
+					    roffset, flags,
+					    SCIF_LOCAL_TO_REMOTE, false);
+			if (err)
+				goto writeto_err;
+			loffset += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			roffset += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			len -= SCIF_MAX_UNALIGNED_BUF_SIZE;
+		}
+	}
+	err = scif_rma_copy(epd, loffset, 0x0, len,
+			    roffset, flags, SCIF_LOCAL_TO_REMOTE, true);
+writeto_err:
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_writeto);
+
+int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len,
+		   off_t roffset, int flags)
+{
+	int err;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI vreadfrom: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n",
+		epd, addr, len, roffset, flags);
+	if (scif_unaligned((off_t __force)addr, roffset)) {
+		if (len > SCIF_MAX_UNALIGNED_BUF_SIZE)
+			flags &= ~SCIF_RMA_USECACHE;
+
+		while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) {
+			err = scif_rma_copy(epd, 0, (u64)addr,
+					    SCIF_MAX_UNALIGNED_BUF_SIZE,
+					    roffset, flags,
+					    SCIF_REMOTE_TO_LOCAL, false);
+			if (err)
+				goto vreadfrom_err;
+			addr += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			roffset += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			len -= SCIF_MAX_UNALIGNED_BUF_SIZE;
+		}
+	}
+	err = scif_rma_copy(epd, 0, (u64)addr, len,
+			    roffset, flags, SCIF_REMOTE_TO_LOCAL, true);
+vreadfrom_err:
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_vreadfrom);
+
+int scif_vwriteto(scif_epd_t epd, void *addr, size_t len,
+		  off_t roffset, int flags)
+{
+	int err;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI vwriteto: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n",
+		epd, addr, len, roffset, flags);
+	if (scif_unaligned((off_t __force)addr, roffset)) {
+		if (len > SCIF_MAX_UNALIGNED_BUF_SIZE)
+			flags &= ~SCIF_RMA_USECACHE;
+
+		while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) {
+			err = scif_rma_copy(epd, 0, (u64)addr,
+					    SCIF_MAX_UNALIGNED_BUF_SIZE,
+					    roffset, flags,
+					    SCIF_LOCAL_TO_REMOTE, false);
+			if (err)
+				goto vwriteto_err;
+			addr += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			roffset += SCIF_MAX_UNALIGNED_BUF_SIZE;
+			len -= SCIF_MAX_UNALIGNED_BUF_SIZE;
+		}
+	}
+	err = scif_rma_copy(epd, 0, (u64)addr, len,
+			    roffset, flags, SCIF_LOCAL_TO_REMOTE, true);
+vwriteto_err:
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_vwriteto);
diff --git a/drivers/misc/mic/scif/scif_epd.c b/drivers/misc/mic/scif/scif_epd.c
index b4bfbb08a8e3..00e5d6d66e7b 100644
--- a/drivers/misc/mic/scif/scif_epd.c
+++ b/drivers/misc/mic/scif/scif_epd.c
@@ -65,14 +65,14 @@ void scif_teardown_ep(void *endpt)
 void scif_add_epd_to_zombie_list(struct scif_endpt *ep, bool eplock_held)
 {
 	if (!eplock_held)
-		spin_lock(&scif_info.eplock);
+		mutex_lock(&scif_info.eplock);
 	spin_lock(&ep->lock);
 	ep->state = SCIFEP_ZOMBIE;
 	spin_unlock(&ep->lock);
 	list_add_tail(&ep->list, &scif_info.zombie);
 	scif_info.nr_zombies++;
 	if (!eplock_held)
-		spin_unlock(&scif_info.eplock);
+		mutex_unlock(&scif_info.eplock);
 	schedule_work(&scif_info.misc_work);
 }
 
@@ -81,16 +81,15 @@ static struct scif_endpt *scif_find_listen_ep(u16 port)
 	struct scif_endpt *ep = NULL;
 	struct list_head *pos, *tmpq;
 
-	spin_lock(&scif_info.eplock);
+	mutex_lock(&scif_info.eplock);
 	list_for_each_safe(pos, tmpq, &scif_info.listen) {
 		ep = list_entry(pos, struct scif_endpt, list);
 		if (ep->port.port == port) {
-			spin_lock(&ep->lock);
-			spin_unlock(&scif_info.eplock);
+			mutex_unlock(&scif_info.eplock);
 			return ep;
 		}
 	}
-	spin_unlock(&scif_info.eplock);
+	mutex_unlock(&scif_info.eplock);
 	return NULL;
 }
 
@@ -99,14 +98,17 @@ void scif_cleanup_zombie_epd(void)
 	struct list_head *pos, *tmpq;
 	struct scif_endpt *ep;
 
-	spin_lock(&scif_info.eplock);
+	mutex_lock(&scif_info.eplock);
 	list_for_each_safe(pos, tmpq, &scif_info.zombie) {
 		ep = list_entry(pos, struct scif_endpt, list);
-		list_del(pos);
-		scif_info.nr_zombies--;
-		kfree(ep);
+		if (scif_rma_ep_can_uninit(ep)) {
+			list_del(pos);
+			scif_info.nr_zombies--;
+			put_iova_domain(&ep->rma_info.iovad);
+			kfree(ep);
+		}
 	}
-	spin_unlock(&scif_info.eplock);
+	mutex_unlock(&scif_info.eplock);
 }
 
 /**
@@ -137,6 +139,8 @@ void scif_cnctreq(struct scif_dev *scifdev, struct scifmsg *msg)
 	if (!ep)
 		/*  Send reject due to no listening ports */
 		goto conreq_sendrej_free;
+	else
+		spin_lock(&ep->lock);
 
 	if (ep->backlog <= ep->conreqcnt) {
 		/*  Send reject due to too many pending requests */
diff --git a/drivers/misc/mic/scif/scif_epd.h b/drivers/misc/mic/scif/scif_epd.h
index 331322a25213..1771d7a9b8d0 100644
--- a/drivers/misc/mic/scif/scif_epd.h
+++ b/drivers/misc/mic/scif/scif_epd.h
@@ -96,7 +96,11 @@ struct scif_endpt_qp_info {
  * @conn_port: Connection port
  * @conn_err: Errors during connection
  * @conn_async_state: Async connection
+ * @conn_pend_wq: Used by poll while waiting for incoming connections
  * @conn_list: List of async connection requests
+ * @rma_info: Information for triggering SCIF RMA and DMA operations
+ * @mmu_list: link to list of MMU notifier cleanup work
+ * @anon: anonymous file for use in kernel mode scif poll
  */
 struct scif_endpt {
 	enum scif_epd_state state;
@@ -125,7 +129,11 @@ struct scif_endpt {
 	struct scif_port_id conn_port;
 	int conn_err;
 	int conn_async_state;
+	wait_queue_head_t conn_pend_wq;
 	struct list_head conn_list;
+	struct scif_endpt_rma_info rma_info;
+	struct list_head mmu_list;
+	struct file *anon;
 };
 
 static inline int scifdev_alive(struct scif_endpt *ep)
@@ -133,6 +141,43 @@ static inline int scifdev_alive(struct scif_endpt *ep)
 	return _scifdev_alive(ep->remote_dev);
 }
 
+/*
+ * scif_verify_epd:
+ * ep: SCIF endpoint
+ *
+ * Checks several generic error conditions and returns the
+ * appropriate error.
+ */
+static inline int scif_verify_epd(struct scif_endpt *ep)
+{
+	if (ep->state == SCIFEP_DISCONNECTED)
+		return -ECONNRESET;
+
+	if (ep->state != SCIFEP_CONNECTED)
+		return -ENOTCONN;
+
+	if (!scifdev_alive(ep))
+		return -ENODEV;
+
+	return 0;
+}
+
+static inline int scif_anon_inode_getfile(scif_epd_t epd)
+{
+	epd->anon = anon_inode_getfile("scif", &scif_anon_fops, NULL, 0);
+	if (IS_ERR(epd->anon))
+		return PTR_ERR(epd->anon);
+	return 0;
+}
+
+static inline void scif_anon_inode_fput(scif_epd_t epd)
+{
+	if (epd->anon) {
+		fput(epd->anon);
+		epd->anon = NULL;
+	}
+}
+
 void scif_cleanup_zombie_epd(void);
 void scif_teardown_ep(void *endpt);
 void scif_cleanup_ep_qp(struct scif_endpt *ep);
@@ -157,4 +202,9 @@ void scif_clientsend(struct scif_dev *scifdev, struct scifmsg *msg);
 void scif_clientrcvd(struct scif_dev *scifdev, struct scifmsg *msg);
 int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block);
 int __scif_flush(scif_epd_t epd);
+int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd);
+unsigned int __scif_pollfd(struct file *f, poll_table *wait,
+			   struct scif_endpt *ep);
+int __scif_pin_pages(void *addr, size_t len, int *out_prot,
+		     int map_flags, scif_pinned_pages_t *pages);
 #endif /* SCIF_EPD_H */
diff --git a/drivers/misc/mic/scif/scif_fd.c b/drivers/misc/mic/scif/scif_fd.c
index eccf7e7135f9..f7e826142a72 100644
--- a/drivers/misc/mic/scif/scif_fd.c
+++ b/drivers/misc/mic/scif/scif_fd.c
@@ -34,6 +34,20 @@ static int scif_fdclose(struct inode *inode, struct file *f)
 	return scif_close(priv);
 }
 
+static int scif_fdmmap(struct file *f, struct vm_area_struct *vma)
+{
+	struct scif_endpt *priv = f->private_data;
+
+	return scif_mmap(vma, priv);
+}
+
+static unsigned int scif_fdpoll(struct file *f, poll_table *wait)
+{
+	struct scif_endpt *priv = f->private_data;
+
+	return __scif_pollfd(f, wait, priv);
+}
+
 static int scif_fdflush(struct file *f, fl_owner_t id)
 {
 	struct scif_endpt *ep = f->private_data;
@@ -140,12 +154,12 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		 * Add to the list of user mode eps where the second half
 		 * of the accept is not yet completed.
 		 */
-		spin_lock(&scif_info.eplock);
+		mutex_lock(&scif_info.eplock);
 		list_add_tail(&((*ep)->miacceptlist), &scif_info.uaccept);
 		list_add_tail(&((*ep)->liacceptlist), &priv->li_accept);
 		(*ep)->listenep = priv;
 		priv->acceptcnt++;
-		spin_unlock(&scif_info.eplock);
+		mutex_unlock(&scif_info.eplock);
 
 		return 0;
 	}
@@ -163,7 +177,7 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg)
 			return -EFAULT;
 
 		/* Remove form the user accept queue */
-		spin_lock(&scif_info.eplock);
+		mutex_lock(&scif_info.eplock);
 		list_for_each_safe(pos, tmpq, &scif_info.uaccept) {
 			tmpep = list_entry(pos,
 					   struct scif_endpt, miacceptlist);
@@ -175,7 +189,7 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		}
 
 		if (!fep) {
-			spin_unlock(&scif_info.eplock);
+			mutex_unlock(&scif_info.eplock);
 			return -ENOENT;
 		}
 
@@ -190,9 +204,10 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg)
 			}
 		}
 
-		spin_unlock(&scif_info.eplock);
+		mutex_unlock(&scif_info.eplock);
 
 		/* Free the resources automatically created from the open. */
+		scif_anon_inode_fput(priv);
 		scif_teardown_ep(priv);
 		scif_add_epd_to_zombie_list(priv, !SCIF_EPLOCK_HELD);
 		f->private_data = newep;
@@ -290,6 +305,157 @@ getnodes_err1:
 getnodes_err2:
 		return err;
 	}
+	case SCIF_REG:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_reg reg;
+		off_t ret;
+
+		if (copy_from_user(&reg, argp, sizeof(reg))) {
+			err = -EFAULT;
+			goto reg_err;
+		}
+		if (reg.flags & SCIF_MAP_KERNEL) {
+			err = -EINVAL;
+			goto reg_err;
+		}
+		ret = scif_register(priv, (void *)reg.addr, reg.len,
+				    reg.offset, reg.prot, reg.flags);
+		if (ret < 0) {
+			err = (int)ret;
+			goto reg_err;
+		}
+
+		if (copy_to_user(&((struct scifioctl_reg __user *)argp)
+				 ->out_offset, &ret, sizeof(reg.out_offset))) {
+			err = -EFAULT;
+			goto reg_err;
+		}
+		err = 0;
+reg_err:
+		scif_err_debug(err, "scif_register");
+		return err;
+	}
+	case SCIF_UNREG:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_unreg unreg;
+
+		if (copy_from_user(&unreg, argp, sizeof(unreg))) {
+			err = -EFAULT;
+			goto unreg_err;
+		}
+		err = scif_unregister(priv, unreg.offset, unreg.len);
+unreg_err:
+		scif_err_debug(err, "scif_unregister");
+		return err;
+	}
+	case SCIF_READFROM:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_copy copy;
+
+		if (copy_from_user(&copy, argp, sizeof(copy))) {
+			err = -EFAULT;
+			goto readfrom_err;
+		}
+		err = scif_readfrom(priv, copy.loffset, copy.len, copy.roffset,
+				    copy.flags);
+readfrom_err:
+		scif_err_debug(err, "scif_readfrom");
+		return err;
+	}
+	case SCIF_WRITETO:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_copy copy;
+
+		if (copy_from_user(&copy, argp, sizeof(copy))) {
+			err = -EFAULT;
+			goto writeto_err;
+		}
+		err = scif_writeto(priv, copy.loffset, copy.len, copy.roffset,
+				   copy.flags);
+writeto_err:
+		scif_err_debug(err, "scif_writeto");
+		return err;
+	}
+	case SCIF_VREADFROM:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_copy copy;
+
+		if (copy_from_user(&copy, argp, sizeof(copy))) {
+			err = -EFAULT;
+			goto vreadfrom_err;
+		}
+		err = scif_vreadfrom(priv, (void __force *)copy.addr, copy.len,
+				     copy.roffset, copy.flags);
+vreadfrom_err:
+		scif_err_debug(err, "scif_vreadfrom");
+		return err;
+	}
+	case SCIF_VWRITETO:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_copy copy;
+
+		if (copy_from_user(&copy, argp, sizeof(copy))) {
+			err = -EFAULT;
+			goto vwriteto_err;
+		}
+		err = scif_vwriteto(priv, (void __force *)copy.addr, copy.len,
+				    copy.roffset, copy.flags);
+vwriteto_err:
+		scif_err_debug(err, "scif_vwriteto");
+		return err;
+	}
+	case SCIF_FENCE_MARK:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_fence_mark mark;
+		int tmp_mark = 0;
+
+		if (copy_from_user(&mark, argp, sizeof(mark))) {
+			err = -EFAULT;
+			goto fence_mark_err;
+		}
+		err = scif_fence_mark(priv, mark.flags, &tmp_mark);
+		if (err)
+			goto fence_mark_err;
+		if (copy_to_user((void __user *)mark.mark, &tmp_mark,
+				 sizeof(tmp_mark))) {
+			err = -EFAULT;
+			goto fence_mark_err;
+		}
+fence_mark_err:
+		scif_err_debug(err, "scif_fence_mark");
+		return err;
+	}
+	case SCIF_FENCE_WAIT:
+	{
+		struct scif_endpt *priv = f->private_data;
+
+		err = scif_fence_wait(priv, arg);
+		scif_err_debug(err, "scif_fence_wait");
+		return err;
+	}
+	case SCIF_FENCE_SIGNAL:
+	{
+		struct scif_endpt *priv = f->private_data;
+		struct scifioctl_fence_signal signal;
+
+		if (copy_from_user(&signal, argp, sizeof(signal))) {
+			err = -EFAULT;
+			goto fence_signal_err;
+		}
+
+		err = scif_fence_signal(priv, signal.loff, signal.lval,
+					signal.roff, signal.rval, signal.flags);
+fence_signal_err:
+		scif_err_debug(err, "scif_fence_signal");
+		return err;
+	}
 	}
 	return -EINVAL;
 }
@@ -298,6 +464,8 @@ const struct file_operations scif_fops = {
 	.open = scif_fdopen,
 	.release = scif_fdclose,
 	.unlocked_ioctl = scif_fdioctl,
+	.mmap = scif_fdmmap,
+	.poll = scif_fdpoll,
 	.flush = scif_fdflush,
 	.owner = THIS_MODULE,
 };
diff --git a/drivers/misc/mic/scif/scif_fence.c b/drivers/misc/mic/scif/scif_fence.c
new file mode 100644
index 000000000000..7f2c96f57066
--- /dev/null
+++ b/drivers/misc/mic/scif/scif_fence.c
@@ -0,0 +1,771 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Intel SCIF driver.
+ *
+ */
+
+#include "scif_main.h"
+
+/**
+ * scif_recv_mark: Handle SCIF_MARK request
+ * @msg:	Interrupt message
+ *
+ * The peer has requested a mark.
+ */
+void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	int mark, err;
+
+	err = _scif_fence_mark(ep, &mark);
+	if (err)
+		msg->uop = SCIF_MARK_NACK;
+	else
+		msg->uop = SCIF_MARK_ACK;
+	msg->payload[0] = ep->remote_ep;
+	msg->payload[2] = mark;
+	scif_nodeqp_send(ep->remote_dev, msg);
+}
+
+/**
+ * scif_recv_mark_resp: Handle SCIF_MARK_(N)ACK messages.
+ * @msg:	Interrupt message
+ *
+ * The peer has responded to a SCIF_MARK message.
+ */
+void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	struct scif_fence_info *fence_req =
+		(struct scif_fence_info *)msg->payload[1];
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	if (msg->uop == SCIF_MARK_ACK) {
+		fence_req->state = OP_COMPLETED;
+		fence_req->dma_mark = (int)msg->payload[2];
+	} else {
+		fence_req->state = OP_FAILED;
+	}
+	mutex_unlock(&ep->rma_info.rma_lock);
+	complete(&fence_req->comp);
+}
+
+/**
+ * scif_recv_wait: Handle SCIF_WAIT request
+ * @msg:	Interrupt message
+ *
+ * The peer has requested waiting on a fence.
+ */
+void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	struct scif_remote_fence_info *fence;
+
+	/*
+	 * Allocate structure for remote fence information and
+	 * send a NACK if the allocation failed. The peer will
+	 * return ENOMEM upon receiving a NACK.
+	 */
+	fence = kmalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence) {
+		msg->payload[0] = ep->remote_ep;
+		msg->uop = SCIF_WAIT_NACK;
+		scif_nodeqp_send(ep->remote_dev, msg);
+		return;
+	}
+
+	/* Prepare the fence request */
+	memcpy(&fence->msg, msg, sizeof(struct scifmsg));
+	INIT_LIST_HEAD(&fence->list);
+
+	/* Insert to the global remote fence request list */
+	mutex_lock(&scif_info.fencelock);
+	atomic_inc(&ep->rma_info.fence_refcount);
+	list_add_tail(&fence->list, &scif_info.fence);
+	mutex_unlock(&scif_info.fencelock);
+
+	schedule_work(&scif_info.misc_work);
+}
+
+/**
+ * scif_recv_wait_resp: Handle SCIF_WAIT_(N)ACK messages.
+ * @msg:	Interrupt message
+ *
+ * The peer has responded to a SCIF_WAIT message.
+ */
+void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	struct scif_fence_info *fence_req =
+		(struct scif_fence_info *)msg->payload[1];
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	if (msg->uop == SCIF_WAIT_ACK)
+		fence_req->state = OP_COMPLETED;
+	else
+		fence_req->state = OP_FAILED;
+	mutex_unlock(&ep->rma_info.rma_lock);
+	complete(&fence_req->comp);
+}
+
+/**
+ * scif_recv_sig_local: Handle SCIF_SIG_LOCAL request
+ * @msg:	Interrupt message
+ *
+ * The peer has requested a signal on a local offset.
+ */
+void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	int err;
+
+	err = scif_prog_signal(ep, msg->payload[1], msg->payload[2],
+			       SCIF_WINDOW_SELF);
+	if (err)
+		msg->uop = SCIF_SIG_NACK;
+	else
+		msg->uop = SCIF_SIG_ACK;
+	msg->payload[0] = ep->remote_ep;
+	scif_nodeqp_send(ep->remote_dev, msg);
+}
+
+/**
+ * scif_recv_sig_remote: Handle SCIF_SIGNAL_REMOTE request
+ * @msg:	Interrupt message
+ *
+ * The peer has requested a signal on a remote offset.
+ */
+void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	int err;
+
+	err = scif_prog_signal(ep, msg->payload[1], msg->payload[2],
+			       SCIF_WINDOW_PEER);
+	if (err)
+		msg->uop = SCIF_SIG_NACK;
+	else
+		msg->uop = SCIF_SIG_ACK;
+	msg->payload[0] = ep->remote_ep;
+	scif_nodeqp_send(ep->remote_dev, msg);
+}
+
+/**
+ * scif_recv_sig_resp: Handle SCIF_SIG_(N)ACK messages.
+ * @msg:	Interrupt message
+ *
+ * The peer has responded to a signal request.
+ */
+void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	struct scif_fence_info *fence_req =
+		(struct scif_fence_info *)msg->payload[3];
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	if (msg->uop == SCIF_SIG_ACK)
+		fence_req->state = OP_COMPLETED;
+	else
+		fence_req->state = OP_FAILED;
+	mutex_unlock(&ep->rma_info.rma_lock);
+	complete(&fence_req->comp);
+}
+
+static inline void *scif_get_local_va(off_t off, struct scif_window *window)
+{
+	struct page **pages = window->pinned_pages->pages;
+	int page_nr = (off - window->offset) >> PAGE_SHIFT;
+	off_t page_off = off & ~PAGE_MASK;
+
+	return page_address(pages[page_nr]) + page_off;
+}
+
+static void scif_prog_signal_cb(void *arg)
+{
+	struct scif_status *status = arg;
+
+	dma_pool_free(status->ep->remote_dev->signal_pool, status,
+		      status->src_dma_addr);
+}
+
+static int _scif_prog_signal(scif_epd_t epd, dma_addr_t dst, u64 val)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct dma_chan *chan = ep->rma_info.dma_chan;
+	struct dma_device *ddev = chan->device;
+	bool x100 = !is_dma_copy_aligned(chan->device, 1, 1, 1);
+	struct dma_async_tx_descriptor *tx;
+	struct scif_status *status = NULL;
+	dma_addr_t src;
+	dma_cookie_t cookie;
+	int err;
+
+	tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE);
+	if (!tx) {
+		err = -ENOMEM;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto alloc_fail;
+	}
+	cookie = tx->tx_submit(tx);
+	if (dma_submit_error(cookie)) {
+		err = (int)cookie;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto alloc_fail;
+	}
+	dma_async_issue_pending(chan);
+	if (x100) {
+		/*
+		 * For X100 use the status descriptor to write the value to
+		 * the destination.
+		 */
+		tx = ddev->device_prep_dma_imm_data(chan, dst, val, 0);
+	} else {
+		status = dma_pool_alloc(ep->remote_dev->signal_pool, GFP_KERNEL,
+					&src);
+		if (!status) {
+			err = -ENOMEM;
+			dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+				__func__, __LINE__, err);
+			goto alloc_fail;
+		}
+		status->val = val;
+		status->src_dma_addr = src;
+		status->ep = ep;
+		src += offsetof(struct scif_status, val);
+		tx = ddev->device_prep_dma_memcpy(chan, dst, src, sizeof(val),
+						  DMA_PREP_INTERRUPT);
+	}
+	if (!tx) {
+		err = -ENOMEM;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto dma_fail;
+	}
+	if (!x100) {
+		tx->callback = scif_prog_signal_cb;
+		tx->callback_param = status;
+	}
+	cookie = tx->tx_submit(tx);
+	if (dma_submit_error(cookie)) {
+		err = -EIO;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		goto dma_fail;
+	}
+	dma_async_issue_pending(chan);
+	return 0;
+dma_fail:
+	if (!x100)
+		dma_pool_free(ep->remote_dev->signal_pool, status,
+			      status->src_dma_addr);
+alloc_fail:
+	return err;
+}
+
+/*
+ * scif_prog_signal:
+ * @epd - Endpoint Descriptor
+ * @offset - registered address to write @val to
+ * @val - Value to be written at @offset
+ * @type - Type of the window.
+ *
+ * Arrange to write a value to the registered offset after ensuring that the
+ * offset provided is indeed valid.
+ */
+int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val,
+		     enum scif_window_type type)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct scif_window *window = NULL;
+	struct scif_rma_req req;
+	dma_addr_t dst_dma_addr;
+	int err;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	req.out_window = &window;
+	req.offset = offset;
+	req.nr_bytes = sizeof(u64);
+	req.prot = SCIF_PROT_WRITE;
+	req.type = SCIF_WINDOW_SINGLE;
+	if (type == SCIF_WINDOW_SELF)
+		req.head = &ep->rma_info.reg_list;
+	else
+		req.head = &ep->rma_info.remote_reg_list;
+	/* Does a valid window exist? */
+	err = scif_query_window(&req);
+	if (err) {
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto unlock_ret;
+	}
+
+	if (scif_is_mgmt_node() && scifdev_self(ep->remote_dev)) {
+		u64 *dst_virt;
+
+		if (type == SCIF_WINDOW_SELF)
+			dst_virt = scif_get_local_va(offset, window);
+		else
+			dst_virt =
+			scif_get_local_va(offset, (struct scif_window *)
+					  window->peer_window);
+		*dst_virt = val;
+	} else {
+		dst_dma_addr = __scif_off_to_dma_addr(window, offset);
+		err = _scif_prog_signal(epd, dst_dma_addr, val);
+	}
+unlock_ret:
+	mutex_unlock(&ep->rma_info.rma_lock);
+	return err;
+}
+
+static int _scif_fence_wait(scif_epd_t epd, int mark)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	dma_cookie_t cookie = mark & ~SCIF_REMOTE_FENCE;
+	int err;
+
+	/* Wait for DMA callback in scif_fence_mark_cb(..) */
+	err = wait_event_interruptible_timeout(ep->rma_info.markwq,
+					       dma_async_is_tx_complete(
+					       ep->rma_info.dma_chan,
+					       cookie, NULL, NULL) ==
+					       DMA_COMPLETE,
+					       SCIF_NODE_ALIVE_TIMEOUT);
+	if (!err)
+		err = -ETIMEDOUT;
+	else if (err > 0)
+		err = 0;
+	return err;
+}
+
+/**
+ * scif_rma_handle_remote_fences:
+ *
+ * This routine services remote fence requests.
+ */
+void scif_rma_handle_remote_fences(void)
+{
+	struct list_head *item, *tmp;
+	struct scif_remote_fence_info *fence;
+	struct scif_endpt *ep;
+	int mark, err;
+
+	might_sleep();
+	mutex_lock(&scif_info.fencelock);
+	list_for_each_safe(item, tmp, &scif_info.fence) {
+		fence = list_entry(item, struct scif_remote_fence_info,
+				   list);
+		/* Remove fence from global list */
+		list_del(&fence->list);
+
+		/* Initiate the fence operation */
+		ep = (struct scif_endpt *)fence->msg.payload[0];
+		mark = fence->msg.payload[2];
+		err = _scif_fence_wait(ep, mark);
+		if (err)
+			fence->msg.uop = SCIF_WAIT_NACK;
+		else
+			fence->msg.uop = SCIF_WAIT_ACK;
+		fence->msg.payload[0] = ep->remote_ep;
+		scif_nodeqp_send(ep->remote_dev, &fence->msg);
+		kfree(fence);
+		if (!atomic_sub_return(1, &ep->rma_info.fence_refcount))
+			schedule_work(&scif_info.misc_work);
+	}
+	mutex_unlock(&scif_info.fencelock);
+}
+
+static int _scif_send_fence(scif_epd_t epd, int uop, int mark, int *out_mark)
+{
+	int err;
+	struct scifmsg msg;
+	struct scif_fence_info *fence_req;
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+
+	fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL);
+	if (!fence_req) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	fence_req->state = OP_IN_PROGRESS;
+	init_completion(&fence_req->comp);
+
+	msg.src = ep->port;
+	msg.uop = uop;
+	msg.payload[0] = ep->remote_ep;
+	msg.payload[1] = (u64)fence_req;
+	if (uop == SCIF_WAIT)
+		msg.payload[2] = mark;
+	spin_lock(&ep->lock);
+	if (ep->state == SCIFEP_CONNECTED)
+		err = scif_nodeqp_send(ep->remote_dev, &msg);
+	else
+		err = -ENOTCONN;
+	spin_unlock(&ep->lock);
+	if (err)
+		goto error_free;
+retry:
+	/* Wait for a SCIF_WAIT_(N)ACK message */
+	err = wait_for_completion_timeout(&fence_req->comp,
+					  SCIF_NODE_ALIVE_TIMEOUT);
+	if (!err && scifdev_alive(ep))
+		goto retry;
+	if (!err)
+		err = -ENODEV;
+	if (err > 0)
+		err = 0;
+	mutex_lock(&ep->rma_info.rma_lock);
+	if (err < 0) {
+		if (fence_req->state == OP_IN_PROGRESS)
+			fence_req->state = OP_FAILED;
+	}
+	if (fence_req->state == OP_FAILED && !err)
+		err = -ENOMEM;
+	if (uop == SCIF_MARK && fence_req->state == OP_COMPLETED)
+		*out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark;
+	mutex_unlock(&ep->rma_info.rma_lock);
+error_free:
+	kfree(fence_req);
+error:
+	return err;
+}
+
+/**
+ * scif_send_fence_mark:
+ * @epd: end point descriptor.
+ * @out_mark: Output DMA mark reported by peer.
+ *
+ * Send a remote fence mark request.
+ */
+static int scif_send_fence_mark(scif_epd_t epd, int *out_mark)
+{
+	return _scif_send_fence(epd, SCIF_MARK, 0, out_mark);
+}
+
+/**
+ * scif_send_fence_wait:
+ * @epd: end point descriptor.
+ * @mark: DMA mark to wait for.
+ *
+ * Send a remote fence wait request.
+ */
+static int scif_send_fence_wait(scif_epd_t epd, int mark)
+{
+	return _scif_send_fence(epd, SCIF_WAIT, mark, NULL);
+}
+
+static int _scif_send_fence_signal_wait(struct scif_endpt *ep,
+					struct scif_fence_info *fence_req)
+{
+	int err;
+
+retry:
+	/* Wait for a SCIF_SIG_(N)ACK message */
+	err = wait_for_completion_timeout(&fence_req->comp,
+					  SCIF_NODE_ALIVE_TIMEOUT);
+	if (!err && scifdev_alive(ep))
+		goto retry;
+	if (!err)
+		err = -ENODEV;
+	if (err > 0)
+		err = 0;
+	if (err < 0) {
+		mutex_lock(&ep->rma_info.rma_lock);
+		if (fence_req->state == OP_IN_PROGRESS)
+			fence_req->state = OP_FAILED;
+		mutex_unlock(&ep->rma_info.rma_lock);
+	}
+	if (fence_req->state == OP_FAILED && !err)
+		err = -ENXIO;
+	return err;
+}
+
+/**
+ * scif_send_fence_signal:
+ * @epd - endpoint descriptor
+ * @loff - local offset
+ * @lval - local value to write to loffset
+ * @roff - remote offset
+ * @rval - remote value to write to roffset
+ * @flags - flags
+ *
+ * Sends a remote fence signal request
+ */
+static int scif_send_fence_signal(scif_epd_t epd, off_t roff, u64 rval,
+				  off_t loff, u64 lval, int flags)
+{
+	int err = 0;
+	struct scifmsg msg;
+	struct scif_fence_info *fence_req;
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+
+	fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL);
+	if (!fence_req) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	fence_req->state = OP_IN_PROGRESS;
+	init_completion(&fence_req->comp);
+	msg.src = ep->port;
+	if (flags & SCIF_SIGNAL_LOCAL) {
+		msg.uop = SCIF_SIG_LOCAL;
+		msg.payload[0] = ep->remote_ep;
+		msg.payload[1] = roff;
+		msg.payload[2] = rval;
+		msg.payload[3] = (u64)fence_req;
+		spin_lock(&ep->lock);
+		if (ep->state == SCIFEP_CONNECTED)
+			err = scif_nodeqp_send(ep->remote_dev, &msg);
+		else
+			err = -ENOTCONN;
+		spin_unlock(&ep->lock);
+		if (err)
+			goto error_free;
+		err = _scif_send_fence_signal_wait(ep, fence_req);
+		if (err)
+			goto error_free;
+	}
+	fence_req->state = OP_IN_PROGRESS;
+
+	if (flags & SCIF_SIGNAL_REMOTE) {
+		msg.uop = SCIF_SIG_REMOTE;
+		msg.payload[0] = ep->remote_ep;
+		msg.payload[1] = loff;
+		msg.payload[2] = lval;
+		msg.payload[3] = (u64)fence_req;
+		spin_lock(&ep->lock);
+		if (ep->state == SCIFEP_CONNECTED)
+			err = scif_nodeqp_send(ep->remote_dev, &msg);
+		else
+			err = -ENOTCONN;
+		spin_unlock(&ep->lock);
+		if (err)
+			goto error_free;
+		err = _scif_send_fence_signal_wait(ep, fence_req);
+	}
+error_free:
+	kfree(fence_req);
+error:
+	return err;
+}
+
+static void scif_fence_mark_cb(void *arg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)arg;
+
+	wake_up_interruptible(&ep->rma_info.markwq);
+	atomic_dec(&ep->rma_info.fence_refcount);
+}
+
+/*
+ * _scif_fence_mark:
+ *
+ * @epd - endpoint descriptor
+ * Set up a mark for this endpoint and return the value of the mark.
+ */
+int _scif_fence_mark(scif_epd_t epd, int *mark)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct dma_chan *chan = ep->rma_info.dma_chan;
+	struct dma_device *ddev = chan->device;
+	struct dma_async_tx_descriptor *tx;
+	dma_cookie_t cookie;
+	int err;
+
+	tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE);
+	if (!tx) {
+		err = -ENOMEM;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		return err;
+	}
+	cookie = tx->tx_submit(tx);
+	if (dma_submit_error(cookie)) {
+		err = (int)cookie;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		return err;
+	}
+	dma_async_issue_pending(chan);
+	tx = ddev->device_prep_dma_interrupt(chan, DMA_PREP_INTERRUPT);
+	if (!tx) {
+		err = -ENOMEM;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		return err;
+	}
+	tx->callback = scif_fence_mark_cb;
+	tx->callback_param = ep;
+	*mark = cookie = tx->tx_submit(tx);
+	if (dma_submit_error(cookie)) {
+		err = (int)cookie;
+		dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n",
+			__func__, __LINE__, err);
+		return err;
+	}
+	atomic_inc(&ep->rma_info.fence_refcount);
+	dma_async_issue_pending(chan);
+	return 0;
+}
+
+#define SCIF_LOOPB_MAGIC_MARK 0xdead
+
+int scif_fence_mark(scif_epd_t epd, int flags, int *mark)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	int err = 0;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x\n",
+		ep, flags, *mark);
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+
+	/* Invalid flags? */
+	if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))
+		return -EINVAL;
+
+	/* At least one of init self or peer RMA should be set */
+	if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
+		return -EINVAL;
+
+	/* Exactly one of init self or peer RMA should be set but not both */
+	if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
+		return -EINVAL;
+
+	/*
+	 * Management node loopback does not need to use DMA.
+	 * Return a valid mark to be symmetric.
+	 */
+	if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) {
+		*mark = SCIF_LOOPB_MAGIC_MARK;
+		return 0;
+	}
+
+	if (flags & SCIF_FENCE_INIT_SELF)
+		err = _scif_fence_mark(epd, mark);
+	else
+		err = scif_send_fence_mark(ep, mark);
+
+	if (err)
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d\n", __func__, __LINE__, err);
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x err %d\n",
+		ep, flags, *mark, err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_fence_mark);
+
+int scif_fence_wait(scif_epd_t epd, int mark)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	int err = 0;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI fence_wait: ep %p mark 0x%x\n",
+		ep, mark);
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+	/*
+	 * Management node loopback does not need to use DMA.
+	 * The only valid mark provided is 0 so simply
+	 * return success if the mark is valid.
+	 */
+	if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) {
+		if (mark == SCIF_LOOPB_MAGIC_MARK)
+			return 0;
+		else
+			return -EINVAL;
+	}
+	if (mark & SCIF_REMOTE_FENCE)
+		err = scif_send_fence_wait(epd, mark);
+	else
+		err = _scif_fence_wait(epd, mark);
+	if (err < 0)
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d\n", __func__, __LINE__, err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_fence_wait);
+
+int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval,
+		      off_t roff, u64 rval, int flags)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	int err = 0;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI fence_signal: ep %p loff 0x%lx lval 0x%llx roff 0x%lx rval 0x%llx flags 0x%x\n",
+		ep, loff, lval, roff, rval, flags);
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+
+	/* Invalid flags? */
+	if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER |
+			SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))
+		return -EINVAL;
+
+	/* At least one of init self or peer RMA should be set */
+	if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)))
+		return -EINVAL;
+
+	/* Exactly one of init self or peer RMA should be set but not both */
+	if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER))
+		return -EINVAL;
+
+	/* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */
+	if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)))
+		return -EINVAL;
+
+	/* Only Dword offsets allowed */
+	if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(u32) - 1)))
+		return -EINVAL;
+
+	/* Only Dword aligned offsets allowed */
+	if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(u32) - 1)))
+		return -EINVAL;
+
+	if (flags & SCIF_FENCE_INIT_PEER) {
+		err = scif_send_fence_signal(epd, roff, rval, loff,
+					     lval, flags);
+	} else {
+		/* Local Signal in Local RAS */
+		if (flags & SCIF_SIGNAL_LOCAL) {
+			err = scif_prog_signal(epd, loff, lval,
+					       SCIF_WINDOW_SELF);
+			if (err)
+				goto error_ret;
+		}
+
+		/* Signal in Remote RAS */
+		if (flags & SCIF_SIGNAL_REMOTE)
+			err = scif_prog_signal(epd, roff,
+					       rval, SCIF_WINDOW_PEER);
+	}
+error_ret:
+	if (err)
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d\n", __func__, __LINE__, err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_fence_signal);
diff --git a/drivers/misc/mic/scif/scif_main.c b/drivers/misc/mic/scif/scif_main.c
index 6ce851f5c7e6..36d847af1209 100644
--- a/drivers/misc/mic/scif/scif_main.c
+++ b/drivers/misc/mic/scif/scif_main.c
@@ -34,6 +34,7 @@ struct scif_info scif_info = {
 };
 
 struct scif_dev *scif_dev;
+struct kmem_cache *unaligned_cache;
 static atomic_t g_loopb_cnt;
 
 /* Runs in the context of intr_wq */
@@ -80,35 +81,6 @@ irqreturn_t scif_intr_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int scif_peer_probe(struct scif_peer_dev *spdev)
-{
-	struct scif_dev *scifdev = &scif_dev[spdev->dnode];
-
-	mutex_lock(&scif_info.conflock);
-	scif_info.total++;
-	scif_info.maxid = max_t(u32, spdev->dnode, scif_info.maxid);
-	mutex_unlock(&scif_info.conflock);
-	rcu_assign_pointer(scifdev->spdev, spdev);
-
-	/* In the future SCIF kernel client devices will be added here */
-	return 0;
-}
-
-static void scif_peer_remove(struct scif_peer_dev *spdev)
-{
-	struct scif_dev *scifdev = &scif_dev[spdev->dnode];
-
-	/* In the future SCIF kernel client devices will be removed here */
-	spdev = rcu_dereference(scifdev->spdev);
-	if (spdev)
-		RCU_INIT_POINTER(scifdev->spdev, NULL);
-	synchronize_rcu();
-
-	mutex_lock(&scif_info.conflock);
-	scif_info.total--;
-	mutex_unlock(&scif_info.conflock);
-}
-
 static void scif_qp_setup_handler(struct work_struct *work)
 {
 	struct scif_dev *scifdev = container_of(work, struct scif_dev,
@@ -139,20 +111,13 @@ static void scif_qp_setup_handler(struct work_struct *work)
 	}
 }
 
-static int scif_setup_scifdev(struct scif_hw_dev *sdev)
+static int scif_setup_scifdev(void)
 {
+	/* We support a maximum of 129 SCIF nodes including the mgmt node */
+#define MAX_SCIF_NODES 129
 	int i;
-	u8 num_nodes;
-
-	if (sdev->snode) {
-		struct mic_bootparam __iomem *bp = sdev->rdp;
-
-		num_nodes = ioread8(&bp->tot_nodes);
-	} else {
-		struct mic_bootparam *bp = sdev->dp;
+	u8 num_nodes = MAX_SCIF_NODES;
 
-		num_nodes = bp->tot_nodes;
-	}
 	scif_dev = kcalloc(num_nodes, sizeof(*scif_dev), GFP_KERNEL);
 	if (!scif_dev)
 		return -ENOMEM;
@@ -163,7 +128,7 @@ static int scif_setup_scifdev(struct scif_hw_dev *sdev)
 		scifdev->exit = OP_IDLE;
 		init_waitqueue_head(&scifdev->disconn_wq);
 		mutex_init(&scifdev->lock);
-		INIT_WORK(&scifdev->init_msg_work, scif_qp_response_ack);
+		INIT_WORK(&scifdev->peer_add_work, scif_add_peer_device);
 		INIT_DELAYED_WORK(&scifdev->p2p_dwork,
 				  scif_poll_qp_state);
 		INIT_DELAYED_WORK(&scifdev->qp_dwork,
@@ -181,27 +146,21 @@ static void scif_destroy_scifdev(void)
 
 static int scif_probe(struct scif_hw_dev *sdev)
 {
-	struct scif_dev *scifdev;
+	struct scif_dev *scifdev = &scif_dev[sdev->dnode];
 	int rc;
 
 	dev_set_drvdata(&sdev->dev, sdev);
+	scifdev->sdev = sdev;
+
 	if (1 == atomic_add_return(1, &g_loopb_cnt)) {
-		struct scif_dev *loopb_dev;
+		struct scif_dev *loopb_dev = &scif_dev[sdev->snode];
 
-		rc = scif_setup_scifdev(sdev);
-		if (rc)
-			goto exit;
-		scifdev = &scif_dev[sdev->dnode];
-		scifdev->sdev = sdev;
-		loopb_dev = &scif_dev[sdev->snode];
 		loopb_dev->sdev = sdev;
 		rc = scif_setup_loopback_qp(loopb_dev);
 		if (rc)
-			goto free_sdev;
-	} else {
-		scifdev = &scif_dev[sdev->dnode];
-		scifdev->sdev = sdev;
+			goto exit;
 	}
+
 	rc = scif_setup_intr_wq(scifdev);
 	if (rc)
 		goto destroy_loopb;
@@ -237,8 +196,6 @@ destroy_intr:
 destroy_loopb:
 	if (atomic_dec_and_test(&g_loopb_cnt))
 		scif_destroy_loopback_qp(&scif_dev[sdev->snode]);
-free_sdev:
-	scif_destroy_scifdev();
 exit:
 	return rc;
 }
@@ -290,13 +247,6 @@ static void scif_remove(struct scif_hw_dev *sdev)
 	scifdev->sdev = NULL;
 }
 
-static struct scif_peer_driver scif_peer_driver = {
-	.driver.name =	KBUILD_MODNAME,
-	.driver.owner =	THIS_MODULE,
-	.probe = scif_peer_probe,
-	.remove = scif_peer_remove,
-};
-
 static struct scif_hw_dev_id id_table[] = {
 	{ MIC_SCIF_DEV, SCIF_DEV_ANY_ID },
 	{ 0 },
@@ -312,29 +262,54 @@ static struct scif_driver scif_driver = {
 
 static int _scif_init(void)
 {
-	spin_lock_init(&scif_info.eplock);
+	int rc;
+
+	mutex_init(&scif_info.eplock);
+	spin_lock_init(&scif_info.rmalock);
 	spin_lock_init(&scif_info.nb_connect_lock);
 	spin_lock_init(&scif_info.port_lock);
 	mutex_init(&scif_info.conflock);
 	mutex_init(&scif_info.connlock);
+	mutex_init(&scif_info.fencelock);
 	INIT_LIST_HEAD(&scif_info.uaccept);
 	INIT_LIST_HEAD(&scif_info.listen);
 	INIT_LIST_HEAD(&scif_info.zombie);
 	INIT_LIST_HEAD(&scif_info.connected);
 	INIT_LIST_HEAD(&scif_info.disconnected);
+	INIT_LIST_HEAD(&scif_info.rma);
+	INIT_LIST_HEAD(&scif_info.rma_tc);
+	INIT_LIST_HEAD(&scif_info.mmu_notif_cleanup);
+	INIT_LIST_HEAD(&scif_info.fence);
 	INIT_LIST_HEAD(&scif_info.nb_connect_list);
 	init_waitqueue_head(&scif_info.exitwq);
+	scif_info.rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT;
 	scif_info.en_msg_log = 0;
 	scif_info.p2p_enable = 1;
+	rc = scif_setup_scifdev();
+	if (rc)
+		goto error;
+	unaligned_cache = kmem_cache_create("Unaligned_DMA",
+					    SCIF_KMEM_UNALIGNED_BUF_SIZE,
+					    0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!unaligned_cache) {
+		rc = -ENOMEM;
+		goto free_sdev;
+	}
 	INIT_WORK(&scif_info.misc_work, scif_misc_handler);
+	INIT_WORK(&scif_info.mmu_notif_work, scif_mmu_notif_handler);
 	INIT_WORK(&scif_info.conn_work, scif_conn_handler);
 	idr_init(&scif_ports);
 	return 0;
+free_sdev:
+	scif_destroy_scifdev();
+error:
+	return rc;
 }
 
 static void _scif_exit(void)
 {
 	idr_destroy(&scif_ports);
+	kmem_cache_destroy(unaligned_cache);
 	scif_destroy_scifdev();
 }
 
@@ -344,15 +319,13 @@ static int __init scif_init(void)
 	int rc;
 
 	_scif_init();
+	iova_cache_get();
 	rc = scif_peer_bus_init();
 	if (rc)
 		goto exit;
-	rc = scif_peer_register_driver(&scif_peer_driver);
-	if (rc)
-		goto peer_bus_exit;
 	rc = scif_register_driver(&scif_driver);
 	if (rc)
-		goto unreg_scif_peer;
+		goto peer_bus_exit;
 	rc = misc_register(mdev);
 	if (rc)
 		goto unreg_scif;
@@ -360,8 +333,6 @@ static int __init scif_init(void)
 	return 0;
 unreg_scif:
 	scif_unregister_driver(&scif_driver);
-unreg_scif_peer:
-	scif_peer_unregister_driver(&scif_peer_driver);
 peer_bus_exit:
 	scif_peer_bus_exit();
 exit:
@@ -374,8 +345,8 @@ static void __exit scif_exit(void)
 	scif_exit_debugfs();
 	misc_deregister(&scif_info.mdev);
 	scif_unregister_driver(&scif_driver);
-	scif_peer_unregister_driver(&scif_peer_driver);
 	scif_peer_bus_exit();
+	iova_cache_put();
 	_scif_exit();
 }
 
diff --git a/drivers/misc/mic/scif/scif_main.h b/drivers/misc/mic/scif/scif_main.h
index 580bc63e1b23..a08f0b600a9e 100644
--- a/drivers/misc/mic/scif/scif_main.h
+++ b/drivers/misc/mic/scif/scif_main.h
@@ -22,15 +22,18 @@
 #include <linux/pci.h>
 #include <linux/miscdevice.h>
 #include <linux/dmaengine.h>
+#include <linux/iova.h>
+#include <linux/anon_inodes.h>
 #include <linux/file.h>
+#include <linux/vmalloc.h>
 #include <linux/scif.h>
-
 #include "../common/mic_dev.h"
 
 #define SCIF_MGMT_NODE 0
 #define SCIF_DEFAULT_WATCHDOG_TO 30
 #define SCIF_NODE_ACCEPT_TIMEOUT (3 * HZ)
 #define SCIF_NODE_ALIVE_TIMEOUT (SCIF_DEFAULT_WATCHDOG_TO * HZ)
+#define SCIF_RMA_TEMP_CACHE_LIMIT 0x20000
 
 /*
  * Generic state used for certain node QP message exchanges
@@ -73,13 +76,21 @@ enum scif_msg_state {
  * @loopb_work: Used for submitting work to loopb_wq
  * @loopb_recv_q: List of messages received on the loopb_wq
  * @card_initiated_exit: set when the card has initiated the exit
+ * @rmalock: Synchronize access to RMA operations
+ * @fencelock: Synchronize access to list of remote fences requested.
+ * @rma: List of temporary registered windows to be destroyed.
+ * @rma_tc: List of temporary registered & cached Windows to be destroyed
+ * @fence: List of remote fence requests
+ * @mmu_notif_work: Work for registration caching MMU notifier workqueue
+ * @mmu_notif_cleanup: List of temporary cached windows for reg cache
+ * @rma_tc_limit: RMA temporary cache limit
  */
 struct scif_info {
 	u8 nodeid;
 	u8 maxid;
 	u8 total;
 	u32 nr_zombies;
-	spinlock_t eplock;
+	struct mutex eplock;
 	struct mutex connlock;
 	spinlock_t nb_connect_lock;
 	spinlock_t port_lock;
@@ -102,6 +113,14 @@ struct scif_info {
 	struct work_struct loopb_work;
 	struct list_head loopb_recv_q;
 	bool card_initiated_exit;
+	spinlock_t rmalock;
+	struct mutex fencelock;
+	struct list_head rma;
+	struct list_head rma_tc;
+	struct list_head fence;
+	struct work_struct mmu_notif_work;
+	struct list_head mmu_notif_cleanup;
+	unsigned long rma_tc_limit;
 };
 
 /*
@@ -139,7 +158,7 @@ struct scif_p2p_info {
  * @db: doorbell the peer will trigger to generate an interrupt on self
  * @rdb: Doorbell to trigger on the peer to generate an interrupt on the peer
  * @cookie: Cookie received while registering the interrupt handler
- * init_msg_work: work scheduled for SCIF_INIT message processing
+ * @peer_add_work: Work for handling device_add for peer devices
  * @p2p_dwork: Delayed work to enable polling for P2P state
  * @qp_dwork: Delayed work for enabling polling for remote QP information
  * @p2p_retry: Number of times to retry polling of P2P state
@@ -152,6 +171,8 @@ struct scif_p2p_info {
  * @disconn_rescnt: Keeps track of number of node remove requests sent
  * @exit: Status of exit message
  * @qp_dma_addr: Queue pair DMA address passed to the peer
+ * @dma_ch_idx: Round robin index for DMA channels
+ * @signal_pool: DMA pool used for scheduling scif_fence_signal DMA's
 */
 struct scif_dev {
 	u8 node;
@@ -165,7 +186,7 @@ struct scif_dev {
 	int db;
 	int rdb;
 	struct mic_irq *cookie;
-	struct work_struct init_msg_work;
+	struct work_struct peer_add_work;
 	struct delayed_work p2p_dwork;
 	struct delayed_work qp_dwork;
 	int p2p_retry;
@@ -178,17 +199,25 @@ struct scif_dev {
 	atomic_t disconn_rescnt;
 	enum scif_msg_state exit;
 	dma_addr_t qp_dma_addr;
+	int dma_ch_idx;
+	struct dma_pool *signal_pool;
 };
 
+extern bool scif_reg_cache_enable;
+extern bool scif_ulimit_check;
 extern struct scif_info scif_info;
 extern struct idr scif_ports;
+extern struct bus_type scif_peer_bus;
 extern struct scif_dev *scif_dev;
 extern const struct file_operations scif_fops;
+extern const struct file_operations scif_anon_fops;
 
 /* Size of the RB for the Node QP */
 #define SCIF_NODE_QP_SIZE 0x10000
 
 #include "scif_nodeqp.h"
+#include "scif_rma.h"
+#include "scif_rma_list.h"
 
 /*
  * scifdev_self:
diff --git a/drivers/misc/mic/scif/scif_map.h b/drivers/misc/mic/scif/scif_map.h
index 20e50b4e19b2..3e86360ba5a6 100644
--- a/drivers/misc/mic/scif/scif_map.h
+++ b/drivers/misc/mic/scif/scif_map.h
@@ -80,7 +80,7 @@ scif_unmap_single(dma_addr_t local, struct scif_dev *scifdev,
 		  size_t size)
 {
 	if (!scifdev_self(scifdev)) {
-		if (scifdev_is_p2p(scifdev) && local > scifdev->base_addr)
+		if (scifdev_is_p2p(scifdev))
 			local = local - scifdev->base_addr;
 		dma_unmap_single(&scifdev->sdev->dev, local,
 				 size, DMA_BIDIRECTIONAL);
@@ -110,4 +110,27 @@ scif_iounmap(void *virt, size_t len, struct scif_dev *scifdev)
 		sdev->hw_ops->iounmap(sdev, (void __force __iomem *)virt);
 	}
 }
+
+static __always_inline int
+scif_map_page(dma_addr_t *dma_handle, struct page *page,
+	      struct scif_dev *scifdev)
+{
+	int err = 0;
+
+	if (scifdev_self(scifdev)) {
+		*dma_handle = page_to_phys(page);
+	} else {
+		struct scif_hw_dev *sdev = scifdev->sdev;
+		*dma_handle = dma_map_page(&sdev->dev,
+					   page, 0x0, PAGE_SIZE,
+					   DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(&sdev->dev, *dma_handle))
+			err = -ENOMEM;
+		else if (scifdev_is_p2p(scifdev))
+			*dma_handle = *dma_handle + scifdev->base_addr;
+	}
+	if (err)
+		*dma_handle = 0;
+	return err;
+}
 #endif  /* SCIF_MAP_H */
diff --git a/drivers/misc/mic/scif/scif_mmap.c b/drivers/misc/mic/scif/scif_mmap.c
new file mode 100644
index 000000000000..49cb8f7b4672
--- /dev/null
+++ b/drivers/misc/mic/scif/scif_mmap.c
@@ -0,0 +1,699 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#include "scif_main.h"
+
+/*
+ * struct scif_vma_info - Information about a remote memory mapping
+ *			  created via scif_mmap(..)
+ * @vma: VM area struct
+ * @list: link to list of active vmas
+ */
+struct scif_vma_info {
+	struct vm_area_struct *vma;
+	struct list_head list;
+};
+
+void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_rma_req req;
+	struct scif_window *window = NULL;
+	struct scif_window *recv_window =
+		(struct scif_window *)msg->payload[0];
+	struct scif_endpt *ep;
+
+	ep = (struct scif_endpt *)recv_window->ep;
+	req.out_window = &window;
+	req.offset = recv_window->offset;
+	req.prot = recv_window->prot;
+	req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT;
+	req.type = SCIF_WINDOW_FULL;
+	req.head = &ep->rma_info.reg_list;
+	msg->payload[0] = ep->remote_ep;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	/* Does a valid window exist? */
+	if (scif_query_window(&req)) {
+		dev_err(&scifdev->sdev->dev,
+			"%s %d -ENXIO\n", __func__, __LINE__);
+		msg->uop = SCIF_UNREGISTER_ACK;
+		goto error;
+	}
+
+	scif_put_window(window, window->nr_pages);
+
+	if (!window->ref_count) {
+		atomic_inc(&ep->rma_info.tw_refcount);
+		ep->rma_info.async_list_del = 1;
+		list_del_init(&window->list);
+		scif_free_window_offset(ep, window, window->offset);
+	}
+error:
+	mutex_unlock(&ep->rma_info.rma_lock);
+	if (window && !window->ref_count)
+		scif_queue_for_cleanup(window, &scif_info.rma);
+}
+
+/*
+ * Remove valid remote memory mappings created via scif_mmap(..) from the
+ * process address space since the remote node is lost
+ */
+static void __scif_zap_mmaps(struct scif_endpt *ep)
+{
+	struct list_head *item;
+	struct scif_vma_info *info;
+	struct vm_area_struct *vma;
+	unsigned long size;
+
+	spin_lock(&ep->lock);
+	list_for_each(item, &ep->rma_info.vma_list) {
+		info = list_entry(item, struct scif_vma_info, list);
+		vma = info->vma;
+		size = vma->vm_end - vma->vm_start;
+		zap_vma_ptes(vma, vma->vm_start, size);
+		dev_dbg(scif_info.mdev.this_device,
+			"%s ep %p zap vma %p size 0x%lx\n",
+			__func__, ep, info->vma, size);
+	}
+	spin_unlock(&ep->lock);
+}
+
+/*
+ * Traverse the list of endpoints for a particular remote node and
+ * zap valid remote memory mappings since the remote node is lost
+ */
+static void _scif_zap_mmaps(int node, struct list_head *head)
+{
+	struct scif_endpt *ep;
+	struct list_head *item;
+
+	mutex_lock(&scif_info.connlock);
+	list_for_each(item, head) {
+		ep = list_entry(item, struct scif_endpt, list);
+		if (ep->remote_dev->node == node)
+			__scif_zap_mmaps(ep);
+	}
+	mutex_unlock(&scif_info.connlock);
+}
+
+/*
+ * Wrapper for removing remote memory mappings for a particular node. This API
+ * is called by peer nodes as part of handling a lost node.
+ */
+void scif_zap_mmaps(int node)
+{
+	_scif_zap_mmaps(node, &scif_info.connected);
+	_scif_zap_mmaps(node, &scif_info.disconnected);
+}
+
+/*
+ * This API is only called while handling a lost node:
+ * a) Remote node is dead.
+ * b) Remote memory mappings have been zapped
+ * So we can traverse the remote_reg_list without any locks. Since
+ * the window has not yet been unregistered we can drop the ref count
+ * and queue it to the cleanup thread.
+ */
+static void __scif_cleanup_rma_for_zombies(struct scif_endpt *ep)
+{
+	struct list_head *pos, *tmp;
+	struct scif_window *window;
+
+	list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) {
+		window = list_entry(pos, struct scif_window, list);
+		if (window->ref_count)
+			scif_put_window(window, window->nr_pages);
+		else
+			dev_err(scif_info.mdev.this_device,
+				"%s %d unexpected\n",
+				__func__, __LINE__);
+		if (!window->ref_count) {
+			atomic_inc(&ep->rma_info.tw_refcount);
+			list_del_init(&window->list);
+			scif_queue_for_cleanup(window, &scif_info.rma);
+		}
+	}
+}
+
+/* Cleanup remote registration lists for zombie endpoints */
+void scif_cleanup_rma_for_zombies(int node)
+{
+	struct scif_endpt *ep;
+	struct list_head *item;
+
+	mutex_lock(&scif_info.eplock);
+	list_for_each(item, &scif_info.zombie) {
+		ep = list_entry(item, struct scif_endpt, list);
+		if (ep->remote_dev && ep->remote_dev->node == node)
+			__scif_cleanup_rma_for_zombies(ep);
+	}
+	mutex_unlock(&scif_info.eplock);
+	flush_work(&scif_info.misc_work);
+}
+
+/* Insert the VMA into the per endpoint VMA list */
+static int scif_insert_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
+{
+	struct scif_vma_info *info;
+	int err = 0;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		err = -ENOMEM;
+		goto done;
+	}
+	info->vma = vma;
+	spin_lock(&ep->lock);
+	list_add_tail(&info->list, &ep->rma_info.vma_list);
+	spin_unlock(&ep->lock);
+done:
+	return err;
+}
+
+/* Delete the VMA from the per endpoint VMA list */
+static void scif_delete_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
+{
+	struct list_head *item;
+	struct scif_vma_info *info;
+
+	spin_lock(&ep->lock);
+	list_for_each(item, &ep->rma_info.vma_list) {
+		info = list_entry(item, struct scif_vma_info, list);
+		if (info->vma == vma) {
+			list_del(&info->list);
+			kfree(info);
+			break;
+		}
+	}
+	spin_unlock(&ep->lock);
+}
+
+static phys_addr_t scif_get_phys(phys_addr_t phys, struct scif_endpt *ep)
+{
+	struct scif_dev *scifdev = (struct scif_dev *)ep->remote_dev;
+	struct scif_hw_dev *sdev = scifdev->sdev;
+	phys_addr_t out_phys, apt_base = 0;
+
+	/*
+	 * If the DMA address is card relative then we need to add the
+	 * aperture base for mmap to work correctly
+	 */
+	if (!scifdev_self(scifdev) && sdev->aper && sdev->card_rel_da)
+		apt_base = sdev->aper->pa;
+	out_phys = apt_base + phys;
+	return out_phys;
+}
+
+int scif_get_pages(scif_epd_t epd, off_t offset, size_t len,
+		   struct scif_range **pages)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct scif_rma_req req;
+	struct scif_window *window = NULL;
+	int nr_pages, err, i;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n",
+		ep, offset, len);
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+
+	if (!len || (offset < 0) ||
+	    (offset + len < offset) ||
+	    (ALIGN(offset, PAGE_SIZE) != offset) ||
+	    (ALIGN(len, PAGE_SIZE) != len))
+		return -EINVAL;
+
+	nr_pages = len >> PAGE_SHIFT;
+
+	req.out_window = &window;
+	req.offset = offset;
+	req.prot = 0;
+	req.nr_bytes = len;
+	req.type = SCIF_WINDOW_SINGLE;
+	req.head = &ep->rma_info.remote_reg_list;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	/* Does a valid window exist? */
+	err = scif_query_window(&req);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto error;
+	}
+
+	/* Allocate scif_range */
+	*pages = kzalloc(sizeof(**pages), GFP_KERNEL);
+	if (!*pages) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	/* Allocate phys addr array */
+	(*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t));
+	if (!((*pages)->phys_addr)) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) {
+		/* Allocate virtual address array */
+		((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)));
+		if (!(*pages)->va) {
+			err = -ENOMEM;
+			goto error;
+		}
+	}
+	/* Populate the values */
+	(*pages)->cookie = window;
+	(*pages)->nr_pages = nr_pages;
+	(*pages)->prot_flags = window->prot;
+
+	for (i = 0; i < nr_pages; i++) {
+		(*pages)->phys_addr[i] =
+			__scif_off_to_dma_addr(window, offset +
+					       (i * PAGE_SIZE));
+		(*pages)->phys_addr[i] = scif_get_phys((*pages)->phys_addr[i],
+							ep);
+		if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev))
+			(*pages)->va[i] =
+				ep->remote_dev->sdev->aper->va +
+				(*pages)->phys_addr[i] -
+				ep->remote_dev->sdev->aper->pa;
+	}
+
+	scif_get_window(window, nr_pages);
+error:
+	mutex_unlock(&ep->rma_info.rma_lock);
+	if (err) {
+		if (*pages) {
+			scif_free((*pages)->phys_addr,
+				  nr_pages * sizeof(dma_addr_t));
+			scif_free((*pages)->va,
+				  nr_pages * sizeof(void *));
+			kfree(*pages);
+			*pages = NULL;
+		}
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_get_pages);
+
+int scif_put_pages(struct scif_range *pages)
+{
+	struct scif_endpt *ep;
+	struct scif_window *window;
+	struct scifmsg msg;
+
+	if (!pages || !pages->cookie)
+		return -EINVAL;
+
+	window = pages->cookie;
+
+	if (!window || window->magic != SCIFEP_MAGIC)
+		return -EINVAL;
+
+	ep = (struct scif_endpt *)window->ep;
+	/*
+	 * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
+	 * callee should be allowed to release references to the pages,
+	 * else the endpoint was not connected in the first place,
+	 * hence the ENOTCONN.
+	 */
+	if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED)
+		return -ENOTCONN;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+
+	scif_put_window(window, pages->nr_pages);
+
+	/* Initiate window destruction if ref count is zero */
+	if (!window->ref_count) {
+		list_del(&window->list);
+		mutex_unlock(&ep->rma_info.rma_lock);
+		scif_drain_dma_intr(ep->remote_dev->sdev,
+				    ep->rma_info.dma_chan);
+		/* Inform the peer about this window being destroyed. */
+		msg.uop = SCIF_MUNMAP;
+		msg.src = ep->port;
+		msg.payload[0] = window->peer_window;
+		/* No error handling for notification messages */
+		scif_nodeqp_send(ep->remote_dev, &msg);
+		/* Destroy this window from the peer's registered AS */
+		scif_destroy_remote_window(window);
+	} else {
+		mutex_unlock(&ep->rma_info.rma_lock);
+	}
+
+	scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t));
+	scif_free(pages->va, pages->nr_pages * sizeof(void *));
+	kfree(pages);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(scif_put_pages);
+
+/*
+ * scif_rma_list_mmap:
+ *
+ * Traverse the remote registration list starting from start_window:
+ * 1) Create VtoP mappings via remap_pfn_range(..)
+ * 2) Once step 1) and 2) complete successfully then traverse the range of
+ *    windows again and bump the reference count.
+ * RMA lock must be held.
+ */
+static int scif_rma_list_mmap(struct scif_window *start_window, s64 offset,
+			      int nr_pages, struct vm_area_struct *vma)
+{
+	s64 end_offset, loop_offset = offset;
+	struct scif_window *window = start_window;
+	int loop_nr_pages, nr_pages_left = nr_pages;
+	struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
+	struct list_head *head = &ep->rma_info.remote_reg_list;
+	int i, err = 0;
+	dma_addr_t phys_addr;
+	struct scif_window_iter src_win_iter;
+	size_t contig_bytes = 0;
+
+	might_sleep();
+	list_for_each_entry_from(window, head, list) {
+		end_offset = window->offset +
+			(window->nr_pages << PAGE_SHIFT);
+		loop_nr_pages = min_t(int,
+				      (end_offset - loop_offset) >> PAGE_SHIFT,
+				      nr_pages_left);
+		scif_init_window_iter(window, &src_win_iter);
+		for (i = 0; i < loop_nr_pages; i++) {
+			phys_addr = scif_off_to_dma_addr(window, loop_offset,
+							 &contig_bytes,
+							 &src_win_iter);
+			phys_addr = scif_get_phys(phys_addr, ep);
+			err = remap_pfn_range(vma,
+					      vma->vm_start +
+					      loop_offset - offset,
+					      phys_addr >> PAGE_SHIFT,
+					      PAGE_SIZE,
+					      vma->vm_page_prot);
+			if (err)
+				goto error;
+			loop_offset += PAGE_SIZE;
+		}
+		nr_pages_left -= loop_nr_pages;
+		if (!nr_pages_left)
+			break;
+	}
+	/*
+	 * No more failures expected. Bump up the ref count for all
+	 * the windows. Another traversal from start_window required
+	 * for handling errors encountered across windows during
+	 * remap_pfn_range(..).
+	 */
+	loop_offset = offset;
+	nr_pages_left = nr_pages;
+	window = start_window;
+	head = &ep->rma_info.remote_reg_list;
+	list_for_each_entry_from(window, head, list) {
+		end_offset = window->offset +
+			(window->nr_pages << PAGE_SHIFT);
+		loop_nr_pages = min_t(int,
+				      (end_offset - loop_offset) >> PAGE_SHIFT,
+				      nr_pages_left);
+		scif_get_window(window, loop_nr_pages);
+		nr_pages_left -= loop_nr_pages;
+		loop_offset += (loop_nr_pages << PAGE_SHIFT);
+		if (!nr_pages_left)
+			break;
+	}
+error:
+	if (err)
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d\n", __func__, __LINE__, err);
+	return err;
+}
+
+/*
+ * scif_rma_list_munmap:
+ *
+ * Traverse the remote registration list starting from window:
+ * 1) Decrement ref count.
+ * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer.
+ * RMA lock must be held.
+ */
+static void scif_rma_list_munmap(struct scif_window *start_window,
+				 s64 offset, int nr_pages)
+{
+	struct scifmsg msg;
+	s64 loop_offset = offset, end_offset;
+	int loop_nr_pages, nr_pages_left = nr_pages;
+	struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
+	struct list_head *head = &ep->rma_info.remote_reg_list;
+	struct scif_window *window = start_window, *_window;
+
+	msg.uop = SCIF_MUNMAP;
+	msg.src = ep->port;
+	loop_offset = offset;
+	nr_pages_left = nr_pages;
+	list_for_each_entry_safe_from(window, _window, head, list) {
+		end_offset = window->offset +
+			(window->nr_pages << PAGE_SHIFT);
+		loop_nr_pages = min_t(int,
+				      (end_offset - loop_offset) >> PAGE_SHIFT,
+				      nr_pages_left);
+		scif_put_window(window, loop_nr_pages);
+		if (!window->ref_count) {
+			struct scif_dev *rdev = ep->remote_dev;
+
+			scif_drain_dma_intr(rdev->sdev,
+					    ep->rma_info.dma_chan);
+			/* Inform the peer about this munmap */
+			msg.payload[0] = window->peer_window;
+			/* No error handling for Notification messages. */
+			scif_nodeqp_send(ep->remote_dev, &msg);
+			list_del(&window->list);
+			/* Destroy this window from the peer's registered AS */
+			scif_destroy_remote_window(window);
+		}
+		nr_pages_left -= loop_nr_pages;
+		loop_offset += (loop_nr_pages << PAGE_SHIFT);
+		if (!nr_pages_left)
+			break;
+	}
+}
+
+/*
+ * The private data field of each VMA used to mmap a remote window
+ * points to an instance of struct vma_pvt
+ */
+struct vma_pvt {
+	struct scif_endpt *ep;	/* End point for remote window */
+	s64 offset;		/* offset within remote window */
+	bool valid_offset;	/* offset is valid only if the original
+				 * mmap request was for a single page
+				 * else the offset within the vma is
+				 * the correct offset
+				 */
+	struct kref ref;
+};
+
+static void vma_pvt_release(struct kref *ref)
+{
+	struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref);
+
+	kfree(vmapvt);
+}
+
+/**
+ * scif_vma_open - VMA open driver callback
+ * @vma: VMM memory area.
+ * The open method is called by the kernel to allow the subsystem implementing
+ * the VMA to initialize the area. This method is invoked any time a new
+ * reference to the VMA is made (when a process forks, for example).
+ * The one exception happens when the VMA is first created by mmap;
+ * in this case, the driver's mmap method is called instead.
+ * This function is also invoked when an existing VMA is split by the kernel
+ * due to a call to munmap on a subset of the VMA resulting in two VMAs.
+ * The kernel invokes this function only on one of the two VMAs.
+ */
+static void scif_vma_open(struct vm_area_struct *vma)
+{
+	struct vma_pvt *vmapvt = vma->vm_private_data;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
+		vma->vm_start, vma->vm_end);
+	scif_insert_vma(vmapvt->ep, vma);
+	kref_get(&vmapvt->ref);
+}
+
+/**
+ * scif_munmap - VMA close driver callback.
+ * @vma: VMM memory area.
+ * When an area is destroyed, the kernel calls its close operation.
+ * Note that there's no usage count associated with VMA's; the area
+ * is opened and closed exactly once by each process that uses it.
+ */
+static void scif_munmap(struct vm_area_struct *vma)
+{
+	struct scif_endpt *ep;
+	struct vma_pvt *vmapvt = vma->vm_private_data;
+	int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	s64 offset;
+	struct scif_rma_req req;
+	struct scif_window *window = NULL;
+	int err;
+
+	might_sleep();
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
+		vma->vm_start, vma->vm_end);
+	ep = vmapvt->ep;
+	offset = vmapvt->valid_offset ? vmapvt->offset :
+		(vma->vm_pgoff) << PAGE_SHIFT;
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n",
+		ep, nr_pages, offset);
+	req.out_window = &window;
+	req.offset = offset;
+	req.nr_bytes = vma->vm_end - vma->vm_start;
+	req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
+	req.type = SCIF_WINDOW_PARTIAL;
+	req.head = &ep->rma_info.remote_reg_list;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+
+	err = scif_query_window(&req);
+	if (err)
+		dev_err(scif_info.mdev.this_device,
+			"%s %d err %d\n", __func__, __LINE__, err);
+	else
+		scif_rma_list_munmap(window, offset, nr_pages);
+
+	mutex_unlock(&ep->rma_info.rma_lock);
+	/*
+	 * The kernel probably zeroes these out but we still want
+	 * to clean up our own mess just in case.
+	 */
+	vma->vm_ops = NULL;
+	vma->vm_private_data = NULL;
+	kref_put(&vmapvt->ref, vma_pvt_release);
+	scif_delete_vma(ep, vma);
+}
+
+static const struct vm_operations_struct scif_vm_ops = {
+	.open = scif_vma_open,
+	.close = scif_munmap,
+};
+
+/**
+ * scif_mmap - Map pages in virtual address space to a remote window.
+ * @vma: VMM memory area.
+ * @epd: endpoint descriptor
+ *
+ * Return: Upon successful completion, scif_mmap() returns zero
+ * else an apt error is returned as documented in scif.h
+ */
+int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd)
+{
+	struct scif_rma_req req;
+	struct scif_window *window = NULL;
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	s64 start_offset = vma->vm_pgoff << PAGE_SHIFT;
+	int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	int err;
+	struct vma_pvt *vmapvt;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n",
+		ep, start_offset, nr_pages);
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+
+	might_sleep();
+
+	err = scif_insert_vma(ep, vma);
+	if (err)
+		return err;
+
+	vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL);
+	if (!vmapvt) {
+		scif_delete_vma(ep, vma);
+		return -ENOMEM;
+	}
+
+	vmapvt->ep = ep;
+	kref_init(&vmapvt->ref);
+
+	req.out_window = &window;
+	req.offset = start_offset;
+	req.nr_bytes = vma->vm_end - vma->vm_start;
+	req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
+	req.type = SCIF_WINDOW_PARTIAL;
+	req.head = &ep->rma_info.remote_reg_list;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	/* Does a valid window exist? */
+	err = scif_query_window(&req);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto error_unlock;
+	}
+
+	/* Default prot for loopback */
+	if (!scifdev_self(ep->remote_dev))
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+	/*
+	 * VM_DONTCOPY - Do not copy this vma on fork
+	 * VM_DONTEXPAND - Cannot expand with mremap()
+	 * VM_RESERVED - Count as reserved_vm like IO
+	 * VM_PFNMAP - Page-ranges managed without "struct page"
+	 * VM_IO - Memory mapped I/O or similar
+	 *
+	 * We do not want to copy this VMA automatically on a fork(),
+	 * expand this VMA due to mremap() or swap out these pages since
+	 * the VMA is actually backed by physical pages in the remote
+	 * node's physical memory and not via a struct page.
+	 */
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+
+	if (!scifdev_self(ep->remote_dev))
+		vma->vm_flags |= VM_IO | VM_PFNMAP;
+
+	/* Map this range of windows */
+	err = scif_rma_list_mmap(window, start_offset, nr_pages, vma);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto error_unlock;
+	}
+	/* Set up the driver call back */
+	vma->vm_ops = &scif_vm_ops;
+	vma->vm_private_data = vmapvt;
+error_unlock:
+	mutex_unlock(&ep->rma_info.rma_lock);
+	if (err) {
+		kfree(vmapvt);
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		scif_delete_vma(ep, vma);
+	}
+	return err;
+}
diff --git a/drivers/misc/mic/scif/scif_nm.c b/drivers/misc/mic/scif/scif_nm.c
index 9b4c5382d6a7..79f26a02a1cb 100644
--- a/drivers/misc/mic/scif/scif_nm.c
+++ b/drivers/misc/mic/scif/scif_nm.c
@@ -34,6 +34,7 @@ static void scif_invalidate_ep(int node)
 	list_for_each_safe(pos, tmpq, &scif_info.disconnected) {
 		ep = list_entry(pos, struct scif_endpt, list);
 		if (ep->remote_dev->node == node) {
+			scif_unmap_all_windows(ep);
 			spin_lock(&ep->lock);
 			scif_cleanup_ep_qp(ep);
 			spin_unlock(&ep->lock);
@@ -50,6 +51,7 @@ static void scif_invalidate_ep(int node)
 			wake_up_interruptible(&ep->sendwq);
 			wake_up_interruptible(&ep->recvwq);
 			spin_unlock(&ep->lock);
+			scif_unmap_all_windows(ep);
 		}
 	}
 	mutex_unlock(&scif_info.connlock);
@@ -61,8 +63,8 @@ void scif_free_qp(struct scif_dev *scifdev)
 
 	if (!qp)
 		return;
-	scif_free_coherent((void *)qp->inbound_q.rb_base,
-			   qp->local_buf, scifdev, qp->inbound_q.size);
+	scif_unmap_single(qp->local_buf, scifdev, qp->inbound_q.size);
+	kfree(qp->inbound_q.rb_base);
 	scif_unmap_single(qp->local_qp, scifdev, sizeof(struct scif_qp));
 	kfree(scifdev->qpairs);
 	scifdev->qpairs = NULL;
@@ -125,8 +127,12 @@ void scif_cleanup_scifdev(struct scif_dev *dev)
 		}
 		scif_destroy_intr_wq(dev);
 	}
+	flush_work(&scif_info.misc_work);
 	scif_destroy_p2p(dev);
 	scif_invalidate_ep(dev->node);
+	scif_zap_mmaps(dev->node);
+	scif_cleanup_rma_for_zombies(dev->node);
+	flush_work(&scif_info.misc_work);
 	scif_send_acks(dev);
 	if (!dev->node && scif_info.card_initiated_exit) {
 		/*
@@ -147,14 +153,8 @@ void scif_cleanup_scifdev(struct scif_dev *dev)
 void scif_handle_remove_node(int node)
 {
 	struct scif_dev *scifdev = &scif_dev[node];
-	struct scif_peer_dev *spdev;
-
-	rcu_read_lock();
-	spdev = rcu_dereference(scifdev->spdev);
-	rcu_read_unlock();
-	if (spdev)
-		scif_peer_unregister_device(spdev);
-	else
+
+	if (scif_peer_unregister_device(scifdev))
 		scif_send_acks(scifdev);
 }
 
diff --git a/drivers/misc/mic/scif/scif_nodeqp.c b/drivers/misc/mic/scif/scif_nodeqp.c
index 6dfdae3452d6..7180d566c74b 100644
--- a/drivers/misc/mic/scif/scif_nodeqp.c
+++ b/drivers/misc/mic/scif/scif_nodeqp.c
@@ -105,18 +105,22 @@
 int scif_setup_qp_connect(struct scif_qp *qp, dma_addr_t *qp_offset,
 			  int local_size, struct scif_dev *scifdev)
 {
-	void *local_q = NULL;
+	void *local_q = qp->inbound_q.rb_base;
 	int err = 0;
 	u32 tmp_rd = 0;
 
 	spin_lock_init(&qp->send_lock);
 	spin_lock_init(&qp->recv_lock);
 
-	local_q = kzalloc(local_size, GFP_KERNEL);
+	/* Allocate rb only if not already allocated */
 	if (!local_q) {
-		err = -ENOMEM;
-		return err;
+		local_q = kzalloc(local_size, GFP_KERNEL);
+		if (!local_q) {
+			err = -ENOMEM;
+			return err;
+		}
 	}
+
 	err = scif_map_single(&qp->local_buf, local_q, scifdev, local_size);
 	if (err)
 		goto kfree;
@@ -260,6 +264,11 @@ int scif_setup_qp_connect_response(struct scif_dev *scifdev,
 		     r_buf,
 		     get_count_order(remote_size));
 	/*
+	 * Because the node QP may already be processing an INIT message, set
+	 * the read pointer so the cached read offset isn't lost
+	 */
+	qp->remote_qp->local_read = qp->inbound_q.current_read_offset;
+	/*
 	 * resetup the inbound_q now that we know where the
 	 * inbound_read really is.
 	 */
@@ -529,27 +538,6 @@ static void scif_p2p_setup(void)
 	}
 }
 
-void scif_qp_response_ack(struct work_struct *work)
-{
-	struct scif_dev *scifdev = container_of(work, struct scif_dev,
-						init_msg_work);
-	struct scif_peer_dev *spdev;
-
-	/* Drop the INIT message if it has already been received */
-	if (_scifdev_alive(scifdev))
-		return;
-
-	spdev = scif_peer_register_device(scifdev);
-	if (IS_ERR(spdev))
-		return;
-
-	if (scif_is_mgmt_node()) {
-		mutex_lock(&scif_info.conflock);
-		scif_p2p_setup();
-		mutex_unlock(&scif_info.conflock);
-	}
-}
-
 static char *message_types[] = {"BAD",
 				"INIT",
 				"EXIT",
@@ -568,7 +556,29 @@ static char *message_types[] = {"BAD",
 				"DISCNT_ACK",
 				"CLIENT_SENT",
 				"CLIENT_RCVD",
-				"SCIF_GET_NODE_INFO"};
+				"SCIF_GET_NODE_INFO",
+				"REGISTER",
+				"REGISTER_ACK",
+				"REGISTER_NACK",
+				"UNREGISTER",
+				"UNREGISTER_ACK",
+				"UNREGISTER_NACK",
+				"ALLOC_REQ",
+				"ALLOC_GNT",
+				"ALLOC_REJ",
+				"FREE_PHYS",
+				"FREE_VIRT",
+				"MUNMAP",
+				"MARK",
+				"MARK_ACK",
+				"MARK_NACK",
+				"WAIT",
+				"WAIT_ACK",
+				"WAIT_NACK",
+				"SIGNAL_LOCAL",
+				"SIGNAL_REMOTE",
+				"SIG_ACK",
+				"SIG_NACK"};
 
 static void
 scif_display_message(struct scif_dev *scifdev, struct scifmsg *msg,
@@ -662,10 +672,16 @@ int scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg)
  *
  * Work queue handler for servicing miscellaneous SCIF tasks.
  * Examples include:
- * 1) Cleanup of zombie endpoints.
+ * 1) Remote fence requests.
+ * 2) Destruction of temporary registered windows
+ *    created during scif_vreadfrom()/scif_vwriteto().
+ * 3) Cleanup of zombie endpoints.
  */
 void scif_misc_handler(struct work_struct *work)
 {
+	scif_rma_handle_remote_fences();
+	scif_rma_destroy_windows();
+	scif_rma_destroy_tcw_invalid();
 	scif_cleanup_zombie_epd();
 }
 
@@ -682,13 +698,14 @@ scif_init(struct scif_dev *scifdev, struct scifmsg *msg)
 	 * address to complete initializing the inbound_q.
 	 */
 	flush_delayed_work(&scifdev->qp_dwork);
-	/*
-	 * Delegate the peer device registration to a workqueue, otherwise if
-	 * SCIF client probe (called during peer device registration) calls
-	 * scif_connect(..), it will block the message processing thread causing
-	 * a deadlock.
-	 */
-	schedule_work(&scifdev->init_msg_work);
+
+	scif_peer_register_device(scifdev);
+
+	if (scif_is_mgmt_node()) {
+		mutex_lock(&scif_info.conflock);
+		scif_p2p_setup();
+		mutex_unlock(&scif_info.conflock);
+	}
 }
 
 /**
@@ -838,13 +855,13 @@ void scif_poll_qp_state(struct work_struct *work)
 				      msecs_to_jiffies(SCIF_NODE_QP_TIMEOUT));
 		return;
 	}
-	scif_peer_register_device(peerdev);
 	return;
 timeout:
 	dev_err(&peerdev->sdev->dev,
 		"%s %d remote node %d offline,  state = 0x%x\n",
 		__func__, __LINE__, peerdev->node, qp->qp_state);
 	qp->remote_qp->qp_state = SCIF_QP_OFFLINE;
+	scif_peer_unregister_device(peerdev);
 	scif_cleanup_scifdev(peerdev);
 }
 
@@ -894,6 +911,9 @@ scif_node_add_ack(struct scif_dev *scifdev, struct scifmsg *msg)
 		goto local_error;
 	peerdev->rdb = msg->payload[2];
 	qp->remote_qp->qp_state = SCIF_QP_ONLINE;
+
+	scif_peer_register_device(peerdev);
+
 	schedule_delayed_work(&peerdev->p2p_dwork, 0);
 	return;
 local_error:
@@ -1007,6 +1027,27 @@ static void (*scif_intr_func[SCIF_MAX_MSG + 1])
 	scif_clientsend,	/* SCIF_CLIENT_SENT */
 	scif_clientrcvd,	/* SCIF_CLIENT_RCVD */
 	scif_get_node_info_resp,/* SCIF_GET_NODE_INFO */
+	scif_recv_reg,		/* SCIF_REGISTER */
+	scif_recv_reg_ack,	/* SCIF_REGISTER_ACK */
+	scif_recv_reg_nack,	/* SCIF_REGISTER_NACK */
+	scif_recv_unreg,	/* SCIF_UNREGISTER */
+	scif_recv_unreg_ack,	/* SCIF_UNREGISTER_ACK */
+	scif_recv_unreg_nack,	/* SCIF_UNREGISTER_NACK */
+	scif_alloc_req,		/* SCIF_ALLOC_REQ */
+	scif_alloc_gnt_rej,	/* SCIF_ALLOC_GNT */
+	scif_alloc_gnt_rej,	/* SCIF_ALLOC_REJ */
+	scif_free_virt,		/* SCIF_FREE_VIRT */
+	scif_recv_munmap,	/* SCIF_MUNMAP */
+	scif_recv_mark,		/* SCIF_MARK */
+	scif_recv_mark_resp,	/* SCIF_MARK_ACK */
+	scif_recv_mark_resp,	/* SCIF_MARK_NACK */
+	scif_recv_wait,		/* SCIF_WAIT */
+	scif_recv_wait_resp,	/* SCIF_WAIT_ACK */
+	scif_recv_wait_resp,	/* SCIF_WAIT_NACK */
+	scif_recv_sig_local,	/* SCIF_SIG_LOCAL */
+	scif_recv_sig_remote,	/* SCIF_SIG_REMOTE */
+	scif_recv_sig_resp,	/* SCIF_SIG_ACK */
+	scif_recv_sig_resp,	/* SCIF_SIG_NACK */
 };
 
 /**
@@ -1169,7 +1210,6 @@ int scif_setup_loopback_qp(struct scif_dev *scifdev)
 	int err = 0;
 	void *local_q;
 	struct scif_qp *qp;
-	struct scif_peer_dev *spdev;
 
 	err = scif_setup_intr_wq(scifdev);
 	if (err)
@@ -1216,15 +1256,11 @@ int scif_setup_loopback_qp(struct scif_dev *scifdev)
 		     &qp->local_write,
 		     local_q, get_count_order(SCIF_NODE_QP_SIZE));
 	scif_info.nodeid = scifdev->node;
-	spdev = scif_peer_register_device(scifdev);
-	if (IS_ERR(spdev)) {
-		err = PTR_ERR(spdev);
-		goto free_local_q;
-	}
+
+	scif_peer_register_device(scifdev);
+
 	scif_info.loopb_dev = scifdev;
 	return err;
-free_local_q:
-	kfree(local_q);
 free_qpairs:
 	kfree(scifdev->qpairs);
 destroy_loopb_wq:
@@ -1243,13 +1279,7 @@ exit:
  */
 int scif_destroy_loopback_qp(struct scif_dev *scifdev)
 {
-	struct scif_peer_dev *spdev;
-
-	rcu_read_lock();
-	spdev = rcu_dereference(scifdev->spdev);
-	rcu_read_unlock();
-	if (spdev)
-		scif_peer_unregister_device(spdev);
+	scif_peer_unregister_device(scifdev);
 	destroy_workqueue(scif_info.loopb_wq);
 	scif_destroy_intr_wq(scifdev);
 	kfree(scifdev->qpairs->outbound_q.rb_base);
diff --git a/drivers/misc/mic/scif/scif_nodeqp.h b/drivers/misc/mic/scif/scif_nodeqp.h
index 6c0ed6783479..95896273138e 100644
--- a/drivers/misc/mic/scif/scif_nodeqp.h
+++ b/drivers/misc/mic/scif/scif_nodeqp.h
@@ -74,7 +74,28 @@
 #define SCIF_CLIENT_SENT 16 /* Notify the peer that data has been written */
 #define SCIF_CLIENT_RCVD 17 /* Notify the peer that data has been read */
 #define SCIF_GET_NODE_INFO 18 /* Get current node mask from the mgmt node*/
-#define SCIF_MAX_MSG SCIF_GET_NODE_INFO
+#define SCIF_REGISTER 19 /* Tell peer about a new registered window */
+#define SCIF_REGISTER_ACK 20 /* Notify peer about unregistration success */
+#define SCIF_REGISTER_NACK 21 /* Notify peer about registration success */
+#define SCIF_UNREGISTER 22 /* Tell peer about unregistering a window */
+#define SCIF_UNREGISTER_ACK 23 /* Notify peer about registration failure */
+#define SCIF_UNREGISTER_NACK 24 /* Notify peer about unregistration failure */
+#define SCIF_ALLOC_REQ 25 /* Request a mapped buffer */
+#define SCIF_ALLOC_GNT 26 /* Notify peer about allocation success */
+#define SCIF_ALLOC_REJ 27 /* Notify peer about allocation failure */
+#define SCIF_FREE_VIRT 28 /* Free previously allocated virtual memory */
+#define SCIF_MUNMAP 29 /* Acknowledgment for a SCIF_MMAP request */
+#define SCIF_MARK 30 /* SCIF Remote Fence Mark Request */
+#define SCIF_MARK_ACK 31 /* SCIF Remote Fence Mark Success */
+#define SCIF_MARK_NACK 32 /* SCIF Remote Fence Mark Failure */
+#define SCIF_WAIT 33 /* SCIF Remote Fence Wait Request */
+#define SCIF_WAIT_ACK 34 /* SCIF Remote Fence Wait Success */
+#define SCIF_WAIT_NACK 35 /* SCIF Remote Fence Wait Failure */
+#define SCIF_SIG_LOCAL 36 /* SCIF Remote Fence Local Signal Request */
+#define SCIF_SIG_REMOTE 37 /* SCIF Remote Fence Remote Signal Request */
+#define SCIF_SIG_ACK 38 /* SCIF Remote Fence Remote Signal Success */
+#define SCIF_SIG_NACK 39 /* SCIF Remote Fence Remote Signal Failure */
+#define SCIF_MAX_MSG SCIF_SIG_NACK
 
 /*
  * struct scifmsg - Node QP message format
@@ -92,6 +113,24 @@ struct scifmsg {
 } __packed;
 
 /*
+ * struct scif_allocmsg - Used with SCIF_ALLOC_REQ to request
+ * the remote note to allocate memory
+ *
+ * phys_addr: Physical address of the buffer
+ * vaddr: Virtual address of the buffer
+ * size: Size of the buffer
+ * state: Current state
+ * allocwq: wait queue for status
+ */
+struct scif_allocmsg {
+	dma_addr_t phys_addr;
+	unsigned long vaddr;
+	size_t size;
+	enum scif_msg_state state;
+	wait_queue_head_t allocwq;
+};
+
+/*
  * struct scif_qp - Node Queue Pair
  *
  * Interesting structure -- a little difficult because we can only
@@ -158,7 +197,6 @@ int scif_setup_qp_connect_response(struct scif_dev *scifdev,
 int scif_setup_loopback_qp(struct scif_dev *scifdev);
 int scif_destroy_loopback_qp(struct scif_dev *scifdev);
 void scif_poll_qp_state(struct work_struct *work);
-void scif_qp_response_ack(struct work_struct *work);
 void scif_destroy_p2p(struct scif_dev *scifdev);
 void scif_send_exit(struct scif_dev *scifdev);
 static inline struct device *scif_get_peer_dev(struct scif_dev *scifdev)
diff --git a/drivers/misc/mic/scif/scif_peer_bus.c b/drivers/misc/mic/scif/scif_peer_bus.c
index 589ae9ad2501..6ffa3bdbd45b 100644
--- a/drivers/misc/mic/scif/scif_peer_bus.c
+++ b/drivers/misc/mic/scif/scif_peer_bus.c
@@ -24,93 +24,152 @@ dev_to_scif_peer(struct device *dev)
 	return container_of(dev, struct scif_peer_dev, dev);
 }
 
-static inline struct scif_peer_driver *
-drv_to_scif_peer(struct device_driver *drv)
-{
-	return container_of(drv, struct scif_peer_driver, driver);
-}
+struct bus_type scif_peer_bus = {
+	.name  = "scif_peer_bus",
+};
 
-static int scif_peer_dev_match(struct device *dv, struct device_driver *dr)
+static void scif_peer_release_dev(struct device *d)
 {
-	return !strncmp(dev_name(dv), dr->name, 4);
+	struct scif_peer_dev *sdev = dev_to_scif_peer(d);
+	struct scif_dev *scifdev = &scif_dev[sdev->dnode];
+
+	scif_cleanup_scifdev(scifdev);
+	kfree(sdev);
 }
 
-static int scif_peer_dev_probe(struct device *d)
+static int scif_peer_initialize_device(struct scif_dev *scifdev)
 {
-	struct scif_peer_dev *dev = dev_to_scif_peer(d);
-	struct scif_peer_driver *drv = drv_to_scif_peer(dev->dev.driver);
+	struct scif_peer_dev *spdev;
+	int ret;
 
-	return drv->probe(dev);
-}
+	spdev = kzalloc(sizeof(*spdev), GFP_KERNEL);
+	if (!spdev) {
+		ret = -ENOMEM;
+		goto err;
+	}
 
-static int scif_peer_dev_remove(struct device *d)
-{
-	struct scif_peer_dev *dev = dev_to_scif_peer(d);
-	struct scif_peer_driver *drv = drv_to_scif_peer(dev->dev.driver);
+	spdev->dev.parent = scifdev->sdev->dev.parent;
+	spdev->dev.release = scif_peer_release_dev;
+	spdev->dnode = scifdev->node;
+	spdev->dev.bus = &scif_peer_bus;
+	dev_set_name(&spdev->dev, "scif_peer-dev%u", spdev->dnode);
+
+	device_initialize(&spdev->dev);
+	get_device(&spdev->dev);
+	rcu_assign_pointer(scifdev->spdev, spdev);
 
-	drv->remove(dev);
+	mutex_lock(&scif_info.conflock);
+	scif_info.total++;
+	scif_info.maxid = max_t(u32, spdev->dnode, scif_info.maxid);
+	mutex_unlock(&scif_info.conflock);
 	return 0;
+err:
+	dev_err(&scifdev->sdev->dev,
+		"dnode %d: initialize_device rc %d\n", scifdev->node, ret);
+	return ret;
 }
 
-static struct bus_type scif_peer_bus = {
-	.name  = "scif_peer_bus",
-	.match = scif_peer_dev_match,
-	.probe = scif_peer_dev_probe,
-	.remove = scif_peer_dev_remove,
-};
-
-int scif_peer_register_driver(struct scif_peer_driver *driver)
+static int scif_peer_add_device(struct scif_dev *scifdev)
 {
-	driver->driver.bus = &scif_peer_bus;
-	return driver_register(&driver->driver);
+	struct scif_peer_dev *spdev = rcu_dereference(scifdev->spdev);
+	char pool_name[16];
+	int ret;
+
+	ret = device_add(&spdev->dev);
+	put_device(&spdev->dev);
+	if (ret) {
+		dev_err(&scifdev->sdev->dev,
+			"dnode %d: peer device_add failed\n", scifdev->node);
+		goto put_spdev;
+	}
+
+	scnprintf(pool_name, sizeof(pool_name), "scif-%d", spdev->dnode);
+	scifdev->signal_pool = dmam_pool_create(pool_name, &scifdev->sdev->dev,
+						sizeof(struct scif_status), 1,
+						0);
+	if (!scifdev->signal_pool) {
+		dev_err(&scifdev->sdev->dev,
+			"dnode %d: dmam_pool_create failed\n", scifdev->node);
+		ret = -ENOMEM;
+		goto del_spdev;
+	}
+	dev_dbg(&spdev->dev, "Added peer dnode %d\n", spdev->dnode);
+	return 0;
+del_spdev:
+	device_del(&spdev->dev);
+put_spdev:
+	RCU_INIT_POINTER(scifdev->spdev, NULL);
+	synchronize_rcu();
+	put_device(&spdev->dev);
+
+	mutex_lock(&scif_info.conflock);
+	scif_info.total--;
+	mutex_unlock(&scif_info.conflock);
+	return ret;
 }
 
-void scif_peer_unregister_driver(struct scif_peer_driver *driver)
+void scif_add_peer_device(struct work_struct *work)
 {
-	driver_unregister(&driver->driver);
+	struct scif_dev *scifdev = container_of(work, struct scif_dev,
+						peer_add_work);
+
+	scif_peer_add_device(scifdev);
 }
 
-static void scif_peer_release_dev(struct device *d)
+/*
+ * Peer device registration is split into a device_initialize and a device_add.
+ * The reason for doing this is as follows: First, peer device registration
+ * itself cannot be done in the message processing thread and must be delegated
+ * to another workqueue, otherwise if SCIF client probe, called during peer
+ * device registration, calls scif_connect(..), it will block the message
+ * processing thread causing a deadlock. Next, device_initialize is done in the
+ * "top-half" message processing thread and device_add in the "bottom-half"
+ * workqueue. If this is not done, SCIF_CNCT_REQ message processing executing
+ * concurrently with SCIF_INIT message processing is unable to get a reference
+ * on the peer device, thereby failing the connect request.
+ */
+void scif_peer_register_device(struct scif_dev *scifdev)
 {
-	struct scif_peer_dev *sdev = dev_to_scif_peer(d);
-	struct scif_dev *scifdev = &scif_dev[sdev->dnode];
+	int ret;
 
-	scif_cleanup_scifdev(scifdev);
-	kfree(sdev);
+	mutex_lock(&scifdev->lock);
+	ret = scif_peer_initialize_device(scifdev);
+	if (ret)
+		goto exit;
+	schedule_work(&scifdev->peer_add_work);
+exit:
+	mutex_unlock(&scifdev->lock);
 }
 
-struct scif_peer_dev *
-scif_peer_register_device(struct scif_dev *scifdev)
+int scif_peer_unregister_device(struct scif_dev *scifdev)
 {
-	int ret;
 	struct scif_peer_dev *spdev;
 
-	spdev = kzalloc(sizeof(*spdev), GFP_KERNEL);
-	if (!spdev)
-		return ERR_PTR(-ENOMEM);
-
-	spdev->dev.parent = scifdev->sdev->dev.parent;
-	spdev->dev.release = scif_peer_release_dev;
-	spdev->dnode = scifdev->node;
-	spdev->dev.bus = &scif_peer_bus;
+	mutex_lock(&scifdev->lock);
+	/* Flush work to ensure device register is complete */
+	flush_work(&scifdev->peer_add_work);
 
-	dev_set_name(&spdev->dev, "scif_peer-dev%u", spdev->dnode);
 	/*
-	 * device_register() causes the bus infrastructure to look for a
-	 * matching driver.
+	 * Continue holding scifdev->lock since theoretically unregister_device
+	 * can be called simultaneously from multiple threads
 	 */
-	ret = device_register(&spdev->dev);
-	if (ret)
-		goto free_spdev;
-	return spdev;
-free_spdev:
-	kfree(spdev);
-	return ERR_PTR(ret);
-}
-
-void scif_peer_unregister_device(struct scif_peer_dev *sdev)
-{
-	device_unregister(&sdev->dev);
+	spdev = rcu_dereference(scifdev->spdev);
+	if (!spdev) {
+		mutex_unlock(&scifdev->lock);
+		return -ENODEV;
+	}
+
+	RCU_INIT_POINTER(scifdev->spdev, NULL);
+	synchronize_rcu();
+	mutex_unlock(&scifdev->lock);
+
+	dev_dbg(&spdev->dev, "Removing peer dnode %d\n", spdev->dnode);
+	device_unregister(&spdev->dev);
+
+	mutex_lock(&scif_info.conflock);
+	scif_info.total--;
+	mutex_unlock(&scif_info.conflock);
+	return 0;
 }
 
 int scif_peer_bus_init(void)
diff --git a/drivers/misc/mic/scif/scif_peer_bus.h b/drivers/misc/mic/scif/scif_peer_bus.h
index 33f0dbb30152..a3b8dd2edaa5 100644
--- a/drivers/misc/mic/scif/scif_peer_bus.h
+++ b/drivers/misc/mic/scif/scif_peer_bus.h
@@ -19,47 +19,13 @@
 
 #include <linux/device.h>
 #include <linux/mic_common.h>
-
-/*
- * Peer devices show up as PCIe devices for the mgmt node but not the cards.
- * The mgmt node discovers all the cards on the PCIe bus and informs the other
- * cards about their peers. Upon notification of a peer a node adds a peer
- * device to the peer bus to maintain symmetry in the way devices are
- * discovered across all nodes in the SCIF network.
- */
-/**
- * scif_peer_dev - representation of a peer SCIF device
- * @dev: underlying device
- * @dnode - The destination node which this device will communicate with.
- */
-struct scif_peer_dev {
-	struct device dev;
-	u8 dnode;
-};
-
-/**
- * scif_peer_driver - operations for a scif_peer I/O driver
- * @driver: underlying device driver (populate name and owner).
- * @id_table: the ids serviced by this driver.
- * @probe: the function to call when a device is found.  Returns 0 or -errno.
- * @remove: the function to call when a device is removed.
- */
-struct scif_peer_driver {
-	struct device_driver driver;
-	const struct scif_peer_dev_id *id_table;
-
-	int (*probe)(struct scif_peer_dev *dev);
-	void (*remove)(struct scif_peer_dev *dev);
-};
+#include <linux/scif.h>
 
 struct scif_dev;
 
-int scif_peer_register_driver(struct scif_peer_driver *driver);
-void scif_peer_unregister_driver(struct scif_peer_driver *driver);
-
-struct scif_peer_dev *scif_peer_register_device(struct scif_dev *sdev);
-void scif_peer_unregister_device(struct scif_peer_dev *sdev);
-
+void scif_add_peer_device(struct work_struct *work);
+void scif_peer_register_device(struct scif_dev *sdev);
+int scif_peer_unregister_device(struct scif_dev *scifdev);
 int scif_peer_bus_init(void);
 void scif_peer_bus_exit(void);
 #endif /* _SCIF_PEER_BUS_H */
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
new file mode 100644
index 000000000000..e2889967036a
--- /dev/null
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -0,0 +1,1770 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#include <linux/dma_remapping.h>
+#include <linux/pagemap.h>
+#include "scif_main.h"
+#include "scif_map.h"
+
+/* Used to skip ulimit checks for registrations with SCIF_MAP_KERNEL flag */
+#define SCIF_MAP_ULIMIT 0x40
+
+bool scif_ulimit_check = 1;
+
+/**
+ * scif_rma_ep_init:
+ * @ep: end point
+ *
+ * Initialize RMA per EP data structures.
+ */
+void scif_rma_ep_init(struct scif_endpt *ep)
+{
+	struct scif_endpt_rma_info *rma = &ep->rma_info;
+
+	mutex_init(&rma->rma_lock);
+	init_iova_domain(&rma->iovad, PAGE_SIZE, SCIF_IOVA_START_PFN,
+			 SCIF_DMA_64BIT_PFN);
+	spin_lock_init(&rma->tc_lock);
+	mutex_init(&rma->mmn_lock);
+	INIT_LIST_HEAD(&rma->reg_list);
+	INIT_LIST_HEAD(&rma->remote_reg_list);
+	atomic_set(&rma->tw_refcount, 0);
+	atomic_set(&rma->tcw_refcount, 0);
+	atomic_set(&rma->tcw_total_pages, 0);
+	atomic_set(&rma->fence_refcount, 0);
+
+	rma->async_list_del = 0;
+	rma->dma_chan = NULL;
+	INIT_LIST_HEAD(&rma->mmn_list);
+	INIT_LIST_HEAD(&rma->vma_list);
+	init_waitqueue_head(&rma->markwq);
+}
+
+/**
+ * scif_rma_ep_can_uninit:
+ * @ep: end point
+ *
+ * Returns 1 if an endpoint can be uninitialized and 0 otherwise.
+ */
+int scif_rma_ep_can_uninit(struct scif_endpt *ep)
+{
+	int ret = 0;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	/* Destroy RMA Info only if both lists are empty */
+	if (list_empty(&ep->rma_info.reg_list) &&
+	    list_empty(&ep->rma_info.remote_reg_list) &&
+	    list_empty(&ep->rma_info.mmn_list) &&
+	    !atomic_read(&ep->rma_info.tw_refcount) &&
+	    !atomic_read(&ep->rma_info.tcw_refcount) &&
+	    !atomic_read(&ep->rma_info.fence_refcount))
+		ret = 1;
+	mutex_unlock(&ep->rma_info.rma_lock);
+	return ret;
+}
+
+/**
+ * scif_create_pinned_pages:
+ * @nr_pages: number of pages in window
+ * @prot: read/write protection
+ *
+ * Allocate and prepare a set of pinned pages.
+ */
+static struct scif_pinned_pages *
+scif_create_pinned_pages(int nr_pages, int prot)
+{
+	struct scif_pinned_pages *pin;
+
+	might_sleep();
+	pin = scif_zalloc(sizeof(*pin));
+	if (!pin)
+		goto error;
+
+	pin->pages = scif_zalloc(nr_pages * sizeof(*pin->pages));
+	if (!pin->pages)
+		goto error_free_pinned_pages;
+
+	pin->prot = prot;
+	pin->magic = SCIFEP_MAGIC;
+	return pin;
+
+error_free_pinned_pages:
+	scif_free(pin, sizeof(*pin));
+error:
+	return NULL;
+}
+
+/**
+ * scif_destroy_pinned_pages:
+ * @pin: A set of pinned pages.
+ *
+ * Deallocate resources for pinned pages.
+ */
+static int scif_destroy_pinned_pages(struct scif_pinned_pages *pin)
+{
+	int j;
+	int writeable = pin->prot & SCIF_PROT_WRITE;
+	int kernel = SCIF_MAP_KERNEL & pin->map_flags;
+
+	for (j = 0; j < pin->nr_pages; j++) {
+		if (pin->pages[j] && !kernel) {
+			if (writeable)
+				SetPageDirty(pin->pages[j]);
+			put_page(pin->pages[j]);
+		}
+	}
+
+	scif_free(pin->pages,
+		  pin->nr_pages * sizeof(*pin->pages));
+	scif_free(pin, sizeof(*pin));
+	return 0;
+}
+
+/*
+ * scif_create_window:
+ * @ep: end point
+ * @nr_pages: number of pages
+ * @offset: registration offset
+ * @temp: true if a temporary window is being created
+ *
+ * Allocate and prepare a self registration window.
+ */
+struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages,
+				       s64 offset, bool temp)
+{
+	struct scif_window *window;
+
+	might_sleep();
+	window = scif_zalloc(sizeof(*window));
+	if (!window)
+		goto error;
+
+	window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr));
+	if (!window->dma_addr)
+		goto error_free_window;
+
+	window->num_pages = scif_zalloc(nr_pages * sizeof(*window->num_pages));
+	if (!window->num_pages)
+		goto error_free_window;
+
+	window->offset = offset;
+	window->ep = (u64)ep;
+	window->magic = SCIFEP_MAGIC;
+	window->reg_state = OP_IDLE;
+	init_waitqueue_head(&window->regwq);
+	window->unreg_state = OP_IDLE;
+	init_waitqueue_head(&window->unregwq);
+	INIT_LIST_HEAD(&window->list);
+	window->type = SCIF_WINDOW_SELF;
+	window->temp = temp;
+	return window;
+
+error_free_window:
+	scif_free(window->dma_addr,
+		  nr_pages * sizeof(*window->dma_addr));
+	scif_free(window, sizeof(*window));
+error:
+	return NULL;
+}
+
+/**
+ * scif_destroy_incomplete_window:
+ * @ep: end point
+ * @window: registration window
+ *
+ * Deallocate resources for self window.
+ */
+static void scif_destroy_incomplete_window(struct scif_endpt *ep,
+					   struct scif_window *window)
+{
+	int err;
+	int nr_pages = window->nr_pages;
+	struct scif_allocmsg *alloc = &window->alloc_handle;
+	struct scifmsg msg;
+
+retry:
+	/* Wait for a SCIF_ALLOC_GNT/REJ message */
+	err = wait_event_timeout(alloc->allocwq,
+				 alloc->state != OP_IN_PROGRESS,
+				 SCIF_NODE_ALIVE_TIMEOUT);
+	if (!err && scifdev_alive(ep))
+		goto retry;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	if (alloc->state == OP_COMPLETED) {
+		msg.uop = SCIF_FREE_VIRT;
+		msg.src = ep->port;
+		msg.payload[0] = ep->remote_ep;
+		msg.payload[1] = window->alloc_handle.vaddr;
+		msg.payload[2] = (u64)window;
+		msg.payload[3] = SCIF_REGISTER;
+		_scif_nodeqp_send(ep->remote_dev, &msg);
+	}
+	mutex_unlock(&ep->rma_info.rma_lock);
+
+	scif_free_window_offset(ep, window, window->offset);
+	scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr));
+	scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages));
+	scif_free(window, sizeof(*window));
+}
+
+/**
+ * scif_unmap_window:
+ * @remote_dev: SCIF remote device
+ * @window: registration window
+ *
+ * Delete any DMA mappings created for a registered self window
+ */
+void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window)
+{
+	int j;
+
+	if (scif_is_iommu_enabled() && !scifdev_self(remote_dev)) {
+		if (window->st) {
+			dma_unmap_sg(&remote_dev->sdev->dev,
+				     window->st->sgl, window->st->nents,
+				     DMA_BIDIRECTIONAL);
+			sg_free_table(window->st);
+			kfree(window->st);
+			window->st = NULL;
+		}
+	} else {
+		for (j = 0; j < window->nr_contig_chunks; j++) {
+			if (window->dma_addr[j]) {
+				scif_unmap_single(window->dma_addr[j],
+						  remote_dev,
+						  window->num_pages[j] <<
+						  PAGE_SHIFT);
+				window->dma_addr[j] = 0x0;
+			}
+		}
+	}
+}
+
+static inline struct mm_struct *__scif_acquire_mm(void)
+{
+	if (scif_ulimit_check)
+		return get_task_mm(current);
+	return NULL;
+}
+
+static inline void __scif_release_mm(struct mm_struct *mm)
+{
+	if (mm)
+		mmput(mm);
+}
+
+static inline int
+__scif_dec_pinned_vm_lock(struct mm_struct *mm,
+			  int nr_pages, bool try_lock)
+{
+	if (!mm || !nr_pages || !scif_ulimit_check)
+		return 0;
+	if (try_lock) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			dev_err(scif_info.mdev.this_device,
+				"%s %d err\n", __func__, __LINE__);
+			return -1;
+		}
+	} else {
+		down_write(&mm->mmap_sem);
+	}
+	mm->pinned_vm -= nr_pages;
+	up_write(&mm->mmap_sem);
+	return 0;
+}
+
+static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm,
+					     int nr_pages)
+{
+	unsigned long locked, lock_limit;
+
+	if (!mm || !nr_pages || !scif_ulimit_check)
+		return 0;
+
+	locked = nr_pages;
+	locked += mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		dev_err(scif_info.mdev.this_device,
+			"locked(%lu) > lock_limit(%lu)\n",
+			locked, lock_limit);
+		return -ENOMEM;
+	}
+	mm->pinned_vm = locked;
+	return 0;
+}
+
+/**
+ * scif_destroy_window:
+ * @ep: end point
+ * @window: registration window
+ *
+ * Deallocate resources for self window.
+ */
+int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window)
+{
+	int j;
+	struct scif_pinned_pages *pinned_pages = window->pinned_pages;
+	int nr_pages = window->nr_pages;
+
+	might_sleep();
+	if (!window->temp && window->mm) {
+		__scif_dec_pinned_vm_lock(window->mm, window->nr_pages, 0);
+		__scif_release_mm(window->mm);
+		window->mm = NULL;
+	}
+
+	scif_free_window_offset(ep, window, window->offset);
+	scif_unmap_window(ep->remote_dev, window);
+	/*
+	 * Decrement references for this set of pinned pages from
+	 * this window.
+	 */
+	j = atomic_sub_return(1, &pinned_pages->ref_count);
+	if (j < 0)
+		dev_err(scif_info.mdev.this_device,
+			"%s %d incorrect ref count %d\n",
+			__func__, __LINE__, j);
+	/*
+	 * If the ref count for pinned_pages is zero then someone
+	 * has already called scif_unpin_pages() for it and we should
+	 * destroy the page cache.
+	 */
+	if (!j)
+		scif_destroy_pinned_pages(window->pinned_pages);
+	scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr));
+	scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages));
+	window->magic = 0;
+	scif_free(window, sizeof(*window));
+	return 0;
+}
+
+/**
+ * scif_create_remote_lookup:
+ * @remote_dev: SCIF remote device
+ * @window: remote window
+ *
+ * Allocate and prepare lookup entries for the remote
+ * end to copy over the physical addresses.
+ * Returns 0 on success and appropriate errno on failure.
+ */
+static int scif_create_remote_lookup(struct scif_dev *remote_dev,
+				     struct scif_window *window)
+{
+	int i, j, err = 0;
+	int nr_pages = window->nr_pages;
+	bool vmalloc_dma_phys, vmalloc_num_pages;
+
+	might_sleep();
+	/* Map window */
+	err = scif_map_single(&window->mapped_offset,
+			      window, remote_dev, sizeof(*window));
+	if (err)
+		goto error_window;
+
+	/* Compute the number of lookup entries. 21 == 2MB Shift */
+	window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE,
+					((2) * 1024 * 1024)) >> 21;
+
+	window->dma_addr_lookup.lookup =
+		scif_alloc_coherent(&window->dma_addr_lookup.offset,
+				    remote_dev, window->nr_lookup *
+				    sizeof(*window->dma_addr_lookup.lookup),
+				    GFP_KERNEL | __GFP_ZERO);
+	if (!window->dma_addr_lookup.lookup)
+		goto error_window;
+
+	window->num_pages_lookup.lookup =
+		scif_alloc_coherent(&window->num_pages_lookup.offset,
+				    remote_dev, window->nr_lookup *
+				    sizeof(*window->num_pages_lookup.lookup),
+				    GFP_KERNEL | __GFP_ZERO);
+	if (!window->num_pages_lookup.lookup)
+		goto error_window;
+
+	vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]);
+	vmalloc_num_pages = is_vmalloc_addr(&window->num_pages[0]);
+
+	/* Now map each of the pages containing physical addresses */
+	for (i = 0, j = 0; i < nr_pages; i += SCIF_NR_ADDR_IN_PAGE, j++) {
+		err = scif_map_page(&window->dma_addr_lookup.lookup[j],
+				    vmalloc_dma_phys ?
+				    vmalloc_to_page(&window->dma_addr[i]) :
+				    virt_to_page(&window->dma_addr[i]),
+				    remote_dev);
+		if (err)
+			goto error_window;
+		err = scif_map_page(&window->num_pages_lookup.lookup[j],
+				    vmalloc_dma_phys ?
+				    vmalloc_to_page(&window->num_pages[i]) :
+				    virt_to_page(&window->num_pages[i]),
+				    remote_dev);
+		if (err)
+			goto error_window;
+	}
+	return 0;
+error_window:
+	return err;
+}
+
+/**
+ * scif_destroy_remote_lookup:
+ * @remote_dev: SCIF remote device
+ * @window: remote window
+ *
+ * Destroy lookup entries used for the remote
+ * end to copy over the physical addresses.
+ */
+static void scif_destroy_remote_lookup(struct scif_dev *remote_dev,
+				       struct scif_window *window)
+{
+	int i, j;
+
+	if (window->nr_lookup) {
+		struct scif_rma_lookup *lup = &window->dma_addr_lookup;
+		struct scif_rma_lookup *npup = &window->num_pages_lookup;
+
+		for (i = 0, j = 0; i < window->nr_pages;
+			i += SCIF_NR_ADDR_IN_PAGE, j++) {
+			if (lup->lookup && lup->lookup[j])
+				scif_unmap_single(lup->lookup[j],
+						  remote_dev,
+						  PAGE_SIZE);
+			if (npup->lookup && npup->lookup[j])
+				scif_unmap_single(npup->lookup[j],
+						  remote_dev,
+						  PAGE_SIZE);
+		}
+		if (lup->lookup)
+			scif_free_coherent(lup->lookup, lup->offset,
+					   remote_dev, window->nr_lookup *
+					   sizeof(*lup->lookup));
+		if (npup->lookup)
+			scif_free_coherent(npup->lookup, npup->offset,
+					   remote_dev, window->nr_lookup *
+					   sizeof(*npup->lookup));
+		if (window->mapped_offset)
+			scif_unmap_single(window->mapped_offset,
+					  remote_dev, sizeof(*window));
+		window->nr_lookup = 0;
+	}
+}
+
+/**
+ * scif_create_remote_window:
+ * @ep: end point
+ * @nr_pages: number of pages in window
+ *
+ * Allocate and prepare a remote registration window.
+ */
+static struct scif_window *
+scif_create_remote_window(struct scif_dev *scifdev, int nr_pages)
+{
+	struct scif_window *window;
+
+	might_sleep();
+	window = scif_zalloc(sizeof(*window));
+	if (!window)
+		goto error_ret;
+
+	window->magic = SCIFEP_MAGIC;
+	window->nr_pages = nr_pages;
+
+	window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr));
+	if (!window->dma_addr)
+		goto error_window;
+
+	window->num_pages = scif_zalloc(nr_pages *
+					sizeof(*window->num_pages));
+	if (!window->num_pages)
+		goto error_window;
+
+	if (scif_create_remote_lookup(scifdev, window))
+		goto error_window;
+
+	window->type = SCIF_WINDOW_PEER;
+	window->unreg_state = OP_IDLE;
+	INIT_LIST_HEAD(&window->list);
+	return window;
+error_window:
+	scif_destroy_remote_window(window);
+error_ret:
+	return NULL;
+}
+
+/**
+ * scif_destroy_remote_window:
+ * @ep: end point
+ * @window: remote registration window
+ *
+ * Deallocate resources for remote window.
+ */
+void
+scif_destroy_remote_window(struct scif_window *window)
+{
+	scif_free(window->dma_addr, window->nr_pages *
+		  sizeof(*window->dma_addr));
+	scif_free(window->num_pages, window->nr_pages *
+		  sizeof(*window->num_pages));
+	window->magic = 0;
+	scif_free(window, sizeof(*window));
+}
+
+/**
+ * scif_iommu_map: create DMA mappings if the IOMMU is enabled
+ * @remote_dev: SCIF remote device
+ * @window: remote registration window
+ *
+ * Map the physical pages using dma_map_sg(..) and then detect the number
+ * of contiguous DMA mappings allocated
+ */
+static int scif_iommu_map(struct scif_dev *remote_dev,
+			  struct scif_window *window)
+{
+	struct scatterlist *sg;
+	int i, err;
+	scif_pinned_pages_t pin = window->pinned_pages;
+
+	window->st = kzalloc(sizeof(*window->st), GFP_KERNEL);
+	if (!window->st)
+		return -ENOMEM;
+
+	err = sg_alloc_table(window->st, window->nr_pages, GFP_KERNEL);
+	if (err)
+		return err;
+
+	for_each_sg(window->st->sgl, sg, window->st->nents, i)
+		sg_set_page(sg, pin->pages[i], PAGE_SIZE, 0x0);
+
+	err = dma_map_sg(&remote_dev->sdev->dev, window->st->sgl,
+			 window->st->nents, DMA_BIDIRECTIONAL);
+	if (!err)
+		return -ENOMEM;
+	/* Detect contiguous ranges of DMA mappings */
+	sg = window->st->sgl;
+	for (i = 0; sg; i++) {
+		dma_addr_t last_da;
+
+		window->dma_addr[i] = sg_dma_address(sg);
+		window->num_pages[i] = sg_dma_len(sg) >> PAGE_SHIFT;
+		last_da = sg_dma_address(sg) + sg_dma_len(sg);
+		while ((sg = sg_next(sg)) && sg_dma_address(sg) == last_da) {
+			window->num_pages[i] +=
+				(sg_dma_len(sg) >> PAGE_SHIFT);
+			last_da = window->dma_addr[i] +
+				sg_dma_len(sg);
+		}
+		window->nr_contig_chunks++;
+	}
+	return 0;
+}
+
+/**
+ * scif_map_window:
+ * @remote_dev: SCIF remote device
+ * @window: self registration window
+ *
+ * Map pages of a window into the aperture/PCI.
+ * Also determine addresses required for DMA.
+ */
+int
+scif_map_window(struct scif_dev *remote_dev, struct scif_window *window)
+{
+	int i, j, k, err = 0, nr_contig_pages;
+	scif_pinned_pages_t pin;
+	phys_addr_t phys_prev, phys_curr;
+
+	might_sleep();
+
+	pin = window->pinned_pages;
+
+	if (intel_iommu_enabled && !scifdev_self(remote_dev))
+		return scif_iommu_map(remote_dev, window);
+
+	for (i = 0, j = 0; i < window->nr_pages; i += nr_contig_pages, j++) {
+		phys_prev = page_to_phys(pin->pages[i]);
+		nr_contig_pages = 1;
+
+		/* Detect physically contiguous chunks */
+		for (k = i + 1; k < window->nr_pages; k++) {
+			phys_curr = page_to_phys(pin->pages[k]);
+			if (phys_curr != (phys_prev + PAGE_SIZE))
+				break;
+			phys_prev = phys_curr;
+			nr_contig_pages++;
+		}
+		window->num_pages[j] = nr_contig_pages;
+		window->nr_contig_chunks++;
+		if (scif_is_mgmt_node()) {
+			/*
+			 * Management node has to deal with SMPT on X100 and
+			 * hence the DMA mapping is required
+			 */
+			err = scif_map_single(&window->dma_addr[j],
+					      phys_to_virt(page_to_phys(
+							   pin->pages[i])),
+					      remote_dev,
+					      nr_contig_pages << PAGE_SHIFT);
+			if (err)
+				return err;
+		} else {
+			window->dma_addr[j] = page_to_phys(pin->pages[i]);
+		}
+	}
+	return err;
+}
+
+/**
+ * scif_send_scif_unregister:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a SCIF_UNREGISTER message.
+ */
+static int scif_send_scif_unregister(struct scif_endpt *ep,
+				     struct scif_window *window)
+{
+	struct scifmsg msg;
+
+	msg.uop = SCIF_UNREGISTER;
+	msg.src = ep->port;
+	msg.payload[0] = window->alloc_handle.vaddr;
+	msg.payload[1] = (u64)window;
+	return scif_nodeqp_send(ep->remote_dev, &msg);
+}
+
+/**
+ * scif_unregister_window:
+ * @window: self registration window
+ *
+ * Send an unregistration request and wait for a response.
+ */
+int scif_unregister_window(struct scif_window *window)
+{
+	int err = 0;
+	struct scif_endpt *ep = (struct scif_endpt *)window->ep;
+	bool send_msg = false;
+
+	might_sleep();
+	switch (window->unreg_state) {
+	case OP_IDLE:
+	{
+		window->unreg_state = OP_IN_PROGRESS;
+		send_msg = true;
+		/* fall through */
+	}
+	case OP_IN_PROGRESS:
+	{
+		scif_get_window(window, 1);
+		mutex_unlock(&ep->rma_info.rma_lock);
+		if (send_msg) {
+			err = scif_send_scif_unregister(ep, window);
+			if (err) {
+				window->unreg_state = OP_COMPLETED;
+				goto done;
+			}
+		} else {
+			/* Return ENXIO since unregistration is in progress */
+			return -ENXIO;
+		}
+retry:
+		/* Wait for a SCIF_UNREGISTER_(N)ACK message */
+		err = wait_event_timeout(window->unregwq,
+					 window->unreg_state != OP_IN_PROGRESS,
+					 SCIF_NODE_ALIVE_TIMEOUT);
+		if (!err && scifdev_alive(ep))
+			goto retry;
+		if (!err) {
+			err = -ENODEV;
+			window->unreg_state = OP_COMPLETED;
+			dev_err(scif_info.mdev.this_device,
+				"%s %d err %d\n", __func__, __LINE__, err);
+		}
+		if (err > 0)
+			err = 0;
+done:
+		mutex_lock(&ep->rma_info.rma_lock);
+		scif_put_window(window, 1);
+		break;
+	}
+	case OP_FAILED:
+	{
+		if (!scifdev_alive(ep)) {
+			err = -ENODEV;
+			window->unreg_state = OP_COMPLETED;
+		}
+		break;
+	}
+	case OP_COMPLETED:
+		break;
+	default:
+		err = -ENODEV;
+	}
+
+	if (window->unreg_state == OP_COMPLETED && window->ref_count)
+		scif_put_window(window, window->nr_pages);
+
+	if (!window->ref_count) {
+		atomic_inc(&ep->rma_info.tw_refcount);
+		list_del_init(&window->list);
+		scif_free_window_offset(ep, window, window->offset);
+		mutex_unlock(&ep->rma_info.rma_lock);
+		if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL)) &&
+		    scifdev_alive(ep)) {
+			scif_drain_dma_intr(ep->remote_dev->sdev,
+					    ep->rma_info.dma_chan);
+		} else {
+			if (!__scif_dec_pinned_vm_lock(window->mm,
+						       window->nr_pages, 1)) {
+				__scif_release_mm(window->mm);
+				window->mm = NULL;
+			}
+		}
+		scif_queue_for_cleanup(window, &scif_info.rma);
+		mutex_lock(&ep->rma_info.rma_lock);
+	}
+	return err;
+}
+
+/**
+ * scif_send_alloc_request:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a remote window allocation request
+ */
+static int scif_send_alloc_request(struct scif_endpt *ep,
+				   struct scif_window *window)
+{
+	struct scifmsg msg;
+	struct scif_allocmsg *alloc = &window->alloc_handle;
+
+	/* Set up the Alloc Handle */
+	alloc->state = OP_IN_PROGRESS;
+	init_waitqueue_head(&alloc->allocwq);
+
+	/* Send out an allocation request */
+	msg.uop = SCIF_ALLOC_REQ;
+	msg.payload[1] = window->nr_pages;
+	msg.payload[2] = (u64)&window->alloc_handle;
+	return _scif_nodeqp_send(ep->remote_dev, &msg);
+}
+
+/**
+ * scif_prep_remote_window:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a remote window allocation request, wait for an allocation response,
+ * and prepares the remote window by copying over the page lists
+ */
+static int scif_prep_remote_window(struct scif_endpt *ep,
+				   struct scif_window *window)
+{
+	struct scifmsg msg;
+	struct scif_window *remote_window;
+	struct scif_allocmsg *alloc = &window->alloc_handle;
+	dma_addr_t *dma_phys_lookup, *tmp, *num_pages_lookup, *tmp1;
+	int i = 0, j = 0;
+	int nr_contig_chunks, loop_nr_contig_chunks;
+	int remaining_nr_contig_chunks, nr_lookup;
+	int err, map_err;
+
+	map_err = scif_map_window(ep->remote_dev, window);
+	if (map_err)
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d map_err %d\n", __func__, __LINE__, map_err);
+	remaining_nr_contig_chunks = window->nr_contig_chunks;
+	nr_contig_chunks = window->nr_contig_chunks;
+retry:
+	/* Wait for a SCIF_ALLOC_GNT/REJ message */
+	err = wait_event_timeout(alloc->allocwq,
+				 alloc->state != OP_IN_PROGRESS,
+				 SCIF_NODE_ALIVE_TIMEOUT);
+	mutex_lock(&ep->rma_info.rma_lock);
+	/* Synchronize with the thread waking up allocwq */
+	mutex_unlock(&ep->rma_info.rma_lock);
+	if (!err && scifdev_alive(ep))
+		goto retry;
+
+	if (!err)
+		err = -ENODEV;
+
+	if (err > 0)
+		err = 0;
+	else
+		return err;
+
+	/* Bail out. The remote end rejected this request */
+	if (alloc->state == OP_FAILED)
+		return -ENOMEM;
+
+	if (map_err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, map_err);
+		msg.uop = SCIF_FREE_VIRT;
+		msg.src = ep->port;
+		msg.payload[0] = ep->remote_ep;
+		msg.payload[1] = window->alloc_handle.vaddr;
+		msg.payload[2] = (u64)window;
+		msg.payload[3] = SCIF_REGISTER;
+		spin_lock(&ep->lock);
+		if (ep->state == SCIFEP_CONNECTED)
+			err = _scif_nodeqp_send(ep->remote_dev, &msg);
+		else
+			err = -ENOTCONN;
+		spin_unlock(&ep->lock);
+		return err;
+	}
+
+	remote_window = scif_ioremap(alloc->phys_addr, sizeof(*window),
+				     ep->remote_dev);
+
+	/* Compute the number of lookup entries. 21 == 2MB Shift */
+	nr_lookup = ALIGN(nr_contig_chunks, SCIF_NR_ADDR_IN_PAGE)
+			  >> ilog2(SCIF_NR_ADDR_IN_PAGE);
+
+	dma_phys_lookup =
+		scif_ioremap(remote_window->dma_addr_lookup.offset,
+			     nr_lookup *
+			     sizeof(*remote_window->dma_addr_lookup.lookup),
+			     ep->remote_dev);
+	num_pages_lookup =
+		scif_ioremap(remote_window->num_pages_lookup.offset,
+			     nr_lookup *
+			     sizeof(*remote_window->num_pages_lookup.lookup),
+			     ep->remote_dev);
+
+	while (remaining_nr_contig_chunks) {
+		loop_nr_contig_chunks = min_t(int, remaining_nr_contig_chunks,
+					      (int)SCIF_NR_ADDR_IN_PAGE);
+		/* #1/2 - Copy  physical addresses over to the remote side */
+
+		/* #2/2 - Copy DMA addresses (addresses that are fed into the
+		 * DMA engine) We transfer bus addresses which are then
+		 * converted into a MIC physical address on the remote
+		 * side if it is a MIC, if the remote node is a mgmt node we
+		 * transfer the MIC physical address
+		 */
+		tmp = scif_ioremap(dma_phys_lookup[j],
+				   loop_nr_contig_chunks *
+				   sizeof(*window->dma_addr),
+				   ep->remote_dev);
+		tmp1 = scif_ioremap(num_pages_lookup[j],
+				    loop_nr_contig_chunks *
+				    sizeof(*window->num_pages),
+				    ep->remote_dev);
+		if (scif_is_mgmt_node()) {
+			memcpy_toio((void __force __iomem *)tmp,
+				    &window->dma_addr[i], loop_nr_contig_chunks
+				    * sizeof(*window->dma_addr));
+			memcpy_toio((void __force __iomem *)tmp1,
+				    &window->num_pages[i], loop_nr_contig_chunks
+				    * sizeof(*window->num_pages));
+		} else {
+			if (scifdev_is_p2p(ep->remote_dev)) {
+				/*
+				 * add remote node's base address for this node
+				 * to convert it into a MIC address
+				 */
+				int m;
+				dma_addr_t dma_addr;
+
+				for (m = 0; m < loop_nr_contig_chunks; m++) {
+					dma_addr = window->dma_addr[i + m] +
+						ep->remote_dev->base_addr;
+					writeq(dma_addr,
+					       (void __force __iomem *)&tmp[m]);
+				}
+				memcpy_toio((void __force __iomem *)tmp1,
+					    &window->num_pages[i],
+					    loop_nr_contig_chunks
+					    * sizeof(*window->num_pages));
+			} else {
+				/* Mgmt node or loopback - transfer DMA
+				 * addresses as is, this is the same as a
+				 * MIC physical address (we use the dma_addr
+				 * and not the phys_addr array since the
+				 * phys_addr is only setup if there is a mmap()
+				 * request from the mgmt node)
+				 */
+				memcpy_toio((void __force __iomem *)tmp,
+					    &window->dma_addr[i],
+					    loop_nr_contig_chunks *
+					    sizeof(*window->dma_addr));
+				memcpy_toio((void __force __iomem *)tmp1,
+					    &window->num_pages[i],
+					    loop_nr_contig_chunks *
+					    sizeof(*window->num_pages));
+			}
+		}
+		remaining_nr_contig_chunks -= loop_nr_contig_chunks;
+		i += loop_nr_contig_chunks;
+		j++;
+		scif_iounmap(tmp, loop_nr_contig_chunks *
+			     sizeof(*window->dma_addr), ep->remote_dev);
+		scif_iounmap(tmp1, loop_nr_contig_chunks *
+			     sizeof(*window->num_pages), ep->remote_dev);
+	}
+
+	/* Prepare the remote window for the peer */
+	remote_window->peer_window = (u64)window;
+	remote_window->offset = window->offset;
+	remote_window->prot = window->prot;
+	remote_window->nr_contig_chunks = nr_contig_chunks;
+	remote_window->ep = ep->remote_ep;
+	scif_iounmap(num_pages_lookup,
+		     nr_lookup *
+		     sizeof(*remote_window->num_pages_lookup.lookup),
+		     ep->remote_dev);
+	scif_iounmap(dma_phys_lookup,
+		     nr_lookup *
+		     sizeof(*remote_window->dma_addr_lookup.lookup),
+		     ep->remote_dev);
+	scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev);
+	window->peer_window = alloc->vaddr;
+	return err;
+}
+
+/**
+ * scif_send_scif_register:
+ * @ep: end point
+ * @window: self registration window
+ *
+ * Send a SCIF_REGISTER message if EP is connected and wait for a
+ * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT
+ * message so that the peer can free its remote window allocated earlier.
+ */
+static int scif_send_scif_register(struct scif_endpt *ep,
+				   struct scif_window *window)
+{
+	int err = 0;
+	struct scifmsg msg;
+
+	msg.src = ep->port;
+	msg.payload[0] = ep->remote_ep;
+	msg.payload[1] = window->alloc_handle.vaddr;
+	msg.payload[2] = (u64)window;
+	spin_lock(&ep->lock);
+	if (ep->state == SCIFEP_CONNECTED) {
+		msg.uop = SCIF_REGISTER;
+		window->reg_state = OP_IN_PROGRESS;
+		err = _scif_nodeqp_send(ep->remote_dev, &msg);
+		spin_unlock(&ep->lock);
+		if (!err) {
+retry:
+			/* Wait for a SCIF_REGISTER_(N)ACK message */
+			err = wait_event_timeout(window->regwq,
+						 window->reg_state !=
+						 OP_IN_PROGRESS,
+						 SCIF_NODE_ALIVE_TIMEOUT);
+			if (!err && scifdev_alive(ep))
+				goto retry;
+			err = !err ? -ENODEV : 0;
+			if (window->reg_state == OP_FAILED)
+				err = -ENOTCONN;
+		}
+	} else {
+		msg.uop = SCIF_FREE_VIRT;
+		msg.payload[3] = SCIF_REGISTER;
+		err = _scif_nodeqp_send(ep->remote_dev, &msg);
+		spin_unlock(&ep->lock);
+		if (!err)
+			err = -ENOTCONN;
+	}
+	return err;
+}
+
+/**
+ * scif_get_window_offset:
+ * @ep: end point descriptor
+ * @flags: flags
+ * @offset: offset hint
+ * @num_pages: number of pages
+ * @out_offset: computed offset returned by reference.
+ *
+ * Compute/Claim a new offset for this EP.
+ */
+int scif_get_window_offset(struct scif_endpt *ep, int flags, s64 offset,
+			   int num_pages, s64 *out_offset)
+{
+	s64 page_index;
+	struct iova *iova_ptr;
+	int err = 0;
+
+	if (flags & SCIF_MAP_FIXED) {
+		page_index = SCIF_IOVA_PFN(offset);
+		iova_ptr = reserve_iova(&ep->rma_info.iovad, page_index,
+					page_index + num_pages - 1);
+		if (!iova_ptr)
+			err = -EADDRINUSE;
+	} else {
+		iova_ptr = alloc_iova(&ep->rma_info.iovad, num_pages,
+				      SCIF_DMA_63BIT_PFN - 1, 0);
+		if (!iova_ptr)
+			err = -ENOMEM;
+	}
+	if (!err)
+		*out_offset = (iova_ptr->pfn_lo) << PAGE_SHIFT;
+	return err;
+}
+
+/**
+ * scif_free_window_offset:
+ * @ep: end point descriptor
+ * @window: registration window
+ * @offset: Offset to be freed
+ *
+ * Free offset for this EP. The callee is supposed to grab
+ * the RMA mutex before calling this API.
+ */
+void scif_free_window_offset(struct scif_endpt *ep,
+			     struct scif_window *window, s64 offset)
+{
+	if ((window && !window->offset_freed) || !window) {
+		free_iova(&ep->rma_info.iovad, offset >> PAGE_SHIFT);
+		if (window)
+			window->offset_freed = true;
+	}
+}
+
+/**
+ * scif_alloc_req: Respond to SCIF_ALLOC_REQ interrupt message
+ * @msg:        Interrupt message
+ *
+ * Remote side is requesting a memory allocation.
+ */
+void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	int err;
+	struct scif_window *window = NULL;
+	int nr_pages = msg->payload[1];
+
+	window = scif_create_remote_window(scifdev, nr_pages);
+	if (!window) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	/* The peer's allocation request is granted */
+	msg->uop = SCIF_ALLOC_GNT;
+	msg->payload[0] = (u64)window;
+	msg->payload[1] = window->mapped_offset;
+	err = scif_nodeqp_send(scifdev, msg);
+	if (err)
+		scif_destroy_remote_window(window);
+	return;
+error:
+	/* The peer's allocation request is rejected */
+	dev_err(&scifdev->sdev->dev,
+		"%s %d error %d alloc_ptr %p nr_pages 0x%x\n",
+		__func__, __LINE__, err, window, nr_pages);
+	msg->uop = SCIF_ALLOC_REJ;
+	scif_nodeqp_send(scifdev, msg);
+}
+
+/**
+ * scif_alloc_gnt_rej: Respond to SCIF_ALLOC_GNT/REJ interrupt message
+ * @msg:        Interrupt message
+ *
+ * Remote side responded to a memory allocation.
+ */
+void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_allocmsg *handle = (struct scif_allocmsg *)msg->payload[2];
+	struct scif_window *window = container_of(handle, struct scif_window,
+						  alloc_handle);
+	struct scif_endpt *ep = (struct scif_endpt *)window->ep;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	handle->vaddr = msg->payload[0];
+	handle->phys_addr = msg->payload[1];
+	if (msg->uop == SCIF_ALLOC_GNT)
+		handle->state = OP_COMPLETED;
+	else
+		handle->state = OP_FAILED;
+	wake_up(&handle->allocwq);
+	mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/**
+ * scif_free_virt: Respond to SCIF_FREE_VIRT interrupt message
+ * @msg:        Interrupt message
+ *
+ * Free up memory kmalloc'd earlier.
+ */
+void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_window *window = (struct scif_window *)msg->payload[1];
+
+	scif_destroy_remote_window(window);
+}
+
+static void
+scif_fixup_aper_base(struct scif_dev *dev, struct scif_window *window)
+{
+	int j;
+	struct scif_hw_dev *sdev = dev->sdev;
+	phys_addr_t apt_base = 0;
+
+	/*
+	 * Add the aperture base if the DMA address is not card relative
+	 * since the DMA addresses need to be an offset into the bar
+	 */
+	if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER &&
+	    sdev->aper && !sdev->card_rel_da)
+		apt_base = sdev->aper->pa;
+	else
+		return;
+
+	for (j = 0; j < window->nr_contig_chunks; j++) {
+		if (window->num_pages[j])
+			window->dma_addr[j] += apt_base;
+		else
+			break;
+	}
+}
+
+/**
+ * scif_recv_reg: Respond to SCIF_REGISTER interrupt message
+ * @msg:        Interrupt message
+ *
+ * Update remote window list with a new registered window.
+ */
+void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0];
+	struct scif_window *window =
+		(struct scif_window *)msg->payload[1];
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	spin_lock(&ep->lock);
+	if (ep->state == SCIFEP_CONNECTED) {
+		msg->uop = SCIF_REGISTER_ACK;
+		scif_nodeqp_send(ep->remote_dev, msg);
+		scif_fixup_aper_base(ep->remote_dev, window);
+		/* No further failures expected. Insert new window */
+		scif_insert_window(window, &ep->rma_info.remote_reg_list);
+	} else {
+		msg->uop = SCIF_REGISTER_NACK;
+		scif_nodeqp_send(ep->remote_dev, msg);
+	}
+	spin_unlock(&ep->lock);
+	mutex_unlock(&ep->rma_info.rma_lock);
+	/* free up any lookup resources now that page lists are transferred */
+	scif_destroy_remote_lookup(ep->remote_dev, window);
+	/*
+	 * We could not insert the window but we need to
+	 * destroy the window.
+	 */
+	if (msg->uop == SCIF_REGISTER_NACK)
+		scif_destroy_remote_window(window);
+}
+
+/**
+ * scif_recv_unreg: Respond to SCIF_UNREGISTER interrupt message
+ * @msg:        Interrupt message
+ *
+ * Remove window from remote registration list;
+ */
+void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_rma_req req;
+	struct scif_window *window = NULL;
+	struct scif_window *recv_window =
+		(struct scif_window *)msg->payload[0];
+	struct scif_endpt *ep;
+	int del_window = 0;
+
+	ep = (struct scif_endpt *)recv_window->ep;
+	req.out_window = &window;
+	req.offset = recv_window->offset;
+	req.prot = 0;
+	req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT;
+	req.type = SCIF_WINDOW_FULL;
+	req.head = &ep->rma_info.remote_reg_list;
+	msg->payload[0] = ep->remote_ep;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	/* Does a valid window exist? */
+	if (scif_query_window(&req)) {
+		dev_err(&scifdev->sdev->dev,
+			"%s %d -ENXIO\n", __func__, __LINE__);
+		msg->uop = SCIF_UNREGISTER_ACK;
+		goto error;
+	}
+	if (window) {
+		if (window->ref_count)
+			scif_put_window(window, window->nr_pages);
+		else
+			dev_err(&scifdev->sdev->dev,
+				"%s %d ref count should be +ve\n",
+				__func__, __LINE__);
+		window->unreg_state = OP_COMPLETED;
+		if (!window->ref_count) {
+			msg->uop = SCIF_UNREGISTER_ACK;
+			atomic_inc(&ep->rma_info.tw_refcount);
+			ep->rma_info.async_list_del = 1;
+			list_del_init(&window->list);
+			del_window = 1;
+		} else {
+			/* NACK! There are valid references to this window */
+			msg->uop = SCIF_UNREGISTER_NACK;
+		}
+	} else {
+		/* The window did not make its way to the list at all. ACK */
+		msg->uop = SCIF_UNREGISTER_ACK;
+		scif_destroy_remote_window(recv_window);
+	}
+error:
+	mutex_unlock(&ep->rma_info.rma_lock);
+	if (del_window)
+		scif_drain_dma_intr(ep->remote_dev->sdev,
+				    ep->rma_info.dma_chan);
+	scif_nodeqp_send(ep->remote_dev, msg);
+	if (del_window)
+		scif_queue_for_cleanup(window, &scif_info.rma);
+}
+
+/**
+ * scif_recv_reg_ack: Respond to SCIF_REGISTER_ACK interrupt message
+ * @msg:        Interrupt message
+ *
+ * Wake up the window waiting to complete registration.
+ */
+void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_window *window =
+		(struct scif_window *)msg->payload[2];
+	struct scif_endpt *ep = (struct scif_endpt *)window->ep;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	window->reg_state = OP_COMPLETED;
+	wake_up(&window->regwq);
+	mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/**
+ * scif_recv_reg_nack: Respond to SCIF_REGISTER_NACK interrupt message
+ * @msg:        Interrupt message
+ *
+ * Wake up the window waiting to inform it that registration
+ * cannot be completed.
+ */
+void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_window *window =
+		(struct scif_window *)msg->payload[2];
+	struct scif_endpt *ep = (struct scif_endpt *)window->ep;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	window->reg_state = OP_FAILED;
+	wake_up(&window->regwq);
+	mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/**
+ * scif_recv_unreg_ack: Respond to SCIF_UNREGISTER_ACK interrupt message
+ * @msg:        Interrupt message
+ *
+ * Wake up the window waiting to complete unregistration.
+ */
+void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_window *window =
+		(struct scif_window *)msg->payload[1];
+	struct scif_endpt *ep = (struct scif_endpt *)window->ep;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	window->unreg_state = OP_COMPLETED;
+	wake_up(&window->unregwq);
+	mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/**
+ * scif_recv_unreg_nack: Respond to SCIF_UNREGISTER_NACK interrupt message
+ * @msg:        Interrupt message
+ *
+ * Wake up the window waiting to inform it that unregistration
+ * cannot be completed immediately.
+ */
+void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+	struct scif_window *window =
+		(struct scif_window *)msg->payload[1];
+	struct scif_endpt *ep = (struct scif_endpt *)window->ep;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	window->unreg_state = OP_FAILED;
+	wake_up(&window->unregwq);
+	mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+int __scif_pin_pages(void *addr, size_t len, int *out_prot,
+		     int map_flags, scif_pinned_pages_t *pages)
+{
+	struct scif_pinned_pages *pinned_pages;
+	int nr_pages, err = 0, i;
+	bool vmalloc_addr = false;
+	bool try_upgrade = false;
+	int prot = *out_prot;
+	int ulimit = 0;
+	struct mm_struct *mm = NULL;
+
+	/* Unsupported flags */
+	if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT))
+		return -EINVAL;
+	ulimit = !!(map_flags & SCIF_MAP_ULIMIT);
+
+	/* Unsupported protection requested */
+	if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
+		return -EINVAL;
+
+	/* addr/len must be page aligned. len should be non zero */
+	if (!len ||
+	    (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) ||
+	    (ALIGN((u64)len, PAGE_SIZE) != (u64)len))
+		return -EINVAL;
+
+	might_sleep();
+
+	nr_pages = len >> PAGE_SHIFT;
+
+	/* Allocate a set of pinned pages */
+	pinned_pages = scif_create_pinned_pages(nr_pages, prot);
+	if (!pinned_pages)
+		return -ENOMEM;
+
+	if (map_flags & SCIF_MAP_KERNEL) {
+		if (is_vmalloc_addr(addr))
+			vmalloc_addr = true;
+
+		for (i = 0; i < nr_pages; i++) {
+			if (vmalloc_addr)
+				pinned_pages->pages[i] =
+					vmalloc_to_page(addr + (i * PAGE_SIZE));
+			else
+				pinned_pages->pages[i] =
+					virt_to_page(addr + (i * PAGE_SIZE));
+		}
+		pinned_pages->nr_pages = nr_pages;
+		pinned_pages->map_flags = SCIF_MAP_KERNEL;
+	} else {
+		/*
+		 * SCIF supports registration caching. If a registration has
+		 * been requested with read only permissions, then we try
+		 * to pin the pages with RW permissions so that a subsequent
+		 * transfer with RW permission can hit the cache instead of
+		 * invalidating it. If the upgrade fails with RW then we
+		 * revert back to R permission and retry
+		 */
+		if (prot == SCIF_PROT_READ)
+			try_upgrade = true;
+		prot |= SCIF_PROT_WRITE;
+retry:
+		mm = current->mm;
+		down_write(&mm->mmap_sem);
+		if (ulimit) {
+			err = __scif_check_inc_pinned_vm(mm, nr_pages);
+			if (err) {
+				up_write(&mm->mmap_sem);
+				pinned_pages->nr_pages = 0;
+				goto error_unmap;
+			}
+		}
+
+		pinned_pages->nr_pages = get_user_pages(
+				current,
+				mm,
+				(u64)addr,
+				nr_pages,
+				!!(prot & SCIF_PROT_WRITE),
+				0,
+				pinned_pages->pages,
+				NULL);
+		up_write(&mm->mmap_sem);
+		if (nr_pages != pinned_pages->nr_pages) {
+			if (try_upgrade) {
+				if (ulimit)
+					__scif_dec_pinned_vm_lock(mm,
+								  nr_pages, 0);
+				/* Roll back any pinned pages */
+				for (i = 0; i < pinned_pages->nr_pages; i++) {
+					if (pinned_pages->pages[i])
+						put_page(
+						pinned_pages->pages[i]);
+				}
+				prot &= ~SCIF_PROT_WRITE;
+				try_upgrade = false;
+				goto retry;
+			}
+		}
+		pinned_pages->map_flags = 0;
+	}
+
+	if (pinned_pages->nr_pages < nr_pages) {
+		err = -EFAULT;
+		pinned_pages->nr_pages = nr_pages;
+		goto dec_pinned;
+	}
+
+	*out_prot = prot;
+	atomic_set(&pinned_pages->ref_count, 1);
+	*pages = pinned_pages;
+	return err;
+dec_pinned:
+	if (ulimit)
+		__scif_dec_pinned_vm_lock(mm, nr_pages, 0);
+	/* Something went wrong! Rollback */
+error_unmap:
+	pinned_pages->nr_pages = nr_pages;
+	scif_destroy_pinned_pages(pinned_pages);
+	*pages = NULL;
+	dev_dbg(scif_info.mdev.this_device,
+		"%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len);
+	return err;
+}
+
+int scif_pin_pages(void *addr, size_t len, int prot,
+		   int map_flags, scif_pinned_pages_t *pages)
+{
+	return __scif_pin_pages(addr, len, &prot, map_flags, pages);
+}
+EXPORT_SYMBOL_GPL(scif_pin_pages);
+
+int scif_unpin_pages(scif_pinned_pages_t pinned_pages)
+{
+	int err = 0, ret;
+
+	if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic)
+		return -EINVAL;
+
+	ret = atomic_sub_return(1, &pinned_pages->ref_count);
+	if (ret < 0) {
+		dev_err(scif_info.mdev.this_device,
+			"%s %d scif_unpin_pages called without pinning? rc %d\n",
+			__func__, __LINE__, ret);
+		return -EINVAL;
+	}
+	/*
+	 * Destroy the window if the ref count for this set of pinned
+	 * pages has dropped to zero. If it is positive then there is
+	 * a valid registered window which is backed by these pages and
+	 * it will be destroyed once all such windows are unregistered.
+	 */
+	if (!ret)
+		err = scif_destroy_pinned_pages(pinned_pages);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_unpin_pages);
+
+static inline void
+scif_insert_local_window(struct scif_window *window, struct scif_endpt *ep)
+{
+	mutex_lock(&ep->rma_info.rma_lock);
+	scif_insert_window(window, &ep->rma_info.reg_list);
+	mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+off_t scif_register_pinned_pages(scif_epd_t epd,
+				 scif_pinned_pages_t pinned_pages,
+				 off_t offset, int map_flags)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	s64 computed_offset;
+	struct scif_window *window;
+	int err;
+	size_t len;
+	struct device *spdev;
+
+	/* Unsupported flags */
+	if (map_flags & ~SCIF_MAP_FIXED)
+		return -EINVAL;
+
+	len = pinned_pages->nr_pages << PAGE_SHIFT;
+
+	/*
+	 * Offset is not page aligned/negative or offset+len
+	 * wraps around with SCIF_MAP_FIXED.
+	 */
+	if ((map_flags & SCIF_MAP_FIXED) &&
+	    ((ALIGN(offset, PAGE_SIZE) != offset) ||
+	    (offset < 0) ||
+	    (offset + (off_t)len < offset)))
+		return -EINVAL;
+
+	might_sleep();
+
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+	/*
+	 * It is an error to pass pinned_pages to scif_register_pinned_pages()
+	 * after calling scif_unpin_pages().
+	 */
+	if (!atomic_add_unless(&pinned_pages->ref_count, 1, 0))
+		return -EINVAL;
+
+	/* Compute the offset for this registration */
+	err = scif_get_window_offset(ep, map_flags, offset,
+				     len, &computed_offset);
+	if (err) {
+		atomic_sub(1, &pinned_pages->ref_count);
+		return err;
+	}
+
+	/* Allocate and prepare self registration window */
+	window = scif_create_window(ep, pinned_pages->nr_pages,
+				    computed_offset, false);
+	if (!window) {
+		atomic_sub(1, &pinned_pages->ref_count);
+		scif_free_window_offset(ep, NULL, computed_offset);
+		return -ENOMEM;
+	}
+
+	window->pinned_pages = pinned_pages;
+	window->nr_pages = pinned_pages->nr_pages;
+	window->prot = pinned_pages->prot;
+
+	spdev = scif_get_peer_dev(ep->remote_dev);
+	if (IS_ERR(spdev)) {
+		err = PTR_ERR(spdev);
+		scif_destroy_window(ep, window);
+		return err;
+	}
+	err = scif_send_alloc_request(ep, window);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto error_unmap;
+	}
+
+	/* Prepare the remote registration window */
+	err = scif_prep_remote_window(ep, window);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto error_unmap;
+	}
+
+	/* Tell the peer about the new window */
+	err = scif_send_scif_register(ep, window);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto error_unmap;
+	}
+
+	scif_put_peer_dev(spdev);
+	/* No further failures expected. Insert new window */
+	scif_insert_local_window(window, ep);
+	return computed_offset;
+error_unmap:
+	scif_destroy_window(ep, window);
+	scif_put_peer_dev(spdev);
+	dev_err(&ep->remote_dev->sdev->dev,
+		"%s %d err %d\n", __func__, __LINE__, err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_register_pinned_pages);
+
+off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
+		    int prot, int map_flags)
+{
+	scif_pinned_pages_t pinned_pages;
+	off_t err;
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	s64 computed_offset;
+	struct scif_window *window;
+	struct mm_struct *mm = NULL;
+	struct device *spdev;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI register: ep %p addr %p len 0x%lx offset 0x%lx prot 0x%x map_flags 0x%x\n",
+		epd, addr, len, offset, prot, map_flags);
+	/* Unsupported flags */
+	if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL))
+		return -EINVAL;
+
+	/*
+	 * Offset is not page aligned/negative or offset+len
+	 * wraps around with SCIF_MAP_FIXED.
+	 */
+	if ((map_flags & SCIF_MAP_FIXED) &&
+	    ((ALIGN(offset, PAGE_SIZE) != offset) ||
+	    (offset < 0) ||
+	    (offset + (off_t)len < offset)))
+		return -EINVAL;
+
+	/* Unsupported protection requested */
+	if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE))
+		return -EINVAL;
+
+	/* addr/len must be page aligned. len should be non zero */
+	if (!len || (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) ||
+	    (ALIGN(len, PAGE_SIZE) != len))
+		return -EINVAL;
+
+	might_sleep();
+
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+
+	/* Compute the offset for this registration */
+	err = scif_get_window_offset(ep, map_flags, offset,
+				     len >> PAGE_SHIFT, &computed_offset);
+	if (err)
+		return err;
+
+	spdev = scif_get_peer_dev(ep->remote_dev);
+	if (IS_ERR(spdev)) {
+		err = PTR_ERR(spdev);
+		scif_free_window_offset(ep, NULL, computed_offset);
+		return err;
+	}
+	/* Allocate and prepare self registration window */
+	window = scif_create_window(ep, len >> PAGE_SHIFT,
+				    computed_offset, false);
+	if (!window) {
+		scif_free_window_offset(ep, NULL, computed_offset);
+		scif_put_peer_dev(spdev);
+		return -ENOMEM;
+	}
+
+	window->nr_pages = len >> PAGE_SHIFT;
+
+	err = scif_send_alloc_request(ep, window);
+	if (err) {
+		scif_destroy_incomplete_window(ep, window);
+		scif_put_peer_dev(spdev);
+		return err;
+	}
+
+	if (!(map_flags & SCIF_MAP_KERNEL)) {
+		mm = __scif_acquire_mm();
+		map_flags |= SCIF_MAP_ULIMIT;
+	}
+	/* Pin down the pages */
+	err = __scif_pin_pages(addr, len, &prot,
+			       map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT),
+			       &pinned_pages);
+	if (err) {
+		scif_destroy_incomplete_window(ep, window);
+		__scif_release_mm(mm);
+		goto error;
+	}
+
+	window->pinned_pages = pinned_pages;
+	window->prot = pinned_pages->prot;
+	window->mm = mm;
+
+	/* Prepare the remote registration window */
+	err = scif_prep_remote_window(ep, window);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %ld\n", __func__, __LINE__, err);
+		goto error_unmap;
+	}
+
+	/* Tell the peer about the new window */
+	err = scif_send_scif_register(ep, window);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %ld\n", __func__, __LINE__, err);
+		goto error_unmap;
+	}
+
+	scif_put_peer_dev(spdev);
+	/* No further failures expected. Insert new window */
+	scif_insert_local_window(window, ep);
+	dev_dbg(&ep->remote_dev->sdev->dev,
+		"SCIFAPI register: ep %p addr %p len 0x%lx computed_offset 0x%llx\n",
+		epd, addr, len, computed_offset);
+	return computed_offset;
+error_unmap:
+	scif_destroy_window(ep, window);
+error:
+	scif_put_peer_dev(spdev);
+	dev_err(&ep->remote_dev->sdev->dev,
+		"%s %d err %ld\n", __func__, __LINE__, err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_register);
+
+int
+scif_unregister(scif_epd_t epd, off_t offset, size_t len)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct scif_window *window = NULL;
+	struct scif_rma_req req;
+	int nr_pages, err;
+	struct device *spdev;
+
+	dev_dbg(scif_info.mdev.this_device,
+		"SCIFAPI unregister: ep %p offset 0x%lx len 0x%lx\n",
+		ep, offset, len);
+	/* len must be page aligned. len should be non zero */
+	if (!len ||
+	    (ALIGN((u64)len, PAGE_SIZE) != (u64)len))
+		return -EINVAL;
+
+	/* Offset is not page aligned or offset+len wraps around */
+	if ((ALIGN(offset, PAGE_SIZE) != offset) ||
+	    (offset + (off_t)len < offset))
+		return -EINVAL;
+
+	err = scif_verify_epd(ep);
+	if (err)
+		return err;
+
+	might_sleep();
+	nr_pages = len >> PAGE_SHIFT;
+
+	req.out_window = &window;
+	req.offset = offset;
+	req.prot = 0;
+	req.nr_bytes = len;
+	req.type = SCIF_WINDOW_FULL;
+	req.head = &ep->rma_info.reg_list;
+
+	spdev = scif_get_peer_dev(ep->remote_dev);
+	if (IS_ERR(spdev)) {
+		err = PTR_ERR(spdev);
+		return err;
+	}
+	mutex_lock(&ep->rma_info.rma_lock);
+	/* Does a valid window exist? */
+	err = scif_query_window(&req);
+	if (err) {
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+		goto error;
+	}
+	/* Unregister all the windows in this range */
+	err = scif_rma_list_unregister(window, offset, nr_pages);
+	if (err)
+		dev_err(&ep->remote_dev->sdev->dev,
+			"%s %d err %d\n", __func__, __LINE__, err);
+error:
+	mutex_unlock(&ep->rma_info.rma_lock);
+	scif_put_peer_dev(spdev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(scif_unregister);
diff --git a/drivers/misc/mic/scif/scif_rma.h b/drivers/misc/mic/scif/scif_rma.h
new file mode 100644
index 000000000000..fa6722279196
--- /dev/null
+++ b/drivers/misc/mic/scif/scif_rma.h
@@ -0,0 +1,464 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#ifndef SCIF_RMA_H
+#define SCIF_RMA_H
+
+#include <linux/dma_remapping.h>
+#include <linux/mmu_notifier.h>
+
+#include "../bus/scif_bus.h"
+
+/* If this bit is set then the mark is a remote fence mark */
+#define SCIF_REMOTE_FENCE_BIT          31
+/* Magic value used to indicate a remote fence request */
+#define SCIF_REMOTE_FENCE BIT_ULL(SCIF_REMOTE_FENCE_BIT)
+
+#define SCIF_MAX_UNALIGNED_BUF_SIZE (1024 * 1024ULL)
+#define SCIF_KMEM_UNALIGNED_BUF_SIZE (SCIF_MAX_UNALIGNED_BUF_SIZE + \
+				      (L1_CACHE_BYTES << 1))
+
+#define SCIF_IOVA_START_PFN		(1)
+#define SCIF_IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
+#define SCIF_DMA_64BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(64))
+#define SCIF_DMA_63BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(63))
+
+/*
+ * struct scif_endpt_rma_info - Per Endpoint Remote Memory Access Information
+ *
+ * @reg_list: List of registration windows for self
+ * @remote_reg_list: List of registration windows for peer
+ * @iovad: Offset generator
+ * @rma_lock: Synchronizes access to self/remote list and also protects the
+ *	      window from being destroyed while RMAs are in progress.
+ * @tc_lock: Synchronizes access to temporary cached windows list
+ *	     for SCIF Registration Caching.
+ * @mmn_lock: Synchronizes access to the list of MMU notifiers registered
+ * @tw_refcount: Keeps track of number of outstanding temporary registered
+ *		 windows created by scif_vreadfrom/scif_vwriteto which have
+ *		 not been destroyed.
+ * @tcw_refcount: Same as tw_refcount but for temporary cached windows
+ * @tcw_total_pages: Same as tcw_refcount but in terms of pages pinned
+ * @mmn_list: MMU notifier so that we can destroy the windows when required
+ * @fence_refcount: Keeps track of number of outstanding remote fence
+ *		    requests which have been received by the peer.
+ * @dma_chan: DMA channel used for all DMA transfers for this endpoint.
+ * @async_list_del: Detect asynchronous list entry deletion
+ * @vma_list: List of vmas with remote memory mappings
+ * @markwq: Wait queue used for scif_fence_mark/scif_fence_wait
+*/
+struct scif_endpt_rma_info {
+	struct list_head reg_list;
+	struct list_head remote_reg_list;
+	struct iova_domain iovad;
+	struct mutex rma_lock;
+	spinlock_t tc_lock;
+	struct mutex mmn_lock;
+	atomic_t tw_refcount;
+	atomic_t tcw_refcount;
+	atomic_t tcw_total_pages;
+	struct list_head mmn_list;
+	atomic_t fence_refcount;
+	struct dma_chan	*dma_chan;
+	int async_list_del;
+	struct list_head vma_list;
+	wait_queue_head_t markwq;
+};
+
+/*
+ * struct scif_fence_info - used for tracking fence requests
+ *
+ * @state: State of this transfer
+ * @wq: Fences wait on this queue
+ * @dma_mark: Used for storing the DMA mark
+ */
+struct scif_fence_info {
+	enum scif_msg_state state;
+	struct completion comp;
+	int dma_mark;
+};
+
+/*
+ * struct scif_remote_fence_info - used for tracking remote fence requests
+ *
+ * @msg: List of SCIF node QP fence messages
+ * @list: Link to list of remote fence requests
+ */
+struct scif_remote_fence_info {
+	struct scifmsg msg;
+	struct list_head list;
+};
+
+/*
+ * Specifies whether an RMA operation can span across partial windows, a single
+ * window or multiple contiguous windows. Mmaps can span across partial windows.
+ * Unregistration can span across complete windows. scif_get_pages() can span a
+ * single window. A window can also be of type self or peer.
+ */
+enum scif_window_type {
+	SCIF_WINDOW_PARTIAL,
+	SCIF_WINDOW_SINGLE,
+	SCIF_WINDOW_FULL,
+	SCIF_WINDOW_SELF,
+	SCIF_WINDOW_PEER
+};
+
+/* The number of physical addresses that can be stored in a PAGE. */
+#define SCIF_NR_ADDR_IN_PAGE   (0x1000 >> 3)
+
+/*
+ * struct scif_rma_lookup - RMA lookup data structure for page list transfers
+ *
+ * Store an array of lookup offsets. Each offset in this array maps
+ * one 4K page containing 512 physical addresses i.e. 2MB. 512 such
+ * offsets in a 4K page will correspond to 1GB of registered address space.
+
+ * @lookup: Array of offsets
+ * @offset: DMA offset of lookup array
+ */
+struct scif_rma_lookup {
+	dma_addr_t *lookup;
+	dma_addr_t offset;
+};
+
+/*
+ * struct scif_pinned_pages - A set of pinned pages obtained with
+ * scif_pin_pages() which could be part of multiple registered
+ * windows across different end points.
+ *
+ * @nr_pages: Number of pages which is defined as a s64 instead of an int
+ * to avoid sign extension with buffers >= 2GB
+ * @prot: read/write protections
+ * @map_flags: Flags specified during the pin operation
+ * @ref_count: Reference count bumped in terms of number of pages
+ * @magic: A magic value
+ * @pages: Array of pointers to struct pages populated with get_user_pages(..)
+ */
+struct scif_pinned_pages {
+	s64 nr_pages;
+	int prot;
+	int map_flags;
+	atomic_t ref_count;
+	u64 magic;
+	struct page **pages;
+};
+
+/*
+ * struct scif_status - Stores DMA status update information
+ *
+ * @src_dma_addr: Source buffer DMA address
+ * @val: src location for value to be written to the destination
+ * @ep: SCIF endpoint
+ */
+struct scif_status {
+	dma_addr_t src_dma_addr;
+	u64 val;
+	struct scif_endpt *ep;
+};
+
+/*
+ * struct scif_window - Registration Window for Self and Remote
+ *
+ * @nr_pages: Number of pages which is defined as a s64 instead of an int
+ * to avoid sign extension with buffers >= 2GB
+ * @nr_contig_chunks: Number of contiguous physical chunks
+ * @prot: read/write protections
+ * @ref_count: reference count in terms of number of pages
+ * @magic: Cookie to detect corruption
+ * @offset: registered offset
+ * @va_for_temp: va address that this window represents
+ * @dma_mark: Used to determine if all DMAs against the window are done
+ * @ep: Pointer to EP. Useful for passing EP around with messages to
+	avoid expensive list traversals.
+ * @list: link to list of windows for the endpoint
+ * @type: self or peer window
+ * @peer_window: Pointer to peer window. Useful for sending messages to peer
+ *		 without requiring an extra list traversal
+ * @unreg_state: unregistration state
+ * @offset_freed: True if the offset has been freed
+ * @temp: True for temporary windows created via scif_vreadfrom/scif_vwriteto
+ * @mm: memory descriptor for the task_struct which initiated the RMA
+ * @st: scatter gather table for DMA mappings with IOMMU enabled
+ * @pinned_pages: The set of pinned_pages backing this window
+ * @alloc_handle: Handle for sending ALLOC_REQ
+ * @regwq: Wait Queue for an registration (N)ACK
+ * @reg_state: Registration state
+ * @unregwq: Wait Queue for an unregistration (N)ACK
+ * @dma_addr_lookup: Lookup for physical addresses used for DMA
+ * @nr_lookup: Number of entries in lookup
+ * @mapped_offset: Offset used to map the window by the peer
+ * @dma_addr: Array of physical addresses used for Mgmt node & MIC initiated DMA
+ * @num_pages: Array specifying number of pages for each physical address
+ */
+struct scif_window {
+	s64 nr_pages;
+	int nr_contig_chunks;
+	int prot;
+	int ref_count;
+	u64 magic;
+	s64 offset;
+	unsigned long va_for_temp;
+	int dma_mark;
+	u64 ep;
+	struct list_head list;
+	enum scif_window_type type;
+	u64 peer_window;
+	enum scif_msg_state unreg_state;
+	bool offset_freed;
+	bool temp;
+	struct mm_struct *mm;
+	struct sg_table *st;
+	union {
+		struct {
+			struct scif_pinned_pages *pinned_pages;
+			struct scif_allocmsg alloc_handle;
+			wait_queue_head_t regwq;
+			enum scif_msg_state reg_state;
+			wait_queue_head_t unregwq;
+		};
+		struct {
+			struct scif_rma_lookup dma_addr_lookup;
+			struct scif_rma_lookup num_pages_lookup;
+			int nr_lookup;
+			dma_addr_t mapped_offset;
+		};
+	};
+	dma_addr_t *dma_addr;
+	u64 *num_pages;
+} __packed;
+
+/*
+ * scif_mmu_notif - SCIF mmu notifier information
+ *
+ * @mmu_notifier ep_mmu_notifier: MMU notifier operations
+ * @tc_reg_list: List of temp registration windows for self
+ * @mm: memory descriptor for the task_struct which initiated the RMA
+ * @ep: SCIF endpoint
+ * @list: link to list of MMU notifier information
+ */
+struct scif_mmu_notif {
+#ifdef CONFIG_MMU_NOTIFIER
+	struct mmu_notifier ep_mmu_notifier;
+#endif
+	struct list_head tc_reg_list;
+	struct mm_struct *mm;
+	struct scif_endpt *ep;
+	struct list_head list;
+};
+
+enum scif_rma_dir {
+	SCIF_LOCAL_TO_REMOTE,
+	SCIF_REMOTE_TO_LOCAL
+};
+
+extern struct kmem_cache *unaligned_cache;
+/* Initialize RMA for this EP */
+void scif_rma_ep_init(struct scif_endpt *ep);
+/* Check if epd can be uninitialized */
+int scif_rma_ep_can_uninit(struct scif_endpt *ep);
+/* Obtain a new offset. Callee must grab RMA lock */
+int scif_get_window_offset(struct scif_endpt *ep, int flags,
+			   s64 offset, int nr_pages, s64 *out_offset);
+/* Free offset. Callee must grab RMA lock */
+void scif_free_window_offset(struct scif_endpt *ep,
+			     struct scif_window *window, s64 offset);
+/* Create self registration window */
+struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages,
+				       s64 offset, bool temp);
+/* Destroy self registration window.*/
+int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window);
+void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window);
+/* Map pages of self window to Aperture/PCI */
+int scif_map_window(struct scif_dev *remote_dev,
+		    struct scif_window *window);
+/* Unregister a self window */
+int scif_unregister_window(struct scif_window *window);
+/* Destroy remote registration window */
+void
+scif_destroy_remote_window(struct scif_window *window);
+/* remove valid remote memory mappings from process address space */
+void scif_zap_mmaps(int node);
+/* Query if any applications have remote memory mappings */
+bool scif_rma_do_apps_have_mmaps(int node);
+/* Cleanup remote registration lists for zombie endpoints */
+void scif_cleanup_rma_for_zombies(int node);
+/* Reserve a DMA channel for a particular endpoint */
+int scif_reserve_dma_chan(struct scif_endpt *ep);
+/* Setup a DMA mark for an endpoint */
+int _scif_fence_mark(scif_epd_t epd, int *mark);
+int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val,
+		     enum scif_window_type type);
+void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg);
+void scif_mmu_notif_handler(struct work_struct *work);
+void scif_rma_handle_remote_fences(void);
+void scif_rma_destroy_windows(void);
+void scif_rma_destroy_tcw_invalid(void);
+int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan);
+
+struct scif_window_iter {
+	s64 offset;
+	int index;
+};
+
+static inline void
+scif_init_window_iter(struct scif_window *window, struct scif_window_iter *iter)
+{
+	iter->offset = window->offset;
+	iter->index = 0;
+}
+
+dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off,
+				size_t *nr_bytes,
+				struct scif_window_iter *iter);
+static inline
+dma_addr_t __scif_off_to_dma_addr(struct scif_window *window, s64 off)
+{
+	return scif_off_to_dma_addr(window, off, NULL, NULL);
+}
+
+static inline bool scif_unaligned(off_t src_offset, off_t dst_offset)
+{
+	src_offset = src_offset & (L1_CACHE_BYTES - 1);
+	dst_offset = dst_offset & (L1_CACHE_BYTES - 1);
+	return !(src_offset == dst_offset);
+}
+
+/*
+ * scif_zalloc:
+ * @size: Size of the allocation request.
+ *
+ * Helper API which attempts to allocate zeroed pages via
+ * __get_free_pages(..) first and then falls back on
+ * vzalloc(..) if that fails.
+ */
+static inline void *scif_zalloc(size_t size)
+{
+	void *ret = NULL;
+	size_t align = ALIGN(size, PAGE_SIZE);
+
+	if (align && get_order(align) < MAX_ORDER)
+		ret = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					       get_order(align));
+	return ret ? ret : vzalloc(align);
+}
+
+/*
+ * scif_free:
+ * @addr: Address to be freed.
+ * @size: Size of the allocation.
+ * Helper API which frees memory allocated via scif_zalloc().
+ */
+static inline void scif_free(void *addr, size_t size)
+{
+	size_t align = ALIGN(size, PAGE_SIZE);
+
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		free_pages((unsigned long)addr, get_order(align));
+}
+
+static inline void scif_get_window(struct scif_window *window, int nr_pages)
+{
+	window->ref_count += nr_pages;
+}
+
+static inline void scif_put_window(struct scif_window *window, int nr_pages)
+{
+	window->ref_count -= nr_pages;
+}
+
+static inline void scif_set_window_ref(struct scif_window *window, int nr_pages)
+{
+	window->ref_count = nr_pages;
+}
+
+static inline void
+scif_queue_for_cleanup(struct scif_window *window, struct list_head *list)
+{
+	spin_lock(&scif_info.rmalock);
+	list_add_tail(&window->list, list);
+	spin_unlock(&scif_info.rmalock);
+	schedule_work(&scif_info.misc_work);
+}
+
+static inline void __scif_rma_destroy_tcw_helper(struct scif_window *window)
+{
+	list_del_init(&window->list);
+	scif_queue_for_cleanup(window, &scif_info.rma_tc);
+}
+
+static inline bool scif_is_iommu_enabled(void)
+{
+#ifdef CONFIG_INTEL_IOMMU
+	return intel_iommu_enabled;
+#else
+	return false;
+#endif
+}
+#endif /* SCIF_RMA_H */
diff --git a/drivers/misc/mic/scif/scif_rma_list.c b/drivers/misc/mic/scif/scif_rma_list.c
new file mode 100644
index 000000000000..e1ef8daedd5a
--- /dev/null
+++ b/drivers/misc/mic/scif/scif_rma_list.c
@@ -0,0 +1,291 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#include "scif_main.h"
+#include <linux/mmu_notifier.h>
+#include <linux/highmem.h>
+
+/*
+ * scif_insert_tcw:
+ *
+ * Insert a temp window to the temp registration list sorted by va_for_temp.
+ * RMA lock must be held.
+ */
+void scif_insert_tcw(struct scif_window *window, struct list_head *head)
+{
+	struct scif_window *curr = NULL;
+	struct scif_window *prev = list_entry(head, struct scif_window, list);
+	struct list_head *item;
+
+	INIT_LIST_HEAD(&window->list);
+	/* Compare with tail and if the entry is new tail add it to the end */
+	if (!list_empty(head)) {
+		curr = list_entry(head->prev, struct scif_window, list);
+		if (curr->va_for_temp < window->va_for_temp) {
+			list_add_tail(&window->list, head);
+			return;
+		}
+	}
+	list_for_each(item, head) {
+		curr = list_entry(item, struct scif_window, list);
+		if (curr->va_for_temp > window->va_for_temp)
+			break;
+		prev = curr;
+	}
+	list_add(&window->list, &prev->list);
+}
+
+/*
+ * scif_insert_window:
+ *
+ * Insert a window to the self registration list sorted by offset.
+ * RMA lock must be held.
+ */
+void scif_insert_window(struct scif_window *window, struct list_head *head)
+{
+	struct scif_window *curr = NULL, *prev = NULL;
+	struct list_head *item;
+
+	INIT_LIST_HEAD(&window->list);
+	list_for_each(item, head) {
+		curr = list_entry(item, struct scif_window, list);
+		if (curr->offset > window->offset)
+			break;
+		prev = curr;
+	}
+	if (!prev)
+		list_add(&window->list, head);
+	else
+		list_add(&window->list, &prev->list);
+	scif_set_window_ref(window, window->nr_pages);
+}
+
+/*
+ * scif_query_tcw:
+ *
+ * Query the temp cached registration list of ep for an overlapping window
+ * in case of permission mismatch, destroy the previous window. if permissions
+ * match and overlap is partial, destroy the window but return the new range
+ * RMA lock must be held.
+ */
+int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *req)
+{
+	struct list_head *item, *temp, *head = req->head;
+	struct scif_window *window;
+	u64 start_va_window, start_va_req = req->va_for_temp;
+	u64 end_va_window, end_va_req = start_va_req + req->nr_bytes;
+
+	if (!req->nr_bytes)
+		return -EINVAL;
+	/*
+	 * Avoid traversing the entire list to find out that there
+	 * is no entry that matches
+	 */
+	if (!list_empty(head)) {
+		window = list_last_entry(head, struct scif_window, list);
+		end_va_window = window->va_for_temp +
+			(window->nr_pages << PAGE_SHIFT);
+		if (start_va_req > end_va_window)
+			return -ENXIO;
+	}
+	list_for_each_safe(item, temp, head) {
+		window = list_entry(item, struct scif_window, list);
+		start_va_window = window->va_for_temp;
+		end_va_window = window->va_for_temp +
+			(window->nr_pages << PAGE_SHIFT);
+		if (start_va_req < start_va_window &&
+		    end_va_req < start_va_window)
+			break;
+		if (start_va_req >= end_va_window)
+			continue;
+		if ((window->prot & req->prot) == req->prot) {
+			if (start_va_req >= start_va_window &&
+			    end_va_req <= end_va_window) {
+				*req->out_window = window;
+				return 0;
+			}
+			/* expand window */
+			if (start_va_req < start_va_window) {
+				req->nr_bytes +=
+					start_va_window - start_va_req;
+				req->va_for_temp = start_va_window;
+			}
+			if (end_va_req >= end_va_window)
+				req->nr_bytes += end_va_window - end_va_req;
+		}
+		/* Destroy the old window to create a new one */
+		__scif_rma_destroy_tcw_helper(window);
+		break;
+	}
+	return -ENXIO;
+}
+
+/*
+ * scif_query_window:
+ *
+ * Query the registration list and check if a valid contiguous
+ * range of windows exist.
+ * RMA lock must be held.
+ */
+int scif_query_window(struct scif_rma_req *req)
+{
+	struct list_head *item;
+	struct scif_window *window;
+	s64 end_offset, offset = req->offset;
+	u64 tmp_min, nr_bytes_left = req->nr_bytes;
+
+	if (!req->nr_bytes)
+		return -EINVAL;
+
+	list_for_each(item, req->head) {
+		window = list_entry(item, struct scif_window, list);
+		end_offset = window->offset +
+			(window->nr_pages << PAGE_SHIFT);
+		if (offset < window->offset)
+			/* Offset not found! */
+			return -ENXIO;
+		if (offset >= end_offset)
+			continue;
+		/* Check read/write protections. */
+		if ((window->prot & req->prot) != req->prot)
+			return -EPERM;
+		if (nr_bytes_left == req->nr_bytes)
+			/* Store the first window */
+			*req->out_window = window;
+		tmp_min = min((u64)end_offset - offset, nr_bytes_left);
+		nr_bytes_left -= tmp_min;
+		offset += tmp_min;
+		/*
+		 * Range requested encompasses
+		 * multiple windows contiguously.
+		 */
+		if (!nr_bytes_left) {
+			/* Done for partial window */
+			if (req->type == SCIF_WINDOW_PARTIAL ||
+			    req->type == SCIF_WINDOW_SINGLE)
+				return 0;
+			/* Extra logic for full windows */
+			if (offset == end_offset)
+				/* Spanning multiple whole windows */
+				return 0;
+				/* Not spanning multiple whole windows */
+			return -ENXIO;
+		}
+		if (req->type == SCIF_WINDOW_SINGLE)
+			break;
+	}
+	dev_err(scif_info.mdev.this_device,
+		"%s %d ENXIO\n", __func__, __LINE__);
+	return -ENXIO;
+}
+
+/*
+ * scif_rma_list_unregister:
+ *
+ * Traverse the self registration list starting from window:
+ * 1) Call scif_unregister_window(..)
+ * RMA lock must be held.
+ */
+int scif_rma_list_unregister(struct scif_window *window,
+			     s64 offset, int nr_pages)
+{
+	struct scif_endpt *ep = (struct scif_endpt *)window->ep;
+	struct list_head *head = &ep->rma_info.reg_list;
+	s64 end_offset;
+	int err = 0;
+	int loop_nr_pages;
+	struct scif_window *_window;
+
+	list_for_each_entry_safe_from(window, _window, head, list) {
+		end_offset = window->offset + (window->nr_pages << PAGE_SHIFT);
+		loop_nr_pages = min((int)((end_offset - offset) >> PAGE_SHIFT),
+				    nr_pages);
+		err = scif_unregister_window(window);
+		if (err)
+			return err;
+		nr_pages -= loop_nr_pages;
+		offset += (loop_nr_pages << PAGE_SHIFT);
+		if (!nr_pages)
+			break;
+	}
+	return 0;
+}
+
+/*
+ * scif_unmap_all_window:
+ *
+ * Traverse all the windows in the self registration list and:
+ * 1) Delete any DMA mappings created
+ */
+void scif_unmap_all_windows(scif_epd_t epd)
+{
+	struct list_head *item, *tmp;
+	struct scif_window *window;
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct list_head *head = &ep->rma_info.reg_list;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+	list_for_each_safe(item, tmp, head) {
+		window = list_entry(item, struct scif_window, list);
+		scif_unmap_window(ep->remote_dev, window);
+	}
+	mutex_unlock(&ep->rma_info.rma_lock);
+}
+
+/*
+ * scif_unregister_all_window:
+ *
+ * Traverse all the windows in the self registration list and:
+ * 1) Call scif_unregister_window(..)
+ * RMA lock must be held.
+ */
+int scif_unregister_all_windows(scif_epd_t epd)
+{
+	struct list_head *item, *tmp;
+	struct scif_window *window;
+	struct scif_endpt *ep = (struct scif_endpt *)epd;
+	struct list_head *head = &ep->rma_info.reg_list;
+	int err = 0;
+
+	mutex_lock(&ep->rma_info.rma_lock);
+retry:
+	item = NULL;
+	tmp = NULL;
+	list_for_each_safe(item, tmp, head) {
+		window = list_entry(item, struct scif_window, list);
+		ep->rma_info.async_list_del = 0;
+		err = scif_unregister_window(window);
+		if (err)
+			dev_err(scif_info.mdev.this_device,
+				"%s %d err %d\n",
+				__func__, __LINE__, err);
+		/*
+		 * Need to restart list traversal if there has been
+		 * an asynchronous list entry deletion.
+		 */
+		if (ACCESS_ONCE(ep->rma_info.async_list_del))
+			goto retry;
+	}
+	mutex_unlock(&ep->rma_info.rma_lock);
+	if (!list_empty(&ep->rma_info.mmn_list)) {
+		spin_lock(&scif_info.rmalock);
+		list_add_tail(&ep->mmu_list, &scif_info.mmu_notif_cleanup);
+		spin_unlock(&scif_info.rmalock);
+		schedule_work(&scif_info.mmu_notif_work);
+	}
+	return err;
+}
diff --git a/drivers/misc/mic/scif/scif_rma_list.h b/drivers/misc/mic/scif/scif_rma_list.h
new file mode 100644
index 000000000000..7d58d1d551b0
--- /dev/null
+++ b/drivers/misc/mic/scif/scif_rma_list.h
@@ -0,0 +1,57 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#ifndef SCIF_RMA_LIST_H
+#define SCIF_RMA_LIST_H
+
+/*
+ * struct scif_rma_req - Self Registration list RMA Request query
+ *
+ * @out_window - Returns the window if found
+ * @offset: Starting offset
+ * @nr_bytes: number of bytes
+ * @prot: protection requested i.e. read or write or both
+ * @type: Specify single, partial or multiple windows
+ * @head: Head of list on which to search
+ * @va_for_temp: VA for searching temporary cached windows
+ */
+struct scif_rma_req {
+	struct scif_window **out_window;
+	union {
+		s64 offset;
+		unsigned long va_for_temp;
+	};
+	size_t nr_bytes;
+	int prot;
+	enum scif_window_type type;
+	struct list_head *head;
+};
+
+/* Insert */
+void scif_insert_window(struct scif_window *window, struct list_head *head);
+void scif_insert_tcw(struct scif_window *window,
+		     struct list_head *head);
+/* Query */
+int scif_query_window(struct scif_rma_req *request);
+int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *request);
+/* Called from close to unregister all self windows */
+int scif_unregister_all_windows(scif_epd_t epd);
+void scif_unmap_all_windows(scif_epd_t epd);
+/* Traverse list and unregister */
+int scif_rma_list_unregister(struct scif_window *window, s64 offset,
+			     int nr_pages);
+#endif /* SCIF_RMA_LIST_H */
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index 2f30badc6ffd..1ee8e82ba710 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -196,12 +196,6 @@ void tfh_write_restart(struct gru_tlb_fault_handle *tfh,
 	start_instruction(tfh);
 }
 
-void tfh_restart(struct gru_tlb_fault_handle *tfh)
-{
-	tfh->opc = TFHOP_RESTART;
-	start_instruction(tfh);
-}
-
 void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh)
 {
 	tfh->opc = TFHOP_USER_POLLING_MODE;
diff --git a/drivers/misc/sgi-gru/gruhandles.h b/drivers/misc/sgi-gru/gruhandles.h
index 3f998b924d8f..3d7bd36a1c89 100644
--- a/drivers/misc/sgi-gru/gruhandles.h
+++ b/drivers/misc/sgi-gru/gruhandles.h
@@ -524,7 +524,6 @@ int tfh_write_only(struct gru_tlb_fault_handle *tfh, unsigned long paddr,
 	int gaa, unsigned long vaddr, int asid, int dirty, int pagesize);
 void tfh_write_restart(struct gru_tlb_fault_handle *tfh, unsigned long paddr,
 	int gaa, unsigned long vaddr, int asid, int dirty, int pagesize);
-void tfh_restart(struct gru_tlb_fault_handle *tfh);
 void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh);
 void tfh_exception(struct gru_tlb_fault_handle *tfh);
 
diff --git a/drivers/misc/sgi-gru/grukdump.c b/drivers/misc/sgi-gru/grukdump.c
index a3700a56b8ff..313da3150262 100644
--- a/drivers/misc/sgi-gru/grukdump.c
+++ b/drivers/misc/sgi-gru/grukdump.c
@@ -78,11 +78,10 @@ static int gru_dump_tfm(struct gru_state *gru,
 		void __user *ubuf, void __user *ubufend)
 {
 	struct gru_tlb_fault_map *tfm;
-	int i, ret, bytes;
+	int i;
 
-	bytes = GRU_NUM_TFM * GRU_CACHE_LINE_BYTES;
-	if (bytes > ubufend - ubuf)
-		ret = -EFBIG;
+	if (GRU_NUM_TFM * GRU_CACHE_LINE_BYTES > ubufend - ubuf)
+		return -EFBIG;
 
 	for (i = 0; i < GRU_NUM_TFM; i++) {
 		tfm = get_tfm(gru->gs_gru_base_vaddr, i);
@@ -99,11 +98,10 @@ static int gru_dump_tgh(struct gru_state *gru,
 		void __user *ubuf, void __user *ubufend)
 {
 	struct gru_tlb_global_handle *tgh;
-	int i, ret, bytes;
+	int i;
 
-	bytes = GRU_NUM_TGH * GRU_CACHE_LINE_BYTES;
-	if (bytes > ubufend - ubuf)
-		ret = -EFBIG;
+	if (GRU_NUM_TGH * GRU_CACHE_LINE_BYTES > ubufend - ubuf)
+		return -EFBIG;
 
 	for (i = 0; i < GRU_NUM_TGH; i++) {
 		tgh = get_tgh(gru->gs_gru_base_vaddr, i);
@@ -196,7 +194,7 @@ int gru_dump_chiplet_request(unsigned long arg)
 		return -EFAULT;
 
 	/* Currently, only dump by gid is implemented */
-	if (req.gid >= gru_max_gids || req.gid < 0)
+	if (req.gid >= gru_max_gids)
 		return -EINVAL;
 
 	gru = GID_TO_GRU(req.gid);
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 913de07e577c..967b9dd24fe9 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -160,7 +160,12 @@ static void gru_load_kernel_context(struct gru_blade_state *bs, int blade_id)
 	down_write(&bs->bs_kgts_sema);
 
 	if (!bs->bs_kgts) {
-		bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0, 0);
+		do {
+			bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0, 0);
+			if (!IS_ERR(bs->bs_kgts))
+				break;
+			msleep(1);
+		} while (true);
 		bs->bs_kgts->ts_user_blade_id = blade_id;
 	}
 	kgts = bs->bs_kgts;
@@ -429,8 +434,8 @@ int gru_get_cb_exception_detail(void *cb,
 	return 0;
 }
 
-char *gru_get_cb_exception_detail_str(int ret, void *cb,
-				      char *buf, int size)
+static char *gru_get_cb_exception_detail_str(int ret, void *cb,
+					     char *buf, int size)
 {
 	struct gru_control_block_status *gen = (void *)cb;
 	struct control_block_extended_exc_detail excdet;
@@ -505,7 +510,7 @@ int gru_wait_proc(void *cb)
 	return ret;
 }
 
-void gru_abort(int ret, void *cb, char *str)
+static void gru_abort(int ret, void *cb, char *str)
 {
 	char buf[GRU_EXC_STR_SIZE];
 
@@ -997,7 +1002,6 @@ static int quicktest1(unsigned long arg)
 {
 	struct gru_message_queue_desc mqd;
 	void *p, *mq;
-	unsigned long *dw;
 	int i, ret = -EIO;
 	char mes[GRU_CACHE_LINE_BYTES], *m;
 
@@ -1007,7 +1011,6 @@ static int quicktest1(unsigned long arg)
 		return -ENOMEM;
 	mq = ALIGNUP(p, 1024);
 	memset(mes, 0xee, sizeof(mes));
-	dw = mq;
 
 	gru_create_message_queue(&mqd, mq, 8 * GRU_CACHE_LINE_BYTES, 0, 0, 0);
 	for (i = 0; i < 6; i++) {
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index ae16c8cb4f3e..1525870f460a 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -930,6 +930,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct gru_thread_state *gts;
 	unsigned long paddr, vaddr;
+	unsigned long expires;
 
 	vaddr = (unsigned long)vmf->virtual_address;
 	gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
@@ -954,7 +955,8 @@ again:
 			mutex_unlock(&gts->ts_ctxlock);
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(GRU_ASSIGN_DELAY);  /* true hack ZZZ */
-			if (gts->ts_steal_jiffies + GRU_STEAL_DELAY < jiffies)
+			expires = gts->ts_steal_jiffies + GRU_STEAL_DELAY;
+			if (time_before(expires, jiffies))
 				gru_steal_context(gts);
 			goto again;
 		}
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
index 2129274ef7ab..e936d43895d2 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -306,19 +306,20 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
 		atomic_inc(&gms->ms_refcnt);
 	} else {
 		gms = kzalloc(sizeof(*gms), GFP_KERNEL);
-		if (gms) {
-			STAT(gms_alloc);
-			spin_lock_init(&gms->ms_asid_lock);
-			gms->ms_notifier.ops = &gru_mmuops;
-			atomic_set(&gms->ms_refcnt, 1);
-			init_waitqueue_head(&gms->ms_wait_queue);
-			err = __mmu_notifier_register(&gms->ms_notifier, current->mm);
-			if (err)
-				goto error;
-		}
+		if (!gms)
+			return ERR_PTR(-ENOMEM);
+		STAT(gms_alloc);
+		spin_lock_init(&gms->ms_asid_lock);
+		gms->ms_notifier.ops = &gru_mmuops;
+		atomic_set(&gms->ms_refcnt, 1);
+		init_waitqueue_head(&gms->ms_wait_queue);
+		err = __mmu_notifier_register(&gms->ms_notifier, current->mm);
+		if (err)
+			goto error;
 	}
-	gru_dbg(grudev, "gms %p, refcnt %d\n", gms,
-		atomic_read(&gms->ms_refcnt));
+	if (gms)
+		gru_dbg(grudev, "gms %p, refcnt %d\n", gms,
+			atomic_read(&gms->ms_refcnt));
 	return gms;
 error:
 	kfree(gms);
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index c8c6a363069c..6e3af8b42cdd 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -460,6 +460,13 @@ static void st_int_enqueue(struct st_data_s *st_gdata, struct sk_buff *skb)
  * - TTY layer when write's finished
  * - st_write (in context of the protocol stack)
  */
+static void work_fn_write_wakeup(struct work_struct *work)
+{
+	struct st_data_s *st_gdata = container_of(work, struct st_data_s,
+			work_write_wakeup);
+
+	st_tx_wakeup((void *)st_gdata);
+}
 void st_tx_wakeup(struct st_data_s *st_data)
 {
 	struct sk_buff *skb;
@@ -812,8 +819,12 @@ static void st_tty_wakeup(struct tty_struct *tty)
 	/* don't do an wakeup for now */
 	clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
 
-	/* call our internal wakeup */
-	st_tx_wakeup((void *)st_gdata);
+	/*
+	 * schedule the internal wakeup instead of calling directly to
+	 * avoid lockup (port->lock needed in tty->ops->write is
+	 * already taken here
+	 */
+	schedule_work(&st_gdata->work_write_wakeup);
 }
 
 static void st_tty_flush_buffer(struct tty_struct *tty)
@@ -881,6 +892,9 @@ int st_core_init(struct st_data_s **core_data)
 			pr_err("unable to un-register ldisc");
 		return err;
 	}
+
+	INIT_WORK(&st_gdata->work_write_wakeup, work_fn_write_wakeup);
+
 	*core_data = st_gdata;
 	return 0;
 }
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index ffb56340d0c7..89300870fefb 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1,7 +1,7 @@
 /*
  * VMware Balloon driver.
  *
- * Copyright (C) 2000-2010, VMware, Inc. All Rights Reserved.
+ * Copyright (C) 2000-2014, VMware, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -37,16 +37,19 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/vmw_vmci_defs.h>
+#include <linux/vmw_vmci_api.h>
 #include <asm/hypervisor.h>
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
-MODULE_VERSION("1.3.0.0-k");
+MODULE_VERSION("1.5.0.0-k");
 MODULE_ALIAS("dmi:*:svnVMware*:*");
 MODULE_ALIAS("vmware_vmmemctl");
 MODULE_LICENSE("GPL");
@@ -57,12 +60,6 @@ MODULE_LICENSE("GPL");
  */
 
 /*
- * Rate of allocating memory when there is no memory pressure
- * (driver performs non-sleeping allocations).
- */
-#define VMW_BALLOON_NOSLEEP_ALLOC_MAX	16384U
-
-/*
  * Rates of memory allocaton when guest experiences memory pressure
  * (driver performs sleeping allocations).
  */
@@ -71,13 +68,6 @@ MODULE_LICENSE("GPL");
 #define VMW_BALLOON_RATE_ALLOC_INC	16U
 
 /*
- * Rates for releasing pages while deflating balloon.
- */
-#define VMW_BALLOON_RATE_FREE_MIN	512U
-#define VMW_BALLOON_RATE_FREE_MAX	16384U
-#define VMW_BALLOON_RATE_FREE_INC	16U
-
-/*
  * When guest is under memory pressure, use a reduced page allocation
  * rate for next several cycles.
  */
@@ -99,9 +89,6 @@ MODULE_LICENSE("GPL");
  */
 #define VMW_PAGE_ALLOC_CANSLEEP		(GFP_HIGHUSER)
 
-/* Maximum number of page allocations without yielding processor */
-#define VMW_BALLOON_YIELD_THRESHOLD	1024
-
 /* Maximum number of refused pages we accumulate during inflation cycle */
 #define VMW_BALLOON_MAX_REFUSED		16
 
@@ -116,17 +103,45 @@ enum vmwballoon_capabilities {
 	/*
 	 * Bit 0 is reserved and not associated to any capability.
 	 */
-	VMW_BALLOON_BASIC_CMDS		= (1 << 1),
-	VMW_BALLOON_BATCHED_CMDS	= (1 << 2)
+	VMW_BALLOON_BASIC_CMDS			= (1 << 1),
+	VMW_BALLOON_BATCHED_CMDS		= (1 << 2),
+	VMW_BALLOON_BATCHED_2M_CMDS		= (1 << 3),
+	VMW_BALLOON_SIGNALLED_WAKEUP_CMD	= (1 << 4),
 };
 
-#define VMW_BALLOON_CAPABILITIES	(VMW_BALLOON_BASIC_CMDS)
+#define VMW_BALLOON_CAPABILITIES	(VMW_BALLOON_BASIC_CMDS \
+					| VMW_BALLOON_BATCHED_CMDS \
+					| VMW_BALLOON_BATCHED_2M_CMDS \
+					| VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
+
+#define VMW_BALLOON_2M_SHIFT		(9)
+#define VMW_BALLOON_NUM_PAGE_SIZES	(2)
+
+/*
+ * Backdoor commands availability:
+ *
+ * START, GET_TARGET and GUEST_ID are always available,
+ *
+ * VMW_BALLOON_BASIC_CMDS:
+ *	LOCK and UNLOCK commands,
+ * VMW_BALLOON_BATCHED_CMDS:
+ *	BATCHED_LOCK and BATCHED_UNLOCK commands.
+ * VMW BALLOON_BATCHED_2M_CMDS:
+ *	BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
+ * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
+ *	VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
+ */
+#define VMW_BALLOON_CMD_START			0
+#define VMW_BALLOON_CMD_GET_TARGET		1
+#define VMW_BALLOON_CMD_LOCK			2
+#define VMW_BALLOON_CMD_UNLOCK			3
+#define VMW_BALLOON_CMD_GUEST_ID		4
+#define VMW_BALLOON_CMD_BATCHED_LOCK		6
+#define VMW_BALLOON_CMD_BATCHED_UNLOCK		7
+#define VMW_BALLOON_CMD_BATCHED_2M_LOCK		8
+#define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK	9
+#define VMW_BALLOON_CMD_VMCI_DOORBELL_SET	10
 
-#define VMW_BALLOON_CMD_START		0
-#define VMW_BALLOON_CMD_GET_TARGET	1
-#define VMW_BALLOON_CMD_LOCK		2
-#define VMW_BALLOON_CMD_UNLOCK		3
-#define VMW_BALLOON_CMD_GUEST_ID	4
 
 /* error codes */
 #define VMW_BALLOON_SUCCESS		        0
@@ -142,18 +157,60 @@ enum vmwballoon_capabilities {
 
 #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES	(0x03000000)
 
-#define VMWARE_BALLOON_CMD(cmd, data, result)			\
+/* Batch page description */
+
+/*
+ * Layout of a page in the batch page:
+ *
+ * +-------------+----------+--------+
+ * |             |          |        |
+ * | Page number | Reserved | Status |
+ * |             |          |        |
+ * +-------------+----------+--------+
+ * 64  PAGE_SHIFT          6         0
+ *
+ * The reserved field should be set to 0.
+ */
+#define VMW_BALLOON_BATCH_MAX_PAGES	(PAGE_SIZE / sizeof(u64))
+#define VMW_BALLOON_BATCH_STATUS_MASK	((1UL << 5) - 1)
+#define VMW_BALLOON_BATCH_PAGE_MASK	(~((1UL << PAGE_SHIFT) - 1))
+
+struct vmballoon_batch_page {
+	u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
+};
+
+static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
+{
+	return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
+}
+
+static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
+				int idx)
+{
+	return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
+}
+
+static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
+				u64 pa)
+{
+	batch->pages[idx] = pa;
+}
+
+
+#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)		\
 ({								\
-	unsigned long __status, __dummy1, __dummy2;		\
+	unsigned long __status, __dummy1, __dummy2, __dummy3;	\
 	__asm__ __volatile__ ("inl %%dx" :			\
 		"=a"(__status),					\
 		"=c"(__dummy1),					\
 		"=d"(__dummy2),					\
-		"=b"(result) :					\
+		"=b"(result),					\
+		"=S" (__dummy3) :				\
 		"0"(VMW_BALLOON_HV_MAGIC),			\
 		"1"(VMW_BALLOON_CMD_##cmd),			\
 		"2"(VMW_BALLOON_HV_PORT),			\
-		"3"(data) :					\
+		"3"(arg1),					\
+		"4" (arg2) :					\
 		"memory");					\
 	if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)	\
 		result = __dummy1;				\
@@ -164,27 +221,30 @@ enum vmwballoon_capabilities {
 #ifdef CONFIG_DEBUG_FS
 struct vmballoon_stats {
 	unsigned int timer;
+	unsigned int doorbell;
 
 	/* allocation statistics */
-	unsigned int alloc;
-	unsigned int alloc_fail;
+	unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
+	unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 	unsigned int sleep_alloc;
 	unsigned int sleep_alloc_fail;
-	unsigned int refused_alloc;
-	unsigned int refused_free;
-	unsigned int free;
+	unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
+	unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
+	unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
 
 	/* monitor operations */
-	unsigned int lock;
-	unsigned int lock_fail;
-	unsigned int unlock;
-	unsigned int unlock_fail;
+	unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
+	unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
+	unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
+	unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 	unsigned int target;
 	unsigned int target_fail;
 	unsigned int start;
 	unsigned int start_fail;
 	unsigned int guest_type;
 	unsigned int guest_type_fail;
+	unsigned int doorbell_set;
+	unsigned int doorbell_unset;
 };
 
 #define STATS_INC(stat) (stat)++
@@ -192,14 +252,30 @@ struct vmballoon_stats {
 #define STATS_INC(stat)
 #endif
 
-struct vmballoon {
+struct vmballoon;
 
+struct vmballoon_ops {
+	void (*add_page)(struct vmballoon *b, int idx, struct page *p);
+	int (*lock)(struct vmballoon *b, unsigned int num_pages,
+			bool is_2m_pages, unsigned int *target);
+	int (*unlock)(struct vmballoon *b, unsigned int num_pages,
+			bool is_2m_pages, unsigned int *target);
+};
+
+struct vmballoon_page_size {
 	/* list of reserved physical pages */
 	struct list_head pages;
 
 	/* transient list of non-balloonable pages */
 	struct list_head refused_pages;
 	unsigned int n_refused_pages;
+};
+
+struct vmballoon {
+	struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
+
+	/* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
+	unsigned supported_page_sizes;
 
 	/* balloon size in pages */
 	unsigned int size;
@@ -210,11 +286,18 @@ struct vmballoon {
 
 	/* adjustment rates (pages per second) */
 	unsigned int rate_alloc;
-	unsigned int rate_free;
 
 	/* slowdown page allocations for next few cycles */
 	unsigned int slow_allocation_cycles;
 
+	unsigned long capabilities;
+
+	struct vmballoon_batch_page *batch_page;
+	unsigned int batch_max_pages;
+	struct page *page;
+
+	const struct vmballoon_ops *ops;
+
 #ifdef CONFIG_DEBUG_FS
 	/* statistics */
 	struct vmballoon_stats stats;
@@ -226,6 +309,8 @@ struct vmballoon {
 	struct sysinfo sysinfo;
 
 	struct delayed_work dwork;
+
+	struct vmci_handle vmci_doorbell;
 };
 
 static struct vmballoon balloon;
@@ -234,20 +319,38 @@ static struct vmballoon balloon;
  * Send "start" command to the host, communicating supported version
  * of the protocol.
  */
-static bool vmballoon_send_start(struct vmballoon *b)
+static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
 {
-	unsigned long status, capabilities;
+	unsigned long status, capabilities, dummy = 0;
+	bool success;
 
 	STATS_INC(b->stats.start);
 
-	status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_CAPABILITIES,
-				capabilities);
-	if (status == VMW_BALLOON_SUCCESS)
-		return true;
+	status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
 
-	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
-	STATS_INC(b->stats.start_fail);
-	return false;
+	switch (status) {
+	case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
+		b->capabilities = capabilities;
+		success = true;
+		break;
+	case VMW_BALLOON_SUCCESS:
+		b->capabilities = VMW_BALLOON_BASIC_CMDS;
+		success = true;
+		break;
+	default:
+		success = false;
+	}
+
+	if (b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS)
+		b->supported_page_sizes = 2;
+	else
+		b->supported_page_sizes = 1;
+
+	if (!success) {
+		pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+		STATS_INC(b->stats.start_fail);
+	}
+	return success;
 }
 
 static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
@@ -273,9 +376,10 @@ static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
  */
 static bool vmballoon_send_guest_id(struct vmballoon *b)
 {
-	unsigned long status, dummy;
+	unsigned long status, dummy = 0;
 
-	status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
+	status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
+				dummy);
 
 	STATS_INC(b->stats.guest_type);
 
@@ -287,6 +391,14 @@ static bool vmballoon_send_guest_id(struct vmballoon *b)
 	return false;
 }
 
+static u16 vmballoon_page_size(bool is_2m_page)
+{
+	if (is_2m_page)
+		return 1 << VMW_BALLOON_2M_SHIFT;
+
+	return 1;
+}
+
 /*
  * Retrieve desired balloon size from the host.
  */
@@ -295,6 +407,7 @@ static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
 	unsigned long status;
 	unsigned long target;
 	unsigned long limit;
+	unsigned long dummy = 0;
 	u32 limit32;
 
 	/*
@@ -313,7 +426,7 @@ static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
 	/* update stats */
 	STATS_INC(b->stats.target);
 
-	status = VMWARE_BALLOON_CMD(GET_TARGET, limit, target);
+	status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
 	if (vmballoon_check_status(b, status)) {
 		*new_target = target;
 		return true;
@@ -330,23 +443,46 @@ static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
  * check the return value and maybe submit a different page.
  */
 static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
-				     unsigned int *hv_status)
+				unsigned int *hv_status, unsigned int *target)
 {
-	unsigned long status, dummy;
+	unsigned long status, dummy = 0;
 	u32 pfn32;
 
 	pfn32 = (u32)pfn;
 	if (pfn32 != pfn)
 		return -1;
 
-	STATS_INC(b->stats.lock);
+	STATS_INC(b->stats.lock[false]);
 
-	*hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy);
+	*hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
 	if (vmballoon_check_status(b, status))
 		return 0;
 
 	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
-	STATS_INC(b->stats.lock_fail);
+	STATS_INC(b->stats.lock_fail[false]);
+	return 1;
+}
+
+static int vmballoon_send_batched_lock(struct vmballoon *b,
+		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
+{
+	unsigned long status;
+	unsigned long pfn = page_to_pfn(b->page);
+
+	STATS_INC(b->stats.lock[is_2m_pages]);
+
+	if (is_2m_pages)
+		status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
+				*target);
+	else
+		status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
+				*target);
+
+	if (vmballoon_check_status(b, status))
+		return 0;
+
+	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
+	STATS_INC(b->stats.lock_fail[is_2m_pages]);
 	return 1;
 }
 
@@ -354,26 +490,66 @@ static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
  * Notify the host that guest intends to release given page back into
  * the pool of available (to the guest) pages.
  */
-static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
+static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
+							unsigned int *target)
 {
-	unsigned long status, dummy;
+	unsigned long status, dummy = 0;
 	u32 pfn32;
 
 	pfn32 = (u32)pfn;
 	if (pfn32 != pfn)
 		return false;
 
-	STATS_INC(b->stats.unlock);
+	STATS_INC(b->stats.unlock[false]);
 
-	status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy);
+	status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
 	if (vmballoon_check_status(b, status))
 		return true;
 
 	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
-	STATS_INC(b->stats.unlock_fail);
+	STATS_INC(b->stats.unlock_fail[false]);
+	return false;
+}
+
+static bool vmballoon_send_batched_unlock(struct vmballoon *b,
+		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
+{
+	unsigned long status;
+	unsigned long pfn = page_to_pfn(b->page);
+
+	STATS_INC(b->stats.unlock[is_2m_pages]);
+
+	if (is_2m_pages)
+		status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
+				*target);
+	else
+		status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
+				*target);
+
+	if (vmballoon_check_status(b, status))
+		return true;
+
+	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
+	STATS_INC(b->stats.unlock_fail[is_2m_pages]);
 	return false;
 }
 
+static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
+{
+	if (is_2m_page)
+		return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
+
+	return alloc_page(flags);
+}
+
+static void vmballoon_free_page(struct page *page, bool is_2m_page)
+{
+	if (is_2m_page)
+		__free_pages(page, VMW_BALLOON_2M_SHIFT);
+	else
+		__free_page(page);
+}
+
 /*
  * Quickly release all pages allocated for the balloon. This function is
  * called when host decides to "reset" balloon for one reason or another.
@@ -383,35 +559,31 @@ static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
 static void vmballoon_pop(struct vmballoon *b)
 {
 	struct page *page, *next;
-	unsigned int count = 0;
-
-	list_for_each_entry_safe(page, next, &b->pages, lru) {
-		list_del(&page->lru);
-		__free_page(page);
-		STATS_INC(b->stats.free);
-		b->size--;
-
-		if (++count >= b->rate_free) {
-			count = 0;
+	unsigned is_2m_pages;
+
+	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
+			is_2m_pages++) {
+		struct vmballoon_page_size *page_size =
+				&b->page_sizes[is_2m_pages];
+		u16 size_per_page = vmballoon_page_size(is_2m_pages);
+
+		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
+			list_del(&page->lru);
+			vmballoon_free_page(page, is_2m_pages);
+			STATS_INC(b->stats.free[is_2m_pages]);
+			b->size -= size_per_page;
 			cond_resched();
 		}
 	}
-}
 
-/*
- * Perform standard reset sequence by popping the balloon (in case it
- * is not  empty) and then restarting protocol. This operation normally
- * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
- */
-static void vmballoon_reset(struct vmballoon *b)
-{
-	/* free all pages, skipping monitor unlock */
-	vmballoon_pop(b);
+	if (b->batch_page) {
+		vunmap(b->batch_page);
+		b->batch_page = NULL;
+	}
 
-	if (vmballoon_send_start(b)) {
-		b->reset_required = false;
-		if (!vmballoon_send_guest_id(b))
-			pr_err("failed to send guest ID to the host\n");
+	if (b->page) {
+		__free_page(b->page);
+		b->page = NULL;
 	}
 }
 
@@ -420,17 +592,23 @@ static void vmballoon_reset(struct vmballoon *b)
  * refuse list, those refused page are then released at the end of the
  * inflation cycle.
  */
-static int vmballoon_lock_page(struct vmballoon *b, struct page *page)
+static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
+				bool is_2m_pages, unsigned int *target)
 {
 	int locked, hv_status;
+	struct page *page = b->page;
+	struct vmballoon_page_size *page_size = &b->page_sizes[false];
+
+	/* is_2m_pages can never happen as 2m pages support implies batching */
 
-	locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status);
+	locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
+								target);
 	if (locked > 0) {
-		STATS_INC(b->stats.refused_alloc);
+		STATS_INC(b->stats.refused_alloc[false]);
 
 		if (hv_status == VMW_BALLOON_ERROR_RESET ||
 				hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED) {
-			__free_page(page);
+			vmballoon_free_page(page, false);
 			return -EIO;
 		}
 
@@ -439,17 +617,17 @@ static int vmballoon_lock_page(struct vmballoon *b, struct page *page)
 		 * and retry allocation, unless we already accumulated
 		 * too many of them, in which case take a breather.
 		 */
-		if (b->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
-			b->n_refused_pages++;
-			list_add(&page->lru, &b->refused_pages);
+		if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
+			page_size->n_refused_pages++;
+			list_add(&page->lru, &page_size->refused_pages);
 		} else {
-			__free_page(page);
+			vmballoon_free_page(page, false);
 		}
 		return -EIO;
 	}
 
 	/* track allocated page */
-	list_add(&page->lru, &b->pages);
+	list_add(&page->lru, &page_size->pages);
 
 	/* update balloon size */
 	b->size++;
@@ -457,21 +635,81 @@ static int vmballoon_lock_page(struct vmballoon *b, struct page *page)
 	return 0;
 }
 
+static int vmballoon_lock_batched_page(struct vmballoon *b,
+		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
+{
+	int locked, i;
+	u16 size_per_page = vmballoon_page_size(is_2m_pages);
+
+	locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
+			target);
+	if (locked > 0) {
+		for (i = 0; i < num_pages; i++) {
+			u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
+			struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
+
+			vmballoon_free_page(p, is_2m_pages);
+		}
+
+		return -EIO;
+	}
+
+	for (i = 0; i < num_pages; i++) {
+		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
+		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
+		struct vmballoon_page_size *page_size =
+				&b->page_sizes[is_2m_pages];
+
+		locked = vmballoon_batch_get_status(b->batch_page, i);
+
+		switch (locked) {
+		case VMW_BALLOON_SUCCESS:
+			list_add(&p->lru, &page_size->pages);
+			b->size += size_per_page;
+			break;
+		case VMW_BALLOON_ERROR_PPN_PINNED:
+		case VMW_BALLOON_ERROR_PPN_INVALID:
+			if (page_size->n_refused_pages
+					< VMW_BALLOON_MAX_REFUSED) {
+				list_add(&p->lru, &page_size->refused_pages);
+				page_size->n_refused_pages++;
+				break;
+			}
+			/* Fallthrough */
+		case VMW_BALLOON_ERROR_RESET:
+		case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
+			vmballoon_free_page(p, is_2m_pages);
+			break;
+		default:
+			/* This should never happen */
+			WARN_ON_ONCE(true);
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Release the page allocated for the balloon. Note that we first notify
  * the host so it can make sure the page will be available for the guest
  * to use, if needed.
  */
-static int vmballoon_release_page(struct vmballoon *b, struct page *page)
+static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
+		bool is_2m_pages, unsigned int *target)
 {
-	if (!vmballoon_send_unlock_page(b, page_to_pfn(page)))
-		return -EIO;
+	struct page *page = b->page;
+	struct vmballoon_page_size *page_size = &b->page_sizes[false];
+
+	/* is_2m_pages can never happen as 2m pages support implies batching */
 
-	list_del(&page->lru);
+	if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
+		list_add(&page->lru, &page_size->pages);
+		return -EIO;
+	}
 
 	/* deallocate page */
-	__free_page(page);
-	STATS_INC(b->stats.free);
+	vmballoon_free_page(page, false);
+	STATS_INC(b->stats.free[false]);
 
 	/* update balloon size */
 	b->size--;
@@ -479,21 +717,76 @@ static int vmballoon_release_page(struct vmballoon *b, struct page *page)
 	return 0;
 }
 
+static int vmballoon_unlock_batched_page(struct vmballoon *b,
+				unsigned int num_pages, bool is_2m_pages,
+				unsigned int *target)
+{
+	int locked, i, ret = 0;
+	bool hv_success;
+	u16 size_per_page = vmballoon_page_size(is_2m_pages);
+
+	hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
+			target);
+	if (!hv_success)
+		ret = -EIO;
+
+	for (i = 0; i < num_pages; i++) {
+		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
+		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
+		struct vmballoon_page_size *page_size =
+				&b->page_sizes[is_2m_pages];
+
+		locked = vmballoon_batch_get_status(b->batch_page, i);
+		if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
+			/*
+			 * That page wasn't successfully unlocked by the
+			 * hypervisor, re-add it to the list of pages owned by
+			 * the balloon driver.
+			 */
+			list_add(&p->lru, &page_size->pages);
+		} else {
+			/* deallocate page */
+			vmballoon_free_page(p, is_2m_pages);
+			STATS_INC(b->stats.free[is_2m_pages]);
+
+			/* update balloon size */
+			b->size -= size_per_page;
+		}
+	}
+
+	return ret;
+}
+
 /*
  * Release pages that were allocated while attempting to inflate the
  * balloon but were refused by the host for one reason or another.
  */
-static void vmballoon_release_refused_pages(struct vmballoon *b)
+static void vmballoon_release_refused_pages(struct vmballoon *b,
+		bool is_2m_pages)
 {
 	struct page *page, *next;
+	struct vmballoon_page_size *page_size =
+			&b->page_sizes[is_2m_pages];
 
-	list_for_each_entry_safe(page, next, &b->refused_pages, lru) {
+	list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
 		list_del(&page->lru);
-		__free_page(page);
-		STATS_INC(b->stats.refused_free);
+		vmballoon_free_page(page, is_2m_pages);
+		STATS_INC(b->stats.refused_free[is_2m_pages]);
 	}
 
-	b->n_refused_pages = 0;
+	page_size->n_refused_pages = 0;
+}
+
+static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
+{
+	b->page = p;
+}
+
+static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
+				struct page *p)
+{
+	vmballoon_batch_set_pa(b->batch_page, idx,
+			(u64)page_to_pfn(p) << PAGE_SHIFT);
 }
 
 /*
@@ -503,12 +796,12 @@ static void vmballoon_release_refused_pages(struct vmballoon *b)
  */
 static void vmballoon_inflate(struct vmballoon *b)
 {
-	unsigned int goal;
-	unsigned int rate;
-	unsigned int i;
+	unsigned rate;
 	unsigned int allocations = 0;
+	unsigned int num_pages = 0;
 	int error = 0;
 	gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
+	bool is_2m_pages;
 
 	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 
@@ -527,27 +820,50 @@ static void vmballoon_inflate(struct vmballoon *b)
 	 * slowdown page allocations considerably.
 	 */
 
-	goal = b->target - b->size;
 	/*
 	 * Start with no sleep allocation rate which may be higher
 	 * than sleeping allocation rate.
 	 */
-	rate = b->slow_allocation_cycles ?
-			b->rate_alloc : VMW_BALLOON_NOSLEEP_ALLOC_MAX;
+	if (b->slow_allocation_cycles) {
+		rate = b->rate_alloc;
+		is_2m_pages = false;
+	} else {
+		rate = UINT_MAX;
+		is_2m_pages =
+			b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
+	}
 
-	pr_debug("%s - goal: %d, no-sleep rate: %d, sleep rate: %d\n",
-		 __func__, goal, rate, b->rate_alloc);
+	pr_debug("%s - goal: %d, no-sleep rate: %u, sleep rate: %d\n",
+		 __func__, b->target - b->size, rate, b->rate_alloc);
 
-	for (i = 0; i < goal; i++) {
+	while (!b->reset_required &&
+		b->size + num_pages * vmballoon_page_size(is_2m_pages)
+		< b->target) {
 		struct page *page;
 
 		if (flags == VMW_PAGE_ALLOC_NOSLEEP)
-			STATS_INC(b->stats.alloc);
+			STATS_INC(b->stats.alloc[is_2m_pages]);
 		else
 			STATS_INC(b->stats.sleep_alloc);
 
-		page = alloc_page(flags);
+		page = vmballoon_alloc_page(flags, is_2m_pages);
 		if (!page) {
+			STATS_INC(b->stats.alloc_fail[is_2m_pages]);
+
+			if (is_2m_pages) {
+				b->ops->lock(b, num_pages, true, &b->target);
+
+				/*
+				 * ignore errors from locking as we now switch
+				 * to 4k pages and we might get different
+				 * errors.
+				 */
+
+				num_pages = 0;
+				is_2m_pages = false;
+				continue;
+			}
+
 			if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
 				/*
 				 * CANSLEEP page allocation failed, so guest
@@ -559,7 +875,6 @@ static void vmballoon_inflate(struct vmballoon *b)
 				STATS_INC(b->stats.sleep_alloc_fail);
 				break;
 			}
-			STATS_INC(b->stats.alloc_fail);
 
 			/*
 			 * NOSLEEP page allocation failed, so the guest is
@@ -571,7 +886,7 @@ static void vmballoon_inflate(struct vmballoon *b)
 			 */
 			b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
 
-			if (i >= b->rate_alloc)
+			if (allocations >= b->rate_alloc)
 				break;
 
 			flags = VMW_PAGE_ALLOC_CANSLEEP;
@@ -580,34 +895,40 @@ static void vmballoon_inflate(struct vmballoon *b)
 			continue;
 		}
 
-		error = vmballoon_lock_page(b, page);
-		if (error)
-			break;
-
-		if (++allocations > VMW_BALLOON_YIELD_THRESHOLD) {
-			cond_resched();
-			allocations = 0;
+		b->ops->add_page(b, num_pages++, page);
+		if (num_pages == b->batch_max_pages) {
+			error = b->ops->lock(b, num_pages, is_2m_pages,
+					&b->target);
+			num_pages = 0;
+			if (error)
+				break;
 		}
 
-		if (i >= rate) {
+		cond_resched();
+
+		if (allocations >= rate) {
 			/* We allocated enough pages, let's take a break. */
 			break;
 		}
 	}
 
+	if (num_pages > 0)
+		b->ops->lock(b, num_pages, is_2m_pages, &b->target);
+
 	/*
 	 * We reached our goal without failures so try increasing
 	 * allocation rate.
 	 */
-	if (error == 0 && i >= b->rate_alloc) {
-		unsigned int mult = i / b->rate_alloc;
+	if (error == 0 && allocations >= b->rate_alloc) {
+		unsigned int mult = allocations / b->rate_alloc;
 
 		b->rate_alloc =
 			min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
 			    VMW_BALLOON_RATE_ALLOC_MAX);
 	}
 
-	vmballoon_release_refused_pages(b);
+	vmballoon_release_refused_pages(b, true);
+	vmballoon_release_refused_pages(b, false);
 }
 
 /*
@@ -615,35 +936,176 @@ static void vmballoon_inflate(struct vmballoon *b)
  */
 static void vmballoon_deflate(struct vmballoon *b)
 {
-	struct page *page, *next;
-	unsigned int i = 0;
-	unsigned int goal;
-	int error;
+	unsigned is_2m_pages;
 
 	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 
-	/* limit deallocation rate */
-	goal = min(b->size - b->target, b->rate_free);
+	/* free pages to reach target */
+	for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
+			is_2m_pages++) {
+		struct page *page, *next;
+		unsigned int num_pages = 0;
+		struct vmballoon_page_size *page_size =
+				&b->page_sizes[is_2m_pages];
+
+		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
+			if (b->reset_required ||
+				(b->target > 0 &&
+					b->size - num_pages
+					* vmballoon_page_size(is_2m_pages)
+				< b->target + vmballoon_page_size(true)))
+				break;
+
+			list_del(&page->lru);
+			b->ops->add_page(b, num_pages++, page);
 
-	pr_debug("%s - goal: %d, rate: %d\n", __func__, goal, b->rate_free);
+			if (num_pages == b->batch_max_pages) {
+				int error;
 
-	/* free pages to reach target */
-	list_for_each_entry_safe(page, next, &b->pages, lru) {
-		error = vmballoon_release_page(b, page);
-		if (error) {
-			/* quickly decrease rate in case of error */
-			b->rate_free = max(b->rate_free / 2,
-					   VMW_BALLOON_RATE_FREE_MIN);
-			return;
+				error = b->ops->unlock(b, num_pages,
+						is_2m_pages, &b->target);
+				num_pages = 0;
+				if (error)
+					return;
+			}
+
+			cond_resched();
 		}
 
-		if (++i >= goal)
-			break;
+		if (num_pages > 0)
+			b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
+	}
+}
+
+static const struct vmballoon_ops vmballoon_basic_ops = {
+	.add_page = vmballoon_add_page,
+	.lock = vmballoon_lock_page,
+	.unlock = vmballoon_unlock_page
+};
+
+static const struct vmballoon_ops vmballoon_batched_ops = {
+	.add_page = vmballoon_add_batched_page,
+	.lock = vmballoon_lock_batched_page,
+	.unlock = vmballoon_unlock_batched_page
+};
+
+static bool vmballoon_init_batching(struct vmballoon *b)
+{
+	b->page = alloc_page(VMW_PAGE_ALLOC_NOSLEEP);
+	if (!b->page)
+		return false;
+
+	b->batch_page = vmap(&b->page, 1, VM_MAP, PAGE_KERNEL);
+	if (!b->batch_page) {
+		__free_page(b->page);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Receive notification and resize balloon
+ */
+static void vmballoon_doorbell(void *client_data)
+{
+	struct vmballoon *b = client_data;
+
+	STATS_INC(b->stats.doorbell);
+
+	mod_delayed_work(system_freezable_wq, &b->dwork, 0);
+}
+
+/*
+ * Clean up vmci doorbell
+ */
+static void vmballoon_vmci_cleanup(struct vmballoon *b)
+{
+	int error;
+
+	VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
+			VMCI_INVALID_ID, error);
+	STATS_INC(b->stats.doorbell_unset);
+
+	if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
+		vmci_doorbell_destroy(b->vmci_doorbell);
+		b->vmci_doorbell = VMCI_INVALID_HANDLE;
+	}
+}
+
+/*
+ * Initialize vmci doorbell, to get notified as soon as balloon changes
+ */
+static int vmballoon_vmci_init(struct vmballoon *b)
+{
+	int error = 0;
+
+	if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) != 0) {
+		error = vmci_doorbell_create(&b->vmci_doorbell,
+				VMCI_FLAG_DELAYED_CB,
+				VMCI_PRIVILEGE_FLAG_RESTRICTED,
+				vmballoon_doorbell, b);
+
+		if (error == VMCI_SUCCESS) {
+			VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET,
+					b->vmci_doorbell.context,
+					b->vmci_doorbell.resource, error);
+			STATS_INC(b->stats.doorbell_set);
+		}
+	}
+
+	if (error != 0) {
+		vmballoon_vmci_cleanup(b);
+
+		return -EIO;
 	}
 
-	/* slowly increase rate if there were no errors */
-	b->rate_free = min(b->rate_free + VMW_BALLOON_RATE_FREE_INC,
-			   VMW_BALLOON_RATE_FREE_MAX);
+	return 0;
+}
+
+/*
+ * Perform standard reset sequence by popping the balloon (in case it
+ * is not  empty) and then restarting protocol. This operation normally
+ * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
+ */
+static void vmballoon_reset(struct vmballoon *b)
+{
+	int error;
+
+	vmballoon_vmci_cleanup(b);
+
+	/* free all pages, skipping monitor unlock */
+	vmballoon_pop(b);
+
+	if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
+		return;
+
+	if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
+		b->ops = &vmballoon_batched_ops;
+		b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
+		if (!vmballoon_init_batching(b)) {
+			/*
+			 * We failed to initialize batching, inform the monitor
+			 * about it by sending a null capability.
+			 *
+			 * The guest will retry in one second.
+			 */
+			vmballoon_send_start(b, 0);
+			return;
+		}
+	} else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
+		b->ops = &vmballoon_basic_ops;
+		b->batch_max_pages = 1;
+	}
+
+	b->reset_required = false;
+
+	error = vmballoon_vmci_init(b);
+	if (error)
+		pr_err("failed to initialize vmci doorbell\n");
+
+	if (!vmballoon_send_guest_id(b))
+		pr_err("failed to send guest ID to the host\n");
 }
 
 /*
@@ -664,13 +1126,14 @@ static void vmballoon_work(struct work_struct *work)
 	if (b->slow_allocation_cycles > 0)
 		b->slow_allocation_cycles--;
 
-	if (vmballoon_send_get_target(b, &target)) {
+	if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
 		/* update target, adjust size */
 		b->target = target;
 
 		if (b->size < target)
 			vmballoon_inflate(b);
-		else if (b->size > target)
+		else if (target == 0 ||
+				b->size > target + vmballoon_page_size(true))
 			vmballoon_deflate(b);
 	}
 
@@ -692,6 +1155,14 @@ static int vmballoon_debug_show(struct seq_file *f, void *offset)
 	struct vmballoon *b = f->private;
 	struct vmballoon_stats *stats = &b->stats;
 
+	/* format capabilities info */
+	seq_printf(f,
+		   "balloon capabilities:   %#4x\n"
+		   "used capabilities:      %#4lx\n"
+		   "is resetting:           %c\n",
+		   VMW_BALLOON_CAPABILITIES, b->capabilities,
+		   b->reset_required ? 'y' : 'n');
+
 	/* format size info */
 	seq_printf(f,
 		   "target:             %8d pages\n"
@@ -700,35 +1171,48 @@ static int vmballoon_debug_show(struct seq_file *f, void *offset)
 
 	/* format rate info */
 	seq_printf(f,
-		   "rateNoSleepAlloc:   %8d pages/sec\n"
-		   "rateSleepAlloc:     %8d pages/sec\n"
-		   "rateFree:           %8d pages/sec\n",
-		   VMW_BALLOON_NOSLEEP_ALLOC_MAX,
-		   b->rate_alloc, b->rate_free);
+		   "rateSleepAlloc:     %8d pages/sec\n",
+		   b->rate_alloc);
 
 	seq_printf(f,
 		   "\n"
 		   "timer:              %8u\n"
+		   "doorbell:           %8u\n"
 		   "start:              %8u (%4u failed)\n"
 		   "guestType:          %8u (%4u failed)\n"
+		   "2m-lock:            %8u (%4u failed)\n"
 		   "lock:               %8u (%4u failed)\n"
+		   "2m-unlock:          %8u (%4u failed)\n"
 		   "unlock:             %8u (%4u failed)\n"
 		   "target:             %8u (%4u failed)\n"
+		   "prim2mAlloc:        %8u (%4u failed)\n"
 		   "primNoSleepAlloc:   %8u (%4u failed)\n"
 		   "primCanSleepAlloc:  %8u (%4u failed)\n"
+		   "prim2mFree:         %8u\n"
 		   "primFree:           %8u\n"
+		   "err2mAlloc:         %8u\n"
 		   "errAlloc:           %8u\n"
-		   "errFree:            %8u\n",
+		   "err2mFree:          %8u\n"
+		   "errFree:            %8u\n"
+		   "doorbellSet:        %8u\n"
+		   "doorbellUnset:      %8u\n",
 		   stats->timer,
+		   stats->doorbell,
 		   stats->start, stats->start_fail,
 		   stats->guest_type, stats->guest_type_fail,
-		   stats->lock,  stats->lock_fail,
-		   stats->unlock, stats->unlock_fail,
+		   stats->lock[true],  stats->lock_fail[true],
+		   stats->lock[false],  stats->lock_fail[false],
+		   stats->unlock[true], stats->unlock_fail[true],
+		   stats->unlock[false], stats->unlock_fail[false],
 		   stats->target, stats->target_fail,
-		   stats->alloc, stats->alloc_fail,
+		   stats->alloc[true], stats->alloc_fail[true],
+		   stats->alloc[false], stats->alloc_fail[false],
 		   stats->sleep_alloc, stats->sleep_alloc_fail,
-		   stats->free,
-		   stats->refused_alloc, stats->refused_free);
+		   stats->free[true],
+		   stats->free[false],
+		   stats->refused_alloc[true], stats->refused_alloc[false],
+		   stats->refused_free[true], stats->refused_free[false],
+		   stats->doorbell_set, stats->doorbell_unset);
 
 	return 0;
 }
@@ -782,7 +1266,7 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b)
 static int __init vmballoon_init(void)
 {
 	int error;
-
+	unsigned is_2m_pages;
 	/*
 	 * Check if we are running on VMware's hypervisor and bail out
 	 * if we are not.
@@ -790,32 +1274,26 @@ static int __init vmballoon_init(void)
 	if (x86_hyper != &x86_hyper_vmware)
 		return -ENODEV;
 
-	INIT_LIST_HEAD(&balloon.pages);
-	INIT_LIST_HEAD(&balloon.refused_pages);
+	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
+			is_2m_pages++) {
+		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
+		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
+	}
 
 	/* initialize rates */
 	balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
-	balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
 
 	INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
 
-	/*
-	 * Start balloon.
-	 */
-	if (!vmballoon_send_start(&balloon)) {
-		pr_err("failed to send start command to the host\n");
-		return -EIO;
-	}
-
-	if (!vmballoon_send_guest_id(&balloon)) {
-		pr_err("failed to send guest ID to the host\n");
-		return -EIO;
-	}
-
 	error = vmballoon_debugfs_init(&balloon);
 	if (error)
 		return error;
 
+	balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
+	balloon.batch_page = NULL;
+	balloon.page = NULL;
+	balloon.reset_required = true;
+
 	queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
 
 	return 0;
@@ -824,6 +1302,7 @@ module_init(vmballoon_init);
 
 static void __exit vmballoon_exit(void)
 {
+	vmballoon_vmci_cleanup(&balloon);
 	cancel_delayed_work_sync(&balloon.dwork);
 
 	vmballoon_debugfs_exit(&balloon);
@@ -833,7 +1312,7 @@ static void __exit vmballoon_exit(void)
 	 * Reset connection before deallocating memory to avoid potential for
 	 * additional spurious resets from guest touching deallocated pages.
 	 */
-	vmballoon_send_start(&balloon);
+	vmballoon_send_start(&balloon, 0);
 	vmballoon_pop(&balloon);
 }
 module_exit(vmballoon_exit);
diff --git a/drivers/misc/vmw_vmci/vmci_datagram.c b/drivers/misc/vmw_vmci/vmci_datagram.c
index 822665245588..8a4b6bbe1bee 100644
--- a/drivers/misc/vmw_vmci/vmci_datagram.c
+++ b/drivers/misc/vmw_vmci/vmci_datagram.c
@@ -276,11 +276,10 @@ static int dg_dispatch_as_host(u32 context_id, struct vmci_datagram *dg)
 		}
 
 		/* We make a copy to enqueue. */
-		new_dg = kmalloc(dg_size, GFP_KERNEL);
+		new_dg = kmemdup(dg, dg_size, GFP_KERNEL);
 		if (new_dg == NULL)
 			return VMCI_ERROR_NO_MEM;
 
-		memcpy(new_dg, dg, dg_size);
 		retval = vmci_ctx_enqueue_datagram(dg->dst.context, new_dg);
 		if (retval < VMCI_SUCCESS) {
 			kfree(new_dg);
diff --git a/drivers/nfc/mei_phy.c b/drivers/nfc/mei_phy.c
index 754a9bb0f58d..83deda4bb4d6 100644
--- a/drivers/nfc/mei_phy.c
+++ b/drivers/nfc/mei_phy.c
@@ -118,7 +118,7 @@ static int mei_nfc_if_version(struct nfc_mei_phy *phy)
 	cmd.sub_command = MEI_NFC_SUBCMD_IF_VERSION;
 
 	MEI_DUMP_NFC_HDR("version", &cmd.hdr);
-	r = mei_cl_send(phy->device, (u8 *)&cmd, sizeof(struct mei_nfc_cmd));
+	r = mei_cldev_send(phy->cldev, (u8 *)&cmd, sizeof(struct mei_nfc_cmd));
 	if (r < 0) {
 		pr_err("Could not send IF version cmd\n");
 		return r;
@@ -132,7 +132,7 @@ static int mei_nfc_if_version(struct nfc_mei_phy *phy)
 	if (!reply)
 		return -ENOMEM;
 
-	bytes_recv = mei_cl_recv(phy->device, (u8 *)reply, if_version_length);
+	bytes_recv = mei_cldev_recv(phy->cldev, (u8 *)reply, if_version_length);
 	if (bytes_recv < 0 || bytes_recv < sizeof(struct mei_nfc_reply)) {
 		pr_err("Could not read IF version\n");
 		r = -EIO;
@@ -186,13 +186,14 @@ static int mei_nfc_connect(struct nfc_mei_phy *phy)
 	connect->vendor_id = phy->vendor_id;
 
 	MEI_DUMP_NFC_HDR("connect request", &cmd->hdr);
-	r = mei_cl_send(phy->device, (u8 *)cmd, connect_length);
+	r = mei_cldev_send(phy->cldev, (u8 *)cmd, connect_length);
 	if (r < 0) {
 		pr_err("Could not send connect cmd %d\n", r);
 		goto err;
 	}
 
-	bytes_recv = mei_cl_recv(phy->device, (u8 *)reply, connect_resp_length);
+	bytes_recv = mei_cldev_recv(phy->cldev, (u8 *)reply,
+				    connect_resp_length);
 	if (bytes_recv < 0) {
 		r = bytes_recv;
 		pr_err("Could not read connect response %d\n", r);
@@ -238,7 +239,7 @@ static int mei_nfc_send(struct nfc_mei_phy *phy, u8 *buf, size_t length)
 	MEI_DUMP_NFC_HDR("send", hdr);
 
 	memcpy(mei_buf + MEI_NFC_HEADER_SIZE, buf, length);
-	err = mei_cl_send(phy->device, mei_buf, length + MEI_NFC_HEADER_SIZE);
+	err = mei_cldev_send(phy->cldev, mei_buf, length + MEI_NFC_HEADER_SIZE);
 	if (err < 0)
 		goto out;
 
@@ -278,7 +279,7 @@ static int mei_nfc_recv(struct nfc_mei_phy *phy, u8 *buf, size_t length)
 	struct mei_nfc_hdr *hdr;
 	int received_length;
 
-	received_length = mei_cl_recv(phy->device, buf, length);
+	received_length = mei_cldev_recv(phy->cldev, buf, length);
 	if (received_length < 0)
 		return received_length;
 
@@ -296,7 +297,7 @@ static int mei_nfc_recv(struct nfc_mei_phy *phy, u8 *buf, size_t length)
 }
 
 
-static void nfc_mei_event_cb(struct mei_cl_device *device, u32 events,
+static void nfc_mei_event_cb(struct mei_cl_device *cldev, u32 events,
 			     void *context)
 {
 	struct nfc_mei_phy *phy = context;
@@ -337,7 +338,7 @@ static int nfc_mei_phy_enable(void *phy_id)
 	if (phy->powered == 1)
 		return 0;
 
-	r = mei_cl_enable_device(phy->device);
+	r = mei_cldev_enable(phy->cldev);
 	if (r < 0) {
 		pr_err("Could not enable device %d\n", r);
 		return r;
@@ -355,7 +356,7 @@ static int nfc_mei_phy_enable(void *phy_id)
 		goto err;
 	}
 
-	r = mei_cl_register_event_cb(phy->device, BIT(MEI_CL_EVENT_RX),
+	r = mei_cldev_register_event_cb(phy->cldev, BIT(MEI_CL_EVENT_RX),
 				     nfc_mei_event_cb, phy);
 	if (r) {
 		pr_err("Event cb registration failed %d\n", r);
@@ -368,7 +369,7 @@ static int nfc_mei_phy_enable(void *phy_id)
 
 err:
 	phy->powered = 0;
-	mei_cl_disable_device(phy->device);
+	mei_cldev_disable(phy->cldev);
 	return r;
 }
 
@@ -378,7 +379,7 @@ static void nfc_mei_phy_disable(void *phy_id)
 
 	pr_info("%s\n", __func__);
 
-	mei_cl_disable_device(phy->device);
+	mei_cldev_disable(phy->cldev);
 
 	phy->powered = 0;
 }
@@ -390,7 +391,7 @@ struct nfc_phy_ops mei_phy_ops = {
 };
 EXPORT_SYMBOL_GPL(mei_phy_ops);
 
-struct nfc_mei_phy *nfc_mei_phy_alloc(struct mei_cl_device *device)
+struct nfc_mei_phy *nfc_mei_phy_alloc(struct mei_cl_device *cldev)
 {
 	struct nfc_mei_phy *phy;
 
@@ -398,9 +399,9 @@ struct nfc_mei_phy *nfc_mei_phy_alloc(struct mei_cl_device *device)
 	if (!phy)
 		return NULL;
 
-	phy->device = device;
+	phy->cldev = cldev;
 	init_waitqueue_head(&phy->send_wq);
-	mei_cl_set_drvdata(device, phy);
+	mei_cldev_set_drvdata(cldev, phy);
 
 	return phy;
 }
@@ -408,7 +409,7 @@ EXPORT_SYMBOL_GPL(nfc_mei_phy_alloc);
 
 void nfc_mei_phy_free(struct nfc_mei_phy *phy)
 {
-	mei_cl_disable_device(phy->device);
+	mei_cldev_disable(phy->cldev);
 	kfree(phy);
 }
 EXPORT_SYMBOL_GPL(nfc_mei_phy_free);
diff --git a/drivers/nfc/mei_phy.h b/drivers/nfc/mei_phy.h
index fbfa3e61738f..acd3a1fc69e6 100644
--- a/drivers/nfc/mei_phy.h
+++ b/drivers/nfc/mei_phy.h
@@ -13,7 +13,7 @@
 /**
  * struct nfc_mei_phy
  *
- * @device: mei device
+ * @cldev: mei client device
  * @hdev:   nfc hci device
 
  * @send_wq: send completion wait queue
@@ -28,7 +28,7 @@
  *    and prevents normal operation.
  */
 struct nfc_mei_phy {
-	struct mei_cl_device *device;
+	struct mei_cl_device *cldev;
 	struct nfc_hci_dev *hdev;
 
 	wait_queue_head_t send_wq;
diff --git a/drivers/nfc/microread/mei.c b/drivers/nfc/microread/mei.c
index f9f5fc97cdd7..3092501f26c4 100644
--- a/drivers/nfc/microread/mei.c
+++ b/drivers/nfc/microread/mei.c
@@ -29,7 +29,7 @@
 
 #define MICROREAD_DRIVER_NAME "microread"
 
-static int microread_mei_probe(struct mei_cl_device *device,
+static int microread_mei_probe(struct mei_cl_device *cldev,
 			       const struct mei_cl_device_id *id)
 {
 	struct nfc_mei_phy *phy;
@@ -37,7 +37,7 @@ static int microread_mei_probe(struct mei_cl_device *device,
 
 	pr_info("Probing NFC microread\n");
 
-	phy = nfc_mei_phy_alloc(device);
+	phy = nfc_mei_phy_alloc(cldev);
 	if (!phy) {
 		pr_err("Cannot allocate memory for microread mei phy.\n");
 		return -ENOMEM;
@@ -55,9 +55,9 @@ static int microread_mei_probe(struct mei_cl_device *device,
 	return 0;
 }
 
-static int microread_mei_remove(struct mei_cl_device *device)
+static int microread_mei_remove(struct mei_cl_device *cldev)
 {
-	struct nfc_mei_phy *phy = mei_cl_get_drvdata(device);
+	struct nfc_mei_phy *phy = mei_cldev_get_drvdata(cldev);
 
 	microread_remove(phy->hdev);
 
@@ -67,7 +67,7 @@ static int microread_mei_remove(struct mei_cl_device *device)
 }
 
 static struct mei_cl_device_id microread_mei_tbl[] = {
-	{ MICROREAD_DRIVER_NAME, MEI_NFC_UUID},
+	{ MICROREAD_DRIVER_NAME, MEI_NFC_UUID, MEI_CL_VERSION_ANY},
 
 	/* required last entry */
 	{ }
@@ -88,7 +88,7 @@ static int microread_mei_init(void)
 
 	pr_debug(DRIVER_DESC ": %s\n", __func__);
 
-	r = mei_cl_driver_register(&microread_driver);
+	r = mei_cldev_driver_register(&microread_driver);
 	if (r) {
 		pr_err(MICROREAD_DRIVER_NAME ": driver registration failed\n");
 		return r;
@@ -99,7 +99,7 @@ static int microread_mei_init(void)
 
 static void microread_mei_exit(void)
 {
-	mei_cl_driver_unregister(&microread_driver);
+	mei_cldev_driver_unregister(&microread_driver);
 }
 
 module_init(microread_mei_init);
diff --git a/drivers/nfc/pn544/mei.c b/drivers/nfc/pn544/mei.c
index 101a37e12efa..46d0eb24eef9 100644
--- a/drivers/nfc/pn544/mei.c
+++ b/drivers/nfc/pn544/mei.c
@@ -27,7 +27,7 @@
 
 #define PN544_DRIVER_NAME "pn544"
 
-static int pn544_mei_probe(struct mei_cl_device *device,
+static int pn544_mei_probe(struct mei_cl_device *cldev,
 			       const struct mei_cl_device_id *id)
 {
 	struct nfc_mei_phy *phy;
@@ -35,7 +35,7 @@ static int pn544_mei_probe(struct mei_cl_device *device,
 
 	pr_info("Probing NFC pn544\n");
 
-	phy = nfc_mei_phy_alloc(device);
+	phy = nfc_mei_phy_alloc(cldev);
 	if (!phy) {
 		pr_err("Cannot allocate memory for pn544 mei phy.\n");
 		return -ENOMEM;
@@ -53,9 +53,9 @@ static int pn544_mei_probe(struct mei_cl_device *device,
 	return 0;
 }
 
-static int pn544_mei_remove(struct mei_cl_device *device)
+static int pn544_mei_remove(struct mei_cl_device *cldev)
 {
-	struct nfc_mei_phy *phy = mei_cl_get_drvdata(device);
+	struct nfc_mei_phy *phy = mei_cldev_get_drvdata(cldev);
 
 	pr_info("Removing pn544\n");
 
@@ -67,7 +67,7 @@ static int pn544_mei_remove(struct mei_cl_device *device)
 }
 
 static struct mei_cl_device_id pn544_mei_tbl[] = {
-	{ PN544_DRIVER_NAME, MEI_NFC_UUID},
+	{ PN544_DRIVER_NAME, MEI_NFC_UUID, MEI_CL_VERSION_ANY},
 
 	/* required last entry */
 	{ }
@@ -88,7 +88,7 @@ static int pn544_mei_init(void)
 
 	pr_debug(DRIVER_DESC ": %s\n", __func__);
 
-	r = mei_cl_driver_register(&pn544_driver);
+	r = mei_cldev_driver_register(&pn544_driver);
 	if (r) {
 		pr_err(PN544_DRIVER_NAME ": driver registration failed\n");
 		return r;
@@ -99,7 +99,7 @@ static int pn544_mei_init(void)
 
 static void pn544_mei_exit(void)
 {
-	mei_cl_driver_unregister(&pn544_driver);
+	mei_cldev_driver_unregister(&pn544_driver);
 }
 
 module_init(pn544_mei_init);
diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig
index 8db297821f78..bc4ea585b42e 100644
--- a/drivers/nvmem/Kconfig
+++ b/drivers/nvmem/Kconfig
@@ -14,6 +14,28 @@ menuconfig NVMEM
 
 if NVMEM
 
+config NVMEM_IMX_OCOTP
+	tristate "i.MX6 On-Chip OTP Controller support"
+	depends on SOC_IMX6
+	help
+	  This is a driver for the On-Chip OTP Controller (OCOTP) available on
+	  i.MX6 SoCs, providing access to 4 Kbits of one-time programmable
+	  eFuses.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called nvmem-imx-ocotp.
+
+config NVMEM_MXS_OCOTP
+	tristate "Freescale MXS On-Chip OTP Memory Support"
+	depends on ARCH_MXS || COMPILE_TEST
+	help
+	  If you say Y here, you will get readonly access to the
+	  One Time Programmable memory pages that are stored
+	  on the Freescale i.MX23/i.MX28 processor.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called nvmem-mxs-ocotp.
+
 config QCOM_QFPROM
 	tristate "QCOM QFPROM Support"
 	depends on ARCH_QCOM || COMPILE_TEST
@@ -25,6 +47,16 @@ config QCOM_QFPROM
 	  This driver can also be built as a module. If so, the module
 	  will be called nvmem_qfprom.
 
+config ROCKCHIP_EFUSE
+	tristate "Rockchip eFuse Support"
+	depends on ARCH_ROCKCHIP || COMPILE_TEST
+	help
+	  This is a simple drive to dump specified values of Rockchip SoC
+	  from eFuse, such as cpu-leakage.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called nvmem_rockchip_efuse.
+
 config NVMEM_SUNXI_SID
 	tristate "Allwinner SoCs SID support"
 	depends on ARCH_SUNXI
@@ -36,4 +68,14 @@ config NVMEM_SUNXI_SID
 	  This driver can also be built as a module. If so, the module
 	  will be called nvmem_sunxi_sid.
 
+config NVMEM_VF610_OCOTP
+	tristate "VF610 SoC OCOTP support"
+	depends on SOC_VF610 || COMPILE_TEST
+	help
+	  This is a driver for the 'OCOTP' peripheral available on Vybrid
+	  devices like VF5xx and VF6xx.
+
+	  This driver can also be build as a module. If so, the module will
+	  be called nvmem-vf610-ocotp.
+
 endif
diff --git a/drivers/nvmem/Makefile b/drivers/nvmem/Makefile
index 4328b930ad9a..95dde3f8f085 100644
--- a/drivers/nvmem/Makefile
+++ b/drivers/nvmem/Makefile
@@ -6,7 +6,15 @@ obj-$(CONFIG_NVMEM)		+= nvmem_core.o
 nvmem_core-y			:= core.o
 
 # Devices
+obj-$(CONFIG_NVMEM_IMX_OCOTP)	+= nvmem-imx-ocotp.o
+nvmem-imx-ocotp-y		:= imx-ocotp.o
+obj-$(CONFIG_NVMEM_MXS_OCOTP)	+= nvmem-mxs-ocotp.o
+nvmem-mxs-ocotp-y		:= mxs-ocotp.o
 obj-$(CONFIG_QCOM_QFPROM)	+= nvmem_qfprom.o
 nvmem_qfprom-y			:= qfprom.o
+obj-$(CONFIG_ROCKCHIP_EFUSE)	+= nvmem_rockchip_efuse.o
+nvmem_rockchip_efuse-y		:= rockchip-efuse.o
 obj-$(CONFIG_NVMEM_SUNXI_SID)	+= nvmem_sunxi_sid.o
 nvmem_sunxi_sid-y		:= sunxi_sid.o
+obj-$(CONFIG_NVMEM_VF610_OCOTP)	+= nvmem-vf610-ocotp.o
+nvmem-vf610-ocotp-y		:= vf610-ocotp.o
diff --git a/drivers/nvmem/imx-ocotp.c b/drivers/nvmem/imx-ocotp.c
new file mode 100644
index 000000000000..b7971d410b60
--- /dev/null
+++ b/drivers/nvmem/imx-ocotp.c
@@ -0,0 +1,154 @@
+/*
+ * i.MX6 OCOTP fusebox driver
+ *
+ * Copyright (c) 2015 Pengutronix, Philipp Zabel <p.zabel@pengutronix.de>
+ *
+ * Based on the barebox ocotp driver,
+ * Copyright (c) 2010 Baruch Siach <baruch@tkos.co.il>,
+ *	Orex Computed Radiography
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * http://www.opensource.org/licenses/gpl-license.html
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/nvmem-provider.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+struct ocotp_priv {
+	struct device *dev;
+	void __iomem *base;
+	unsigned int nregs;
+};
+
+static int imx_ocotp_read(void *context, const void *reg, size_t reg_size,
+			  void *val, size_t val_size)
+{
+	struct ocotp_priv *priv = context;
+	unsigned int offset = *(u32 *)reg;
+	unsigned int count;
+	int i;
+	u32 index;
+
+	index = offset >> 2;
+	count = val_size >> 2;
+
+	if (count > (priv->nregs - index))
+		count = priv->nregs - index;
+
+	for (i = index; i < (index + count); i++) {
+		*(u32 *)val = readl(priv->base + 0x400 + i * 0x10);
+		val += 4;
+	}
+
+	return (i - index) * 4;
+}
+
+static int imx_ocotp_write(void *context, const void *data, size_t count)
+{
+	/* Not implemented */
+	return 0;
+}
+
+static struct regmap_bus imx_ocotp_bus = {
+	.read = imx_ocotp_read,
+	.write = imx_ocotp_write,
+	.reg_format_endian_default = REGMAP_ENDIAN_NATIVE,
+	.val_format_endian_default = REGMAP_ENDIAN_NATIVE,
+};
+
+static bool imx_ocotp_writeable_reg(struct device *dev, unsigned int reg)
+{
+	return false;
+}
+
+static struct regmap_config imx_ocotp_regmap_config = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+	.writeable_reg = imx_ocotp_writeable_reg,
+	.name = "imx-ocotp",
+};
+
+static struct nvmem_config imx_ocotp_nvmem_config = {
+	.name = "imx-ocotp",
+	.read_only = true,
+	.owner = THIS_MODULE,
+};
+
+static const struct of_device_id imx_ocotp_dt_ids[] = {
+	{ .compatible = "fsl,imx6q-ocotp",  (void *)128 },
+	{ .compatible = "fsl,imx6sl-ocotp", (void *)32 },
+	{ .compatible = "fsl,imx6sx-ocotp", (void *)128 },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, imx_ocotp_dt_ids);
+
+static int imx_ocotp_probe(struct platform_device *pdev)
+{
+	const struct of_device_id *of_id;
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	struct regmap *regmap;
+	struct ocotp_priv *priv;
+	struct nvmem_device *nvmem;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	priv->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(priv->base))
+		return PTR_ERR(priv->base);
+
+	of_id = of_match_device(imx_ocotp_dt_ids, dev);
+	priv->nregs = (unsigned int)of_id->data;
+	imx_ocotp_regmap_config.max_register = 4 * priv->nregs - 4;
+
+	regmap = devm_regmap_init(dev, &imx_ocotp_bus, priv,
+				  &imx_ocotp_regmap_config);
+	if (IS_ERR(regmap)) {
+		dev_err(dev, "regmap init failed\n");
+		return PTR_ERR(regmap);
+	}
+	imx_ocotp_nvmem_config.dev = dev;
+	nvmem = nvmem_register(&imx_ocotp_nvmem_config);
+	if (IS_ERR(nvmem))
+		return PTR_ERR(nvmem);
+
+	platform_set_drvdata(pdev, nvmem);
+
+	return 0;
+}
+
+static int imx_ocotp_remove(struct platform_device *pdev)
+{
+	struct nvmem_device *nvmem = platform_get_drvdata(pdev);
+
+	return nvmem_unregister(nvmem);
+}
+
+static struct platform_driver imx_ocotp_driver = {
+	.probe	= imx_ocotp_probe,
+	.remove	= imx_ocotp_remove,
+	.driver = {
+		.name	= "imx_ocotp",
+		.of_match_table = imx_ocotp_dt_ids,
+	},
+};
+module_platform_driver(imx_ocotp_driver);
+
+MODULE_AUTHOR("Philipp Zabel <p.zabel@pengutronix.de>");
+MODULE_DESCRIPTION("i.MX6 OCOTP fuse box driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvmem/mxs-ocotp.c b/drivers/nvmem/mxs-ocotp.c
new file mode 100644
index 000000000000..8ba19bba3156
--- /dev/null
+++ b/drivers/nvmem/mxs-ocotp.c
@@ -0,0 +1,257 @@
+/*
+ * Freescale MXS On-Chip OTP driver
+ *
+ * Copyright (C) 2015 Stefan Wahren <stefan.wahren@i2se.com>
+ *
+ * Based on the driver from Huang Shijie and Christoph G. Baumann
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/nvmem-provider.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/stmp_device.h>
+
+/* OCOTP registers and bits */
+
+#define BM_OCOTP_CTRL_RD_BANK_OPEN	BIT(12)
+#define BM_OCOTP_CTRL_ERROR		BIT(9)
+#define BM_OCOTP_CTRL_BUSY		BIT(8)
+
+#define OCOTP_TIMEOUT		10000
+#define OCOTP_DATA_OFFSET	0x20
+
+struct mxs_ocotp {
+	struct clk *clk;
+	void __iomem *base;
+	struct nvmem_device *nvmem;
+};
+
+static int mxs_ocotp_wait(struct mxs_ocotp *otp)
+{
+	int timeout = OCOTP_TIMEOUT;
+	unsigned int status = 0;
+
+	while (timeout--) {
+		status = readl(otp->base);
+
+		if (!(status & (BM_OCOTP_CTRL_BUSY | BM_OCOTP_CTRL_ERROR)))
+			break;
+
+		cpu_relax();
+	}
+
+	if (status & BM_OCOTP_CTRL_BUSY)
+		return -EBUSY;
+	else if (status & BM_OCOTP_CTRL_ERROR)
+		return -EIO;
+
+	return 0;
+}
+
+static int mxs_ocotp_read(void *context, const void *reg, size_t reg_size,
+			  void *val, size_t val_size)
+{
+	struct mxs_ocotp *otp = context;
+	unsigned int offset = *(u32 *)reg;
+	u32 *buf = val;
+	int ret;
+
+	ret = clk_enable(otp->clk);
+	if (ret)
+		return ret;
+
+	writel(BM_OCOTP_CTRL_ERROR, otp->base + STMP_OFFSET_REG_CLR);
+
+	ret = mxs_ocotp_wait(otp);
+	if (ret)
+		goto disable_clk;
+
+	/* open OCOTP banks for read */
+	writel(BM_OCOTP_CTRL_RD_BANK_OPEN, otp->base + STMP_OFFSET_REG_SET);
+
+	/* approximately wait 33 hclk cycles */
+	udelay(1);
+
+	ret = mxs_ocotp_wait(otp);
+	if (ret)
+		goto close_banks;
+
+	while (val_size) {
+		if ((offset < OCOTP_DATA_OFFSET) || (offset % 16)) {
+			/* fill up non-data register */
+			*buf = 0;
+		} else {
+			*buf = readl(otp->base + offset);
+		}
+
+		buf++;
+		val_size--;
+		offset += reg_size;
+	}
+
+close_banks:
+	/* close banks for power saving */
+	writel(BM_OCOTP_CTRL_RD_BANK_OPEN, otp->base + STMP_OFFSET_REG_CLR);
+
+disable_clk:
+	clk_disable(otp->clk);
+
+	return ret;
+}
+
+static int mxs_ocotp_write(void *context, const void *data, size_t count)
+{
+	/* We don't want to support writing */
+	return 0;
+}
+
+static bool mxs_ocotp_writeable_reg(struct device *dev, unsigned int reg)
+{
+	return false;
+}
+
+static struct nvmem_config ocotp_config = {
+	.name = "mxs-ocotp",
+	.owner = THIS_MODULE,
+};
+
+static const struct regmap_range imx23_ranges[] = {
+	regmap_reg_range(OCOTP_DATA_OFFSET, 0x210),
+};
+
+static const struct regmap_access_table imx23_access = {
+	.yes_ranges = imx23_ranges,
+	.n_yes_ranges = ARRAY_SIZE(imx23_ranges),
+};
+
+static const struct regmap_range imx28_ranges[] = {
+	regmap_reg_range(OCOTP_DATA_OFFSET, 0x290),
+};
+
+static const struct regmap_access_table imx28_access = {
+	.yes_ranges = imx28_ranges,
+	.n_yes_ranges = ARRAY_SIZE(imx28_ranges),
+};
+
+static struct regmap_bus mxs_ocotp_bus = {
+	.read = mxs_ocotp_read,
+	.write = mxs_ocotp_write, /* make regmap_init() happy */
+	.reg_format_endian_default = REGMAP_ENDIAN_NATIVE,
+	.val_format_endian_default = REGMAP_ENDIAN_NATIVE,
+};
+
+static struct regmap_config mxs_ocotp_config = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 16,
+	.writeable_reg = mxs_ocotp_writeable_reg,
+};
+
+static const struct of_device_id mxs_ocotp_match[] = {
+	{ .compatible = "fsl,imx23-ocotp", .data = &imx23_access },
+	{ .compatible = "fsl,imx28-ocotp", .data = &imx28_access },
+	{ /* sentinel */},
+};
+MODULE_DEVICE_TABLE(of, mxs_ocotp_match);
+
+static int mxs_ocotp_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct mxs_ocotp *otp;
+	struct resource *res;
+	const struct of_device_id *match;
+	struct regmap *regmap;
+	const struct regmap_access_table *access;
+	int ret;
+
+	match = of_match_device(dev->driver->of_match_table, dev);
+	if (!match || !match->data)
+		return -EINVAL;
+
+	otp = devm_kzalloc(dev, sizeof(*otp), GFP_KERNEL);
+	if (!otp)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	otp->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(otp->base))
+		return PTR_ERR(otp->base);
+
+	otp->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(otp->clk))
+		return PTR_ERR(otp->clk);
+
+	ret = clk_prepare(otp->clk);
+	if (ret < 0) {
+		dev_err(dev, "failed to prepare clk: %d\n", ret);
+		return ret;
+	}
+
+	access = match->data;
+	mxs_ocotp_config.rd_table = access;
+	mxs_ocotp_config.max_register = access->yes_ranges[0].range_max;
+
+	regmap = devm_regmap_init(dev, &mxs_ocotp_bus, otp, &mxs_ocotp_config);
+	if (IS_ERR(regmap)) {
+		dev_err(dev, "regmap init failed\n");
+		ret = PTR_ERR(regmap);
+		goto err_clk;
+	}
+
+	ocotp_config.dev = dev;
+	otp->nvmem = nvmem_register(&ocotp_config);
+	if (IS_ERR(otp->nvmem)) {
+		ret = PTR_ERR(otp->nvmem);
+		goto err_clk;
+	}
+
+	platform_set_drvdata(pdev, otp);
+
+	return 0;
+
+err_clk:
+	clk_unprepare(otp->clk);
+
+	return ret;
+}
+
+static int mxs_ocotp_remove(struct platform_device *pdev)
+{
+	struct mxs_ocotp *otp = platform_get_drvdata(pdev);
+
+	clk_unprepare(otp->clk);
+
+	return nvmem_unregister(otp->nvmem);
+}
+
+static struct platform_driver mxs_ocotp_driver = {
+	.probe = mxs_ocotp_probe,
+	.remove = mxs_ocotp_remove,
+	.driver = {
+		.name = "mxs-ocotp",
+		.of_match_table = mxs_ocotp_match,
+	},
+};
+
+module_platform_driver(mxs_ocotp_driver);
+MODULE_AUTHOR("Stefan Wahren <stefan.wahren@i2se.com>");
+MODULE_DESCRIPTION("driver for OCOTP in i.MX23/i.MX28");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvmem/rockchip-efuse.c b/drivers/nvmem/rockchip-efuse.c
new file mode 100644
index 000000000000..f55213424222
--- /dev/null
+++ b/drivers/nvmem/rockchip-efuse.c
@@ -0,0 +1,186 @@
+/*
+ * Rockchip eFuse Driver
+ *
+ * Copyright (c) 2015 Rockchip Electronics Co. Ltd.
+ * Author: Caesar Wang <wxt@rock-chips.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/nvmem-provider.h>
+#include <linux/slab.h>
+#include <linux/regmap.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/of.h>
+#include <linux/clk.h>
+
+#define EFUSE_A_SHIFT			6
+#define EFUSE_A_MASK			0x3ff
+#define EFUSE_PGENB			BIT(3)
+#define EFUSE_LOAD			BIT(2)
+#define EFUSE_STROBE			BIT(1)
+#define EFUSE_CSB			BIT(0)
+
+#define REG_EFUSE_CTRL			0x0000
+#define REG_EFUSE_DOUT			0x0004
+
+struct rockchip_efuse_context {
+	struct device *dev;
+	void __iomem *base;
+	struct clk *efuse_clk;
+};
+
+static int rockchip_efuse_write(void *context, const void *data, size_t count)
+{
+	/* Nothing TBD, Read-Only */
+	return 0;
+}
+
+static int rockchip_efuse_read(void *context,
+			       const void *reg, size_t reg_size,
+			       void *val, size_t val_size)
+{
+	unsigned int offset = *(u32 *)reg;
+	struct rockchip_efuse_context *_context = context;
+	void __iomem *base = _context->base;
+	struct clk *clk = _context->efuse_clk;
+	u8 *buf = val;
+	int ret;
+
+	ret = clk_prepare_enable(clk);
+	if (ret < 0) {
+		dev_err(_context->dev, "failed to prepare/enable efuse clk\n");
+		return ret;
+	}
+
+	writel(EFUSE_LOAD | EFUSE_PGENB, base + REG_EFUSE_CTRL);
+	udelay(1);
+	while (val_size) {
+		writel(readl(base + REG_EFUSE_CTRL) &
+			     (~(EFUSE_A_MASK << EFUSE_A_SHIFT)),
+			     base + REG_EFUSE_CTRL);
+		writel(readl(base + REG_EFUSE_CTRL) |
+			     ((offset & EFUSE_A_MASK) << EFUSE_A_SHIFT),
+			     base + REG_EFUSE_CTRL);
+		udelay(1);
+		writel(readl(base + REG_EFUSE_CTRL) |
+			     EFUSE_STROBE, base + REG_EFUSE_CTRL);
+		udelay(1);
+		*buf++ = readb(base + REG_EFUSE_DOUT);
+		writel(readl(base + REG_EFUSE_CTRL) &
+		     (~EFUSE_STROBE), base + REG_EFUSE_CTRL);
+		udelay(1);
+
+		val_size -= 1;
+		offset += 1;
+	}
+
+	/* Switch to standby mode */
+	writel(EFUSE_PGENB | EFUSE_CSB, base + REG_EFUSE_CTRL);
+
+	clk_disable_unprepare(clk);
+
+	return 0;
+}
+
+static struct regmap_bus rockchip_efuse_bus = {
+	.read = rockchip_efuse_read,
+	.write = rockchip_efuse_write,
+	.reg_format_endian_default = REGMAP_ENDIAN_NATIVE,
+	.val_format_endian_default = REGMAP_ENDIAN_NATIVE,
+};
+
+static struct regmap_config rockchip_efuse_regmap_config = {
+	.reg_bits = 32,
+	.reg_stride = 1,
+	.val_bits = 8,
+};
+
+static struct nvmem_config econfig = {
+	.name = "rockchip-efuse",
+	.owner = THIS_MODULE,
+	.read_only = true,
+};
+
+static const struct of_device_id rockchip_efuse_match[] = {
+	{ .compatible = "rockchip,rockchip-efuse",},
+	{ /* sentinel */},
+};
+MODULE_DEVICE_TABLE(of, rockchip_efuse_match);
+
+static int rockchip_efuse_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	struct nvmem_device *nvmem;
+	struct regmap *regmap;
+	void __iomem *base;
+	struct clk *clk;
+	struct rockchip_efuse_context *context;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	context = devm_kzalloc(dev, sizeof(struct rockchip_efuse_context),
+			       GFP_KERNEL);
+	if (IS_ERR(context))
+		return PTR_ERR(context);
+
+	clk = devm_clk_get(dev, "pclk_efuse");
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+
+	context->dev = dev;
+	context->base = base;
+	context->efuse_clk = clk;
+
+	rockchip_efuse_regmap_config.max_register = resource_size(res) - 1;
+
+	regmap = devm_regmap_init(dev, &rockchip_efuse_bus,
+				  context, &rockchip_efuse_regmap_config);
+	if (IS_ERR(regmap)) {
+		dev_err(dev, "regmap init failed\n");
+		return PTR_ERR(regmap);
+	}
+	econfig.dev = dev;
+	nvmem = nvmem_register(&econfig);
+	if (IS_ERR(nvmem))
+		return PTR_ERR(nvmem);
+
+	platform_set_drvdata(pdev, nvmem);
+
+	return 0;
+}
+
+static int rockchip_efuse_remove(struct platform_device *pdev)
+{
+	struct nvmem_device *nvmem = platform_get_drvdata(pdev);
+
+	return nvmem_unregister(nvmem);
+}
+
+static struct platform_driver rockchip_efuse_driver = {
+	.probe = rockchip_efuse_probe,
+	.remove = rockchip_efuse_remove,
+	.driver = {
+		.name = "rockchip-efuse",
+		.of_match_table = rockchip_efuse_match,
+	},
+};
+
+module_platform_driver(rockchip_efuse_driver);
+MODULE_DESCRIPTION("rockchip_efuse driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvmem/vf610-ocotp.c b/drivers/nvmem/vf610-ocotp.c
new file mode 100644
index 000000000000..8641319efeda
--- /dev/null
+++ b/drivers/nvmem/vf610-ocotp.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (C) 2015 Toradex AG.
+ *
+ * Author: Sanchayan Maity <sanchayan.maity@toradex.com>
+ *
+ * Based on the barebox ocotp driver,
+ * Copyright (c) 2010 Baruch Siach <baruch@tkos.co.il>
+ *	Orex Computed Radiography
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/nvmem-provider.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+/* OCOTP Register Offsets */
+#define OCOTP_CTRL_REG				0x00
+#define OCOTP_CTRL_SET				0x04
+#define OCOTP_CTRL_CLR				0x08
+#define OCOTP_TIMING				0x10
+#define OCOTP_DATA				0x20
+#define OCOTP_READ_CTRL_REG			0x30
+#define OCOTP_READ_FUSE_DATA			0x40
+
+/* OCOTP Register bits and masks */
+#define OCOTP_CTRL_WR_UNLOCK			16
+#define OCOTP_CTRL_WR_UNLOCK_KEY		0x3E77
+#define OCOTP_CTRL_WR_UNLOCK_MASK		GENMASK(31, 16)
+#define OCOTP_CTRL_ADDR				0
+#define OCOTP_CTRL_ADDR_MASK			GENMASK(6, 0)
+#define OCOTP_CTRL_RELOAD_SHADOWS		BIT(10)
+#define OCOTP_CTRL_ERR				BIT(9)
+#define OCOTP_CTRL_BUSY				BIT(8)
+
+#define OCOTP_TIMING_STROBE_READ		16
+#define OCOTP_TIMING_STROBE_READ_MASK		GENMASK(21, 16)
+#define OCOTP_TIMING_RELAX			12
+#define OCOTP_TIMING_RELAX_MASK			GENMASK(15, 12)
+#define OCOTP_TIMING_STROBE_PROG		0
+#define OCOTP_TIMING_STROBE_PROG_MASK		GENMASK(11, 0)
+
+#define OCOTP_READ_CTRL_READ_FUSE		0x1
+
+#define VF610_OCOTP_TIMEOUT			100000
+
+#define BF(value, field)		(((value) << field) & field##_MASK)
+
+#define DEF_RELAX				20
+
+static const int base_to_fuse_addr_mappings[][2] = {
+	{0x400, 0x00},
+	{0x410, 0x01},
+	{0x420, 0x02},
+	{0x450, 0x05},
+	{0x4F0, 0x0F},
+	{0x600, 0x20},
+	{0x610, 0x21},
+	{0x620, 0x22},
+	{0x630, 0x23},
+	{0x640, 0x24},
+	{0x650, 0x25},
+	{0x660, 0x26},
+	{0x670, 0x27},
+	{0x6F0, 0x2F},
+	{0x880, 0x38},
+	{0x890, 0x39},
+	{0x8A0, 0x3A},
+	{0x8B0, 0x3B},
+	{0x8C0, 0x3C},
+	{0x8D0, 0x3D},
+	{0x8E0, 0x3E},
+	{0x8F0, 0x3F},
+	{0xC80, 0x78},
+	{0xC90, 0x79},
+	{0xCA0, 0x7A},
+	{0xCB0, 0x7B},
+	{0xCC0, 0x7C},
+	{0xCD0, 0x7D},
+	{0xCE0, 0x7E},
+	{0xCF0, 0x7F},
+};
+
+struct vf610_ocotp {
+	void __iomem *base;
+	struct clk *clk;
+	struct device *dev;
+	struct nvmem_device *nvmem;
+	int timing;
+};
+
+static int vf610_ocotp_wait_busy(void __iomem *base)
+{
+	int timeout = VF610_OCOTP_TIMEOUT;
+
+	while ((readl(base) & OCOTP_CTRL_BUSY) && --timeout)
+		udelay(10);
+
+	if (!timeout) {
+		writel(OCOTP_CTRL_ERR, base + OCOTP_CTRL_CLR);
+		return -ETIMEDOUT;
+	}
+
+	udelay(10);
+
+	return 0;
+}
+
+static int vf610_ocotp_calculate_timing(struct vf610_ocotp *ocotp_dev)
+{
+	u32 clk_rate;
+	u32 relax, strobe_read, strobe_prog;
+	u32 timing;
+
+	clk_rate = clk_get_rate(ocotp_dev->clk);
+
+	/* Refer section OTP read/write timing parameters in TRM */
+	relax = clk_rate / (1000000000 / DEF_RELAX) - 1;
+	strobe_prog = clk_rate / (1000000000 / 10000) + 2 * (DEF_RELAX + 1) - 1;
+	strobe_read = clk_rate / (1000000000 / 40) + 2 * (DEF_RELAX + 1) - 1;
+
+	timing = BF(relax, OCOTP_TIMING_RELAX);
+	timing |= BF(strobe_read, OCOTP_TIMING_STROBE_READ);
+	timing |= BF(strobe_prog, OCOTP_TIMING_STROBE_PROG);
+
+	return timing;
+}
+
+static int vf610_get_fuse_address(int base_addr_offset)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(base_to_fuse_addr_mappings); i++) {
+		if (base_to_fuse_addr_mappings[i][0] == base_addr_offset)
+			return base_to_fuse_addr_mappings[i][1];
+	}
+
+	return -EINVAL;
+}
+
+static int vf610_ocotp_write(void *context, const void *data, size_t count)
+{
+	return 0;
+}
+
+static int vf610_ocotp_read(void *context,
+			const void *off, size_t reg_size,
+			void *val, size_t val_size)
+{
+	struct vf610_ocotp *ocotp = context;
+	void __iomem *base = ocotp->base;
+	unsigned int offset = *(u32 *)off;
+	u32 reg, *buf = val;
+	int fuse_addr;
+	int ret;
+
+	while (val_size > 0) {
+		fuse_addr = vf610_get_fuse_address(offset);
+		if (fuse_addr > 0) {
+			writel(ocotp->timing, base + OCOTP_TIMING);
+			ret = vf610_ocotp_wait_busy(base + OCOTP_CTRL_REG);
+			if (ret)
+				return ret;
+
+			reg = readl(base + OCOTP_CTRL_REG);
+			reg &= ~OCOTP_CTRL_ADDR_MASK;
+			reg &= ~OCOTP_CTRL_WR_UNLOCK_MASK;
+			reg |= BF(fuse_addr, OCOTP_CTRL_ADDR);
+			writel(reg, base + OCOTP_CTRL_REG);
+
+			writel(OCOTP_READ_CTRL_READ_FUSE,
+				base + OCOTP_READ_CTRL_REG);
+			ret = vf610_ocotp_wait_busy(base + OCOTP_CTRL_REG);
+			if (ret)
+				return ret;
+
+			if (readl(base) & OCOTP_CTRL_ERR) {
+				dev_dbg(ocotp->dev, "Error reading from fuse address %x\n",
+					fuse_addr);
+				writel(OCOTP_CTRL_ERR, base + OCOTP_CTRL_CLR);
+			}
+
+			/*
+			 * In case of error, we do not abort and expect to read
+			 * 0xBADABADA as mentioned by the TRM. We just read this
+			 * value and return.
+			 */
+			*buf = readl(base + OCOTP_READ_FUSE_DATA);
+		} else {
+			*buf = 0;
+		}
+
+		buf++;
+		val_size--;
+		offset += reg_size;
+	}
+
+	return 0;
+}
+
+static struct regmap_bus vf610_ocotp_bus = {
+	.read = vf610_ocotp_read,
+	.write = vf610_ocotp_write,
+	.reg_format_endian_default = REGMAP_ENDIAN_NATIVE,
+	.val_format_endian_default = REGMAP_ENDIAN_NATIVE,
+};
+
+static struct regmap_config ocotp_regmap_config = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+};
+
+static struct nvmem_config ocotp_config = {
+	.name = "ocotp",
+	.owner = THIS_MODULE,
+};
+
+static const struct of_device_id ocotp_of_match[] = {
+	{ .compatible = "fsl,vf610-ocotp", },
+	{/* sentinel */},
+};
+MODULE_DEVICE_TABLE(of, ocotp_of_match);
+
+static int vf610_ocotp_remove(struct platform_device *pdev)
+{
+	struct vf610_ocotp *ocotp_dev = platform_get_drvdata(pdev);
+
+	return nvmem_unregister(ocotp_dev->nvmem);
+}
+
+static int vf610_ocotp_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	struct regmap *regmap;
+	struct vf610_ocotp *ocotp_dev;
+
+	ocotp_dev = devm_kzalloc(&pdev->dev,
+			sizeof(struct vf610_ocotp), GFP_KERNEL);
+	if (!ocotp_dev)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ocotp_dev->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(ocotp_dev->base))
+		return PTR_ERR(ocotp_dev->base);
+
+	ocotp_dev->clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(ocotp_dev->clk)) {
+		dev_err(dev, "failed getting clock, err = %ld\n",
+			PTR_ERR(ocotp_dev->clk));
+		return PTR_ERR(ocotp_dev->clk);
+	}
+
+	ocotp_regmap_config.max_register = resource_size(res);
+	regmap = devm_regmap_init(dev,
+		&vf610_ocotp_bus, ocotp_dev, &ocotp_regmap_config);
+	if (IS_ERR(regmap)) {
+		dev_err(dev, "regmap init failed\n");
+		return PTR_ERR(regmap);
+	}
+	ocotp_config.dev = dev;
+
+	ocotp_dev->nvmem = nvmem_register(&ocotp_config);
+	if (IS_ERR(ocotp_dev->nvmem))
+		return PTR_ERR(ocotp_dev->nvmem);
+
+	ocotp_dev->dev = dev;
+	platform_set_drvdata(pdev, ocotp_dev);
+
+	ocotp_dev->timing = vf610_ocotp_calculate_timing(ocotp_dev);
+
+	return 0;
+}
+
+static struct platform_driver vf610_ocotp_driver = {
+	.probe = vf610_ocotp_probe,
+	.remove = vf610_ocotp_remove,
+	.driver = {
+		.name = "vf610-ocotp",
+		.of_match_table = ocotp_of_match,
+	},
+};
+module_platform_driver(vf610_ocotp_driver);
+MODULE_AUTHOR("Sanchayan Maity <sanchayan.maity@toradex.com>");
+MODULE_DESCRIPTION("Vybrid OCOTP driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 0decee6c556e..489ea1098c96 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -468,12 +468,10 @@ static int pcmcia_device_query(struct pcmcia_device *p_dev)
 			if ((length < 2) || (length > 255))
 				continue;
 
-			new = kmalloc(sizeof(char) * length, GFP_KERNEL);
+			new = kstrdup(tmp, GFP_KERNEL);
 			if (!new)
 				continue;
 
-			new = strncpy(new, tmp, length);
-
 			tmp = p_dev->prod_id[i];
 			p_dev->prod_id[i] = new;
 			kfree(tmp);
diff --git a/drivers/spmi/spmi-pmic-arb.c b/drivers/spmi/spmi-pmic-arb.c
index 4a3cf9ba152f..96615d807694 100644
--- a/drivers/spmi/spmi-pmic-arb.c
+++ b/drivers/spmi/spmi-pmic-arb.c
@@ -168,11 +168,6 @@ struct pmic_arb_ver_ops {
 	u32 (*irq_clear)(u8 n);
 };
 
-static inline u32 pmic_arb_base_read(struct spmi_pmic_arb_dev *dev, u32 offset)
-{
-	return readl_relaxed(dev->rd_base + offset);
-}
-
 static inline void pmic_arb_base_write(struct spmi_pmic_arb_dev *dev,
 				       u32 offset, u32 val)
 {
@@ -193,7 +188,7 @@ static inline void pmic_arb_set_rd_cmd(struct spmi_pmic_arb_dev *dev,
  */
 static void pa_read_data(struct spmi_pmic_arb_dev *dev, u8 *buf, u32 reg, u8 bc)
 {
-	u32 data = pmic_arb_base_read(dev, reg);
+	u32 data = __raw_readl(dev->rd_base + reg);
 	memcpy(buf, &data, (bc & 3) + 1);
 }
 
@@ -208,7 +203,7 @@ pa_write_data(struct spmi_pmic_arb_dev *dev, const u8 *buf, u32 reg, u8 bc)
 {
 	u32 data = 0;
 	memcpy(&data, buf, (bc & 3) + 1);
-	pmic_arb_base_write(dev, reg, data);
+	__raw_writel(data, dev->wr_base + reg);
 }
 
 static int pmic_arb_wait_for_done(struct spmi_controller *ctrl,
@@ -365,7 +360,7 @@ static int pmic_arb_write_cmd(struct spmi_controller *ctrl, u8 opc, u8 sid,
 		opc = PMIC_ARB_OP_EXT_WRITE;
 	else if (opc >= 0x30 && opc <= 0x37)
 		opc = PMIC_ARB_OP_EXT_WRITEL;
-	else if (opc >= 0x80 && opc <= 0xFF)
+	else if (opc >= 0x80)
 		opc = PMIC_ARB_OP_ZERO_WRITE;
 	else
 		return -EINVAL;
diff --git a/drivers/spmi/spmi.c b/drivers/spmi/spmi.c
index 11467e17bdd8..6b3da1bb0d63 100644
--- a/drivers/spmi/spmi.c
+++ b/drivers/spmi/spmi.c
@@ -560,12 +560,13 @@ EXPORT_SYMBOL_GPL(spmi_controller_remove);
  * This API will register the client driver with the SPMI framework.
  * It is typically called from the driver's module-init function.
  */
-int spmi_driver_register(struct spmi_driver *sdrv)
+int __spmi_driver_register(struct spmi_driver *sdrv, struct module *owner)
 {
 	sdrv->driver.bus = &spmi_bus_type;
+	sdrv->driver.owner = owner;
 	return driver_register(&sdrv->driver);
 }
-EXPORT_SYMBOL_GPL(spmi_driver_register);
+EXPORT_SYMBOL_GPL(__spmi_driver_register);
 
 static void __exit spmi_exit(void)
 {
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 8196581f54c2..bcc1fc027311 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -524,6 +524,7 @@ static ssize_t uio_read(struct file *filep, char __user *buf,
 
 		event_count = atomic_read(&idev->event);
 		if (event_count != listener->event_count) {
+			__set_current_state(TASK_RUNNING);
 			if (copy_to_user(buf, &event_count, count))
 				retval = -EFAULT;
 			else {
diff --git a/drivers/uio/uio_fsl_elbc_gpcm.c b/drivers/uio/uio_fsl_elbc_gpcm.c
index 2bcf80c159c1..b46323d9dc18 100644
--- a/drivers/uio/uio_fsl_elbc_gpcm.c
+++ b/drivers/uio/uio_fsl_elbc_gpcm.c
@@ -470,6 +470,7 @@ static const struct of_device_id uio_fsl_elbc_gpcm_match[] = {
 	{ .compatible = "fsl,elbc-gpcm-uio", },
 	{}
 };
+MODULE_DEVICE_TABLE(of, uio_fsl_elbc_gpcm_match);
 
 static struct platform_driver uio_fsl_elbc_gpcm_driver = {
 	.driver = {
diff --git a/drivers/w1/masters/omap_hdq.c b/drivers/w1/masters/omap_hdq.c
index e7d448963a24..0e2f43bccf1f 100644
--- a/drivers/w1/masters/omap_hdq.c
+++ b/drivers/w1/masters/omap_hdq.c
@@ -17,6 +17,7 @@
 #include <linux/io.h>
 #include <linux/sched.h>
 #include <linux/pm_runtime.h>
+#include <linux/of.h>
 
 #include "../w1.h"
 #include "../w1_int.h"
@@ -27,21 +28,23 @@
 #define OMAP_HDQ_TX_DATA			0x04
 #define OMAP_HDQ_RX_DATA			0x08
 #define OMAP_HDQ_CTRL_STATUS			0x0c
-#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK	(1<<6)
-#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE	(1<<5)
-#define OMAP_HDQ_CTRL_STATUS_GO			(1<<4)
-#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION	(1<<2)
-#define OMAP_HDQ_CTRL_STATUS_DIR		(1<<1)
-#define OMAP_HDQ_CTRL_STATUS_MODE		(1<<0)
+#define OMAP_HDQ_CTRL_STATUS_SINGLE		BIT(7)
+#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK	BIT(6)
+#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE	BIT(5)
+#define OMAP_HDQ_CTRL_STATUS_GO                 BIT(4)
+#define OMAP_HDQ_CTRL_STATUS_PRESENCE		BIT(3)
+#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION	BIT(2)
+#define OMAP_HDQ_CTRL_STATUS_DIR		BIT(1)
 #define OMAP_HDQ_INT_STATUS			0x10
-#define OMAP_HDQ_INT_STATUS_TXCOMPLETE		(1<<2)
-#define OMAP_HDQ_INT_STATUS_RXCOMPLETE		(1<<1)
-#define OMAP_HDQ_INT_STATUS_TIMEOUT		(1<<0)
+#define OMAP_HDQ_INT_STATUS_TXCOMPLETE		BIT(2)
+#define OMAP_HDQ_INT_STATUS_RXCOMPLETE		BIT(1)
+#define OMAP_HDQ_INT_STATUS_TIMEOUT		BIT(0)
 #define OMAP_HDQ_SYSCONFIG			0x14
-#define OMAP_HDQ_SYSCONFIG_SOFTRESET		(1<<1)
-#define OMAP_HDQ_SYSCONFIG_AUTOIDLE		(1<<0)
+#define OMAP_HDQ_SYSCONFIG_SOFTRESET		BIT(1)
+#define OMAP_HDQ_SYSCONFIG_AUTOIDLE		BIT(0)
+#define OMAP_HDQ_SYSCONFIG_NOIDLE		0x0
 #define OMAP_HDQ_SYSSTATUS			0x18
-#define OMAP_HDQ_SYSSTATUS_RESETDONE		(1<<0)
+#define OMAP_HDQ_SYSSTATUS_RESETDONE		BIT(0)
 
 #define OMAP_HDQ_FLAG_CLEAR			0
 #define OMAP_HDQ_FLAG_SET			1
@@ -67,6 +70,10 @@ struct hdq_data {
 	 * the data wrire or read.
 	 */
 	int			init_trans;
+	int                     rrw;
+	/* mode: 0-HDQ 1-W1 */
+	int                     mode;
+
 };
 
 static int omap_hdq_probe(struct platform_device *pdev);
@@ -74,6 +81,7 @@ static int omap_hdq_remove(struct platform_device *pdev);
 
 static const struct of_device_id omap_hdq_dt_ids[] = {
 	{ .compatible = "ti,omap3-1w" },
+	{ .compatible = "ti,am4372-hdq" },
 	{}
 };
 MODULE_DEVICE_TABLE(of, omap_hdq_dt_ids);
@@ -90,15 +98,12 @@ static struct platform_driver omap_hdq_driver = {
 static u8 omap_w1_read_byte(void *_hdq);
 static void omap_w1_write_byte(void *_hdq, u8 byte);
 static u8 omap_w1_reset_bus(void *_hdq);
-static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
-		u8 search_type,	w1_slave_found_callback slave_found);
 
 
 static struct w1_bus_master omap_w1_master = {
 	.read_byte	= omap_w1_read_byte,
 	.write_byte	= omap_w1_write_byte,
 	.reset_bus	= omap_w1_reset_bus,
-	.search		= omap_w1_search_bus,
 };
 
 /* HDQ register I/O routines */
@@ -122,6 +127,15 @@ static inline u8 hdq_reg_merge(struct hdq_data *hdq_data, u32 offset,
 	return new_val;
 }
 
+static void hdq_disable_interrupt(struct hdq_data *hdq_data, u32 offset,
+				  u32 mask)
+{
+	u32 ie;
+
+	ie = readl(hdq_data->hdq_base + offset);
+	writel(ie & mask, hdq_data->hdq_base + offset);
+}
+
 /*
  * Wait for one or more bits in flag change.
  * HDQ_FLAG_SET: wait until any bit in the flag is set.
@@ -229,13 +243,7 @@ static irqreturn_t hdq_isr(int irq, void *_hdq)
 	return IRQ_HANDLED;
 }
 
-/* HDQ Mode: always return success */
-static u8 omap_w1_reset_bus(void *_hdq)
-{
-	return 0;
-}
-
-/* W1 search callback function */
+/* W1 search callback function  in HDQ mode */
 static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
 		u8 search_type, w1_slave_found_callback slave_found)
 {
@@ -262,9 +270,10 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
 	int ret;
 	u8 tmp_status;
 
-	hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG, OMAP_HDQ_SYSCONFIG_SOFTRESET);
+	hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+		    OMAP_HDQ_SYSCONFIG_SOFTRESET);
 	/*
-	 * Select HDQ mode & enable clocks.
+	 * Select HDQ/1W mode & enable clocks.
 	 * It is observed that INT flags can't be cleared via a read and GO/INIT
 	 * won't return to zero if interrupt is disabled. So we always enable
 	 * interrupt.
@@ -282,7 +291,8 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
 	else {
 		hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
 			OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
-			OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+			OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+			hdq_data->mode);
 		hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
 			OMAP_HDQ_SYSCONFIG_AUTOIDLE);
 	}
@@ -334,6 +344,18 @@ static int omap_hdq_break(struct hdq_data *hdq_data)
 		ret = -ETIMEDOUT;
 		goto out;
 	}
+
+	/*
+	 * check for the presence detect bit to get
+	 * set to show that the slave is responding
+	 */
+	if (!(hdq_reg_in(hdq_data, OMAP_HDQ_CTRL_STATUS) &
+			OMAP_HDQ_CTRL_STATUS_PRESENCE)) {
+		dev_dbg(hdq_data->dev, "Presence bit not set\n");
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+
 	/*
 	 * wait for both INIT and GO bits rerurn to zero.
 	 * zero wait time expected for interrupt mode.
@@ -368,6 +390,8 @@ static int hdq_read_byte(struct hdq_data *hdq_data, u8 *val)
 		goto out;
 	}
 
+	hdq_data->hdq_irqstatus = 0;
+
 	if (!(hdq_data->hdq_irqstatus & OMAP_HDQ_INT_STATUS_RXCOMPLETE)) {
 		hdq_reg_merge(hdq_data, OMAP_HDQ_CTRL_STATUS,
 			OMAP_HDQ_CTRL_STATUS_DIR | OMAP_HDQ_CTRL_STATUS_GO,
@@ -400,7 +424,7 @@ rtn:
 
 }
 
-/* Enable clocks and set the controller to HDQ mode */
+/* Enable clocks and set the controller to HDQ/1W mode */
 static int omap_hdq_get(struct hdq_data *hdq_data)
 {
 	int ret = 0;
@@ -422,7 +446,7 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
 
 			pm_runtime_get_sync(hdq_data->dev);
 
-			/* make sure HDQ is out of reset */
+			/* make sure HDQ/1W is out of reset */
 			if (!(hdq_reg_in(hdq_data, OMAP_HDQ_SYSSTATUS) &
 				OMAP_HDQ_SYSSTATUS_RESETDONE)) {
 				ret = _omap_hdq_reset(hdq_data);
@@ -430,12 +454,13 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
 					/* back up the count */
 					hdq_data->hdq_usecount--;
 			} else {
-				/* select HDQ mode & enable clocks */
+				/* select HDQ/1W mode & enable clocks */
 				hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
 					OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
-					OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+					OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+					hdq_data->mode);
 				hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
-					OMAP_HDQ_SYSCONFIG_AUTOIDLE);
+					OMAP_HDQ_SYSCONFIG_NOIDLE);
 				hdq_reg_in(hdq_data, OMAP_HDQ_INT_STATUS);
 			}
 		}
@@ -456,6 +481,8 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
 	if (ret < 0)
 		return -EINTR;
 
+	hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+		    OMAP_HDQ_SYSCONFIG_AUTOIDLE);
 	if (0 == hdq_data->hdq_usecount) {
 		dev_dbg(hdq_data->dev, "attempt to decrement use count"
 			" when it is zero");
@@ -471,6 +498,100 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
 	return ret;
 }
 
+/*
+ * W1 triplet callback function - used for searching ROM addresses.
+ * Registered only when controller is in 1-wire mode.
+ */
+static u8 omap_w1_triplet(void *_hdq, u8 bdir)
+{
+	u8 id_bit, comp_bit;
+	int err;
+	u8 ret = 0x3; /* no slaves responded */
+	struct hdq_data *hdq_data = _hdq;
+	u8 ctrl = OMAP_HDQ_CTRL_STATUS_SINGLE | OMAP_HDQ_CTRL_STATUS_GO |
+		  OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK;
+	u8 mask = ctrl | OMAP_HDQ_CTRL_STATUS_DIR;
+
+	omap_hdq_get(_hdq);
+
+	err = mutex_lock_interruptible(&hdq_data->hdq_mutex);
+	if (err < 0) {
+		dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
+		goto rtn;
+	}
+
+	hdq_data->hdq_irqstatus = 0;
+	/* read id_bit */
+	hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+		      ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+	err = wait_event_timeout(hdq_wait_queue,
+				 (hdq_data->hdq_irqstatus
+				  & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+				 OMAP_HDQ_TIMEOUT);
+	if (err == 0) {
+		dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+		goto out;
+	}
+	id_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+	hdq_data->hdq_irqstatus = 0;
+	/* read comp_bit */
+	hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+		      ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+	err = wait_event_timeout(hdq_wait_queue,
+				 (hdq_data->hdq_irqstatus
+				  & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+				 OMAP_HDQ_TIMEOUT);
+	if (err == 0) {
+		dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+		goto out;
+	}
+	comp_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+	if (id_bit && comp_bit) {
+		ret = 0x03;  /* no slaves responded */
+		goto out;
+	}
+	if (!id_bit && !comp_bit) {
+		/* Both bits are valid, take the direction given */
+		ret = bdir ? 0x04 : 0;
+	} else {
+		/* Only one bit is valid, take that direction */
+		bdir = id_bit;
+		ret = id_bit ? 0x05 : 0x02;
+	}
+
+	/* write bdir bit */
+	hdq_reg_out(_hdq, OMAP_HDQ_TX_DATA, bdir);
+	hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, ctrl, mask);
+	err = wait_event_timeout(hdq_wait_queue,
+				 (hdq_data->hdq_irqstatus
+				  & OMAP_HDQ_INT_STATUS_TXCOMPLETE),
+				 OMAP_HDQ_TIMEOUT);
+	if (err == 0) {
+		dev_dbg(hdq_data->dev, "TX wait elapsed\n");
+		goto out;
+	}
+
+	hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, 0,
+		      OMAP_HDQ_CTRL_STATUS_SINGLE);
+
+out:
+	mutex_unlock(&hdq_data->hdq_mutex);
+rtn:
+	omap_hdq_put(_hdq);
+	return ret;
+}
+
+/* reset callback */
+static u8 omap_w1_reset_bus(void *_hdq)
+{
+	omap_hdq_get(_hdq);
+	omap_hdq_break(_hdq);
+	omap_hdq_put(_hdq);
+	return 0;
+}
+
 /* Read a byte of data from the device */
 static u8 omap_w1_read_byte(void *_hdq)
 {
@@ -478,6 +599,10 @@ static u8 omap_w1_read_byte(void *_hdq)
 	u8 val = 0;
 	int ret;
 
+	/* First write to initialize the transfer */
+	if (hdq_data->init_trans == 0)
+		omap_hdq_get(hdq_data);
+
 	ret = hdq_read_byte(hdq_data, &val);
 	if (ret) {
 		ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -491,6 +616,10 @@ static u8 omap_w1_read_byte(void *_hdq)
 		return -1;
 	}
 
+	hdq_disable_interrupt(hdq_data, OMAP_HDQ_CTRL_STATUS,
+			      ~OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+	hdq_data->hdq_usecount = 0;
+
 	/* Write followed by a read, release the module */
 	if (hdq_data->init_trans) {
 		ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -517,6 +646,14 @@ static void omap_w1_write_byte(void *_hdq, u8 byte)
 	if (hdq_data->init_trans == 0)
 		omap_hdq_get(hdq_data);
 
+	/*
+	 * We need to reset the slave before
+	 * issuing the SKIP ROM command, else
+	 * the slave will not work.
+	 */
+	if (byte == W1_SKIP_ROM)
+		omap_hdq_break(hdq_data);
+
 	ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
 	if (ret < 0) {
 		dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
@@ -551,6 +688,7 @@ static int omap_hdq_probe(struct platform_device *pdev)
 	struct resource *res;
 	int ret, irq;
 	u8 rev;
+	const char *mode;
 
 	hdq_data = devm_kzalloc(dev, sizeof(*hdq_data), GFP_KERNEL);
 	if (!hdq_data) {
@@ -567,10 +705,21 @@ static int omap_hdq_probe(struct platform_device *pdev)
 		return PTR_ERR(hdq_data->hdq_base);
 
 	hdq_data->hdq_usecount = 0;
+	hdq_data->rrw = 0;
 	mutex_init(&hdq_data->hdq_mutex);
 
 	pm_runtime_enable(&pdev->dev);
-	pm_runtime_get_sync(&pdev->dev);
+	ret = pm_runtime_get_sync(&pdev->dev);
+	if (ret < 0) {
+		dev_dbg(&pdev->dev, "pm_runtime_get_sync failed\n");
+		goto err_w1;
+	}
+
+	ret = _omap_hdq_reset(hdq_data);
+	if (ret) {
+		dev_dbg(&pdev->dev, "reset failed\n");
+		return -EINVAL;
+	}
 
 	rev = hdq_reg_in(hdq_data, OMAP_HDQ_REVISION);
 	dev_info(&pdev->dev, "OMAP HDQ Hardware Rev %c.%c. Driver in %s mode\n",
@@ -594,6 +743,15 @@ static int omap_hdq_probe(struct platform_device *pdev)
 
 	pm_runtime_put_sync(&pdev->dev);
 
+	ret = of_property_read_string(pdev->dev.of_node, "ti,mode", &mode);
+	if (ret < 0 || !strcmp(mode, "hdq")) {
+		hdq_data->mode = 0;
+		omap_w1_master.search = omap_w1_search_bus;
+	} else {
+		hdq_data->mode = 1;
+		omap_w1_master.triplet = omap_w1_triplet;
+	}
+
 	omap_w1_master.data = hdq_data;
 
 	ret = w1_add_master_device(&omap_w1_master);
@@ -635,8 +793,8 @@ static int omap_hdq_remove(struct platform_device *pdev)
 module_platform_driver(omap_hdq_driver);
 
 module_param(w1_id, int, S_IRUSR);
-MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection");
+MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection in HDQ mode");
 
 MODULE_AUTHOR("Texas Instruments");
-MODULE_DESCRIPTION("HDQ driver Library");
+MODULE_DESCRIPTION("HDQ-1W driver Library");
 MODULE_LICENSE("GPL");
diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index 47249a30eae3..20f766afa4c7 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -91,8 +91,7 @@ static struct w1_master *w1_alloc_dev(u32 id, int slave_count, int slave_ttl,
 	err = device_register(&dev->dev);
 	if (err) {
 		pr_err("Failed to register master device. err=%d\n", err);
-		memset(dev, 0, sizeof(struct w1_master));
-		kfree(dev);
+		put_device(&dev->dev);
 		dev = NULL;
 	}