summaryrefslogtreecommitdiffstats
path: root/drivers/misc/genwqe
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/genwqe')
-rw-r--r--drivers/misc/genwqe/Kconfig6
-rw-r--r--drivers/misc/genwqe/card_base.c217
-rw-r--r--drivers/misc/genwqe/card_base.h2
-rw-r--r--drivers/misc/genwqe/card_ddcb.c28
-rw-r--r--drivers/misc/genwqe/card_debugfs.c7
-rw-r--r--drivers/misc/genwqe/card_dev.c5
-rw-r--r--drivers/misc/genwqe/card_sysfs.c25
-rw-r--r--drivers/misc/genwqe/card_utils.c20
-rw-r--r--drivers/misc/genwqe/genwqe_driver.h2
9 files changed, 284 insertions, 28 deletions
diff --git a/drivers/misc/genwqe/Kconfig b/drivers/misc/genwqe/Kconfig
index 6069d8cd79d7..4c0a033cbfdb 100644
--- a/drivers/misc/genwqe/Kconfig
+++ b/drivers/misc/genwqe/Kconfig
@@ -11,3 +11,9 @@ menuconfig GENWQE
Enables PCIe card driver for IBM GenWQE accelerators.
The user-space interface is described in
include/linux/genwqe/genwqe_card.h.
+
+config GENWQE_PLATFORM_ERROR_RECOVERY
+ int "Use platform recovery procedures (0=off, 1=on)"
+ depends on GENWQE
+ default 1 if PPC64
+ default 0
diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c
index 74d51c9bb858..43bbabc96b6c 100644
--- a/drivers/misc/genwqe/card_base.c
+++ b/drivers/misc/genwqe/card_base.c
@@ -38,7 +38,6 @@
#include <linux/notifier.h>
#include <linux/device.h>
#include <linux/log2.h>
-#include <linux/genwqe/genwqe_card.h>
#include "card_base.h"
#include "card_ddcb.h"
@@ -58,7 +57,7 @@ static struct dentry *debugfs_genwqe;
static struct genwqe_dev *genwqe_devices[GENWQE_CARD_NO_MAX];
/* PCI structure for identifying device by PCI vendor and device ID */
-static DEFINE_PCI_DEVICE_TABLE(genwqe_device_table) = {
+static const struct pci_device_id genwqe_device_table[] = {
{ .vendor = PCI_VENDOR_ID_IBM,
.device = PCI_DEVICE_GENWQE,
.subvendor = PCI_SUBVENDOR_ID_IBM,
@@ -140,6 +139,12 @@ static struct genwqe_dev *genwqe_dev_alloc(void)
cd->class_genwqe = class_genwqe;
cd->debugfs_genwqe = debugfs_genwqe;
+ /*
+ * This comes from kernel config option and can be overritten via
+ * debugfs.
+ */
+ cd->use_platform_recovery = CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY;
+
init_waitqueue_head(&cd->queue_waitq);
spin_lock_init(&cd->file_lock);
@@ -761,6 +766,124 @@ static u64 genwqe_fir_checking(struct genwqe_dev *cd)
}
/**
+ * genwqe_pci_fundamental_reset() - trigger a PCIe fundamental reset on the slot
+ *
+ * Note: pci_set_pcie_reset_state() is not implemented on all archs, so this
+ * reset method will not work in all cases.
+ *
+ * Return: 0 on success or error code from pci_set_pcie_reset_state()
+ */
+static int genwqe_pci_fundamental_reset(struct pci_dev *pci_dev)
+{
+ int rc;
+
+ /*
+ * lock pci config space access from userspace,
+ * save state and issue PCIe fundamental reset
+ */
+ pci_cfg_access_lock(pci_dev);
+ pci_save_state(pci_dev);
+ rc = pci_set_pcie_reset_state(pci_dev, pcie_warm_reset);
+ if (!rc) {
+ /* keep PCIe reset asserted for 250ms */
+ msleep(250);
+ pci_set_pcie_reset_state(pci_dev, pcie_deassert_reset);
+ /* Wait for 2s to reload flash and train the link */
+ msleep(2000);
+ }
+ pci_restore_state(pci_dev);
+ pci_cfg_access_unlock(pci_dev);
+ return rc;
+}
+
+
+static int genwqe_platform_recovery(struct genwqe_dev *cd)
+{
+ struct pci_dev *pci_dev = cd->pci_dev;
+ int rc;
+
+ dev_info(&pci_dev->dev,
+ "[%s] resetting card for error recovery\n", __func__);
+
+ /* Clear out error injection flags */
+ cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE |
+ GENWQE_INJECT_GFIR_FATAL |
+ GENWQE_INJECT_GFIR_INFO);
+
+ genwqe_stop(cd);
+
+ /* Try recoverying the card with fundamental reset */
+ rc = genwqe_pci_fundamental_reset(pci_dev);
+ if (!rc) {
+ rc = genwqe_start(cd);
+ if (!rc)
+ dev_info(&pci_dev->dev,
+ "[%s] card recovered\n", __func__);
+ else
+ dev_err(&pci_dev->dev,
+ "[%s] err: cannot start card services! (err=%d)\n",
+ __func__, rc);
+ } else {
+ dev_err(&pci_dev->dev,
+ "[%s] card reset failed\n", __func__);
+ }
+
+ return rc;
+}
+
+/*
+ * genwqe_reload_bistream() - reload card bitstream
+ *
+ * Set the appropriate register and call fundamental reset to reaload the card
+ * bitstream.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+static int genwqe_reload_bistream(struct genwqe_dev *cd)
+{
+ struct pci_dev *pci_dev = cd->pci_dev;
+ int rc;
+
+ dev_info(&pci_dev->dev,
+ "[%s] resetting card for bitstream reload\n",
+ __func__);
+
+ genwqe_stop(cd);
+
+ /*
+ * Cause a CPLD reprogram with the 'next_bitstream'
+ * partition on PCIe hot or fundamental reset
+ */
+ __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET,
+ (cd->softreset & 0xcull) | 0x70ull);
+
+ rc = genwqe_pci_fundamental_reset(pci_dev);
+ if (rc) {
+ /*
+ * A fundamental reset failure can be caused
+ * by lack of support on the arch, so we just
+ * log the error and try to start the card
+ * again.
+ */
+ dev_err(&pci_dev->dev,
+ "[%s] err: failed to reset card for bitstream reload\n",
+ __func__);
+ }
+
+ rc = genwqe_start(cd);
+ if (rc) {
+ dev_err(&pci_dev->dev,
+ "[%s] err: cannot start card services! (err=%d)\n",
+ __func__, rc);
+ return rc;
+ }
+ dev_info(&pci_dev->dev,
+ "[%s] card reloaded\n", __func__);
+ return 0;
+}
+
+
+/**
* genwqe_health_thread() - Health checking thread
*
* This thread is only started for the PF of the card.
@@ -786,6 +909,7 @@ static int genwqe_health_thread(void *data)
struct pci_dev *pci_dev = cd->pci_dev;
u64 gfir, gfir_masked, slu_unitcfg, app_unitcfg;
+ health_thread_begin:
while (!kthread_should_stop()) {
rc = wait_event_interruptible_timeout(cd->health_waitq,
(genwqe_health_check_cond(cd, &gfir) ||
@@ -846,6 +970,13 @@ static int genwqe_health_thread(void *data)
}
}
+ if (cd->card_state == GENWQE_CARD_RELOAD_BITSTREAM) {
+ /* Userspace requested card bitstream reload */
+ rc = genwqe_reload_bistream(cd);
+ if (rc)
+ goto fatal_error;
+ }
+
cd->last_gfir = gfir;
cond_resched();
}
@@ -853,6 +984,28 @@ static int genwqe_health_thread(void *data)
return 0;
fatal_error:
+ if (cd->use_platform_recovery) {
+ /*
+ * Since we use raw accessors, EEH errors won't be detected
+ * by the platform until we do a non-raw MMIO or config space
+ * read
+ */
+ readq(cd->mmio + IO_SLC_CFGREG_GFIR);
+
+ /* We do nothing if the card is going over PCI recovery */
+ if (pci_channel_offline(pci_dev))
+ return -EIO;
+
+ /*
+ * If it's supported by the platform, we try a fundamental reset
+ * to recover from a fatal error. Otherwise, we continue to wait
+ * for an external recovery procedure to take care of it.
+ */
+ rc = genwqe_platform_recovery(cd);
+ if (!rc)
+ goto health_thread_begin;
+ }
+
dev_err(&pci_dev->dev,
"[%s] card unusable. Please trigger unbind!\n", __func__);
@@ -958,6 +1111,9 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
pci_set_master(pci_dev);
pci_enable_pcie_error_reporting(pci_dev);
+ /* EEH recovery requires PCIe fundamental reset */
+ pci_dev->needs_freset = 1;
+
/* request complete BAR-0 space (length = 0) */
cd->mmio_len = pci_resource_len(pci_dev, 0);
cd->mmio = pci_iomap(pci_dev, 0, 0);
@@ -1096,23 +1252,40 @@ static pci_ers_result_t genwqe_err_error_detected(struct pci_dev *pci_dev,
dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state);
- if (pci_dev == NULL)
- return PCI_ERS_RESULT_NEED_RESET;
-
cd = dev_get_drvdata(&pci_dev->dev);
if (cd == NULL)
- return PCI_ERS_RESULT_NEED_RESET;
+ return PCI_ERS_RESULT_DISCONNECT;
- switch (state) {
- case pci_channel_io_normal:
- return PCI_ERS_RESULT_CAN_RECOVER;
- case pci_channel_io_frozen:
- return PCI_ERS_RESULT_NEED_RESET;
- case pci_channel_io_perm_failure:
+ /* Stop the card */
+ genwqe_health_check_stop(cd);
+ genwqe_stop(cd);
+
+ /*
+ * On permanent failure, the PCI code will call device remove
+ * after the return of this function.
+ * genwqe_stop() can be called twice.
+ */
+ if (state == pci_channel_io_perm_failure) {
return PCI_ERS_RESULT_DISCONNECT;
+ } else {
+ genwqe_pci_remove(cd);
+ return PCI_ERS_RESULT_NEED_RESET;
}
+}
+
+static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev)
+{
+ int rc;
+ struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
- return PCI_ERS_RESULT_NEED_RESET;
+ rc = genwqe_pci_setup(cd);
+ if (!rc) {
+ return PCI_ERS_RESULT_RECOVERED;
+ } else {
+ dev_err(&pci_dev->dev,
+ "err: problems with PCI setup (err=%d)\n", rc);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
}
static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
@@ -1120,8 +1293,22 @@ static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
return PCI_ERS_RESULT_NONE;
}
-static void genwqe_err_resume(struct pci_dev *dev)
+static void genwqe_err_resume(struct pci_dev *pci_dev)
{
+ int rc;
+ struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
+
+ rc = genwqe_start(cd);
+ if (!rc) {
+ rc = genwqe_health_check_start(cd);
+ if (rc)
+ dev_err(&pci_dev->dev,
+ "err: cannot start health checking! (err=%d)\n",
+ rc);
+ } else {
+ dev_err(&pci_dev->dev,
+ "err: cannot start card services! (err=%d)\n", rc);
+ }
}
static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs)
@@ -1144,7 +1331,7 @@ static struct pci_error_handlers genwqe_err_handler = {
.error_detected = genwqe_err_error_detected,
.mmio_enabled = genwqe_err_result_none,
.link_reset = genwqe_err_result_none,
- .slot_reset = genwqe_err_result_none,
+ .slot_reset = genwqe_err_slot_reset,
.resume = genwqe_err_resume,
};
diff --git a/drivers/misc/genwqe/card_base.h b/drivers/misc/genwqe/card_base.h
index 0e608a288603..67abd8cb2247 100644
--- a/drivers/misc/genwqe/card_base.h
+++ b/drivers/misc/genwqe/card_base.h
@@ -291,6 +291,8 @@ struct genwqe_dev {
struct task_struct *health_thread;
wait_queue_head_t health_waitq;
+ int use_platform_recovery; /* use platform recovery mechanisms */
+
/* char device */
dev_t devnum_genwqe; /* major/minor num card */
struct class *class_genwqe; /* reference to class object */
diff --git a/drivers/misc/genwqe/card_ddcb.c b/drivers/misc/genwqe/card_ddcb.c
index c8046db2d5a2..dc9851a5540e 100644
--- a/drivers/misc/genwqe/card_ddcb.c
+++ b/drivers/misc/genwqe/card_ddcb.c
@@ -1118,7 +1118,21 @@ static irqreturn_t genwqe_pf_isr(int irq, void *dev_id)
* safer, but slower for the good-case ... See above.
*/
gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
- if ((gfir & GFIR_ERR_TRIGGER) != 0x0) {
+ if (((gfir & GFIR_ERR_TRIGGER) != 0x0) &&
+ !pci_channel_offline(pci_dev)) {
+
+ if (cd->use_platform_recovery) {
+ /*
+ * Since we use raw accessors, EEH errors won't be
+ * detected by the platform until we do a non-raw
+ * MMIO or config space read
+ */
+ readq(cd->mmio + IO_SLC_CFGREG_GFIR);
+
+ /* Don't do anything if the PCI channel is frozen */
+ if (pci_channel_offline(pci_dev))
+ goto exit;
+ }
wake_up_interruptible(&cd->health_waitq);
@@ -1126,12 +1140,12 @@ static irqreturn_t genwqe_pf_isr(int irq, void *dev_id)
* By default GFIRs causes recovery actions. This
* count is just for debug when recovery is masked.
*/
- printk_ratelimited(KERN_ERR
- "%s %s: [%s] GFIR=%016llx\n",
- GENWQE_DEVNAME, dev_name(&pci_dev->dev),
- __func__, gfir);
+ dev_err_ratelimited(&pci_dev->dev,
+ "[%s] GFIR=%016llx\n",
+ __func__, gfir);
}
+ exit:
return IRQ_HANDLED;
}
@@ -1237,9 +1251,7 @@ int genwqe_setup_service_layer(struct genwqe_dev *cd)
}
rc = genwqe_set_interrupt_capability(cd, GENWQE_MSI_IRQS);
- if (rc > 0)
- rc = genwqe_set_interrupt_capability(cd, rc);
- if (rc != 0) {
+ if (rc) {
rc = -ENODEV;
goto stop_kthread;
}
diff --git a/drivers/misc/genwqe/card_debugfs.c b/drivers/misc/genwqe/card_debugfs.c
index 0a33ade64109..c9b4d6d0eb99 100644
--- a/drivers/misc/genwqe/card_debugfs.c
+++ b/drivers/misc/genwqe/card_debugfs.c
@@ -485,6 +485,13 @@ int genwqe_init_debugfs(struct genwqe_dev *cd)
goto err1;
}
+ file = debugfs_create_u32("use_platform_recovery", 0666, root,
+ &cd->use_platform_recovery);
+ if (!file) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
cd->debugfs_root = root;
return 0;
err1:
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index 1d2f163a1906..aae42555e2ca 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -1048,10 +1048,15 @@ static long genwqe_ioctl(struct file *filp, unsigned int cmd,
int rc = 0;
struct genwqe_file *cfile = (struct genwqe_file *)filp->private_data;
struct genwqe_dev *cd = cfile->cd;
+ struct pci_dev *pci_dev = cd->pci_dev;
struct genwqe_reg_io __user *io;
u64 val;
u32 reg_offs;
+ /* Return -EIO if card hit EEH */
+ if (pci_channel_offline(pci_dev))
+ return -EIO;
+
if (_IOC_TYPE(cmd) != GENWQE_IOC_CODE)
return -EINVAL;
diff --git a/drivers/misc/genwqe/card_sysfs.c b/drivers/misc/genwqe/card_sysfs.c
index a72a99266c3c..7232e40a3ad9 100644
--- a/drivers/misc/genwqe/card_sysfs.c
+++ b/drivers/misc/genwqe/card_sysfs.c
@@ -223,6 +223,30 @@ static ssize_t next_bitstream_store(struct device *dev,
}
static DEVICE_ATTR_RW(next_bitstream);
+static ssize_t reload_bitstream_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int reload;
+ struct genwqe_dev *cd = dev_get_drvdata(dev);
+
+ if (kstrtoint(buf, 0, &reload) < 0)
+ return -EINVAL;
+
+ if (reload == 0x1) {
+ if (cd->card_state == GENWQE_CARD_UNUSED ||
+ cd->card_state == GENWQE_CARD_USED)
+ cd->card_state = GENWQE_CARD_RELOAD_BITSTREAM;
+ else
+ return -EIO;
+ } else {
+ return -EINVAL;
+ }
+
+ return count;
+}
+static DEVICE_ATTR_WO(reload_bitstream);
+
/*
* Create device_attribute structures / params: name, mode, show, store
* additional flag if valid in VF
@@ -239,6 +263,7 @@ static struct attribute *genwqe_attributes[] = {
&dev_attr_status.attr,
&dev_attr_freerunning_timer.attr,
&dev_attr_queue_working_time.attr,
+ &dev_attr_reload_bitstream.attr,
NULL,
};
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 62cc6bb3f62e..a6400f09229c 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -53,12 +53,17 @@
*/
int __genwqe_writeq(struct genwqe_dev *cd, u64 byte_offs, u64 val)
{
+ struct pci_dev *pci_dev = cd->pci_dev;
+
if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE)
return -EIO;
if (cd->mmio == NULL)
return -EIO;
+ if (pci_channel_offline(pci_dev))
+ return -EIO;
+
__raw_writeq((__force u64)cpu_to_be64(val), cd->mmio + byte_offs);
return 0;
}
@@ -99,12 +104,17 @@ u64 __genwqe_readq(struct genwqe_dev *cd, u64 byte_offs)
*/
int __genwqe_writel(struct genwqe_dev *cd, u64 byte_offs, u32 val)
{
+ struct pci_dev *pci_dev = cd->pci_dev;
+
if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE)
return -EIO;
if (cd->mmio == NULL)
return -EIO;
+ if (pci_channel_offline(pci_dev))
+ return -EIO;
+
__raw_writel((__force u32)cpu_to_be32(val), cd->mmio + byte_offs);
return 0;
}
@@ -718,10 +728,12 @@ int genwqe_set_interrupt_capability(struct genwqe_dev *cd, int count)
int rc;
struct pci_dev *pci_dev = cd->pci_dev;
- rc = pci_enable_msi_exact(pci_dev, count);
- if (rc == 0)
- cd->flags |= GENWQE_FLAG_MSI_ENABLED;
- return rc;
+ rc = pci_enable_msi_range(pci_dev, 1, count);
+ if (rc < 0)
+ return rc;
+
+ cd->flags |= GENWQE_FLAG_MSI_ENABLED;
+ return 0;
}
/**
diff --git a/drivers/misc/genwqe/genwqe_driver.h b/drivers/misc/genwqe/genwqe_driver.h
index cd5263163a6e..a506e9aa2d57 100644
--- a/drivers/misc/genwqe/genwqe_driver.h
+++ b/drivers/misc/genwqe/genwqe_driver.h
@@ -36,7 +36,7 @@
#include <asm/byteorder.h>
#include <linux/genwqe/genwqe_card.h>
-#define DRV_VERS_STRING "2.0.15"
+#define DRV_VERS_STRING "2.0.21"
/*
* Static minor number assignement, until we decide/implement