diff options
Diffstat (limited to 'drivers/misc/genwqe')
-rw-r--r-- | drivers/misc/genwqe/Kconfig | 6 | ||||
-rw-r--r-- | drivers/misc/genwqe/card_base.c | 217 | ||||
-rw-r--r-- | drivers/misc/genwqe/card_base.h | 2 | ||||
-rw-r--r-- | drivers/misc/genwqe/card_ddcb.c | 28 | ||||
-rw-r--r-- | drivers/misc/genwqe/card_debugfs.c | 7 | ||||
-rw-r--r-- | drivers/misc/genwqe/card_dev.c | 5 | ||||
-rw-r--r-- | drivers/misc/genwqe/card_sysfs.c | 25 | ||||
-rw-r--r-- | drivers/misc/genwqe/card_utils.c | 20 | ||||
-rw-r--r-- | drivers/misc/genwqe/genwqe_driver.h | 2 |
9 files changed, 284 insertions, 28 deletions
diff --git a/drivers/misc/genwqe/Kconfig b/drivers/misc/genwqe/Kconfig index 6069d8cd79d7..4c0a033cbfdb 100644 --- a/drivers/misc/genwqe/Kconfig +++ b/drivers/misc/genwqe/Kconfig @@ -11,3 +11,9 @@ menuconfig GENWQE Enables PCIe card driver for IBM GenWQE accelerators. The user-space interface is described in include/linux/genwqe/genwqe_card.h. + +config GENWQE_PLATFORM_ERROR_RECOVERY + int "Use platform recovery procedures (0=off, 1=on)" + depends on GENWQE + default 1 if PPC64 + default 0 diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c index 74d51c9bb858..43bbabc96b6c 100644 --- a/drivers/misc/genwqe/card_base.c +++ b/drivers/misc/genwqe/card_base.c @@ -38,7 +38,6 @@ #include <linux/notifier.h> #include <linux/device.h> #include <linux/log2.h> -#include <linux/genwqe/genwqe_card.h> #include "card_base.h" #include "card_ddcb.h" @@ -58,7 +57,7 @@ static struct dentry *debugfs_genwqe; static struct genwqe_dev *genwqe_devices[GENWQE_CARD_NO_MAX]; /* PCI structure for identifying device by PCI vendor and device ID */ -static DEFINE_PCI_DEVICE_TABLE(genwqe_device_table) = { +static const struct pci_device_id genwqe_device_table[] = { { .vendor = PCI_VENDOR_ID_IBM, .device = PCI_DEVICE_GENWQE, .subvendor = PCI_SUBVENDOR_ID_IBM, @@ -140,6 +139,12 @@ static struct genwqe_dev *genwqe_dev_alloc(void) cd->class_genwqe = class_genwqe; cd->debugfs_genwqe = debugfs_genwqe; + /* + * This comes from kernel config option and can be overritten via + * debugfs. + */ + cd->use_platform_recovery = CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY; + init_waitqueue_head(&cd->queue_waitq); spin_lock_init(&cd->file_lock); @@ -761,6 +766,124 @@ static u64 genwqe_fir_checking(struct genwqe_dev *cd) } /** + * genwqe_pci_fundamental_reset() - trigger a PCIe fundamental reset on the slot + * + * Note: pci_set_pcie_reset_state() is not implemented on all archs, so this + * reset method will not work in all cases. + * + * Return: 0 on success or error code from pci_set_pcie_reset_state() + */ +static int genwqe_pci_fundamental_reset(struct pci_dev *pci_dev) +{ + int rc; + + /* + * lock pci config space access from userspace, + * save state and issue PCIe fundamental reset + */ + pci_cfg_access_lock(pci_dev); + pci_save_state(pci_dev); + rc = pci_set_pcie_reset_state(pci_dev, pcie_warm_reset); + if (!rc) { + /* keep PCIe reset asserted for 250ms */ + msleep(250); + pci_set_pcie_reset_state(pci_dev, pcie_deassert_reset); + /* Wait for 2s to reload flash and train the link */ + msleep(2000); + } + pci_restore_state(pci_dev); + pci_cfg_access_unlock(pci_dev); + return rc; +} + + +static int genwqe_platform_recovery(struct genwqe_dev *cd) +{ + struct pci_dev *pci_dev = cd->pci_dev; + int rc; + + dev_info(&pci_dev->dev, + "[%s] resetting card for error recovery\n", __func__); + + /* Clear out error injection flags */ + cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE | + GENWQE_INJECT_GFIR_FATAL | + GENWQE_INJECT_GFIR_INFO); + + genwqe_stop(cd); + + /* Try recoverying the card with fundamental reset */ + rc = genwqe_pci_fundamental_reset(pci_dev); + if (!rc) { + rc = genwqe_start(cd); + if (!rc) + dev_info(&pci_dev->dev, + "[%s] card recovered\n", __func__); + else + dev_err(&pci_dev->dev, + "[%s] err: cannot start card services! (err=%d)\n", + __func__, rc); + } else { + dev_err(&pci_dev->dev, + "[%s] card reset failed\n", __func__); + } + + return rc; +} + +/* + * genwqe_reload_bistream() - reload card bitstream + * + * Set the appropriate register and call fundamental reset to reaload the card + * bitstream. + * + * Return: 0 on success, error code otherwise + */ +static int genwqe_reload_bistream(struct genwqe_dev *cd) +{ + struct pci_dev *pci_dev = cd->pci_dev; + int rc; + + dev_info(&pci_dev->dev, + "[%s] resetting card for bitstream reload\n", + __func__); + + genwqe_stop(cd); + + /* + * Cause a CPLD reprogram with the 'next_bitstream' + * partition on PCIe hot or fundamental reset + */ + __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET, + (cd->softreset & 0xcull) | 0x70ull); + + rc = genwqe_pci_fundamental_reset(pci_dev); + if (rc) { + /* + * A fundamental reset failure can be caused + * by lack of support on the arch, so we just + * log the error and try to start the card + * again. + */ + dev_err(&pci_dev->dev, + "[%s] err: failed to reset card for bitstream reload\n", + __func__); + } + + rc = genwqe_start(cd); + if (rc) { + dev_err(&pci_dev->dev, + "[%s] err: cannot start card services! (err=%d)\n", + __func__, rc); + return rc; + } + dev_info(&pci_dev->dev, + "[%s] card reloaded\n", __func__); + return 0; +} + + +/** * genwqe_health_thread() - Health checking thread * * This thread is only started for the PF of the card. @@ -786,6 +909,7 @@ static int genwqe_health_thread(void *data) struct pci_dev *pci_dev = cd->pci_dev; u64 gfir, gfir_masked, slu_unitcfg, app_unitcfg; + health_thread_begin: while (!kthread_should_stop()) { rc = wait_event_interruptible_timeout(cd->health_waitq, (genwqe_health_check_cond(cd, &gfir) || @@ -846,6 +970,13 @@ static int genwqe_health_thread(void *data) } } + if (cd->card_state == GENWQE_CARD_RELOAD_BITSTREAM) { + /* Userspace requested card bitstream reload */ + rc = genwqe_reload_bistream(cd); + if (rc) + goto fatal_error; + } + cd->last_gfir = gfir; cond_resched(); } @@ -853,6 +984,28 @@ static int genwqe_health_thread(void *data) return 0; fatal_error: + if (cd->use_platform_recovery) { + /* + * Since we use raw accessors, EEH errors won't be detected + * by the platform until we do a non-raw MMIO or config space + * read + */ + readq(cd->mmio + IO_SLC_CFGREG_GFIR); + + /* We do nothing if the card is going over PCI recovery */ + if (pci_channel_offline(pci_dev)) + return -EIO; + + /* + * If it's supported by the platform, we try a fundamental reset + * to recover from a fatal error. Otherwise, we continue to wait + * for an external recovery procedure to take care of it. + */ + rc = genwqe_platform_recovery(cd); + if (!rc) + goto health_thread_begin; + } + dev_err(&pci_dev->dev, "[%s] card unusable. Please trigger unbind!\n", __func__); @@ -958,6 +1111,9 @@ static int genwqe_pci_setup(struct genwqe_dev *cd) pci_set_master(pci_dev); pci_enable_pcie_error_reporting(pci_dev); + /* EEH recovery requires PCIe fundamental reset */ + pci_dev->needs_freset = 1; + /* request complete BAR-0 space (length = 0) */ cd->mmio_len = pci_resource_len(pci_dev, 0); cd->mmio = pci_iomap(pci_dev, 0, 0); @@ -1096,23 +1252,40 @@ static pci_ers_result_t genwqe_err_error_detected(struct pci_dev *pci_dev, dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state); - if (pci_dev == NULL) - return PCI_ERS_RESULT_NEED_RESET; - cd = dev_get_drvdata(&pci_dev->dev); if (cd == NULL) - return PCI_ERS_RESULT_NEED_RESET; + return PCI_ERS_RESULT_DISCONNECT; - switch (state) { - case pci_channel_io_normal: - return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - return PCI_ERS_RESULT_NEED_RESET; - case pci_channel_io_perm_failure: + /* Stop the card */ + genwqe_health_check_stop(cd); + genwqe_stop(cd); + + /* + * On permanent failure, the PCI code will call device remove + * after the return of this function. + * genwqe_stop() can be called twice. + */ + if (state == pci_channel_io_perm_failure) { return PCI_ERS_RESULT_DISCONNECT; + } else { + genwqe_pci_remove(cd); + return PCI_ERS_RESULT_NEED_RESET; } +} + +static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev) +{ + int rc; + struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev); - return PCI_ERS_RESULT_NEED_RESET; + rc = genwqe_pci_setup(cd); + if (!rc) { + return PCI_ERS_RESULT_RECOVERED; + } else { + dev_err(&pci_dev->dev, + "err: problems with PCI setup (err=%d)\n", rc); + return PCI_ERS_RESULT_DISCONNECT; + } } static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev) @@ -1120,8 +1293,22 @@ static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev) return PCI_ERS_RESULT_NONE; } -static void genwqe_err_resume(struct pci_dev *dev) +static void genwqe_err_resume(struct pci_dev *pci_dev) { + int rc; + struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev); + + rc = genwqe_start(cd); + if (!rc) { + rc = genwqe_health_check_start(cd); + if (rc) + dev_err(&pci_dev->dev, + "err: cannot start health checking! (err=%d)\n", + rc); + } else { + dev_err(&pci_dev->dev, + "err: cannot start card services! (err=%d)\n", rc); + } } static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs) @@ -1144,7 +1331,7 @@ static struct pci_error_handlers genwqe_err_handler = { .error_detected = genwqe_err_error_detected, .mmio_enabled = genwqe_err_result_none, .link_reset = genwqe_err_result_none, - .slot_reset = genwqe_err_result_none, + .slot_reset = genwqe_err_slot_reset, .resume = genwqe_err_resume, }; diff --git a/drivers/misc/genwqe/card_base.h b/drivers/misc/genwqe/card_base.h index 0e608a288603..67abd8cb2247 100644 --- a/drivers/misc/genwqe/card_base.h +++ b/drivers/misc/genwqe/card_base.h @@ -291,6 +291,8 @@ struct genwqe_dev { struct task_struct *health_thread; wait_queue_head_t health_waitq; + int use_platform_recovery; /* use platform recovery mechanisms */ + /* char device */ dev_t devnum_genwqe; /* major/minor num card */ struct class *class_genwqe; /* reference to class object */ diff --git a/drivers/misc/genwqe/card_ddcb.c b/drivers/misc/genwqe/card_ddcb.c index c8046db2d5a2..dc9851a5540e 100644 --- a/drivers/misc/genwqe/card_ddcb.c +++ b/drivers/misc/genwqe/card_ddcb.c @@ -1118,7 +1118,21 @@ static irqreturn_t genwqe_pf_isr(int irq, void *dev_id) * safer, but slower for the good-case ... See above. */ gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR); - if ((gfir & GFIR_ERR_TRIGGER) != 0x0) { + if (((gfir & GFIR_ERR_TRIGGER) != 0x0) && + !pci_channel_offline(pci_dev)) { + + if (cd->use_platform_recovery) { + /* + * Since we use raw accessors, EEH errors won't be + * detected by the platform until we do a non-raw + * MMIO or config space read + */ + readq(cd->mmio + IO_SLC_CFGREG_GFIR); + + /* Don't do anything if the PCI channel is frozen */ + if (pci_channel_offline(pci_dev)) + goto exit; + } wake_up_interruptible(&cd->health_waitq); @@ -1126,12 +1140,12 @@ static irqreturn_t genwqe_pf_isr(int irq, void *dev_id) * By default GFIRs causes recovery actions. This * count is just for debug when recovery is masked. */ - printk_ratelimited(KERN_ERR - "%s %s: [%s] GFIR=%016llx\n", - GENWQE_DEVNAME, dev_name(&pci_dev->dev), - __func__, gfir); + dev_err_ratelimited(&pci_dev->dev, + "[%s] GFIR=%016llx\n", + __func__, gfir); } + exit: return IRQ_HANDLED; } @@ -1237,9 +1251,7 @@ int genwqe_setup_service_layer(struct genwqe_dev *cd) } rc = genwqe_set_interrupt_capability(cd, GENWQE_MSI_IRQS); - if (rc > 0) - rc = genwqe_set_interrupt_capability(cd, rc); - if (rc != 0) { + if (rc) { rc = -ENODEV; goto stop_kthread; } diff --git a/drivers/misc/genwqe/card_debugfs.c b/drivers/misc/genwqe/card_debugfs.c index 0a33ade64109..c9b4d6d0eb99 100644 --- a/drivers/misc/genwqe/card_debugfs.c +++ b/drivers/misc/genwqe/card_debugfs.c @@ -485,6 +485,13 @@ int genwqe_init_debugfs(struct genwqe_dev *cd) goto err1; } + file = debugfs_create_u32("use_platform_recovery", 0666, root, + &cd->use_platform_recovery); + if (!file) { + ret = -ENOMEM; + goto err1; + } + cd->debugfs_root = root; return 0; err1: diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index 1d2f163a1906..aae42555e2ca 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -1048,10 +1048,15 @@ static long genwqe_ioctl(struct file *filp, unsigned int cmd, int rc = 0; struct genwqe_file *cfile = (struct genwqe_file *)filp->private_data; struct genwqe_dev *cd = cfile->cd; + struct pci_dev *pci_dev = cd->pci_dev; struct genwqe_reg_io __user *io; u64 val; u32 reg_offs; + /* Return -EIO if card hit EEH */ + if (pci_channel_offline(pci_dev)) + return -EIO; + if (_IOC_TYPE(cmd) != GENWQE_IOC_CODE) return -EINVAL; diff --git a/drivers/misc/genwqe/card_sysfs.c b/drivers/misc/genwqe/card_sysfs.c index a72a99266c3c..7232e40a3ad9 100644 --- a/drivers/misc/genwqe/card_sysfs.c +++ b/drivers/misc/genwqe/card_sysfs.c @@ -223,6 +223,30 @@ static ssize_t next_bitstream_store(struct device *dev, } static DEVICE_ATTR_RW(next_bitstream); +static ssize_t reload_bitstream_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int reload; + struct genwqe_dev *cd = dev_get_drvdata(dev); + + if (kstrtoint(buf, 0, &reload) < 0) + return -EINVAL; + + if (reload == 0x1) { + if (cd->card_state == GENWQE_CARD_UNUSED || + cd->card_state == GENWQE_CARD_USED) + cd->card_state = GENWQE_CARD_RELOAD_BITSTREAM; + else + return -EIO; + } else { + return -EINVAL; + } + + return count; +} +static DEVICE_ATTR_WO(reload_bitstream); + /* * Create device_attribute structures / params: name, mode, show, store * additional flag if valid in VF @@ -239,6 +263,7 @@ static struct attribute *genwqe_attributes[] = { &dev_attr_status.attr, &dev_attr_freerunning_timer.attr, &dev_attr_queue_working_time.attr, + &dev_attr_reload_bitstream.attr, NULL, }; diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index 62cc6bb3f62e..a6400f09229c 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -53,12 +53,17 @@ */ int __genwqe_writeq(struct genwqe_dev *cd, u64 byte_offs, u64 val) { + struct pci_dev *pci_dev = cd->pci_dev; + if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE) return -EIO; if (cd->mmio == NULL) return -EIO; + if (pci_channel_offline(pci_dev)) + return -EIO; + __raw_writeq((__force u64)cpu_to_be64(val), cd->mmio + byte_offs); return 0; } @@ -99,12 +104,17 @@ u64 __genwqe_readq(struct genwqe_dev *cd, u64 byte_offs) */ int __genwqe_writel(struct genwqe_dev *cd, u64 byte_offs, u32 val) { + struct pci_dev *pci_dev = cd->pci_dev; + if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE) return -EIO; if (cd->mmio == NULL) return -EIO; + if (pci_channel_offline(pci_dev)) + return -EIO; + __raw_writel((__force u32)cpu_to_be32(val), cd->mmio + byte_offs); return 0; } @@ -718,10 +728,12 @@ int genwqe_set_interrupt_capability(struct genwqe_dev *cd, int count) int rc; struct pci_dev *pci_dev = cd->pci_dev; - rc = pci_enable_msi_exact(pci_dev, count); - if (rc == 0) - cd->flags |= GENWQE_FLAG_MSI_ENABLED; - return rc; + rc = pci_enable_msi_range(pci_dev, 1, count); + if (rc < 0) + return rc; + + cd->flags |= GENWQE_FLAG_MSI_ENABLED; + return 0; } /** diff --git a/drivers/misc/genwqe/genwqe_driver.h b/drivers/misc/genwqe/genwqe_driver.h index cd5263163a6e..a506e9aa2d57 100644 --- a/drivers/misc/genwqe/genwqe_driver.h +++ b/drivers/misc/genwqe/genwqe_driver.h @@ -36,7 +36,7 @@ #include <asm/byteorder.h> #include <linux/genwqe/genwqe_card.h> -#define DRV_VERS_STRING "2.0.15" +#define DRV_VERS_STRING "2.0.21" /* * Static minor number assignement, until we decide/implement |