From cd0ac51c5760d4eed4981be5de9cad0255976512 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 19 Nov 2022 07:48:17 +0000 Subject: crypto: hisilicon/qm - define the device isolation strategy Define the device isolation strategy by the device driver. The user configures a hardware error threshold value by uacce interface. If the number of hardware errors exceeds the value of setting error threshold in one hour. The device will not be available in user space. The VF device use the PF device isolation strategy. All the hardware errors are processed by PF driver. Signed-off-by: Kai Ye Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20221119074817.12063-4-yekai13@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/hisilicon/qm.c | 169 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 154 insertions(+), 15 deletions(-) (limited to 'drivers/crypto') diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 007ac7a69ce7..58d023f6fba3 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -367,6 +367,16 @@ struct hisi_qm_resource { struct list_head list; }; +/** + * struct qm_hw_err - Structure describing the device errors + * @list: hardware error list + * @timestamp: timestamp when the error occurred + */ +struct qm_hw_err { + struct list_head list; + unsigned long long timestamp; +}; + struct hisi_qm_hw_ops { int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number); void (*qm_db)(struct hisi_qm *qm, u16 qn, @@ -2469,6 +2479,113 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd, return -EINVAL; } +/** + * qm_hw_err_isolate() - Try to set the isolation status of the uacce device + * according to user's configuration of error threshold. + * @qm: the uacce device + */ +static int qm_hw_err_isolate(struct hisi_qm *qm) +{ + struct qm_hw_err *err, *tmp, *hw_err; + struct qm_err_isolate *isolate; + u32 count = 0; + + isolate = &qm->isolate_data; + +#define SECONDS_PER_HOUR 3600 + + /* All the hw errs are processed by PF driver */ + if (qm->uacce->is_vf || isolate->is_isolate || !isolate->err_threshold) + return 0; + + hw_err = kzalloc(sizeof(*hw_err), GFP_KERNEL); + if (!hw_err) + return -ENOMEM; + + /* + * Time-stamp every slot AER error. Then check the AER error log when the + * next device AER error occurred. if the device slot AER error count exceeds + * the setting error threshold in one hour, the isolated state will be set + * to true. And the AER error logs that exceed one hour will be cleared. + */ + mutex_lock(&isolate->isolate_lock); + hw_err->timestamp = jiffies; + list_for_each_entry_safe(err, tmp, &isolate->qm_hw_errs, list) { + if ((hw_err->timestamp - err->timestamp) / HZ > + SECONDS_PER_HOUR) { + list_del(&err->list); + kfree(err); + } else { + count++; + } + } + list_add(&hw_err->list, &isolate->qm_hw_errs); + mutex_unlock(&isolate->isolate_lock); + + if (count >= isolate->err_threshold) + isolate->is_isolate = true; + + return 0; +} + +static void qm_hw_err_destroy(struct hisi_qm *qm) +{ + struct qm_hw_err *err, *tmp; + + mutex_lock(&qm->isolate_data.isolate_lock); + list_for_each_entry_safe(err, tmp, &qm->isolate_data.qm_hw_errs, list) { + list_del(&err->list); + kfree(err); + } + mutex_unlock(&qm->isolate_data.isolate_lock); +} + +static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce) +{ + struct hisi_qm *qm = uacce->priv; + struct hisi_qm *pf_qm; + + if (uacce->is_vf) + pf_qm = pci_get_drvdata(pci_physfn(qm->pdev)); + else + pf_qm = qm; + + return pf_qm->isolate_data.is_isolate ? + UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL; +} + +static int hisi_qm_isolate_threshold_write(struct uacce_device *uacce, u32 num) +{ + struct hisi_qm *qm = uacce->priv; + + /* Must be set by PF */ + if (uacce->is_vf) + return -EPERM; + + if (qm->isolate_data.is_isolate) + return -EPERM; + + qm->isolate_data.err_threshold = num; + + /* After the policy is updated, need to reset the hardware err list */ + qm_hw_err_destroy(qm); + + return 0; +} + +static u32 hisi_qm_isolate_threshold_read(struct uacce_device *uacce) +{ + struct hisi_qm *qm = uacce->priv; + struct hisi_qm *pf_qm; + + if (uacce->is_vf) { + pf_qm = pci_get_drvdata(pci_physfn(qm->pdev)); + return pf_qm->isolate_data.err_threshold; + } + + return qm->isolate_data.err_threshold; +} + static const struct uacce_ops uacce_qm_ops = { .get_available_instances = hisi_qm_get_available_instances, .get_queue = hisi_qm_uacce_get_queue, @@ -2478,8 +2595,22 @@ static const struct uacce_ops uacce_qm_ops = { .mmap = hisi_qm_uacce_mmap, .ioctl = hisi_qm_uacce_ioctl, .is_q_updated = hisi_qm_is_q_updated, + .get_isolate_state = hisi_qm_get_isolate_state, + .isolate_err_threshold_write = hisi_qm_isolate_threshold_write, + .isolate_err_threshold_read = hisi_qm_isolate_threshold_read, }; +static void qm_remove_uacce(struct hisi_qm *qm) +{ + struct uacce_device *uacce = qm->uacce; + + if (qm->use_sva) { + qm_hw_err_destroy(qm); + uacce_remove(uacce); + qm->uacce = NULL; + } +} + static int qm_alloc_uacce(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -2506,8 +2637,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm) qm->use_sva = true; } else { /* only consider sva case */ - uacce_remove(uacce); - qm->uacce = NULL; + qm_remove_uacce(qm); return -EINVAL; } @@ -2540,6 +2670,8 @@ static int qm_alloc_uacce(struct hisi_qm *qm) uacce->qf_pg_num[UACCE_QFRT_DUS] = dus_page_nr; qm->uacce = uacce; + INIT_LIST_HEAD(&qm->isolate_data.qm_hw_errs); + mutex_init(&qm->isolate_data.isolate_lock); return 0; } @@ -4029,6 +4161,12 @@ static int qm_controller_reset_prepare(struct hisi_qm *qm) return ret; } + if (qm->use_sva) { + ret = qm_hw_err_isolate(qm); + if (ret) + pci_err(pdev, "failed to isolate hw err!\n"); + } + ret = qm_wait_vf_prepare_finish(qm); if (ret) pci_err(pdev, "failed to stop by vfs in soft reset!\n"); @@ -4336,21 +4474,25 @@ static int qm_controller_reset(struct hisi_qm *qm) qm->err_ini->show_last_dfx_regs(qm); ret = qm_soft_reset(qm); - if (ret) { - pci_err(pdev, "Controller reset failed (%d)\n", ret); - qm_reset_bit_clear(qm); - return ret; - } + if (ret) + goto err_reset; ret = qm_controller_reset_done(qm); - if (ret) { - qm_reset_bit_clear(qm); - return ret; - } + if (ret) + goto err_reset; pci_info(pdev, "Controller reset complete\n"); return 0; + +err_reset: + pci_err(pdev, "Controller reset failed (%d)\n", ret); + qm_reset_bit_clear(qm); + + /* if resetting fails, isolate the device */ + if (qm->use_sva) + qm->isolate_data.is_isolate = true; + return ret; } /** @@ -5271,10 +5413,7 @@ int hisi_qm_init(struct hisi_qm *qm) err_free_qm_memory: hisi_qm_memory_uninit(qm); err_alloc_uacce: - if (qm->use_sva) { - uacce_remove(qm->uacce); - qm->uacce = NULL; - } + qm_remove_uacce(qm); err_irq_register: qm_irqs_unregister(qm); err_pci_init: -- cgit v1.2.3