summaryrefslogtreecommitdiffstats
path: root/drivers/accel/habanalabs
diff options
context:
space:
mode:
authorTomer Tayar <ttayar@habana.ai>2023-12-24 23:28:36 +0100
committerOded Gabbay <ogabbay@kernel.org>2024-02-26 08:30:40 +0100
commit246d8b6cfb80a31e3cc287e3c1db6a5515b7c20a (patch)
treea6b459211b2a6a5f0002f4c0069c3d1dd52a4600 /drivers/accel/habanalabs
parentaccel/habanalabs: fix DRAM BAR base address calculation (diff)
downloadlinux-246d8b6cfb80a31e3cc287e3c1db6a5515b7c20a.tar.xz
linux-246d8b6cfb80a31e3cc287e3c1db6a5515b7c20a.zip
accel/habanalabs: abort device reset for consecutive heartbeat failures
The mechanism of aborting device reset for consecutive fatal errors is currently only for fatal errors that are reported by FW. A non-responsive FW and consecutive heartbeat failures is also considered fatal, so add them as well to this mechanism to avoid recurring device reset in such a case. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/accel/habanalabs')
-rw-r--r--drivers/accel/habanalabs/common/device.c14
1 files changed, 8 insertions, 6 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 5c46826e3659..cf004baf5e62 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1769,14 +1769,16 @@ kill_processes:
hdev->device_cpu_disabled = false;
hdev->reset_info.hard_reset_pending = false;
+ /*
+ * Put the device in an unusable state if there are 2 back to back resets due to
+ * fatal errors.
+ */
if (hdev->reset_info.reset_trigger_repeated &&
- (hdev->reset_info.prev_reset_trigger ==
- HL_DRV_RESET_FW_FATAL_ERR)) {
- /* if there 2 back to back resets from FW,
- * ensure driver puts the driver in a unusable state
- */
+ (hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
+ hdev->reset_info.prev_reset_trigger ==
+ HL_DRV_RESET_HEARTBEAT)) {
dev_crit(hdev->dev,
- "%s Consecutive FW fatal errors received, stopping hard reset\n",
+ "%s Consecutive fatal errors, stopping hard reset\n",
dev_name(&(hdev)->pdev->dev));
rc = -EIO;
goto out_err;