diff options
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 12 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs.h | 15 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi2/gaudi2.c | 3 |
3 files changed, 29 insertions, 1 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index bb3f44392908..35502e938b5d 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1052,12 +1052,22 @@ static bool is_pci_link_healthy(struct hl_device *hdev) static bool hl_device_eq_heartbeat_received(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 cpu_q_id; if (!prop->cpucp_info.eq_health_check_supported) return true; if (!hdev->eq_heartbeat_received) { + cpu_q_id = hdev->heartbeat_debug_info.cpu_queue_id; + dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); + + dev_err(hdev->dev, "Heartbeat events counter: %u, Q_PI: %u, Q_CI: %u, EQ CI: %u, EQ prev: %u\n", + hdev->heartbeat_debug_info.heartbeat_event_counter, + hdev->kernel_queues[cpu_q_id].pi, + atomic_read(&hdev->kernel_queues[cpu_q_id].ci), + hdev->event_queue.ci, + hdev->event_queue.prev_eqe_index); return false; } @@ -1138,6 +1148,8 @@ static int device_late_init(struct hl_device *hdev) hdev->high_pll = hdev->asic_prop.high_pll; if (hdev->heartbeat) { + hdev->heartbeat_debug_info.heartbeat_event_counter = 0; + /* * Before scheduling the heartbeat driver will check if eq event has received. * for the first schedule we need to set the indication as true then for the next diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 55495861f432..5e9f54ca336a 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -71,7 +71,7 @@ struct hl_fpriv; #define HL_DEVICE_TIMEOUT_USEC 1000000 /* 1 s */ -#define HL_HEARTBEAT_PER_USEC 5000000 /* 5 s */ +#define HL_HEARTBEAT_PER_USEC 10000000 /* 10 s */ #define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */ @@ -3175,6 +3175,16 @@ struct hl_reset_info { }; /** + * struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure. + * @heartbeat_event_counter: number of heartbeat events received. + * @cpu_queue_id: used to read the queue pi/ci + */ +struct eq_heartbeat_debug_info { + u32 heartbeat_event_counter; + u32 cpu_queue_id; +}; + +/** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. * @pcie_bar_phys: array of available PCIe bars physical addresses. @@ -3262,6 +3272,7 @@ struct hl_reset_info { * @clk_throttling: holds information about current/previous clock throttling events * @captured_err_info: holds information about errors. * @reset_info: holds current device reset information. + * @heartbeat_debug_info: counters used to debug heartbeat failures. * @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling. * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @fw_inner_major_ver: the major of current loaded preboot inner version. @@ -3452,6 +3463,8 @@ struct hl_device { struct hl_reset_info reset_info; + struct eq_heartbeat_debug_info heartbeat_debug_info; + cpumask_t irq_affinity_mask; u32 *stream_master_qid_arr; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 962b7fcd4318..08276f03c80f 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -3796,6 +3796,8 @@ static int gaudi2_sw_init(struct hl_device *hdev) if (rc) goto special_blocks_free; + hdev->heartbeat_debug_info.cpu_queue_id = GAUDI2_QUEUE_ID_CPU_PQ; + return 0; special_blocks_free: @@ -9777,6 +9779,7 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type) static void hl_eq_heartbeat_event_handle(struct hl_device *hdev) { + hdev->heartbeat_debug_info.heartbeat_event_counter++; hdev->eq_heartbeat_received = true; } |