summaryrefslogtreecommitdiffstats
path: root/drivers/misc/habanalabs/gaudi/gaudi.c
diff options
context:
space:
mode:
authorMaxime Ripard <maxime@cerno.tech>2021-05-11 13:35:52 +0200
committerMaxime Ripard <maxime@cerno.tech>2021-05-11 13:35:52 +0200
commitc55b44c9386f3ee1b08752638559f19deaf6040d (patch)
treec843a21f45180387fcd9eb2625cc9d1f166a3156 /drivers/misc/habanalabs/gaudi/gaudi.c
parentMAINTAINERS: Update my e-mail (diff)
parentLinux 5.13-rc1 (diff)
downloadlinux-c55b44c9386f3ee1b08752638559f19deaf6040d.tar.xz
linux-c55b44c9386f3ee1b08752638559f19deaf6040d.zip
Merge drm/drm-fixes into drm-misc-fixes
Start this new release drm-misc-fixes branch Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Diffstat (limited to 'drivers/misc/habanalabs/gaudi/gaudi.c')
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi.c357
1 files changed, 314 insertions, 43 deletions
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 9152242778f5..b751652f80a8 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -105,6 +105,36 @@
#define GAUDI_PLL_MAX 10
+/*
+ * this enum kept here for compatibility with old FW (in which each asic has
+ * unique PLL numbering
+ */
+enum gaudi_pll_index {
+ GAUDI_CPU_PLL = 0,
+ GAUDI_PCI_PLL,
+ GAUDI_SRAM_PLL,
+ GAUDI_HBM_PLL,
+ GAUDI_NIC_PLL,
+ GAUDI_DMA_PLL,
+ GAUDI_MESH_PLL,
+ GAUDI_MME_PLL,
+ GAUDI_TPC_PLL,
+ GAUDI_IF_PLL,
+};
+
+static enum pll_index gaudi_pll_map[PLL_MAX] = {
+ [CPU_PLL] = GAUDI_CPU_PLL,
+ [PCI_PLL] = GAUDI_PCI_PLL,
+ [SRAM_PLL] = GAUDI_SRAM_PLL,
+ [HBM_PLL] = GAUDI_HBM_PLL,
+ [NIC_PLL] = GAUDI_NIC_PLL,
+ [DMA_PLL] = GAUDI_DMA_PLL,
+ [MESH_PLL] = GAUDI_MESH_PLL,
+ [MME_PLL] = GAUDI_MME_PLL,
+ [TPC_PLL] = GAUDI_TPC_PLL,
+ [IF_PLL] = GAUDI_IF_PLL,
+};
+
static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
"gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -396,6 +426,19 @@ get_collective_mode(struct hl_device *hdev, u32 queue_id)
return HL_COLLECTIVE_NOT_SUPPORTED;
}
+static inline void set_default_power_values(struct hl_device *hdev)
+{
+ struct asic_fixed_properties *prop = &hdev->asic_prop;
+
+ if (hdev->card_type == cpucp_card_type_pmc) {
+ prop->max_power_default = MAX_POWER_DEFAULT_PMC;
+ prop->dc_power_default = DC_POWER_DEFAULT_PMC;
+ } else {
+ prop->max_power_default = MAX_POWER_DEFAULT_PCI;
+ prop->dc_power_default = DC_POWER_DEFAULT_PCI;
+ }
+}
+
static int gaudi_get_fixed_properties(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -507,7 +550,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->num_of_events = GAUDI_EVENT_SIZE;
prop->tpc_enabled_mask = TPC_ENABLED_MASK;
- prop->max_power_default = MAX_POWER_DEFAULT_PCI;
+ set_default_power_values(hdev);
prop->cb_pool_cb_cnt = GAUDI_CB_POOL_CB_CNT;
prop->cb_pool_cb_size = GAUDI_CB_POOL_CB_SIZE;
@@ -532,8 +575,6 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX;
- /* disable fw security for now, set it in a later stage */
- prop->fw_security_disabled = true;
prop->fw_security_status_valid = false;
prop->hard_reset_done_by_fw = false;
@@ -588,6 +629,11 @@ static int gaudi_init_iatu(struct hl_device *hdev)
struct hl_outbound_pci_region outbound_region;
int rc;
+ if (hdev->asic_prop.iatu_done_by_fw) {
+ hdev->asic_funcs->set_dma_mask_from_fw(hdev);
+ return 0;
+ }
+
/* Inbound Region 0 - Bar 0 - Point to SRAM + CFG */
inbound_region.mode = PCI_BAR_MATCH_MODE;
inbound_region.bar = SRAM_BAR_ID;
@@ -632,6 +678,7 @@ static int gaudi_early_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct pci_dev *pdev = hdev->pdev;
+ u32 fw_boot_status;
int rc;
rc = gaudi_get_fixed_properties(hdev);
@@ -665,6 +712,23 @@ static int gaudi_early_init(struct hl_device *hdev)
prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID);
+ /* If FW security is enabled at this point it means no access to ELBI */
+ if (!hdev->asic_prop.fw_security_disabled) {
+ hdev->asic_prop.iatu_done_by_fw = true;
+ goto pci_init;
+ }
+
+ rc = hl_pci_elbi_read(hdev, CFG_BASE + mmCPU_BOOT_DEV_STS0,
+ &fw_boot_status);
+ if (rc)
+ goto free_queue_props;
+
+ /* Check whether FW is configuring iATU */
+ if ((fw_boot_status & CPU_BOOT_DEV_STS0_ENABLED) &&
+ (fw_boot_status & CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN))
+ hdev->asic_prop.iatu_done_by_fw = true;
+
+pci_init:
rc = hl_pci_init(hdev);
if (rc)
goto free_queue_props;
@@ -1588,6 +1652,9 @@ static int gaudi_sw_init(struct hl_device *hdev)
hdev->asic_specific = gaudi;
+ /* store legacy PLL map */
+ hdev->legacy_pll_map = gaudi_pll_map;
+
/* Create DMA pool for small allocations */
hdev->dma_pool = dma_pool_create(dev_name(hdev->dev),
&hdev->pdev->dev, GAUDI_DMA_POOL_BLK_SIZE, 8, 0);
@@ -1766,8 +1833,7 @@ static int gaudi_enable_msi(struct hl_device *hdev)
if (gaudi->hw_cap_initialized & HW_CAP_MSI)
return 0;
- rc = pci_alloc_irq_vectors(hdev->pdev, 1, GAUDI_MSI_ENTRIES,
- PCI_IRQ_MSI);
+ rc = pci_alloc_irq_vectors(hdev->pdev, 1, 1, PCI_IRQ_MSI);
if (rc < 0) {
dev_err(hdev->dev, "MSI: Failed to enable support %d\n", rc);
return rc;
@@ -3701,7 +3767,7 @@ static int gaudi_init_cpu(struct hl_device *hdev)
struct gaudi_device *gaudi = hdev->asic_specific;
int rc;
- if (!hdev->cpu_enable)
+ if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU))
return 0;
if (gaudi->hw_cap_initialized & HW_CAP_CPU)
@@ -4873,7 +4939,7 @@ static int gaudi_pin_memory_before_cs(struct hl_device *hdev,
parser->job_userptr_list, &userptr))
goto already_pinned;
- userptr = kzalloc(sizeof(*userptr), GFP_ATOMIC);
+ userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);
if (!userptr)
return -ENOMEM;
@@ -5684,18 +5750,26 @@ release_cb:
static int gaudi_schedule_register_memset(struct hl_device *hdev,
u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val)
{
- struct hl_ctx *ctx = hdev->compute_ctx;
+ struct hl_ctx *ctx;
struct hl_pending_cb *pending_cb;
struct packet_msg_long *pkt;
u32 cb_size, ctl;
struct hl_cb *cb;
- int i;
+ int i, rc;
+
+ mutex_lock(&hdev->fpriv_list_lock);
+ ctx = hdev->compute_ctx;
/* If no compute context available or context is going down
* memset registers directly
*/
- if (!ctx || kref_read(&ctx->refcount) == 0)
- return gaudi_memset_registers(hdev, reg_base, num_regs, val);
+ if (!ctx || kref_read(&ctx->refcount) == 0) {
+ rc = gaudi_memset_registers(hdev, reg_base, num_regs, val);
+ mutex_unlock(&hdev->fpriv_list_lock);
+ return rc;
+ }
+
+ mutex_unlock(&hdev->fpriv_list_lock);
cb_size = (sizeof(*pkt) * num_regs) +
sizeof(struct packet_msg_prot) * 2;
@@ -5911,13 +5985,16 @@ static void gaudi_restore_phase_topology(struct hl_device *hdev)
}
-static int gaudi_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
+static int gaudi_debugfs_read32(struct hl_device *hdev, u64 addr,
+ bool user_address, u32 *val)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct gaudi_device *gaudi = hdev->asic_specific;
- u64 hbm_bar_addr;
+ u64 hbm_bar_addr, host_phys_end;
int rc = 0;
+ host_phys_end = HOST_PHYS_BASE + HOST_PHYS_SIZE;
+
if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
@@ -5949,6 +6026,9 @@ static int gaudi_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
}
if (hbm_bar_addr == U64_MAX)
rc = -EIO;
+ } else if (addr >= HOST_PHYS_BASE && addr < host_phys_end &&
+ user_address && !iommu_present(&pci_bus_type)) {
+ *val = *(u32 *) phys_to_virt(addr - HOST_PHYS_BASE);
} else {
rc = -EFAULT;
}
@@ -5956,13 +6036,16 @@ static int gaudi_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
return rc;
}
-static int gaudi_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
+static int gaudi_debugfs_write32(struct hl_device *hdev, u64 addr,
+ bool user_address, u32 val)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct gaudi_device *gaudi = hdev->asic_specific;
- u64 hbm_bar_addr;
+ u64 hbm_bar_addr, host_phys_end;
int rc = 0;
+ host_phys_end = HOST_PHYS_BASE + HOST_PHYS_SIZE;
+
if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
@@ -5994,6 +6077,9 @@ static int gaudi_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
}
if (hbm_bar_addr == U64_MAX)
rc = -EIO;
+ } else if (addr >= HOST_PHYS_BASE && addr < host_phys_end &&
+ user_address && !iommu_present(&pci_bus_type)) {
+ *(u32 *) phys_to_virt(addr - HOST_PHYS_BASE) = val;
} else {
rc = -EFAULT;
}
@@ -6001,13 +6087,16 @@ static int gaudi_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
return rc;
}
-static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
+static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr,
+ bool user_address, u64 *val)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct gaudi_device *gaudi = hdev->asic_specific;
- u64 hbm_bar_addr;
+ u64 hbm_bar_addr, host_phys_end;
int rc = 0;
+ host_phys_end = HOST_PHYS_BASE + HOST_PHYS_SIZE;
+
if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
@@ -6043,6 +6132,9 @@ static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
}
if (hbm_bar_addr == U64_MAX)
rc = -EIO;
+ } else if (addr >= HOST_PHYS_BASE && addr < host_phys_end &&
+ user_address && !iommu_present(&pci_bus_type)) {
+ *val = *(u64 *) phys_to_virt(addr - HOST_PHYS_BASE);
} else {
rc = -EFAULT;
}
@@ -6050,13 +6142,16 @@ static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
return rc;
}
-static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
+static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr,
+ bool user_address, u64 val)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct gaudi_device *gaudi = hdev->asic_specific;
- u64 hbm_bar_addr;
+ u64 hbm_bar_addr, host_phys_end;
int rc = 0;
+ host_phys_end = HOST_PHYS_BASE + HOST_PHYS_SIZE;
+
if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
@@ -6091,6 +6186,9 @@ static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
}
if (hbm_bar_addr == U64_MAX)
rc = -EIO;
+ } else if (addr >= HOST_PHYS_BASE && addr < host_phys_end &&
+ user_address && !iommu_present(&pci_bus_type)) {
+ *(u64 *) phys_to_virt(addr - HOST_PHYS_BASE) = val;
} else {
rc = -EFAULT;
}
@@ -6098,6 +6196,164 @@ static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
return rc;
}
+static int gaudi_dma_core_transfer(struct hl_device *hdev, int dma_id, u64 addr,
+ u32 size_to_dma, dma_addr_t dma_addr)
+{
+ u32 err_cause, val;
+ u64 dma_offset;
+ int rc;
+
+ dma_offset = dma_id * DMA_CORE_OFFSET;
+
+ WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, lower_32_bits(addr));
+ WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, upper_32_bits(addr));
+ WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset, lower_32_bits(dma_addr));
+ WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset, upper_32_bits(dma_addr));
+ WREG32(mmDMA0_CORE_DST_TSIZE_0 + dma_offset, size_to_dma);
+ WREG32(mmDMA0_CORE_COMMIT + dma_offset,
+ (1 << DMA0_CORE_COMMIT_LIN_SHIFT));
+
+ rc = hl_poll_timeout(
+ hdev,
+ mmDMA0_CORE_STS0 + dma_offset,
+ val,
+ ((val & DMA0_CORE_STS0_BUSY_MASK) == 0),
+ 0,
+ 1000000);
+
+ if (rc) {
+ dev_err(hdev->dev,
+ "DMA %d timed-out during reading of 0x%llx\n",
+ dma_id, addr);
+ return -EIO;
+ }
+
+ /* Verify DMA is OK */
+ err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset);
+ if (err_cause) {
+ dev_err(hdev->dev, "DMA Failed, cause 0x%x\n", err_cause);
+ dev_dbg(hdev->dev,
+ "Clearing DMA0 engine from errors (cause 0x%x)\n",
+ err_cause);
+ WREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset, err_cause);
+
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size,
+ void *blob_addr)
+{
+ u32 dma_core_sts0, err_cause, cfg1, size_left, pos, size_to_dma;
+ struct gaudi_device *gaudi = hdev->asic_specific;
+ u64 dma_offset, qm_offset;
+ dma_addr_t dma_addr;
+ void *kernel_addr;
+ bool is_eng_idle;
+ int rc = 0, dma_id;
+
+ kernel_addr = hdev->asic_funcs->asic_dma_alloc_coherent(
+ hdev, SZ_2M,
+ &dma_addr,
+ GFP_KERNEL | __GFP_ZERO);
+
+ if (!kernel_addr)
+ return -ENOMEM;
+
+ mutex_lock(&gaudi->clk_gate_mutex);
+
+ hdev->asic_funcs->disable_clock_gating(hdev);
+
+ hdev->asic_funcs->hw_queues_lock(hdev);
+
+ dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_1];
+ dma_offset = dma_id * DMA_CORE_OFFSET;
+ qm_offset = dma_id * DMA_QMAN_OFFSET;
+ dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset);
+ is_eng_idle = IS_DMA_IDLE(dma_core_sts0);
+
+ if (!is_eng_idle) {
+ dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_2];
+ dma_offset = dma_id * DMA_CORE_OFFSET;
+ qm_offset = dma_id * DMA_QMAN_OFFSET;
+ dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset);
+ is_eng_idle = IS_DMA_IDLE(dma_core_sts0);
+
+ if (!is_eng_idle) {
+ dev_err_ratelimited(hdev->dev,
+ "Can't read via DMA because it is BUSY\n");
+ rc = -EAGAIN;
+ goto out;
+ }
+ }
+
+ cfg1 = RREG32(mmDMA0_QM_GLBL_CFG1 + qm_offset);
+ WREG32(mmDMA0_QM_GLBL_CFG1 + qm_offset,
+ 0xF << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT);
+
+ /* TODO: remove this by mapping the DMA temporary buffer to the MMU
+ * using the compute ctx ASID, if exists. If not, use the kernel ctx
+ * ASID
+ */
+ WREG32_OR(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_VAL_SHIFT));
+
+ /* Verify DMA is OK */
+ err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset);
+ if (err_cause) {
+ dev_dbg(hdev->dev,
+ "Clearing DMA0 engine from errors (cause 0x%x)\n",
+ err_cause);
+ WREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset, err_cause);
+ }
+
+ pos = 0;
+ size_left = size;
+ size_to_dma = SZ_2M;
+
+ while (size_left > 0) {
+
+ if (size_left < SZ_2M)
+ size_to_dma = size_left;
+
+ rc = gaudi_dma_core_transfer(hdev, dma_id, addr, size_to_dma,
+ dma_addr);
+ if (rc)
+ break;
+
+ memcpy(blob_addr + pos, kernel_addr, size_to_dma);
+
+ if (size_left <= SZ_2M)
+ break;
+
+ pos += SZ_2M;
+ addr += SZ_2M;
+ size_left -= SZ_2M;
+ }
+
+ /* TODO: remove this by mapping the DMA temporary buffer to the MMU
+ * using the compute ctx ASID, if exists. If not, use the kernel ctx
+ * ASID
+ */
+ WREG32_AND(mmDMA0_CORE_PROT + dma_offset,
+ ~BIT(DMA0_CORE_PROT_VAL_SHIFT));
+
+ WREG32(mmDMA0_QM_GLBL_CFG1 + qm_offset, cfg1);
+
+out:
+ hdev->asic_funcs->hw_queues_unlock(hdev);
+
+ hdev->asic_funcs->set_clock_gating(hdev);
+
+ mutex_unlock(&gaudi->clk_gate_mutex);
+
+ hdev->asic_funcs->asic_dma_free_coherent(hdev, SZ_2M, kernel_addr,
+ dma_addr);
+
+ return rc;
+}
+
static u64 gaudi_read_pte(struct hl_device *hdev, u64 addr)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@@ -6851,7 +7107,8 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
}
/* Write 1 clear errors */
- WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val);
+ if (!hdev->stop_on_err)
+ WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val);
}
arb_err_val = RREG32(arb_err_addr);
@@ -7097,6 +7354,15 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
}
}
+static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
+ struct cpucp_pkt_sync_err *sync_err)
+{
+ struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ];
+
+ dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
+ sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+}
+
static int gaudi_soft_reset_late_init(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@@ -7371,18 +7637,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_MMU_DERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
- if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, true, false);
- break;
+ goto reset_device;
case GAUDI_EVENT_GIC500:
case GAUDI_EVENT_AXI_ECC:
case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
gaudi_print_irq_info(hdev, event_type, false);
- if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, true, false);
- break;
+ goto reset_device;
case GAUDI_EVENT_HBM0_SPI_0:
case GAUDI_EVENT_HBM1_SPI_0:
@@ -7392,9 +7654,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
- if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, true, false);
- break;
+ goto reset_device;
case GAUDI_EVENT_HBM0_SPI_1:
case GAUDI_EVENT_HBM1_SPI_1:
@@ -7423,8 +7683,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name);
- if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, true, false);
+ goto reset_device;
} else {
hl_fw_unmask_irq(hdev, event_type);
}
@@ -7446,8 +7705,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name);
- if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, true, false);
+ goto reset_device;
} else {
hl_fw_unmask_irq(hdev, event_type);
}
@@ -7516,9 +7774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true);
- if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, true, false);
- break;
+ goto reset_device;
case GAUDI_EVENT_TPC0_BMON_SPMU:
case GAUDI_EVENT_TPC1_BMON_SPMU:
@@ -7552,11 +7808,28 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
event_type, cause);
break;
+ case GAUDI_EVENT_DEV_RESET_REQ:
+ gaudi_print_irq_info(hdev, event_type, false);
+ goto reset_device;
+
+ case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
+ gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
+ goto reset_device;
+
default:
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
event_type);
break;
}
+
+ return;
+
+reset_device:
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, HL_RESET_HARD);
+ else
+ hl_fw_unmask_irq(hdev, event_type);
}
static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate,
@@ -7607,7 +7880,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
- hl_device_reset(hdev, true, false);
+ hl_device_reset(hdev, HL_RESET_HARD);
}
return rc;
@@ -7656,7 +7929,7 @@ static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
- hl_device_reset(hdev, true, false);
+ hl_device_reset(hdev, HL_RESET_HARD);
}
return rc;
@@ -7714,7 +7987,7 @@ static int gaudi_cpucp_info_get(struct hl_device *hdev)
if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q))
return 0;
- rc = hl_fw_cpucp_info_get(hdev, mmCPU_BOOT_DEV_STS0, mmCPU_BOOT_ERR0);
+ rc = hl_fw_cpucp_handshake(hdev, mmCPU_BOOT_DEV_STS0, mmCPU_BOOT_ERR0);
if (rc)
return rc;
@@ -7724,10 +7997,7 @@ static int gaudi_cpucp_info_get(struct hl_device *hdev)
hdev->card_type = le32_to_cpu(hdev->asic_prop.cpucp_info.card_type);
- if (hdev->card_type == cpucp_card_type_pci)
- prop->max_power_default = MAX_POWER_DEFAULT_PCI;
- else if (hdev->card_type == cpucp_card_type_pmc)
- prop->max_power_default = MAX_POWER_DEFAULT_PMC;
+ set_default_power_values(hdev);
hdev->max_power = prop->max_power_default;
@@ -8549,6 +8819,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
.debugfs_write32 = gaudi_debugfs_write32,
.debugfs_read64 = gaudi_debugfs_read64,
.debugfs_write64 = gaudi_debugfs_write64,
+ .debugfs_read_dma = gaudi_debugfs_read_dma,
.add_device_attr = gaudi_add_device_attr,
.handle_eqe = gaudi_handle_eqe,
.set_pll_profile = gaudi_set_pll_profile,